Пример #1
0
def gutenbergStats(f_gproj):
    """
    Statistics for Project Gutenberg corpus.
    """
    stopwords = getStopwords()
    lost_words = 0
    lost_sents = 0
    words = 0
    sents = 0
    sent_max = 0
    sent_min = maxint

    # Change between testing and training folder manually
    corpus_type = "test/"

    for f in listdir(f_gproj + corpus_type):
        with copen(f_gproj + corpus_type + f, encoding="utf-8") as proj_f:
            for line in proj_f:
                m = match(".*>(.+)</.*", line)

                if m:
                    sentence = m.group(1).strip().split(" ")
                    clean = []

                    for word in sentence:
                        word = word.lower()

                        if word not in stopwords and len(
                                word) > 1 and word.isalpha():
                            words += 1
                            clean.append(word)
                        else:
                            lost_words += 1
                    if len(clean) > 1:
                        sents += 1

                        if len(clean) > sent_max:
                            sent_max = len(clean)
                        if len(clean) < sent_min:
                            sent_min = len(clean)
                    else:
                        lost_sents += 1

    print "Lost Words: %d" % lost_words
    print "Lost Sentences: %d" % lost_sents
    print "Words: %d" % words
    print "Sentences: %d" % sents
    print "Sentence minimum length: %d" % sent_min
    print "Sentence maximum length: %d" % sent_max
    print "Sentence average length: %f" % (float(words) / float(sents))
Пример #2
0
def gutenbergStats(f_gproj):
    """
    Statistics for Project Gutenberg corpus.
    """
    stopwords = getStopwords()
    lost_words = 0
    lost_sents = 0
    words = 0
    sents = 0
    sent_max = 0
    sent_min = maxint

    # Change between testing and training folder manually
    corpus_type = "test/"

    for f in listdir(f_gproj + corpus_type):
        with copen(f_gproj + corpus_type + f, encoding="utf-8") as proj_f:
            for line in proj_f:
                m = match(".*>(.+)</.*", line)

                if m:
                    sentence = m.group(1).strip().split(" ")
                    clean = []

                    for word in sentence:
                        word = word.lower()

                        if word not in stopwords and len(word) > 1 and word.isalpha():
                            words += 1
                            clean.append(word)
                        else:
                            lost_words += 1
                    if len(clean) > 1:
                        sents += 1

                        if len(clean) > sent_max:
                            sent_max = len(clean)
                        if len(clean) < sent_min:
                            sent_min = len(clean)
                    else:
                        lost_sents += 1

    print "Lost Words: %d" % lost_words
    print "Lost Sentences: %d" % lost_sents
    print "Words: %d" % words
    print "Sentences: %d" % sents
    print "Sentence minimum length: %d" % sent_min
    print "Sentence maximum length: %d" % sent_max
    print "Sentence average length: %f" % (float(words) / float(sents))
Пример #3
0
def convertGutenberg(f_root, f_type, f_gproj, slda=True, filter=True):
    """ Converts the Gutenberg corps to SLDA or LLDA format. """
    stopwords = getStopwords()
    formal = 0
    informal = 0
    neutral = 0
    double = 0
    nn_lost = 0
    on_lost = 0
    no_lost = 0
    total = 0
    form = "SLDA"
    f_ttype = f_type

    if not slda:
        form = "LLDA"

    if f_type == "test":
        f_ttype = "gold"

    print "Converting Gutenberg corpus to %s format for %sing" % (form, f_type)

    with copen(f_root + f_ttype, "w", encoding="utf-8") as lda_f:
        for f in listdir(f_gproj + f_type):
            with copen(f_gproj + f_type + "/" + f, encoding="utf-8") as proj_f:
                for line in proj_f:
                    m = match(".*>(.+)</.*", line)

                    if m:
                        sentence = m.group(1).strip().split(" ")
                        clean = []

                        for word in sentence:
                            word = word.lower()

                            if filter:
                                if word not in stopwords and len(word) > 1 and word.isalpha():
                                    clean.append(word)
                            else:
                                if word.isalpha():
                                    clean.append(word)

                        if len(clean) > 1:
                            sentence = " ".join(clean)

                            if 'f="0" i="0"' in line:
                                if not "you" in clean:
                                    neutral += 1
                                    total += 1
                                    if slda:
                                        lda_f.write("[2|%s]." % sentence)
                                    else:
                                        lda_f.write("[2] %s\n" % sentence)
                                else:
                                    nn_lost += 1
                            elif 'f="1" i="0"' in line:
                                if not "you" in clean:
                                    on_lost += 1
                                else:
                                    formal += 1
                                    total += 1
                                    if slda:
                                        lda_f.write("[1|%s]." % sentence)
                                    else:
                                        lda_f.write("[1] %s\n" % sentence)
                            elif 'f="0" i="1"' in line:
                                if not "you" in clean:
                                    no_lost += 1
                                else:
                                    informal += 1
                                    total += 1
                                    if slda:
                                        lda_f.write("[0|%s]." % sentence)
                                    else:
                                        lda_f.write("[0] %s\n" % sentence)
                            elif 'f="1" i="1"' in line:
                                double += 0
                lda_f.write("\n")
    print "Formal: %d" % formal
    print "Informal: %d" % informal
    print "Neutral: %d" % neutral
    print "Formal + Informal: %d" % double
    print "You in Neutral: %d" % nn_lost
    print "You not in Formal: %d" % on_lost
    print "You not in Informal: %d" % no_lost
    print "Total sentences %d" % total
    print
Пример #4
0
def convertGutenberg(f_root, f_type, f_gproj, slda=True, filter=True):
    """ Converts the Gutenberg corps to SLDA or LLDA format. """
    stopwords = getStopwords()
    formal = 0
    informal = 0
    neutral = 0
    double = 0
    nn_lost = 0
    on_lost = 0
    no_lost = 0
    total = 0
    form = "SLDA"
    f_ttype = f_type

    if not slda:
        form = "LLDA"

    if f_type == "test":
        f_ttype = "gold"

    print "Converting Gutenberg corpus to %s format for %sing" % (form, f_type)

    with copen(f_root + f_ttype, "w", encoding="utf-8") as lda_f:
        for f in listdir(f_gproj + f_type):
            with copen(f_gproj + f_type + "/" + f, encoding="utf-8") as proj_f:
                for line in proj_f:
                    m = match(".*>(.+)</.*", line)

                    if m:
                        sentence = m.group(1).strip().split(" ")
                        clean = []

                        for word in sentence:
                            word = word.lower()

                            if filter:
                                if word not in stopwords and len(
                                        word) > 1 and word.isalpha():
                                    clean.append(word)
                            else:
                                if word.isalpha():
                                    clean.append(word)

                        if len(clean) > 1:
                            sentence = " ".join(clean)

                            if "f=\"0\" i=\"0\"" in line:
                                if not "you" in clean:
                                    neutral += 1
                                    total += 1
                                    if slda:
                                        lda_f.write("[2|%s]." % sentence)
                                    else:
                                        lda_f.write("[2] %s\n" % sentence)
                                else:
                                    nn_lost += 1
                            elif "f=\"1\" i=\"0\"" in line:
                                if not "you" in clean:
                                    on_lost += 1
                                else:
                                    formal += 1
                                    total += 1
                                    if slda:
                                        lda_f.write("[1|%s]." % sentence)
                                    else:
                                        lda_f.write("[1] %s\n" % sentence)
                            elif "f=\"0\" i=\"1\"" in line:
                                if not "you" in clean:
                                    no_lost += 1
                                else:
                                    informal += 1
                                    total += 1
                                    if slda:
                                        lda_f.write("[0|%s]." % sentence)
                                    else:
                                        lda_f.write("[0] %s\n" % sentence)
                            elif "f=\"1\" i=\"1\"" in line:
                                double += 0
                lda_f.write("\n")
    print "Formal: %d" % formal
    print "Informal: %d" % informal
    print "Neutral: %d" % neutral
    print "Formal + Informal: %d" % double
    print "You in Neutral: %d" % nn_lost
    print "You not in Formal: %d" % on_lost
    print "You not in Informal: %d" % no_lost
    print "Total sentences %d" % total
    print
Пример #5
0
def cleanCopyDocuments(f_align_p,
                       f_corpus,
                       f_clean,
                       f_stats,
                       f_rem,
                       filter=True):
    """ Copies the documents with alignment in a clean format to a new folder as text files.
    """
    align_p_f = loadStruct(f_align_p)
    stopwords = getStopwords()
    n_docs = len(align_p_f)
    words_total = 0
    words_lost = 0
    sents_lost = 0

    with open(f_rem, "w") as rem_f:
        for i, key in enumerate(align_p_f.iterkeys()):
            if i % 500 == 0:
                print "Documents: %d/%d" % (i, n_docs)
            elif i == 0 or i == n_docs - 1:
                print "Documents: %d/%d" % (i + 1, n_docs)

            for lang in key:
                fname = f_clean + lang.replace(".gz", "")
                createPath(fname)

                with copen(fname, "w", encoding="utf-8") as xml_f:
                    doc = []
                    last_id = 0
                    words = 0

                    with gopen(f_corpus + lang) as clean_f:
                        for line in clean_f:
                            line = line.strip()

                            if line.startswith("<s"):
                                last_id = match(".*id=\"([0-9]+)\"",
                                                line).group(1)
                                doc.append([])
                            if line.startswith("<w"):
                                m = match(".*>(.+)</", line)
                                if m:
                                    word = m.group(1)
                                    words += 1
                                    if lang.startswith("en"):
                                        words_total += 1
                                        word = word.strip().lower().replace(
                                            "\'", "")

                                        if filter and word not in stopwords and len(
                                                word) > 1 and word.isalpha():
                                            doc[-1].append(word)
                                        elif not filter:
                                            doc[-1].append(word)
                                        else:
                                            words_lost += 1
                                    elif lang.startswith("de"):
                                        doc[-1].append(word)

                    xml_f.write(
                        "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<d s=\"%s\" w=\"%s\" f=\"%s\">\n"
                        % (last_id, words, lang.replace(".gz", "")))

                    for k, v in enumerate(doc):
                        sid = k + 1

                        if len(v) > 1:
                            xml_f.write("<s id=\"%s\">%s</s>\n" %
                                        (sid, " ".join(v).decode("utf-8")))
                        if len(v) <= 1:
                            sents_lost += 1
                            rem_f.write("[R] %s %s %s\n" %
                                        (str(key), lang[0:2], sid))

                            for projection in align_p_f[key]:
                                if lang.startswith("de") and str(
                                        sid) in projection[0].split(" "):
                                    align_p_f[key].remove(projection)
                                    break
                                elif lang.startswith("en") and str(
                                        sid) in projection[1].split(" "):
                                    align_p_f[key].remove(projection)
                                    break
                    xml_f.write("</d>\n")
                    xml_f.flush()
    with open(f_stats, "a") as stats_f:
        stats_f.write("Removed: %d sentences\n" % sents_lost)
        scount = 0

        for v in align_p_f.itervalues():
            scount += len(v)

        stats_f.write("Remaining: %d sentences\n" % scount)
        stats_f.write("Total words: %d\n" % words_total)
        stats_f.write("Words lost: %d\n" % words_lost)
        stats_f.write("Words remmaining: %d\n" % (words_total - words_lost))

    dumpStruct(f_align_p, align_p_f)
Пример #6
0
def cleanCopyDocuments(f_align_p, f_corpus, f_clean, f_stats, f_rem, filter=True):
    """ Copies the documents with alignment in a clean format to a new folder as text files.
    """
    align_p_f = loadStruct(f_align_p)
    stopwords = getStopwords()
    n_docs = len(align_p_f)
    words_total = 0
    words_lost = 0
    sents_lost = 0

    with open(f_rem, "w") as rem_f:
        for i, key in enumerate(align_p_f.iterkeys()):
            if i % 500 == 0:
                print "Documents: %d/%d" % (i, n_docs)
            elif i == 0 or i == n_docs - 1:
                print "Documents: %d/%d" % (i + 1, n_docs)

            for lang in key:
                fname = f_clean + lang.replace(".gz", "")
                createPath(fname)

                with copen(fname, "w", encoding="utf-8") as xml_f:
                    doc = []
                    last_id = 0
                    words = 0

                    with gopen(f_corpus + lang) as clean_f:
                        for line in clean_f:
                            line = line.strip()

                            if line.startswith("<s"):
                                last_id = match('.*id="([0-9]+)"', line).group(1)
                                doc.append([])
                            if line.startswith("<w"):
                                m = match(".*>(.+)</", line)
                                if m:
                                    word = m.group(1)
                                    words += 1
                                    if lang.startswith("en"):
                                        words_total += 1
                                        word = word.strip().lower().replace("'", "")

                                        if filter and word not in stopwords and len(word) > 1 and word.isalpha():
                                            doc[-1].append(word)
                                        elif not filter:
                                            doc[-1].append(word)
                                        else:
                                            words_lost += 1
                                    elif lang.startswith("de"):
                                        doc[-1].append(word)

                    xml_f.write(
                        '<?xml version="1.0" encoding="utf-8"?>\n<d s="%s" w="%s" f="%s">\n'
                        % (last_id, words, lang.replace(".gz", ""))
                    )

                    for k, v in enumerate(doc):
                        sid = k + 1

                        if len(v) > 1:
                            xml_f.write('<s id="%s">%s</s>\n' % (sid, " ".join(v).decode("utf-8")))
                        if len(v) <= 1:
                            sents_lost += 1
                            rem_f.write("[R] %s %s %s\n" % (str(key), lang[0:2], sid))

                            for projection in align_p_f[key]:
                                if lang.startswith("de") and str(sid) in projection[0].split(" "):
                                    align_p_f[key].remove(projection)
                                    break
                                elif lang.startswith("en") and str(sid) in projection[1].split(" "):
                                    align_p_f[key].remove(projection)
                                    break
                    xml_f.write("</d>\n")
                    xml_f.flush()
    with open(f_stats, "a") as stats_f:
        stats_f.write("Removed: %d sentences\n" % sents_lost)
        scount = 0

        for v in align_p_f.itervalues():
            scount += len(v)

        stats_f.write("Remaining: %d sentences\n" % scount)
        stats_f.write("Total words: %d\n" % words_total)
        stats_f.write("Words lost: %d\n" % words_lost)
        stats_f.write("Words remmaining: %d\n" % (words_total - words_lost))

    dumpStruct(f_align_p, align_p_f)