예제 #1
0
파일: process.py 프로젝트: shaik1990/SLDA
def processGutenberg(f_gutenberg, f_gproj):
    """
    Processing the Project Gutenberg corpus.
    """
    for f_g in ["test/", "train/"]:
        createPath(f_gproj + f_g)

        for f_novel in listdir(f_gutenberg + f_g):
            if f_novel.endswith("_en.txt"):

                with copen(f_gproj + f_g + f_novel, "w",
                           encoding="utf-8") as gproj_f:
                    gproj_f.write("<d src=\"%s\">\n" % f_novel)

                    with copen(f_gutenberg + f_g + f_novel,
                               encoding="utf-8") as novel_f:
                        j = 2

                        for i, line in enumerate(novel_f.readlines()):
                            if i in xrange(j - 2, j):
                                line = line.strip()

                                if line.startswith("<S"):
                                    m = match(
                                        ".*sentNum:([0-9]+).*F:([0|1]) I:([0|1])",
                                        line)
                                    gproj_f.write(
                                        "<s id=\"%s\" f=\"%s\" i=\"%s\">" %
                                        (m.group(1), m.group(2), m.group(3)))
                                else:
                                    gproj_f.write("%s</s>\n" % line)
                            elif i == j:
                                j += 4
                    gproj_f.write("</d>\n")
예제 #2
0
파일: process.py 프로젝트: akullpp/SLDA
def processGutenberg(f_gutenberg, f_gproj):
    """
    Processing the Project Gutenberg corpus.
    """
    for f_g in ["test/", "train/"]:
        createPath(f_gproj + f_g)

        for f_novel in listdir(f_gutenberg + f_g):
            if f_novel.endswith("_en.txt"):

                with copen(f_gproj + f_g + f_novel, "w", encoding="utf-8") as gproj_f:
                    gproj_f.write("<d src=\"%s\">\n" % f_novel)

                    with copen(f_gutenberg + f_g + f_novel, encoding="utf-8") as novel_f:
                        j = 2

                        for i, line in enumerate(novel_f.readlines()):
                            if i in xrange(j - 2, j):
                                line = line.strip()

                                if line.startswith("<S"):
                                    m = match(".*sentNum:([0-9]+).*F:([0|1]) I:([0|1])", line)
                                    gproj_f.write("<s id=\"%s\" f=\"%s\" i=\"%s\">" % (m.group(1), m.group(2), m.group(3)))
                                else:
                                    gproj_f.write("%s</s>\n" % line)
                            elif i == j:
                                j += 4
                    gproj_f.write("</d>\n")
예제 #3
0
def extractAlignmentsRX(f_align, f_align_p, f_stats):
    """ Extracts the alignments with regex.

    Easier to parse HUN aligned files, which will be dropped due to inconsistencies. Mainly used for the small
    OpenSubtitles corpus not the 2011er one.
    """
    print "Extracting alignments"

    alignments = {}
    final = {}
    hun_files = set()
    doc_count = 0
    link_count = 0

    with gopen(f_align) as align_f:
        for line in align_f:
            line = line.strip()

            if line.startswith("<linkGrp"):
                doc_count += 1
                m = search("fromDoc=\"(.+)\"\stoDoc=\"(.+)\"", line)

                if m:
                    key = (m.group(1), m.group(2))
                elif not m:
                    m = search("toDoc=\"(.+)\"\sfromDoc=\"(.+)\"", line)
                    key = (m.group(2), m.group(1))
                alignments.setdefault(key, [])
            elif line.startswith("<link id="):
                link_count += 1
                m = search("xtargets=\"(.+?)\"", line)
                alignments[key].append(m.group(1).split(";"))
            elif line.startswith("<link certainty="):
                hun_files.add(key)

                if key in alignments:
                    del alignments[key]
                continue

    empty = set()

    for k, v in alignments.iteritems():
        if len(v) != 0:
            final.setdefault(k, v)
        else:
            empty.add(k)
    dumpStruct(f_align_p, final)
    createPath(f_stats)

    with open(f_stats, "w") as stats:
            stats.write("DOCS: %d\nHUN: %d\nEMPTY: %d\nLEFT: %d\nLINKS: %d\n\n" %
                       (doc_count, len(hun_files), len(empty), len(final), link_count))

            for k in hun_files:
                stats.write(k[0] + " || " + k[1] + "\n")
            stats.write("\n")
예제 #4
0
def extractAlignmentsLXML(f_align, f_align_p, f_stats):
    """ Extracts alignment information from the alignments file with LXML.

    Used for the large OpenSubtitles 2011 corpus for faster processing.
    """
    print "Extracting alignments"

    class Target(object):
        def __init__(self):
            self.d = dict()
            self.n_links = 0
            self.n_docs = 0

        def start(self, tag, attr):
            if tag == "linkGrp":
                self.n_docs += 1
                self.k = (attr["fromDoc"], attr["toDoc"])
                self.group = self.d[self.k] = []
            elif tag == "link":
                self.n_links += 1
                self.group.append(tuple(attr["xtargets"].split(";")))

                if "certainty" in attr:
                    print "Attention HUN: %s" % self.k

        def close(self):
            pass

    with gopen(f_align) as xml:
        targets = Target()
        parser = etree.XMLParser(target=targets)
        etree.parse(xml, parser)

    alignments = targets.d

    # Documents with no alignments
    empty = set()

    for k, v in alignments.iteritems():
        if not len(v):
            empty.add(k)
            del targets.d[k]

    dumpStruct(f_align_p, alignments)
    createPath(f_stats)

    with open(f_stats, "w") as stats:
        stats.write("DOCS: %d\nEMPTY: %d\nLEFT: %d\nLINKS: %d\n\n" %
                    (targets.n_docs, len(empty), len(alignments), targets.n_links))

        for k in empty:
            stats.write("!!! Empty files\n%s || %s\n" % (k[0], k[1]))
            stats.write("\n")
예제 #5
0
def plainCopyDocuments(f_align_p, f_corpus, f_clean):
    """ Copies the files with alignments to a seperate folder.
    """
    align_p_f = loadStruct(f_align_p)

    print "Copying %d documents" % len(align_p_f)

    for key in align_p_f.iterkeys():
        to_de = f_clean + key[0]
        to_en = f_clean + key[1]
        createPath(to_de)
        createPath(to_en)
        copy(f_corpus + key[0], to_de)
        copy(f_corpus + key[1], to_en)
예제 #6
0
파일: preprocess.py 프로젝트: caomw/SLDA
def plainCopyDocuments(f_align_p, f_corpus, f_clean):
    """ Copies the files with alignments to a seperate folder.
    """
    align_p_f = loadStruct(f_align_p)

    print "Copying %d documents" % len(align_p_f)

    for key in align_p_f.iterkeys():
        to_de = f_clean + key[0]
        to_en = f_clean + key[1]
        createPath(to_de)
        createPath(to_en)
        copy(f_corpus + key[0], to_de)
        copy(f_corpus + key[1], to_en)
예제 #7
0
파일: process.py 프로젝트: akullpp/SLDA
def createProjection(f_align_p, f_stats, f_clean, f_proj, f_unknown_p):
    """ Creates the projection based on rules.
    """
    fcount = 0
    de_count = 0
    en_count = 0
    pos = 0
    neg = 0
    lost = 0
    nn = 0
    on = 0
    no = 0
    oo = 0
    de_lost = 0
    scount = 0
    align_p_f = loadStruct(f_align_p)
    total = len(align_p_f)
    unknown = {}

    for lang, rels in align_p_f.iteritems():
        fcount += 1

        if fcount % 500 == 0 or fcount == total or fcount == 1:
            print "Documents: %d/%d" % (fcount, total)

        with copen(f_clean + lang[0].replace(".gz", "")) as xml_f:
            proj = {}
            dom = parse(xml_f)
            nodes = dom.getElementsByTagName("s")
            de_count += len(nodes)

            for link in rels:
                for node in nodes:
                    id_de = node.getAttribute("id")
                    links_de = link[0].split(" ")

                    if id_de in links_de and link[1] != "":
                        sentence = node.firstChild.nodeValue.split(" ")
                        meta = "<s id=\"0\" f=\"0\" i=\"0\">"

                        if "du" in sentence or "Du" in sentence:
                            meta = meta.replace("i=\"0\"", "i=\"1\"")
                        if "Sie" in sentence[1:]:
                            meta = meta.replace("f=\"0\"", "f=\"1\"")

                        if "f=\"0\" i=\"0\"" in meta:
                            nn += 1
                        elif "f=\"1\" i=\"0\"" in meta:
                            on += 1
                        elif "f=\"0\" i=\"1\"" in meta:
                            no += 1
                        elif "f=\"1\" i=\"1\"" in meta:
                            oo += 1

                        if "f=\"1\" i=\"1\"" not in meta:
                            for id_en in link[1].split(" "):
                                proj[id_en] = meta.replace("id=\"0\"", "id=\"%s\"" % id_en)
                    else:
                        de_lost += 1
            en_count += len(proj)

        with copen(f_clean + lang[1].replace(".gz", "")) as xml_e:
            unknown.setdefault(lang, [])
            fname_e = f_proj + "_".join(lang[1].split("/")).replace(".xml.gz", ".txt").replace("en_", "")
            createPath(fname_e)

            with copen(fname_e, "w", encoding="utf-8") as txt_e:
                txt_e.write("<d src=\"%s\">\n" % lang[0].replace(".gz", ""))
                dom_e = parse(xml_e)
                nodes_e = dom_e.getElementsByTagName("s")

                for node in nodes_e:
                    id_e = node.getAttribute("id")
                    sent_e = node.firstChild.nodeValue

                    if id_e in proj:
                        proj_e = proj[id_e]
                        s_sent_e = sent_e.split(" ")

                        if "you" in s_sent_e and "f=\"0\" i=\"0\"" not in proj_e:
                            pos += 1
                            scount += 1
                            txt_e.write("%s%s</s>\n" % (proj_e, sent_e))
                        elif "you" in s_sent_e and "f=\"0\" i=\"0\"" in proj_e:
                            neg += 1
                            unknown[lang].append(id_e)
                        elif "you" not in s_sent_e and "f=\"0\" i=\"0\"" in proj_e:
                            scount += 1
                            txt_e.write("%s%s</s>\n" % (proj_e, sent_e))
                        elif "you" not in s_sent_e and "f=\"0\" i=\"0\"" not in proj_e:
                            lost += 1
                txt_e.write("</d>\n")
                txt_e.flush()

    with open(f_stats, "a") as stats:
        stats.write("PROJECTED DE_%d TO %d_EN\n"
                    "DE 0 0: %d\n"
                    "DE 1 0: %d\n"
                    "DE 0 1: %d\n"
                    "DE 1 1: %d\n"
                    "Y-Found: %d\n"
                    "Y-NotFound: %d\n"
                    "F-Lost: %d\n"
                    "Sentences: %d\n"
                    "DE no EN: %d" %
                   (de_count, en_count, nn, on, no, oo, pos, neg, lost, scount, de_lost))

    dumpStruct(f_unknown_p, unknown)
예제 #8
0
파일: process.py 프로젝트: shaik1990/SLDA
def createProjection(f_align_p, f_stats, f_clean, f_proj, f_unknown_p):
    """ Creates the projection based on rules.
    """
    fcount = 0
    de_count = 0
    en_count = 0
    pos = 0
    neg = 0
    lost = 0
    nn = 0
    on = 0
    no = 0
    oo = 0
    de_lost = 0
    scount = 0
    align_p_f = loadStruct(f_align_p)
    total = len(align_p_f)
    unknown = {}

    for lang, rels in align_p_f.iteritems():
        fcount += 1

        if fcount % 500 == 0 or fcount == total or fcount == 1:
            print "Documents: %d/%d" % (fcount, total)

        with copen(f_clean + lang[0].replace(".gz", "")) as xml_f:
            proj = {}
            dom = parse(xml_f)
            nodes = dom.getElementsByTagName("s")
            de_count += len(nodes)

            for link in rels:
                for node in nodes:
                    id_de = node.getAttribute("id")
                    links_de = link[0].split(" ")

                    if id_de in links_de and link[1] != "":
                        sentence = node.firstChild.nodeValue.split(" ")
                        meta = "<s id=\"0\" f=\"0\" i=\"0\">"

                        if "du" in sentence or "Du" in sentence:
                            meta = meta.replace("i=\"0\"", "i=\"1\"")
                        if "Sie" in sentence[1:]:
                            meta = meta.replace("f=\"0\"", "f=\"1\"")

                        if "f=\"0\" i=\"0\"" in meta:
                            nn += 1
                        elif "f=\"1\" i=\"0\"" in meta:
                            on += 1
                        elif "f=\"0\" i=\"1\"" in meta:
                            no += 1
                        elif "f=\"1\" i=\"1\"" in meta:
                            oo += 1

                        if "f=\"1\" i=\"1\"" not in meta:
                            for id_en in link[1].split(" "):
                                proj[id_en] = meta.replace(
                                    "id=\"0\"", "id=\"%s\"" % id_en)
                    else:
                        de_lost += 1
            en_count += len(proj)

        with copen(f_clean + lang[1].replace(".gz", "")) as xml_e:
            unknown.setdefault(lang, [])
            fname_e = f_proj + "_".join(lang[1].split("/")).replace(
                ".xml.gz", ".txt").replace("en_", "")
            createPath(fname_e)

            with copen(fname_e, "w", encoding="utf-8") as txt_e:
                txt_e.write("<d src=\"%s\">\n" % lang[0].replace(".gz", ""))
                dom_e = parse(xml_e)
                nodes_e = dom_e.getElementsByTagName("s")

                for node in nodes_e:
                    id_e = node.getAttribute("id")
                    sent_e = node.firstChild.nodeValue

                    if id_e in proj:
                        proj_e = proj[id_e]
                        s_sent_e = sent_e.split(" ")

                        if "you" in s_sent_e and "f=\"0\" i=\"0\"" not in proj_e:
                            pos += 1
                            scount += 1
                            txt_e.write("%s%s</s>\n" % (proj_e, sent_e))
                        elif "you" in s_sent_e and "f=\"0\" i=\"0\"" in proj_e:
                            neg += 1
                            unknown[lang].append(id_e)
                        elif "you" not in s_sent_e and "f=\"0\" i=\"0\"" in proj_e:
                            scount += 1
                            txt_e.write("%s%s</s>\n" % (proj_e, sent_e))
                        elif "you" not in s_sent_e and "f=\"0\" i=\"0\"" not in proj_e:
                            lost += 1
                txt_e.write("</d>\n")
                txt_e.flush()

    with open(f_stats, "a") as stats:
        stats.write("PROJECTED DE_%d TO %d_EN\n"
                    "DE 0 0: %d\n"
                    "DE 1 0: %d\n"
                    "DE 0 1: %d\n"
                    "DE 1 1: %d\n"
                    "Y-Found: %d\n"
                    "Y-NotFound: %d\n"
                    "F-Lost: %d\n"
                    "Sentences: %d\n"
                    "DE no EN: %d" % (de_count, en_count, nn, on, no, oo, pos,
                                      neg, lost, scount, de_lost))

    dumpStruct(f_unknown_p, unknown)
예제 #9
0
def cleanCopyDocuments(f_align_p,
                       f_corpus,
                       f_clean,
                       f_stats,
                       f_rem,
                       filter=True):
    """ Copies the documents with alignment in a clean format to a new folder as text files.
    """
    align_p_f = loadStruct(f_align_p)
    stopwords = getStopwords()
    n_docs = len(align_p_f)
    words_total = 0
    words_lost = 0
    sents_lost = 0

    with open(f_rem, "w") as rem_f:
        for i, key in enumerate(align_p_f.iterkeys()):
            if i % 500 == 0:
                print "Documents: %d/%d" % (i, n_docs)
            elif i == 0 or i == n_docs - 1:
                print "Documents: %d/%d" % (i + 1, n_docs)

            for lang in key:
                fname = f_clean + lang.replace(".gz", "")
                createPath(fname)

                with copen(fname, "w", encoding="utf-8") as xml_f:
                    doc = []
                    last_id = 0
                    words = 0

                    with gopen(f_corpus + lang) as clean_f:
                        for line in clean_f:
                            line = line.strip()

                            if line.startswith("<s"):
                                last_id = match(".*id=\"([0-9]+)\"",
                                                line).group(1)
                                doc.append([])
                            if line.startswith("<w"):
                                m = match(".*>(.+)</", line)
                                if m:
                                    word = m.group(1)
                                    words += 1
                                    if lang.startswith("en"):
                                        words_total += 1
                                        word = word.strip().lower().replace(
                                            "\'", "")

                                        if filter and word not in stopwords and len(
                                                word) > 1 and word.isalpha():
                                            doc[-1].append(word)
                                        elif not filter:
                                            doc[-1].append(word)
                                        else:
                                            words_lost += 1
                                    elif lang.startswith("de"):
                                        doc[-1].append(word)

                    xml_f.write(
                        "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<d s=\"%s\" w=\"%s\" f=\"%s\">\n"
                        % (last_id, words, lang.replace(".gz", "")))

                    for k, v in enumerate(doc):
                        sid = k + 1

                        if len(v) > 1:
                            xml_f.write("<s id=\"%s\">%s</s>\n" %
                                        (sid, " ".join(v).decode("utf-8")))
                        if len(v) <= 1:
                            sents_lost += 1
                            rem_f.write("[R] %s %s %s\n" %
                                        (str(key), lang[0:2], sid))

                            for projection in align_p_f[key]:
                                if lang.startswith("de") and str(
                                        sid) in projection[0].split(" "):
                                    align_p_f[key].remove(projection)
                                    break
                                elif lang.startswith("en") and str(
                                        sid) in projection[1].split(" "):
                                    align_p_f[key].remove(projection)
                                    break
                    xml_f.write("</d>\n")
                    xml_f.flush()
    with open(f_stats, "a") as stats_f:
        stats_f.write("Removed: %d sentences\n" % sents_lost)
        scount = 0

        for v in align_p_f.itervalues():
            scount += len(v)

        stats_f.write("Remaining: %d sentences\n" % scount)
        stats_f.write("Total words: %d\n" % words_total)
        stats_f.write("Words lost: %d\n" % words_lost)
        stats_f.write("Words remmaining: %d\n" % (words_total - words_lost))

    dumpStruct(f_align_p, align_p_f)
예제 #10
0
파일: preprocess.py 프로젝트: caomw/SLDA
def cleanCopyDocuments(f_align_p, f_corpus, f_clean, f_stats, f_rem, filter=True):
    """ Copies the documents with alignment in a clean format to a new folder as text files.
    """
    align_p_f = loadStruct(f_align_p)
    stopwords = getStopwords()
    n_docs = len(align_p_f)
    words_total = 0
    words_lost = 0
    sents_lost = 0

    with open(f_rem, "w") as rem_f:
        for i, key in enumerate(align_p_f.iterkeys()):
            if i % 500 == 0:
                print "Documents: %d/%d" % (i, n_docs)
            elif i == 0 or i == n_docs - 1:
                print "Documents: %d/%d" % (i + 1, n_docs)

            for lang in key:
                fname = f_clean + lang.replace(".gz", "")
                createPath(fname)

                with copen(fname, "w", encoding="utf-8") as xml_f:
                    doc = []
                    last_id = 0
                    words = 0

                    with gopen(f_corpus + lang) as clean_f:
                        for line in clean_f:
                            line = line.strip()

                            if line.startswith("<s"):
                                last_id = match('.*id="([0-9]+)"', line).group(1)
                                doc.append([])
                            if line.startswith("<w"):
                                m = match(".*>(.+)</", line)
                                if m:
                                    word = m.group(1)
                                    words += 1
                                    if lang.startswith("en"):
                                        words_total += 1
                                        word = word.strip().lower().replace("'", "")

                                        if filter and word not in stopwords and len(word) > 1 and word.isalpha():
                                            doc[-1].append(word)
                                        elif not filter:
                                            doc[-1].append(word)
                                        else:
                                            words_lost += 1
                                    elif lang.startswith("de"):
                                        doc[-1].append(word)

                    xml_f.write(
                        '<?xml version="1.0" encoding="utf-8"?>\n<d s="%s" w="%s" f="%s">\n'
                        % (last_id, words, lang.replace(".gz", ""))
                    )

                    for k, v in enumerate(doc):
                        sid = k + 1

                        if len(v) > 1:
                            xml_f.write('<s id="%s">%s</s>\n' % (sid, " ".join(v).decode("utf-8")))
                        if len(v) <= 1:
                            sents_lost += 1
                            rem_f.write("[R] %s %s %s\n" % (str(key), lang[0:2], sid))

                            for projection in align_p_f[key]:
                                if lang.startswith("de") and str(sid) in projection[0].split(" "):
                                    align_p_f[key].remove(projection)
                                    break
                                elif lang.startswith("en") and str(sid) in projection[1].split(" "):
                                    align_p_f[key].remove(projection)
                                    break
                    xml_f.write("</d>\n")
                    xml_f.flush()
    with open(f_stats, "a") as stats_f:
        stats_f.write("Removed: %d sentences\n" % sents_lost)
        scount = 0

        for v in align_p_f.itervalues():
            scount += len(v)

        stats_f.write("Remaining: %d sentences\n" % scount)
        stats_f.write("Total words: %d\n" % words_total)
        stats_f.write("Words lost: %d\n" % words_lost)
        stats_f.write("Words remmaining: %d\n" % (words_total - words_lost))

    dumpStruct(f_align_p, align_p_f)