Python language示例，w2w.vocabulary.language Python示例

示例#1

0

显示文件

文件： examples.py 项目： AcademiaSinicaNLPLab/neural-language-model

    def __init__(self, l1, l1seq, w1):
        """
        l1 = source language
        l1seq = sequence of word IDs in source language
        w1 = focus word ID in source language
        """
        self.l1 = l1
        self.l1seq = l1seq
        self.w1 = w1

        if wordform(self.w1) != "*UNKNOWN*":
            assert self.l1 == language(self.w1)

示例#2

0

显示文件

文件： examples.py 项目： zbxzc35/neural-language-model

    def __init__(self, l1, l1seq, w1):
        """
        l1 = source language
        l1seq = sequence of word IDs in source language
        w1 = focus word ID in source language
        """
        self.l1 = l1
        self.l1seq = l1seq
        self.w1 = w1

        if wordform(self.w1) != "*UNKNOWN*":
            assert self.l1 == language(self.w1)

示例#3

0

显示文件

文件： examples.py 项目： AcademiaSinicaNLPLab/neural-language-model

def get_training_biexample(l1, l2, f1, f2, falign):
    """
    Generator of bilingual training examples from this bicorpus.
    """
    import common.hyperparameters
    HYPERPARAMETERS = common.hyperparameters.read("language-model")
    WINDOW = HYPERPARAMETERS["WINDOW_SIZE"]

    for ws1, ws2, links in bicorpus_sentences_and_alignments(l1, l2, f1, f2, falign):
        for i1, i2 in links:
            w1 = ws1[i1]
            w2 = ws2[i2]

            l2new = language(w2)
            assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
            # Skip translations to unknown words
            if wordform(w2) == "*UNKNOWN*": continue
            assert l2new == l2

            # Skip translations from unknown words
            if wordform(w1) == "*UNKNOWN*": continue

            # If we are filtering examples by lemma
            if not(HYPERPARAMETERS["W2W FOCUS LEMMAS"] is None or len (HYPERPARAMETERS["W2W FOCUS LEMMAS"]) == 0):
#                print wordmap().str(w1), wordmap().str(w2)
                assert language(w1) == "en"
#                from lemmatizer import lemmatize
#                if lemmatize(language(w1), wordform(w1)) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
#                    logging.debug("Focus word %s (lemma %s) not in our list of focus lemmas" % (`wordmap().str(w1)`, lemmatize(language(w1), wordform(w1))))
                if wordform(w1) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
                    logging.debug("Focus word %s not in our list of focus lemmas" % (`wordmap().str(w1)`))
                    continue

            if w1 not in targetmap():
                logging.warning("No translations for word %s, skipping" % (`wordmap().str(w1)`))
                continue

            if l2new not in targetmap()[w1]:
                logging.warning("Word %s has no translations for language %s, skipping" % (`wordmap().str(w1)`, l2new))
                continue

            if w2 not in targetmap()[w1][l2new]:
                logging.error("Word %s cannot translate to word %s, skipping" % (`wordmap().str(w1)`, `wordmap().str(w2)`))
                continue

            if len(targetmap()[w1][l2new]) == 1:
                logging.debug("Word %s has only one translation in language %s, skipping" % (`wordmap().str(w1)`, l2new))
                continue

            # Extract the window of tokens around index i1. Pad with *LBOUNDARY* and *RBOUNDARY* as necessary.
            min = i1 - (WINDOW-1)/2
            max = i1 + (WINDOW-1)/2
            lpad = 0
            rpad = 0
            if min < 0:
                lpad = -min
                min = 0
            if max >= len(ws1):
                rpad = max - (len(ws1)-1)
                max = len(ws1)-1
            assert lpad + (max - min + 1) + rpad == WINDOW

#            print i1 - (WINDOW-1)/2, i1 + (WINDOW-1)/2
#            print "min=%d, max=%d, lpad=%d, rpad=%d" % (min, max, lpad, rpad)
            seq = [wordmap().id((None, "*LBOUNDARY*"))]*lpad + ws1[min:max+1] + [wordmap().id((None, "*RBOUNDARY*"))]*rpad
#            print [wordmap.str(w) for w in seq]
            assert len(seq) == WINDOW
#            print ws1[i1 - (WINDOW-1)/2:i1 + (WINDOW-1)/2]

            assert seq[(WINDOW-1)/2] == w1
            yield BilingualExample(l1, seq, w1, w2)

示例#4

0

显示文件

文件： examples.py 项目： AcademiaSinicaNLPLab/neural-language-model

 def l2(self):
     return language(self.w2)

示例#5

0

显示文件

    reversecnt = {}
    for l1, l2, f1, f2, falign in w2w.corpora.bicorpora_filenames():
        for ws1, ws2, links in w2w.corpora.bicorpus_sentences_and_alignments(
                l1, l2, f1, f2, falign):
            for i1, i2 in links:
                if len(ws1) <= i1 or len(ws2) <= i2:
                    print >> sys.stderr, "This is going to break on link (%d, %d) because lens = (%d, %d)" % (
                        i1, i2, len(ws1), len(ws2))
                    print >> sys.stderr, [wordform(w) for w in ws1]
                    print >> sys.stderr, [wordform(w) for w in ws2]
                    print >> sys.stderr, links
                w1 = ws1[i1]
                w2 = ws2[i2]
                #                print wordmap.str(w1)[1], wordmap.str(w2)[1]

                l2new = language(w2)

                assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
                # Skip translations to unknown words
                if wordform(w2) == "*UNKNOWN*": continue

                assert l2new == l2

                # We don't filter here, otherwise we will get a reversemap that only maps to focus lemmas.
                #                # If we are filtering examples by lemma
                #                if not(HYPERPARAMETERS["W2W FOCUS LEMMAS"] is None or len (HYPERPARAMETERS["W2W FOCUS LEMMAS"]) == 0):
                #                    assert language(w1) == "en"
                #                    from lemmatizer import lemmatize
                #                    if lemmatize(language(w1), wordform(w1)) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
                ##                        logging.debug("Focus word %s (lemma %s) not in our list of focus lemmas" % (`wordmap().str(w1)`, lemmatize(language(w1), wordform(w1))))
                #                        continue

示例#6

0

显示文件

文件： build-initial-embeddings.py 项目： AcademiaSinicaNLPLab/neural-language-model

        else:
            original_embeddings[word] = numpy.array([float(v) for v in vals[1:]])
    print >> sys.stderr, "...done reading embeddings from %s" % HYPERPARAMETERS["W2W INITIAL EMBEDDINGS"]
    print >> sys.stderr, "Skipped %s words for which we had duplicate embeddings" % percent(tot-len(original_embeddings), tot)
    print >> sys.stderr, stats()

    reversemap = targetmap(name="reverse")

    embeddings = numpy.zeros((wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"]))
    assert embeddings.shape == (wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"])

    ELANG = HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"]
    for w in range(wordmap().len):
        embedding = None
        # If this word is in a different language than the embeddings.
        if language(w) != HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"]:
            if w not in reversemap:
                print >> sys.stderr, "Word %s is not even in target map! Using *UNKNOWN*" % `wordmap().str(w)`
                embedding = original_embeddings["*UNKNOWN*"]
            elif ELANG not in reversemap[w]:
                print >> sys.stderr, "Have no %s translations for word %s, only have %s, using *UNKNOWN*" % (ELANG, wordmap().str(w), reversemap[w].keys())
                embedding = original_embeddings["*UNKNOWN*"]
            else:
                # Mix the target word embedding over the weighted translation into the source language

                mixcnt = {}
                for w2 in reversemap[w][ELANG]:
                    if language(w2) is None:
                        assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
                        continue
                    assert language(w2) == ELANG

示例#7

0

显示文件

    print >> sys.stderr, "Skipped %s words for which we had duplicate embeddings" % percent(
        tot - len(original_embeddings), tot)
    print >> sys.stderr, stats()

    reversemap = targetmap(name="reverse")

    embeddings = numpy.zeros(
        (wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"]))
    assert embeddings.shape == (wordmap().len,
                                HYPERPARAMETERS["EMBEDDING_SIZE"])

    ELANG = HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"]
    for w in range(wordmap().len):
        embedding = None
        # If this word is in a different language than the embeddings.
        if language(w) != HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"]:
            if w not in reversemap:
                print >> sys.stderr, "Word %s is not even in target map! Using *UNKNOWN*" % ` wordmap(
                ).str(w) `
                embedding = original_embeddings["*UNKNOWN*"]
            elif ELANG not in reversemap[w]:
                print >> sys.stderr, "Have no %s translations for word %s, only have %s, using *UNKNOWN*" % (
                    ELANG, wordmap().str(w), reversemap[w].keys())
                embedding = original_embeddings["*UNKNOWN*"]
            else:
                # Mix the target word embedding over the weighted translation into the source language

                mixcnt = {}
                for w2 in reversemap[w][ELANG]:
                    if language(w2) is None:
                        assert HYPERPARAMETERS[

示例#8

0

显示文件

文件： build-target-vocabulary.py 项目： AcademiaSinicaNLPLab/neural-language-model

    cnt = {}
    reversecnt = {}
    for l1, l2, f1, f2, falign in w2w.corpora.bicorpora_filenames():
        for ws1, ws2, links in w2w.corpora.bicorpus_sentences_and_alignments(l1, l2, f1, f2, falign):
            for i1, i2 in links:
                if len(ws1) <= i1 or len(ws2) <= i2:
                    print >> sys.stderr, "This is going to break on link (%d, %d) because lens = (%d, %d)" % (i1,i2, len(ws1), len(ws2))
                    print >> sys.stderr, [wordform(w) for w in ws1]
                    print >> sys.stderr, [wordform(w) for w in ws2]
                    print >> sys.stderr, links
                w1 = ws1[i1]
                w2 = ws2[i2]
#                print wordmap.str(w1)[1], wordmap.str(w2)[1]

                l2new = language(w2)

                assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
                # Skip translations to unknown words
                if wordform(w2) == "*UNKNOWN*": continue

                assert l2new == l2


                # We don't filter here, otherwise we will get a reversemap that only maps to focus lemmas.
#                # If we are filtering examples by lemma
#                if not(HYPERPARAMETERS["W2W FOCUS LEMMAS"] is None or len (HYPERPARAMETERS["W2W FOCUS LEMMAS"]) == 0):
#                    assert language(w1) == "en"
#                    from lemmatizer import lemmatize
#                    if lemmatize(language(w1), wordform(w1)) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
##                        logging.debug("Focus word %s (lemma %s) not in our list of focus lemmas" % (`wordmap().str(w1)`, lemmatize(language(w1), wordform(w1))))

示例#9

0

显示文件

文件： examples.py 项目： zbxzc35/neural-language-model

def get_training_biexample(l1, l2, f1, f2, falign):
    """
    Generator of bilingual training examples from this bicorpus.
    """
    import common.hyperparameters
    HYPERPARAMETERS = common.hyperparameters.read("language-model")
    WINDOW = HYPERPARAMETERS["WINDOW_SIZE"]

    for ws1, ws2, links in bicorpus_sentences_and_alignments(
            l1, l2, f1, f2, falign):
        for i1, i2 in links:
            w1 = ws1[i1]
            w2 = ws2[i2]

            l2new = language(w2)
            assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
            # Skip translations to unknown words
            if wordform(w2) == "*UNKNOWN*": continue
            assert l2new == l2

            # Skip translations from unknown words
            if wordform(w1) == "*UNKNOWN*": continue

            # If we are filtering examples by lemma
            if not (HYPERPARAMETERS["W2W FOCUS LEMMAS"] is None
                    or len(HYPERPARAMETERS["W2W FOCUS LEMMAS"]) == 0):
                #                print wordmap().str(w1), wordmap().str(w2)
                assert language(w1) == "en"
                #                from lemmatizer import lemmatize
                #                if lemmatize(language(w1), wordform(w1)) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
                #                    logging.debug("Focus word %s (lemma %s) not in our list of focus lemmas" % (`wordmap().str(w1)`, lemmatize(language(w1), wordform(w1))))
                if wordform(w1) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
                    logging.debug(
                        "Focus word %s not in our list of focus lemmas" %
                        ( ` wordmap().str(w1) `))
                    continue

            if w1 not in targetmap():
                logging.warning("No translations for word %s, skipping" %
                                ( ` wordmap().str(w1) `))
                continue

            if l2new not in targetmap()[w1]:
                logging.warning(
                    "Word %s has no translations for language %s, skipping" %
                    ( ` wordmap().str(w1) `, l2new))
                continue

            if w2 not in targetmap()[w1][l2new]:
                logging.error("Word %s cannot translate to word %s, skipping" %
                              ( ` wordmap().str(w1) `, ` wordmap().str(w2) `))
                continue

            if len(targetmap()[w1][l2new]) == 1:
                logging.debug(
                    "Word %s has only one translation in language %s, skipping"
                    % ( ` wordmap().str(w1) `, l2new))
                continue

            # Extract the window of tokens around index i1. Pad with *LBOUNDARY* and *RBOUNDARY* as necessary.
            min = i1 - (WINDOW - 1) / 2
            max = i1 + (WINDOW - 1) / 2
            lpad = 0
            rpad = 0
            if min < 0:
                lpad = -min
                min = 0
            if max >= len(ws1):
                rpad = max - (len(ws1) - 1)
                max = len(ws1) - 1
            assert lpad + (max - min + 1) + rpad == WINDOW

            #            print i1 - (WINDOW-1)/2, i1 + (WINDOW-1)/2
            #            print "min=%d, max=%d, lpad=%d, rpad=%d" % (min, max, lpad, rpad)
            seq = [wordmap().id((None, "*LBOUNDARY*"))
                   ] * lpad + ws1[min:max +
                                  1] + [wordmap().id(
                                      (None, "*RBOUNDARY*"))] * rpad
            #            print [wordmap.str(w) for w in seq]
            assert len(seq) == WINDOW
            #            print ws1[i1 - (WINDOW-1)/2:i1 + (WINDOW-1)/2]

            assert seq[(WINDOW - 1) / 2] == w1
            yield BilingualExample(l1, seq, w1, w2)

示例#10

0

显示文件

文件： examples.py 项目： zbxzc35/neural-language-model

 def l2(self):
     return language(self.w2)