def __init__(self, l1, l1seq, w1): """ l1 = source language l1seq = sequence of word IDs in source language w1 = focus word ID in source language """ self.l1 = l1 self.l1seq = l1seq self.w1 = w1 if wordform(self.w1) != "*UNKNOWN*": assert self.l1 == language(self.w1)
def get_training_biexample(l1, l2, f1, f2, falign): """ Generator of bilingual training examples from this bicorpus. """ import common.hyperparameters HYPERPARAMETERS = common.hyperparameters.read("language-model") WINDOW = HYPERPARAMETERS["WINDOW_SIZE"] for ws1, ws2, links in bicorpus_sentences_and_alignments(l1, l2, f1, f2, falign): for i1, i2 in links: w1 = ws1[i1] w2 = ws2[i2] l2new = language(w2) assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"] # Skip translations to unknown words if wordform(w2) == "*UNKNOWN*": continue assert l2new == l2 # Skip translations from unknown words if wordform(w1) == "*UNKNOWN*": continue # If we are filtering examples by lemma if not(HYPERPARAMETERS["W2W FOCUS LEMMAS"] is None or len (HYPERPARAMETERS["W2W FOCUS LEMMAS"]) == 0): # print wordmap().str(w1), wordmap().str(w2) assert language(w1) == "en" # from lemmatizer import lemmatize # if lemmatize(language(w1), wordform(w1)) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]: # logging.debug("Focus word %s (lemma %s) not in our list of focus lemmas" % (`wordmap().str(w1)`, lemmatize(language(w1), wordform(w1)))) if wordform(w1) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]: logging.debug("Focus word %s not in our list of focus lemmas" % (`wordmap().str(w1)`)) continue if w1 not in targetmap(): logging.warning("No translations for word %s, skipping" % (`wordmap().str(w1)`)) continue if l2new not in targetmap()[w1]: logging.warning("Word %s has no translations for language %s, skipping" % (`wordmap().str(w1)`, l2new)) continue if w2 not in targetmap()[w1][l2new]: logging.error("Word %s cannot translate to word %s, skipping" % (`wordmap().str(w1)`, `wordmap().str(w2)`)) continue if len(targetmap()[w1][l2new]) == 1: logging.debug("Word %s has only one translation in language %s, skipping" % (`wordmap().str(w1)`, l2new)) continue # Extract the window of tokens around index i1. Pad with *LBOUNDARY* and *RBOUNDARY* as necessary. min = i1 - (WINDOW-1)/2 max = i1 + (WINDOW-1)/2 lpad = 0 rpad = 0 if min < 0: lpad = -min min = 0 if max >= len(ws1): rpad = max - (len(ws1)-1) max = len(ws1)-1 assert lpad + (max - min + 1) + rpad == WINDOW # print i1 - (WINDOW-1)/2, i1 + (WINDOW-1)/2 # print "min=%d, max=%d, lpad=%d, rpad=%d" % (min, max, lpad, rpad) seq = [wordmap().id((None, "*LBOUNDARY*"))]*lpad + ws1[min:max+1] + [wordmap().id((None, "*RBOUNDARY*"))]*rpad # print [wordmap.str(w) for w in seq] assert len(seq) == WINDOW # print ws1[i1 - (WINDOW-1)/2:i1 + (WINDOW-1)/2] assert seq[(WINDOW-1)/2] == w1 yield BilingualExample(l1, seq, w1, w2)
def l2(self): return language(self.w2)
reversecnt = {} for l1, l2, f1, f2, falign in w2w.corpora.bicorpora_filenames(): for ws1, ws2, links in w2w.corpora.bicorpus_sentences_and_alignments( l1, l2, f1, f2, falign): for i1, i2 in links: if len(ws1) <= i1 or len(ws2) <= i2: print >> sys.stderr, "This is going to break on link (%d, %d) because lens = (%d, %d)" % ( i1, i2, len(ws1), len(ws2)) print >> sys.stderr, [wordform(w) for w in ws1] print >> sys.stderr, [wordform(w) for w in ws2] print >> sys.stderr, links w1 = ws1[i1] w2 = ws2[i2] # print wordmap.str(w1)[1], wordmap.str(w2)[1] l2new = language(w2) assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"] # Skip translations to unknown words if wordform(w2) == "*UNKNOWN*": continue assert l2new == l2 # We don't filter here, otherwise we will get a reversemap that only maps to focus lemmas. # # If we are filtering examples by lemma # if not(HYPERPARAMETERS["W2W FOCUS LEMMAS"] is None or len (HYPERPARAMETERS["W2W FOCUS LEMMAS"]) == 0): # assert language(w1) == "en" # from lemmatizer import lemmatize # if lemmatize(language(w1), wordform(w1)) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]: ## logging.debug("Focus word %s (lemma %s) not in our list of focus lemmas" % (`wordmap().str(w1)`, lemmatize(language(w1), wordform(w1)))) # continue
else: original_embeddings[word] = numpy.array([float(v) for v in vals[1:]]) print >> sys.stderr, "...done reading embeddings from %s" % HYPERPARAMETERS["W2W INITIAL EMBEDDINGS"] print >> sys.stderr, "Skipped %s words for which we had duplicate embeddings" % percent(tot-len(original_embeddings), tot) print >> sys.stderr, stats() reversemap = targetmap(name="reverse") embeddings = numpy.zeros((wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"])) assert embeddings.shape == (wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"]) ELANG = HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"] for w in range(wordmap().len): embedding = None # If this word is in a different language than the embeddings. if language(w) != HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"]: if w not in reversemap: print >> sys.stderr, "Word %s is not even in target map! Using *UNKNOWN*" % `wordmap().str(w)` embedding = original_embeddings["*UNKNOWN*"] elif ELANG not in reversemap[w]: print >> sys.stderr, "Have no %s translations for word %s, only have %s, using *UNKNOWN*" % (ELANG, wordmap().str(w), reversemap[w].keys()) embedding = original_embeddings["*UNKNOWN*"] else: # Mix the target word embedding over the weighted translation into the source language mixcnt = {} for w2 in reversemap[w][ELANG]: if language(w2) is None: assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"] continue assert language(w2) == ELANG
print >> sys.stderr, "Skipped %s words for which we had duplicate embeddings" % percent( tot - len(original_embeddings), tot) print >> sys.stderr, stats() reversemap = targetmap(name="reverse") embeddings = numpy.zeros( (wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"])) assert embeddings.shape == (wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"]) ELANG = HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"] for w in range(wordmap().len): embedding = None # If this word is in a different language than the embeddings. if language(w) != HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"]: if w not in reversemap: print >> sys.stderr, "Word %s is not even in target map! Using *UNKNOWN*" % ` wordmap( ).str(w) ` embedding = original_embeddings["*UNKNOWN*"] elif ELANG not in reversemap[w]: print >> sys.stderr, "Have no %s translations for word %s, only have %s, using *UNKNOWN*" % ( ELANG, wordmap().str(w), reversemap[w].keys()) embedding = original_embeddings["*UNKNOWN*"] else: # Mix the target word embedding over the weighted translation into the source language mixcnt = {} for w2 in reversemap[w][ELANG]: if language(w2) is None: assert HYPERPARAMETERS[
cnt = {} reversecnt = {} for l1, l2, f1, f2, falign in w2w.corpora.bicorpora_filenames(): for ws1, ws2, links in w2w.corpora.bicorpus_sentences_and_alignments(l1, l2, f1, f2, falign): for i1, i2 in links: if len(ws1) <= i1 or len(ws2) <= i2: print >> sys.stderr, "This is going to break on link (%d, %d) because lens = (%d, %d)" % (i1,i2, len(ws1), len(ws2)) print >> sys.stderr, [wordform(w) for w in ws1] print >> sys.stderr, [wordform(w) for w in ws2] print >> sys.stderr, links w1 = ws1[i1] w2 = ws2[i2] # print wordmap.str(w1)[1], wordmap.str(w2)[1] l2new = language(w2) assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"] # Skip translations to unknown words if wordform(w2) == "*UNKNOWN*": continue assert l2new == l2 # We don't filter here, otherwise we will get a reversemap that only maps to focus lemmas. # # If we are filtering examples by lemma # if not(HYPERPARAMETERS["W2W FOCUS LEMMAS"] is None or len (HYPERPARAMETERS["W2W FOCUS LEMMAS"]) == 0): # assert language(w1) == "en" # from lemmatizer import lemmatize # if lemmatize(language(w1), wordform(w1)) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]: ## logging.debug("Focus word %s (lemma %s) not in our list of focus lemmas" % (`wordmap().str(w1)`, lemmatize(language(w1), wordform(w1))))
def get_training_biexample(l1, l2, f1, f2, falign): """ Generator of bilingual training examples from this bicorpus. """ import common.hyperparameters HYPERPARAMETERS = common.hyperparameters.read("language-model") WINDOW = HYPERPARAMETERS["WINDOW_SIZE"] for ws1, ws2, links in bicorpus_sentences_and_alignments( l1, l2, f1, f2, falign): for i1, i2 in links: w1 = ws1[i1] w2 = ws2[i2] l2new = language(w2) assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"] # Skip translations to unknown words if wordform(w2) == "*UNKNOWN*": continue assert l2new == l2 # Skip translations from unknown words if wordform(w1) == "*UNKNOWN*": continue # If we are filtering examples by lemma if not (HYPERPARAMETERS["W2W FOCUS LEMMAS"] is None or len(HYPERPARAMETERS["W2W FOCUS LEMMAS"]) == 0): # print wordmap().str(w1), wordmap().str(w2) assert language(w1) == "en" # from lemmatizer import lemmatize # if lemmatize(language(w1), wordform(w1)) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]: # logging.debug("Focus word %s (lemma %s) not in our list of focus lemmas" % (`wordmap().str(w1)`, lemmatize(language(w1), wordform(w1)))) if wordform(w1) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]: logging.debug( "Focus word %s not in our list of focus lemmas" % ( ` wordmap().str(w1) `)) continue if w1 not in targetmap(): logging.warning("No translations for word %s, skipping" % ( ` wordmap().str(w1) `)) continue if l2new not in targetmap()[w1]: logging.warning( "Word %s has no translations for language %s, skipping" % ( ` wordmap().str(w1) `, l2new)) continue if w2 not in targetmap()[w1][l2new]: logging.error("Word %s cannot translate to word %s, skipping" % ( ` wordmap().str(w1) `, ` wordmap().str(w2) `)) continue if len(targetmap()[w1][l2new]) == 1: logging.debug( "Word %s has only one translation in language %s, skipping" % ( ` wordmap().str(w1) `, l2new)) continue # Extract the window of tokens around index i1. Pad with *LBOUNDARY* and *RBOUNDARY* as necessary. min = i1 - (WINDOW - 1) / 2 max = i1 + (WINDOW - 1) / 2 lpad = 0 rpad = 0 if min < 0: lpad = -min min = 0 if max >= len(ws1): rpad = max - (len(ws1) - 1) max = len(ws1) - 1 assert lpad + (max - min + 1) + rpad == WINDOW # print i1 - (WINDOW-1)/2, i1 + (WINDOW-1)/2 # print "min=%d, max=%d, lpad=%d, rpad=%d" % (min, max, lpad, rpad) seq = [wordmap().id((None, "*LBOUNDARY*")) ] * lpad + ws1[min:max + 1] + [wordmap().id( (None, "*RBOUNDARY*"))] * rpad # print [wordmap.str(w) for w in seq] assert len(seq) == WINDOW # print ws1[i1 - (WINDOW-1)/2:i1 + (WINDOW-1)/2] assert seq[(WINDOW - 1) / 2] == w1 yield BilingualExample(l1, seq, w1, w2)