def load_fastText_trans_vectors(fname, vocab, lang): word_vecs = {} words = FastVector(vector_file=fname) words.apply_transform("fastText_multilingual/alignment_matrices/" + lang + ".txt") for word in words.id2word: if word.decode('utf8').strip() in vocab: word_vecs[word.decode('utf8').strip()] = words[word] return word_vecs
def upload_fasttext_embeddings(embeddings_path): import fastText_multilingual from fastText_multilingual import fasttext from fastText_multilingual.fasttext import FastVector dictionary = FastVector(vector_file=embeddings_path ) #embeddings_path+'wiki.'+language_code+'.vec') print('Dictionary_loaded... ') return dictionary
def get_sim1_fasttext(dictionary, list1, list2): score = 0.0 final_score = 0.0 similarity = 0.0 for word1 in list1: for word2 in list2: if word1 == word2: similarity = 1.0 else: if word1 in dictionary: w1_vector = dictionary[word1] if word2 in dictionary: w2_vector = dictionary[word2] similarity = FastVector.cosine_similarity( w1_vector, w2_vector) score += similarity final_score += score / float(len(list2)) score = 0.0 return final_score / float(len(list1))
def align(_log, fasttext_dir="fasttext", langs="en,id", output_dir="aligned_fasttext"): """Align fasttext embeddings with the method of Smith et al. (2017).""" output_dir = Path(output_dir) output_dir.mkdir() for lang in langs.split(","): _log.info("Aligning embedding for %s", lang) output_path = output_dir / f"wiki.multi.{lang}.vec" if output_path.exists(): _log.info("Aligned embedding already exists, skipping") continue dictionary = FastVector(vector_file=Path(fasttext_dir) / f"wiki.{lang}.vec") dictionary.apply_transform( str( Path("fastText_multilingual") / "alignment_matrices" / f"{lang}.txt")) dictionary.export(output_path)
def word2vec(word2vec_model, vocabulary, lang): # f= open ('word2vec.txt','w') word2vec2= [] fr_model = FastVector(vector_file= word2vec_model) if lang == 'es': fr_model.apply_transform('./fastText_multilingual/alignment_matrices/es.txt') else: fr_model.apply_transform('./fastText_multilingual/alignment_matrices/en.txt') for word in vocabulary: try: word2vec = fr_model[word] except Exception: word2vec = [0.0000001] * 300 # f.write(",".join(map(lambda x: str(x), word2vec)) + "\n") word2vec2.append(word2vec) return word2vec2
def form_word_vec_dict(dataset, talks_read, talk_names, monolingual, src_word_set, target_word_set, translated_word_dict, translated_pairs_file, source_lang_embedding_file, target_lang_embedding_file, source_lang_transformation_file, target_lang_transformation_file, translation_complete): if dataset == 'SwDA': test_set_idx = swda_test_set_idx elif dataset == 'MRDA': test_set_idx = mrda_test_set_idx else: print("Dataset unknown!") exit(0) if monolingual: source_dictionary = FastVector(vector_file=source_lang_embedding_file) word_vec_dict = {} add_words_to_word_vec_dict(word_vec_dict, src_word_set, source_dictionary) print("Formed word dictionary with language vectors.") del source_dictionary del src_word_set else: if translated_word_dict is None: translated_word_dict = {} else: total_not_found_words = 0 for word in src_word_set: if word not in translated_word_dict: total_not_found_words += 1 print("WARNING: %d words not found in translated_word_dict." % total_not_found_words) total_words = len(src_word_set) total_translated_words = len(translated_word_dict) print("Found %d translated word pairs." % total_translated_words) target_dictionary = FastVector(vector_file=target_lang_embedding_file) print("Target monolingual language data loaded successfully.") if not translation_complete: source_dictionary = FastVector( vector_file=source_lang_embedding_file) print("Source monolingual language data loaded successfully.") source_dictionary.apply_transform(source_lang_transformation_file) print("Transformation data applied to source language.") target_dictionary.apply_transform(target_lang_transformation_file) print("Transformation data applied to target language.") print("Translating words seen in dataset:") try: words_seen = 0 for word in src_word_set: if word not in translated_word_dict: try: translation = target_dictionary.translate_inverted_softmax( source_dictionary[word], source_dictionary, 1500, recalculate=False) # translation = target_dictionary.translate_nearest_neighbor(source_dictionary[word]) translated_word_dict[word] = translation total_translated_words += 1 except KeyError as e: pass words_seen += 1 if words_seen % 100 == 0: print( "\t- Translated %d out of %d." % (words_seen + total_translated_words, total_words)) except KeyboardInterrupt as e: if translated_pairs_file is not None: write_word_translation_dict_to_file( translated_pairs_file, translated_word_dict) sys.exit(0) print("Word translation complete.") del source_dictionary del target_dictionary if translated_pairs_file is not None: write_word_translation_dict_to_file(translated_pairs_file, translated_word_dict, True) print("Source and target dictionaries are deleted.") target_dictionary = FastVector( vector_file=target_lang_embedding_file) word_vec_dict = {} add_words_to_word_vec_dict(word_vec_dict, src_word_set, target_dictionary, translated_word_dict) add_words_to_word_vec_dict(word_vec_dict, target_word_set, target_dictionary) print("Formed word dictionary with target language vectors.") del target_dictionary del target_word_set del src_word_set for k, c in enumerate(talks_read): if talk_names[k] not in test_set_idx: for u in c[0]: for i, word in enumerate(u): word_lowercase = word.lower() if word_lowercase in translated_word_dict: u[i] = translated_word_dict[word_lowercase] del translated_word_dict return word_vec_dict
def extract_links(text): for m in links_RE.finditer(text): yield [m.group(1).split('|')[0].strip()] #Working with a sample of documents, to make output memory friendly for lang in ['ja','ar','en','fr', 'es', 'ru']: df = pd.read_pickle('articlesInSixLang.p').reset_index() wikidatadict = df[df.wiki ==lang][['q','page']].set_index('page').to_dict()['q'] del(df) #save memory print(lang) paths = glob.glob('/mnt/data/xmldatadumps/public/%swiki/%s/%swiki-%s-pages-meta-current*.xml*.bz2' % (lang,dumpDate,lang,dumpDate)) if len(paths) > 1: #remove the single file when have, keep it for small wikis that came all togheter in one file paths.remove('/mnt/data/xmldatadumps/public/%swiki/%s/%swiki-%s-pages-meta-current.xml.bz2' % (lang,dumpDate,lang,dumpDate)) print(paths) lang_dictionary = FastVector(vector_file='fastText_multilingual/vectors/wiki.%s.vec' % lang) if lang != 'en': lang_dictionary.apply_transform('fastText_multilingual/my_alingments/apply_in_%s_to_en.txt' % lang) def process_dump(dump, path): for page in dump: if int(page.namespace) == 0: #if int(page.id) in pagesIds: if page.title in wikidatadict: #try: for revision in page: pass #pass all , go to the last revision text = revision.text sections = list(extract_sections(text) or "") N = len(sections) sectionContent = {} for n,sec in enumerate(sections):
precision=18) for j in range(1, matrix.shape[1]): s += ' %s' % np.format_float_scientific( matrix[i][j], unique=False, precision=18) f.write('%s\n' % s) source_languages = ['de', 'es', 'tr'] datasets = [('fasttextwiki/', 'wiki.%s.vec'), ('conll17word2vec/', 'conll17.%s.txt'), ('fasttext157/', 'cc.%s.300.vec')] dictionaries = ['expert', 'automated'] for prefix, file_format in datasets: monolingual_language_files_path = '../word-embeddings/%smonolingual/' % prefix target_dictionary = FastVector( vector_file=monolingual_language_files_path + (file_format % 'en')) target_words = set(target_dictionary.word2id.keys()) for signal in dictionaries: training_matrices_path = ('alignment_matrices/%s' % prefix) + signal + '/%s.txt' dimension = None for language in source_languages: source_dictionary = FastVector( vector_file=monolingual_language_files_path + (file_format % language)) source_words = set(source_dictionary.word2id.keys()) if signal is 'automated': # For pseudo-dictionary training