def load_fastText_trans_vectors(fname, vocab, lang):
    word_vecs = {}

    words = FastVector(vector_file=fname)
    words.apply_transform("fastText_multilingual/alignment_matrices/" + lang +
                          ".txt")

    for word in words.id2word:
        if word.decode('utf8').strip() in vocab:
            word_vecs[word.decode('utf8').strip()] = words[word]

    return word_vecs
Пример #2
0
def upload_fasttext_embeddings(embeddings_path):
    import fastText_multilingual
    from fastText_multilingual import fasttext
    from fastText_multilingual.fasttext import FastVector
    dictionary = FastVector(vector_file=embeddings_path
                            )  #embeddings_path+'wiki.'+language_code+'.vec')
    print('Dictionary_loaded... ')
    return dictionary
def get_sim1_fasttext(dictionary, list1, list2):
    score = 0.0
    final_score = 0.0
    similarity = 0.0
    for word1 in list1:
        for word2 in list2:
            if word1 == word2:
                similarity = 1.0
            else:
                if word1 in dictionary:
                    w1_vector = dictionary[word1]
                    if word2 in dictionary:
                        w2_vector = dictionary[word2]
                        similarity = FastVector.cosine_similarity(
                            w1_vector, w2_vector)
            score += similarity
            final_score += score / float(len(list2))
            score = 0.0
    return final_score / float(len(list1))
Пример #4
0
def align(_log,
          fasttext_dir="fasttext",
          langs="en,id",
          output_dir="aligned_fasttext"):
    """Align fasttext embeddings with the method of Smith et al. (2017)."""
    output_dir = Path(output_dir)
    output_dir.mkdir()

    for lang in langs.split(","):
        _log.info("Aligning embedding for %s", lang)
        output_path = output_dir / f"wiki.multi.{lang}.vec"
        if output_path.exists():
            _log.info("Aligned embedding already exists, skipping")
            continue
        dictionary = FastVector(vector_file=Path(fasttext_dir) /
                                f"wiki.{lang}.vec")
        dictionary.apply_transform(
            str(
                Path("fastText_multilingual") / "alignment_matrices" /
                f"{lang}.txt"))
        dictionary.export(output_path)
def word2vec(word2vec_model, vocabulary, lang):
    
#    f= open ('word2vec.txt','w')
    
    word2vec2= []
    
    fr_model = FastVector(vector_file= word2vec_model)
    
    if lang == 'es':
        fr_model.apply_transform('./fastText_multilingual/alignment_matrices/es.txt')
    else: 
        fr_model.apply_transform('./fastText_multilingual/alignment_matrices/en.txt')
    
    for word in vocabulary:
        try:
            word2vec = fr_model[word]
        except Exception:
            word2vec = [0.0000001] * 300
#        f.write(",".join(map(lambda x: str(x), word2vec)) + "\n")
        
        word2vec2.append(word2vec)
    
    return word2vec2
Пример #6
0
def form_word_vec_dict(dataset, talks_read, talk_names, monolingual,
                       src_word_set, target_word_set, translated_word_dict,
                       translated_pairs_file, source_lang_embedding_file,
                       target_lang_embedding_file,
                       source_lang_transformation_file,
                       target_lang_transformation_file, translation_complete):
    if dataset == 'SwDA':
        test_set_idx = swda_test_set_idx
    elif dataset == 'MRDA':
        test_set_idx = mrda_test_set_idx
    else:
        print("Dataset unknown!")
        exit(0)

    if monolingual:
        source_dictionary = FastVector(vector_file=source_lang_embedding_file)
        word_vec_dict = {}
        add_words_to_word_vec_dict(word_vec_dict, src_word_set,
                                   source_dictionary)
        print("Formed word dictionary with language vectors.")

        del source_dictionary
        del src_word_set
    else:
        if translated_word_dict is None:
            translated_word_dict = {}
        else:
            total_not_found_words = 0
            for word in src_word_set:
                if word not in translated_word_dict:
                    total_not_found_words += 1
            print("WARNING: %d words not found in translated_word_dict." %
                  total_not_found_words)

        total_words = len(src_word_set)
        total_translated_words = len(translated_word_dict)
        print("Found %d translated word pairs." % total_translated_words)

        target_dictionary = FastVector(vector_file=target_lang_embedding_file)
        print("Target  monolingual language data loaded successfully.")

        if not translation_complete:
            source_dictionary = FastVector(
                vector_file=source_lang_embedding_file)
            print("Source monolingual language data loaded successfully.")
            source_dictionary.apply_transform(source_lang_transformation_file)
            print("Transformation data applied to source language.")
            target_dictionary.apply_transform(target_lang_transformation_file)
            print("Transformation data applied to target language.")
            print("Translating words seen in dataset:")

            try:
                words_seen = 0
                for word in src_word_set:
                    if word not in translated_word_dict:
                        try:
                            translation = target_dictionary.translate_inverted_softmax(
                                source_dictionary[word],
                                source_dictionary,
                                1500,
                                recalculate=False)
                            #                translation = target_dictionary.translate_nearest_neighbor(source_dictionary[word])
                            translated_word_dict[word] = translation
                            total_translated_words += 1
                        except KeyError as e:
                            pass
                        words_seen += 1
                    if words_seen % 100 == 0:
                        print(
                            "\t- Translated %d out of %d." %
                            (words_seen + total_translated_words, total_words))
            except KeyboardInterrupt as e:
                if translated_pairs_file is not None:
                    write_word_translation_dict_to_file(
                        translated_pairs_file, translated_word_dict)
                sys.exit(0)
            print("Word translation complete.")

            del source_dictionary
            del target_dictionary

            if translated_pairs_file is not None:
                write_word_translation_dict_to_file(translated_pairs_file,
                                                    translated_word_dict, True)

            print("Source and target dictionaries are deleted.")

            target_dictionary = FastVector(
                vector_file=target_lang_embedding_file)

        word_vec_dict = {}
        add_words_to_word_vec_dict(word_vec_dict, src_word_set,
                                   target_dictionary, translated_word_dict)
        add_words_to_word_vec_dict(word_vec_dict, target_word_set,
                                   target_dictionary)
        print("Formed word dictionary with target language vectors.")

        del target_dictionary
        del target_word_set
        del src_word_set

        for k, c in enumerate(talks_read):
            if talk_names[k] not in test_set_idx:
                for u in c[0]:
                    for i, word in enumerate(u):
                        word_lowercase = word.lower()
                        if word_lowercase in translated_word_dict:
                            u[i] = translated_word_dict[word_lowercase]

        del translated_word_dict

    return word_vec_dict
Пример #7
0
def extract_links(text):
 for m in links_RE.finditer(text):
  yield [m.group(1).split('|')[0].strip()]

#Working with a sample of documents, to make output memory friendly
for lang in  ['ja','ar','en','fr', 'es', 'ru']: 
	df = pd.read_pickle('articlesInSixLang.p').reset_index()
	wikidatadict = df[df.wiki ==lang][['q','page']].set_index('page').to_dict()['q']
	del(df) #save memory
	print(lang)
	paths = glob.glob('/mnt/data/xmldatadumps/public/%swiki/%s/%swiki-%s-pages-meta-current*.xml*.bz2' % (lang,dumpDate,lang,dumpDate))
	if len(paths) > 1: #remove the single file when have, keep it for small wikis that came all togheter in one file
		paths.remove('/mnt/data/xmldatadumps/public/%swiki/%s/%swiki-%s-pages-meta-current.xml.bz2' % (lang,dumpDate,lang,dumpDate))

	print(paths)
	lang_dictionary = FastVector(vector_file='fastText_multilingual/vectors/wiki.%s.vec' % lang)
	if lang != 'en':
		lang_dictionary.apply_transform('fastText_multilingual/my_alingments/apply_in_%s_to_en.txt' % lang)

	def process_dump(dump, path):
		for page in dump:
			if int(page.namespace) == 0:  #if int(page.id) in pagesIds:
				if page.title in wikidatadict:
				#try:
					for revision in page: 
						pass #pass all , go to the last revision
					text =  revision.text
					sections = list(extract_sections(text) or "")
					N = len(sections)
					sectionContent = {}
					for n,sec  in enumerate(sections):
                                           precision=18)
            for j in range(1, matrix.shape[1]):
                s += ' %s' % np.format_float_scientific(
                    matrix[i][j], unique=False, precision=18)
            f.write('%s\n' % s)


source_languages = ['de', 'es', 'tr']
datasets = [('fasttextwiki/', 'wiki.%s.vec'),
            ('conll17word2vec/', 'conll17.%s.txt'),
            ('fasttext157/', 'cc.%s.300.vec')]
dictionaries = ['expert', 'automated']

for prefix, file_format in datasets:
    monolingual_language_files_path = '../word-embeddings/%smonolingual/' % prefix
    target_dictionary = FastVector(
        vector_file=monolingual_language_files_path + (file_format % 'en'))
    target_words = set(target_dictionary.word2id.keys())

    for signal in dictionaries:
        training_matrices_path = ('alignment_matrices/%s' %
                                  prefix) + signal + '/%s.txt'
        dimension = None
        for language in source_languages:
            source_dictionary = FastVector(
                vector_file=monolingual_language_files_path +
                (file_format % language))

            source_words = set(source_dictionary.word2id.keys())

            if signal is 'automated':
                # For pseudo-dictionary training