def command_linear_trans(args_): model_es = word_vectors.load_model(args_.model_es_file_name) model_pt = word_vectors.load_model(args_.model_pt_file_name) if args_.random_pair_per_synset: lexicon = bilingual_lexicon.random_pair_per_synset_bilingual_lexicon( ) elif args_.most_frequent: lexicon = bilingual_lexicon.most_frequent_bilingual_lexicon_based_on_external_count( model_es.vocab, model_pt.vocab) else: lexicon = bilingual_lexicon.bilingual_lexicon() X, Y = zip(*word_vectors.bilingual_lexicon_vectors( model_es, model_pt, bilingual_lexicon=lexicon)) T = linear_trans.linear_transformation(X, Y, args_.backwards) linear_trans.save_linear_transformation( args_.translation_matrix_file_name, T)
# -*- coding: utf-8 -*- import logging import random from sklearn import svm from falsefriends import classifier from falsefriends import linear_trans from falsefriends import util, word_vectors logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) training_friend_pairs = util.read_words('resources/sepulveda2011_training.txt') model_es = word_vectors.load_model('resources/big/vectors_es_100.bin') model_pt = word_vectors.load_model('resources/big/vectors_pt_100.bin') logging.info("computing equal words...") equal_words = model_es.vocab.keys() & model_pt.vocab.keys() print("Equal words number in the Wikipedia's:", len(equal_words)) SAMPLE_SIZE = 20 print("Sample", SAMPLE_SIZE, "equal words found:", random.sample(equal_words, SAMPLE_SIZE)) T = linear_trans.load_linear_transformation( 'resources/big/trans_es_100_pt_100.npz') clf = classifier.build_classifier()
# -*- coding: utf-8 -*- import logging import os import sys import numpy as np PARENT_DIR = os.path.abspath( os.path.dirname(os.path.realpath(__file__)) + '/..') sys.path.insert(0, PARENT_DIR) from falsefriends import bilingual_lexicon, classifier, linear_trans, util, word_vectors # logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) model_es = word_vectors.load_model(PARENT_DIR + '/resources/big/vectors_es_100.bin') model_pt = word_vectors.load_model(PARENT_DIR + '/resources/big/vectors_pt_100.bin') lexicon = bilingual_lexicon.most_frequent_bilingual_lexicon_based_on_external_count( model_es.vocab, model_pt.vocab) logging.info("getting vector pairs") X, Y = zip(*word_vectors.bilingual_lexicon_vectors( model_es, model_pt, bilingual_lexicon=lexicon)) X_array = np.vstack(X) Y_array = np.vstack(Y) logging.info( "Computing linear transformations and classifying with cross-validation..."
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import logging import random from sklearn import svm from falsefriends import classifier from falsefriends import linear_trans from falsefriends import util, word_vectors logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) training_friend_pairs = util.read_words('resources/sepulveda2011_training.txt') model_es = word_vectors.load_model('resources/big/jairo/vectors_es.bin') model_pt = word_vectors.load_model('resources/big/jairo/vectors_pt.bin') logging.info("computing equal words...") equal_words = model_es.vocab.keys() & model_pt.vocab.keys() print("Equal words number in the Wikipedia's:", len(equal_words)) SAMPLE_SIZE = 20 print("Sample", SAMPLE_SIZE, "equal words found:", random.sample(equal_words, SAMPLE_SIZE)) T = linear_trans.load_linear_transformation('resources/big/jairo/linear_trans.npz') clf = svm.SVC() X_train, y_train, scaler = classifier.features_labels_and_scaler(training_friend_pairs, model_es, model_pt, T)
def read_models(_args): model_es = word_vectors.load_model(_args.model_es_file_name) model_pt = word_vectors.load_model(_args.model_pt_file_name) return model_es, model_pt
def command_linear_trans(_args): model_es = word_vectors.load_model(_args.model_es_file_name) model_pt = word_vectors.load_model(_args.model_pt_file_name) X, Y = zip(*word_vectors.bilingual_lexicon_vectors(model_es, model_pt)) T = linear_trans.linear_transformation(X, Y, _args.backwards) linear_trans.save_linear_transformation(_args.translation_matrix_file_name, T)
os.path.dirname(os.path.realpath(__file__)) + '/..') sys.path.insert(0, PARENT_DIR) from falsefriends import bilingual_lexicon, classifier, linear_trans, util, word_vectors # logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) print("Method\t\t Acc") friend_pairs = util.read_words(PARENT_DIR + '/resources/sepulveda2011_original.txt') VECTOR_SIZES = [100, 200, 400, 800] for size_es in VECTOR_SIZES: model_es = word_vectors.load_model( PARENT_DIR + '/resources/big/vectors_es_{}.bin'.format(size_es)) for size_pt in VECTOR_SIZES: model_pt = word_vectors.load_model( PARENT_DIR + '/resources/big/vectors_pt_{}.bin'.format(size_pt)) clf = classifier.build_classifier() T_path = PARENT_DIR + '/resources/big/trans_es_{}_pt_{}.npz'.format( size_es, size_pt) if os.path.exists(T_path): T = linear_trans.load_linear_transformation(T_path) else: lexicon = bilingual_lexicon.most_frequent_bilingual_lexicon_based_on_external_count( model_es.vocab, model_pt.vocab) X, Y = zip(*word_vectors.bilingual_lexicon_vectors( model_es, model_pt, bilingual_lexicon=lexicon))
def read_models(args_): model_es = word_vectors.load_model(args_.model_es_file_name) model_pt = word_vectors.load_model(args_.model_pt_file_name) return model_es, model_pt