def test_save(self): embs = load_from_dir("tests/data/embeddings/text/plain_with_file_header") path_save = "/tmp/vecto/saved" embs.save_to_dir(path_save) embs = load_from_dir(path_save) print(embs.matrix.shape) embs.save_to_dir_plain_txt("/tmp/vecto/saved_plain")
def test_save(self): embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) path_save = path.join('/tmp', 'vecto', 'saved') embs.save_to_dir(path_save) embs = load_from_dir(path_save) print(embs.matrix.shape) embs.save_to_dir_plain_txt(path.join('/tmp', 'vecto', 'saved_plain'))
def main(): # config = load_config() # print(config) parser = argparse.ArgumentParser() parser.add_argument("embeddings") parser.add_argument("--window_size", default=5, type=int) parser.add_argument("--test", default=True, help='use small test dataset') parser.add_argument("--method", default='lstm', choices=['lr', '2FFNN', 'lstm'], help='name of method') parser.add_argument("--path_out", default=False, help="destination folder to save results") args = parser.parse_args() embeddings = load_from_dir(args.embeddings) # print("embeddings", embeddings) language_modeling = Language_modeling(window_size=args.window_size, method=args.method, test=args.test) results = language_modeling.get_result(embeddings) if args.path_out: if os.path.isdir(args.path_out) or args.path_out.endswith("/"): name_file_out = os.path.join(args.path_out, "language_modeling", "results.json") save_json(results, name_file_out) else: save_json(results, args.path_out) else: print_json(results)
def test_api(self): embs = load_from_dir( path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) analogy = Analogy(method="3CosAdd") result = analogy.get_result(embs, path_analogy_dataset) self.assertIsInstance(result[0], dict) analogy = Analogy(method="PairDistance") result = analogy.get_result(embs, path_analogy_dataset) self.assertIsInstance(result[0], dict) analogy = Analogy(method="3CosMul") result = analogy.get_result(embs, path_analogy_dataset) self.assertIsInstance(result[0], dict) analogy = Analogy(method="3CosMul2") result = analogy.get_result(embs, path_analogy_dataset) self.assertIsInstance(result[0], dict) analogy = Analogy(method="3CosAvg") result = analogy.get_result(embs, path_analogy_dataset) self.assertIsInstance(result[0], dict) analogy = Analogy(method="SimilarToAny") result = analogy.get_result(embs, path_analogy_dataset) print(result) analogy = Analogy(method="SimilarToB") result = analogy.get_result(embs, path_analogy_dataset) print(result) analogy = Analogy(method="LRCos") result = analogy.get_result(embs, path_analogy_dataset) print(result)
def test_categorization_data(self): embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) categorization = KMeansCategorization() result = categorization.get_result(embs, path_categorization_dataset) word_stats = result[0]['word_stats'] # self.assertEqual(word_stats['4. banana']['true_category'], 'food') self.assertEqual(len(word_stats.keys()), 7)
def main(): # config = load_config() # print(config) parser = argparse.ArgumentParser() parser.add_argument("embeddings") parser.add_argument("dataset") parser.add_argument("--method", help="analogy solving method", default="LRCos") parser.add_argument("--path_out", help="destination folder to save results") args = parser.parse_args() embeddings = load_from_dir(args.embeddings) # print("embeddings", embeddings) benchmark = select_method(args.method) results = benchmark.get_result(embeddings, args.dataset) if args.path_out: if os.path.isdir(args.path_out) or args.path_out.endswith("/"): dataset = os.path.basename(os.path.normpath(args.dataset)) name_file_out = os.path.join(args.path_out, dataset, args.method, "results.json") save_json(results, name_file_out) else: save_json(results, args.path_out) else: print_json(results)
def test_filter(self): embs = load_from_dir("tests/data/embeddings/text/plain_with_file_header") path_vocab = "./tests/data/vocabs/plain" vocab = Vocabulary() vocab.load(path_vocab) embs.filter_by_vocab(["the", "apple"]) embs.filter_by_vocab([])
def run_benchmark_by_name(name, args): print(name, args) print("running ", name) mod = importlib.import_module("vecto.benchmarks." + name) parser = argparse.ArgumentParser() add_extra_args = getattr(mod, 'add_extra_args') add_extra_args(parser) parser.add_argument("--path_out", default=None, help="destination folder to save results") args = parser.parse_args(args) dict_args = vars(args) embeddings = load_from_dir(args.embeddings) # TODO: this is ugly hack, do subparsers or something if name == "language_modeling": dataset = Dataset("/tmp/") dataset.name = "ptb" else: dataset = Dataset(args.dataset) dict_args.pop("dataset") dict_args.pop("embeddings") # TODO: not sure if all banchmarks use dataset arg path_out = dict_args.pop("path_out") Benchmark = getattr(mod, "Benchmark") benchmark = Benchmark(**dict_args) print("SHAPE:", embeddings.matrix.shape) print("vocab size:", embeddings.vocabulary.cnt_words) results = benchmark.run(embeddings, dataset) if path_out: save_results(results, path_out, dataset.metadata["name"]) else: print_json(results)
def test_text_classification(self): embs = load_from_dir( "./tests/data/embeddings/text/plain_with_file_header") tc = Text_classification(model='cnn') result = tc.get_result( embs, path_text_classification_dataset, "/tmp/tests/data/benchmarks_results/text_classification/") print(result) tc = Text_classification(model='rnn') result = tc.get_result( embs, path_text_classification_dataset, "/tmp/tests/data/benchmarks_results/text_classification/") print(result) tc = Text_classification(model='bow') result = tc.get_result( embs, path_text_classification_dataset, "/tmp/tests/data/benchmarks_results/text_classification/") print(result) model = text_classification.load_model( "./tests/data/benchmarks_results/text_classification/args.json", embs.matrix) model = text_classification.load_model( "./tests/data/benchmarks_results/text_classification/args.json", embs.matrix) print(text_classification.predict(model, "I like this")) print( text_classification.get_vectors(model, ["I like this", "I hate this"]))
def test_analogy(self): embs = load_from_dir( path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) analogy = LinearOffset() result = analogy.get_result(embs, path_analogy_dataset) print(result) analogy = PairDistance() result = analogy.get_result(embs, path_analogy_dataset) print(result) analogy = ThreeCosMul() result = analogy.get_result(embs, path_analogy_dataset) print(result) analogy = ThreeCosMul2() result = analogy.get_result(embs, path_analogy_dataset) print(result) analogy = ThreeCosAvg() result = analogy.get_result(embs, path_analogy_dataset) print(result) # analogy = SimilarToAny() # result = analogy.get_result(embs, path_analogy_dataset) # print(result) # analogy = SimilarToB() # result = analogy.get_result(embs, path_analogy_dataset) # print(result) analogy = LRCos() result = analogy.get_result(embs, path_analogy_dataset) print(result) analogy_visualize.plot_accuracy()
def test_api(self): embs = load_from_dir(path_emb) dataset = Dataset(path_text_classification_dataset) tc = Text_classification(model='cnn') result = tc.run(embs, dataset, "/tmp/vecto/benchmarks/text_classification_model/") self.assertIsInstance(result[0], dict) print(result) tc = Text_classification(model='rnn') result = tc.run(embs, dataset, "/tmp/vecto/benchmarks/text_classification_model/") self.assertIsInstance(result[0], dict) print(result) tc = Text_classification(model='bow') result = tc.run(embs, dataset, "/tmp/vecto/benchmarks/text_classification_model/") self.assertIsInstance(result[0], dict) print(result) model = load_model("/tmp/vecto/benchmarks/text_classification_model/args.json", embs.matrix) print(predict(model, "I like this")) print(get_vectors(model, ["I like this", "I hate this"]))
def test_filter(self): embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) path_vocab = path.join('.', 'tests', 'data', 'vocabs', 'plain') vocab = Vocabulary() vocab.load(path_vocab) embs.filter_by_vocab(["the", "apple"]) embs.filter_by_vocab([])
def main(): max_len = 30 min_count = 2 embeddings_dir = '/home/mattd/embeddings/reddit_2/' #dataset_path = '/home/mattd/datasets/AskReddit/' dataset_path = "/home/mattd/PycharmProjects/reddit/data/" dataset_train_filename = "{}train.csv".format(dataset_path) dataset_val_filename = "{}validation.csv".format(dataset_path) save_dir = "/home/mattd/PycharmProjects/reddit/embeddings/" dataset_train = PairsDataset(dataset_train_filename, max_len, min_count) dataset_val = PairsDataset(dataset_val_filename, max_len, min_count, dataset_train.vocab) #dataset.add_file(eng_fr_filename2) vectors = embeddings.load_from_dir(embeddings_dir) #emb = embeddings.load_from_dir(embeddings_dir) embs_matrix = np.zeros((len(dataset_val.vocab), len(vectors.matrix[0]))) for i, token in enumerate(dataset_val.vocab.token2id): if vectors.has_word(token): embs_matrix[i] = vectors.get_vector(token) np.save('{}embeddings_min{}_max{}'.format(save_dir, min_count, max_len), embs_matrix)
def main(): # config = load_config() # print(config) parser = argparse.ArgumentParser() parser.add_argument("embeddings") parser.add_argument("dataset") parser.add_argument('--batchsize', '-b', type=int, default=64, help='Number of images in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=30, help='Number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') parser.add_argument('--layer', '-l', type=int, default=1, help='Number of layers of RNN or MLP following CNN') parser.add_argument('--dropout', '-d', type=float, default=0.4, help='Dropout rate') parser.add_argument('--model', '-model', default='cnn', choices=['cnn', 'rnn', 'bow'], help='Name of encoder model type') parser.add_argument("--path_out", default=False, help="destination folder to save results") args = parser.parse_args() embeddings = load_from_dir(args.embeddings) # print("embeddings", embeddings) text_classification = Text_classification(batchsize=args.batchsize, epoch=args.epoch, gpu=args.gpu, layer=args.layer, dropout=args.dropout, model=args.model) results = text_classification.get_result(embeddings, args.dataset) if args.path_out: if os.path.isdir(args.path_out) or args.path_out.endswith("/"): dataset = os.path.basename(os.path.normpath(args.dataset)) name_file_out = os.path.join(args.path_out, dataset, "results.json") save_json(results, name_file_out) else: save_json(results, args.path_out) else: print_json(results)
def test_basic(self): WordEmbeddingsDense() model = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) model.cmp_words("apple", "banana") model.cmp_words("apple", "bananaaaaa") x = np.array([0.0, 0.0, 0.0]) x.fill(np.nan) model.cmp_vectors(x, x)
def test_language_modeling(self): embs = load_from_dir( "./tests/data/embeddings/text/plain_with_file_header") for method in ['lr', '2FFNN', 'rnn', 'lstm']: sequence_labeling = Language_modeling(test=True, method=method) results = sequence_labeling.get_result(embs) print(results)
def test_basic(self): WordEmbeddingsDense() model = load_from_dir("tests/data/embeddings/text/plain_with_file_header") model.cmp_words("apple", "banana") model.cmp_words("apple", "bananaaaaa") x = np.array([0.0, 0.0, 0.0]) x.fill(np.nan) model.cmp_vectors(x, x)
def test_fetcher(self): if path.isdir(path.join('.', 'tests', 'data', 'benchmarks_test')): return fetch_benchmarks(path.join('.', 'tests', 'data', 'benchmarks_test')) embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) similarity = Similarity() path_similarity_dataset = path.join('.', 'tests', 'data', 'benchmarks_test', 'benchmarks', 'similarity', 'en') similarity.get_result(embs, path_similarity_dataset)
def test_categorization_scores(self): embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) categorization = KMeansCategorization() result = categorization.get_result(embs, path_categorization_dataset) scores = result[0]['global_stats']['scores'] self.assertEqual(len(scores.keys()), 7) self.assertEqual(len(result[0]['global_stats']['true_labels']), 7) self.assertEqual(result[0]['global_stats']['true_labels'][3], 1)
def test_load(self): load_from_dir("tests/data/embeddings/text/plain_with_file_header") # TODO: assert right class load_from_dir("tests/data/embeddings/text/plain_no_file_header") # TODO: assert right class load_from_dir("tests/data/embeddings/npy") embs = load_from_dir("tests/data/embeddings/text/plain_with_file_header") embs.get_vector('apple') with self.assertRaises(RuntimeError): embs.get_vector('word_that_not_in_vocabulary_27') embs = load_from_dir("tests/data/embeddings/text/corrupted") with self.assertRaises(RuntimeError): embs = load_from_dir("tests/data/embeddings/text")
def test_load(self): load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) # TODO: assert right class load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_no_file_header')) # TODO: assert right class load_from_dir(path.join('tests', 'data', 'embeddings', 'npy')) embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) embs.get_vector('apple') #with self.assertRaises(RuntimeError): # embs.get_vector('word_that_not_in_vocabulary_27') embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'corrupted')) with self.assertRaises(RuntimeError): embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text'))
def test_api(self): embs = load_from_dir(path_emb) for method in ['lr', '2FFNN']: sequence_labeling = Sequence_labeling(method=method) for subtask in ['chunk', 'pos', 'ner']: # , 'chunk', 'pos', 'ner' result = sequence_labeling.get_result( embs, path.join(path_sequence_labeling_dataset, subtask)) self.assertIsInstance(result[0], dict) print(result)
def test_sequence_labeling(self): embs = load_from_dir( "./tests/data/embeddings/text/plain_with_file_header") for method in ['lr', '2FFNN']: sequence_labeling = Sequence_labeling(method='lr') for subtask in ['chunk', 'pos', 'ner']: # , 'chunk', 'pos', 'ner' results = sequence_labeling.get_result( embs, os.path.join(path_sequence_labeling_dataset, subtask)) print(results)
def test_utils(self): embs = load_from_dir("tests/data/embeddings/text/plain_with_file_header") results = embs.get_most_similar_words('apple', 5) print(results) embs.cache_normalized_copy() results = embs.get_most_similar_words('apple', 5) print(results) results = embs.get_most_similar_words(embs.get_vector('apple'), 5) print(results) embs.get_x_label(0)
def test_utils(self): embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) results = embs.get_most_similar_words('apple', 5) print(results) embs.cache_normalized_copy() results = embs.get_most_similar_words('apple', 5) print(results) results = embs.get_most_similar_words(embs.get_vector('apple'), 5) print(results) embs.get_x_label(0)
def test_outliers_results(self): embs = load_from_dir( path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) outliers = AveragePairwiseCosine() result = outliers.get_result(embs, path_outliers_dataset)['test'] amount_of_categories = 2 amount_of_word_in_cats = 3 self.assertEqual(len(result.keys()), amount_of_categories) self.assertEqual(len(result['cats']), amount_of_word_in_cats)
def test_outliers_results(self): embs = load_from_dir( path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) outliers = AveragePairwiseCosine() result = outliers.get_result(embs, path_outliers_dataset)['test'] amount_of_categories = 2 # TODO: refactor to be understandable, check if ok after covab to UNK amount_of_word_in_cats = 4 self.assertEqual(len(result.keys()), amount_of_categories) self.assertEqual(len(result['cats']), amount_of_word_in_cats)
def main(): embeddings_dir = '/mnt/data1/embeddings/crawl/' eng_fr_filename = '/mnt/data1/datasets/yelp/merged/train' dataset = SentenceDataset(eng_fr_filename, 20, 2) emb = embeddings.load_from_dir(embeddings_dir) vocab_embs = np.zeros((len(dataset.vocab), emb.matrix.shape[1])) for i, token in enumerate(dataset.vocab.token2id): if emb.has_word(token): vocab_embs[i] = emb.get_vector(token) np.save('embeddings', vocab_embs)
def test_similarity(self): embs = load_from_dir( path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) similarity = Similarity() result = similarity.get_result(embs, path_similarity_dataset) print(result) similarity = Similarity(ignore_oov=False) result = similarity.get_result(embs, path_similarity_dataset) print(result) similarity_visualize.plot_accuracy()
def test_synonymy_results(self): embs = load_from_dir( path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) synonymy = CosineDistance() result = synonymy.get_result(embs, path_synonymy_dataset)['test'] cat_is_synonym = 'yes' cat_is_hit = False distance_to_cat = 1.0 self.assertEqual(result['tiger'][0]['is_synonym'], cat_is_synonym) self.assertEqual(result['tiger'][0]['hit'], cat_is_hit) self.assertEqual(result['tiger'][0]['distance'], distance_to_cat)