Python load_from_dir示例，vecto.embeddings.load_from_dir Python示例

示例#1

0

显示文件

文件： test_embeddings.py 项目： vohoaiviet/vecto-vsm-space

 def test_save(self):
     embs = load_from_dir("tests/data/embeddings/text/plain_with_file_header")
     path_save = "/tmp/vecto/saved"
     embs.save_to_dir(path_save)
     embs = load_from_dir(path_save)
     print(embs.matrix.shape)
     embs.save_to_dir_plain_txt("/tmp/vecto/saved_plain")

示例#2

0

显示文件

文件： test_embeddings.py 项目： vecto-ai/vecto

 def test_save(self):
     embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
     path_save = path.join('/tmp', 'vecto', 'saved')
     embs.save_to_dir(path_save)
     embs = load_from_dir(path_save)
     print(embs.matrix.shape)
     embs.save_to_dir_plain_txt(path.join('/tmp', 'vecto', 'saved_plain'))

示例#3

0

显示文件

def main():
    # config = load_config()
    # print(config)
    parser = argparse.ArgumentParser()
    parser.add_argument("embeddings")
    parser.add_argument("--window_size", default=5, type=int)
    parser.add_argument("--test", default=True, help='use small test dataset')
    parser.add_argument("--method",
                        default='lstm',
                        choices=['lr', '2FFNN', 'lstm'],
                        help='name of method')
    parser.add_argument("--path_out",
                        default=False,
                        help="destination folder to save results")
    args = parser.parse_args()
    embeddings = load_from_dir(args.embeddings)
    # print("embeddings", embeddings)
    language_modeling = Language_modeling(window_size=args.window_size,
                                          method=args.method,
                                          test=args.test)
    results = language_modeling.get_result(embeddings)
    if args.path_out:
        if os.path.isdir(args.path_out) or args.path_out.endswith("/"):
            name_file_out = os.path.join(args.path_out, "language_modeling",
                                         "results.json")
            save_json(results, name_file_out)
        else:
            save_json(results, args.path_out)
    else:
        print_json(results)

示例#4

0

显示文件

    def test_api(self):
        embs = load_from_dir(
            path.join('tests', 'data', 'embeddings', 'text',
                      'plain_with_file_header'))
        analogy = Analogy(method="3CosAdd")
        result = analogy.get_result(embs, path_analogy_dataset)
        self.assertIsInstance(result[0], dict)

        analogy = Analogy(method="PairDistance")
        result = analogy.get_result(embs, path_analogy_dataset)
        self.assertIsInstance(result[0], dict)

        analogy = Analogy(method="3CosMul")
        result = analogy.get_result(embs, path_analogy_dataset)
        self.assertIsInstance(result[0], dict)

        analogy = Analogy(method="3CosMul2")
        result = analogy.get_result(embs, path_analogy_dataset)
        self.assertIsInstance(result[0], dict)

        analogy = Analogy(method="3CosAvg")
        result = analogy.get_result(embs, path_analogy_dataset)
        self.assertIsInstance(result[0], dict)

        analogy = Analogy(method="SimilarToAny")
        result = analogy.get_result(embs, path_analogy_dataset)
        print(result)

        analogy = Analogy(method="SimilarToB")
        result = analogy.get_result(embs, path_analogy_dataset)
        print(result)

        analogy = Analogy(method="LRCos")
        result = analogy.get_result(embs, path_analogy_dataset)
        print(result)

示例#5

0

显示文件

 def test_categorization_data(self):
     embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
     categorization = KMeansCategorization()
     result = categorization.get_result(embs, path_categorization_dataset)
     word_stats = result[0]['word_stats']
     # self.assertEqual(word_stats['4. banana']['true_category'], 'food')
     self.assertEqual(len(word_stats.keys()), 7)

示例#6

0

显示文件

def main():
    # config = load_config()
    # print(config)
    parser = argparse.ArgumentParser()
    parser.add_argument("embeddings")
    parser.add_argument("dataset")
    parser.add_argument("--method",
                        help="analogy solving method",
                        default="LRCos")
    parser.add_argument("--path_out",
                        help="destination folder to save results")
    args = parser.parse_args()
    embeddings = load_from_dir(args.embeddings)
    # print("embeddings", embeddings)
    benchmark = select_method(args.method)
    results = benchmark.get_result(embeddings, args.dataset)
    if args.path_out:
        if os.path.isdir(args.path_out) or args.path_out.endswith("/"):
            dataset = os.path.basename(os.path.normpath(args.dataset))
            name_file_out = os.path.join(args.path_out, dataset, args.method,
                                         "results.json")
            save_json(results, name_file_out)
        else:
            save_json(results, args.path_out)
    else:
        print_json(results)

示例#7

0

显示文件

文件： test_embeddings.py 项目： vohoaiviet/vecto-vsm-space

 def test_filter(self):
     embs = load_from_dir("tests/data/embeddings/text/plain_with_file_header")
     path_vocab = "./tests/data/vocabs/plain"
     vocab = Vocabulary()
     vocab.load(path_vocab)
     embs.filter_by_vocab(["the", "apple"])
     embs.filter_by_vocab([])

示例#8

0

显示文件

def run_benchmark_by_name(name, args):
    print(name, args)
    print("running ", name)
    mod = importlib.import_module("vecto.benchmarks." + name)
    parser = argparse.ArgumentParser()
    add_extra_args = getattr(mod, 'add_extra_args')
    add_extra_args(parser)
    parser.add_argument("--path_out",
                        default=None,
                        help="destination folder to save results")
    args = parser.parse_args(args)
    dict_args = vars(args)
    embeddings = load_from_dir(args.embeddings)
    # TODO: this is ugly hack, do subparsers or something
    if name == "language_modeling":
        dataset = Dataset("/tmp/")
        dataset.name = "ptb"
    else:
        dataset = Dataset(args.dataset)
        dict_args.pop("dataset")

    dict_args.pop("embeddings")
    # TODO: not sure if all banchmarks use dataset arg
    path_out = dict_args.pop("path_out")
    Benchmark = getattr(mod, "Benchmark")
    benchmark = Benchmark(**dict_args)

    print("SHAPE:", embeddings.matrix.shape)
    print("vocab size:", embeddings.vocabulary.cnt_words)
    results = benchmark.run(embeddings, dataset)
    if path_out:
        save_results(results, path_out, dataset.metadata["name"])
    else:
        print_json(results)

示例#9

0

显示文件

文件： test_misc.py 项目： vohoaiviet/vecto-vsm-space

    def test_text_classification(self):
        embs = load_from_dir(
            "./tests/data/embeddings/text/plain_with_file_header")

        tc = Text_classification(model='cnn')
        result = tc.get_result(
            embs, path_text_classification_dataset,
            "/tmp/tests/data/benchmarks_results/text_classification/")
        print(result)
        tc = Text_classification(model='rnn')
        result = tc.get_result(
            embs, path_text_classification_dataset,
            "/tmp/tests/data/benchmarks_results/text_classification/")
        print(result)
        tc = Text_classification(model='bow')
        result = tc.get_result(
            embs, path_text_classification_dataset,
            "/tmp/tests/data/benchmarks_results/text_classification/")
        print(result)

        model = text_classification.load_model(
            "./tests/data/benchmarks_results/text_classification/args.json",
            embs.matrix)
        model = text_classification.load_model(
            "./tests/data/benchmarks_results/text_classification/args.json",
            embs.matrix)
        print(text_classification.predict(model, "I like this"))
        print(
            text_classification.get_vectors(model,
                                            ["I like this", "I hate this"]))

示例#10

0

显示文件

文件： test_analogy.py 项目： vohoaiviet/vecto-vsm-space

    def test_analogy(self):
        embs = load_from_dir(
            path.join('tests', 'data', 'embeddings', 'text',
                      'plain_with_file_header'))
        analogy = LinearOffset()
        result = analogy.get_result(embs, path_analogy_dataset)
        print(result)

        analogy = PairDistance()
        result = analogy.get_result(embs, path_analogy_dataset)
        print(result)

        analogy = ThreeCosMul()
        result = analogy.get_result(embs, path_analogy_dataset)
        print(result)

        analogy = ThreeCosMul2()
        result = analogy.get_result(embs, path_analogy_dataset)
        print(result)

        analogy = ThreeCosAvg()
        result = analogy.get_result(embs, path_analogy_dataset)
        print(result)

        # analogy = SimilarToAny()
        # result = analogy.get_result(embs, path_analogy_dataset)
        # print(result)
        # analogy = SimilarToB()
        # result = analogy.get_result(embs, path_analogy_dataset)
        # print(result)
        analogy = LRCos()
        result = analogy.get_result(embs, path_analogy_dataset)
        print(result)
        analogy_visualize.plot_accuracy()

示例#11

0

显示文件

文件： test_text_classification.py 项目： vecto-ai/vecto

    def test_api(self):
        embs = load_from_dir(path_emb)
        dataset = Dataset(path_text_classification_dataset)

        tc = Text_classification(model='cnn')
        result = tc.run(embs, dataset,
                        "/tmp/vecto/benchmarks/text_classification_model/")
        self.assertIsInstance(result[0], dict)
        print(result)

        tc = Text_classification(model='rnn')
        result = tc.run(embs, dataset,
                        "/tmp/vecto/benchmarks/text_classification_model/")
        self.assertIsInstance(result[0], dict)
        print(result)

        tc = Text_classification(model='bow')
        result = tc.run(embs, dataset,
                        "/tmp/vecto/benchmarks/text_classification_model/")
        self.assertIsInstance(result[0], dict)
        print(result)

        model = load_model("/tmp/vecto/benchmarks/text_classification_model/args.json",
                                               embs.matrix)
        print(predict(model, "I like this"))
        print(get_vectors(model, ["I like this", "I hate this"]))

示例#12

0

显示文件

文件： test_embeddings.py 项目： vecto-ai/vecto

 def test_filter(self):
     embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
     path_vocab = path.join('.', 'tests', 'data', 'vocabs', 'plain')
     vocab = Vocabulary()
     vocab.load(path_vocab)
     embs.filter_by_vocab(["the", "apple"])
     embs.filter_by_vocab([])

示例#13

0

显示文件

def main():
    max_len = 30
    min_count = 2

    embeddings_dir = '/home/mattd/embeddings/reddit_2/'
    #dataset_path = '/home/mattd/datasets/AskReddit/'
    dataset_path = "/home/mattd/PycharmProjects/reddit/data/"
    dataset_train_filename = "{}train.csv".format(dataset_path)
    dataset_val_filename = "{}validation.csv".format(dataset_path)
    save_dir = "/home/mattd/PycharmProjects/reddit/embeddings/"

    dataset_train = PairsDataset(dataset_train_filename, max_len, min_count)
    dataset_val = PairsDataset(dataset_val_filename, max_len, min_count,
                               dataset_train.vocab)
    #dataset.add_file(eng_fr_filename2)

    vectors = embeddings.load_from_dir(embeddings_dir)

    #emb = embeddings.load_from_dir(embeddings_dir)

    embs_matrix = np.zeros((len(dataset_val.vocab), len(vectors.matrix[0])))

    for i, token in enumerate(dataset_val.vocab.token2id):
        if vectors.has_word(token):
            embs_matrix[i] = vectors.get_vector(token)
    np.save('{}embeddings_min{}_max{}'.format(save_dir, min_count, max_len),
            embs_matrix)

示例#14

0

显示文件

def main():
    # config = load_config()
    # print(config)
    parser = argparse.ArgumentParser()
    parser.add_argument("embeddings")
    parser.add_argument("dataset")
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=64,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=30,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--layer',
                        '-l',
                        type=int,
                        default=1,
                        help='Number of layers of RNN or MLP following CNN')
    parser.add_argument('--dropout',
                        '-d',
                        type=float,
                        default=0.4,
                        help='Dropout rate')
    parser.add_argument('--model',
                        '-model',
                        default='cnn',
                        choices=['cnn', 'rnn', 'bow'],
                        help='Name of encoder model type')
    parser.add_argument("--path_out",
                        default=False,
                        help="destination folder to save results")
    args = parser.parse_args()
    embeddings = load_from_dir(args.embeddings)
    # print("embeddings", embeddings)
    text_classification = Text_classification(batchsize=args.batchsize,
                                              epoch=args.epoch,
                                              gpu=args.gpu,
                                              layer=args.layer,
                                              dropout=args.dropout,
                                              model=args.model)
    results = text_classification.get_result(embeddings, args.dataset)
    if args.path_out:
        if os.path.isdir(args.path_out) or args.path_out.endswith("/"):
            dataset = os.path.basename(os.path.normpath(args.dataset))
            name_file_out = os.path.join(args.path_out, dataset,
                                         "results.json")
            save_json(results, name_file_out)
        else:
            save_json(results, args.path_out)
    else:
        print_json(results)

示例#15

0

显示文件

文件： test_embeddings.py 项目： vecto-ai/vecto

 def test_basic(self):
     WordEmbeddingsDense()
     model = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
     model.cmp_words("apple", "banana")
     model.cmp_words("apple", "bananaaaaa")
     x = np.array([0.0, 0.0, 0.0])
     x.fill(np.nan)
     model.cmp_vectors(x, x)

示例#16

0

显示文件

文件： test_misc.py 项目： vohoaiviet/vecto-vsm-space

    def test_language_modeling(self):
        embs = load_from_dir(
            "./tests/data/embeddings/text/plain_with_file_header")

        for method in ['lr', '2FFNN', 'rnn', 'lstm']:
            sequence_labeling = Language_modeling(test=True, method=method)
            results = sequence_labeling.get_result(embs)
            print(results)

示例#17

0

显示文件

文件： test_embeddings.py 项目： vohoaiviet/vecto-vsm-space

 def test_basic(self):
     WordEmbeddingsDense()
     model = load_from_dir("tests/data/embeddings/text/plain_with_file_header")
     model.cmp_words("apple", "banana")
     model.cmp_words("apple", "bananaaaaa")
     x = np.array([0.0, 0.0, 0.0])
     x.fill(np.nan)
     model.cmp_vectors(x, x)

示例#18

0

显示文件

 def test_fetcher(self):
     if path.isdir(path.join('.', 'tests', 'data', 'benchmarks_test')):
         return
     fetch_benchmarks(path.join('.', 'tests', 'data', 'benchmarks_test'))
     embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
     similarity = Similarity()
     path_similarity_dataset = path.join('.', 'tests', 'data', 'benchmarks_test', 'benchmarks', 'similarity', 'en')
     similarity.get_result(embs, path_similarity_dataset)

示例#19

0

显示文件

 def test_categorization_scores(self):
     embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
     categorization = KMeansCategorization()
     result = categorization.get_result(embs, path_categorization_dataset)
     scores = result[0]['global_stats']['scores']
     self.assertEqual(len(scores.keys()), 7)
     self.assertEqual(len(result[0]['global_stats']['true_labels']), 7)
     self.assertEqual(result[0]['global_stats']['true_labels'][3], 1)

示例#20

0

显示文件

文件： test_embeddings.py 项目： vohoaiviet/vecto-vsm-space

    def test_load(self):
        load_from_dir("tests/data/embeddings/text/plain_with_file_header")
        # TODO: assert right class
        load_from_dir("tests/data/embeddings/text/plain_no_file_header")
        # TODO: assert right class
        load_from_dir("tests/data/embeddings/npy")

        embs = load_from_dir("tests/data/embeddings/text/plain_with_file_header")
        embs.get_vector('apple')
        with self.assertRaises(RuntimeError):
            embs.get_vector('word_that_not_in_vocabulary_27')
        embs = load_from_dir("tests/data/embeddings/text/corrupted")
        with self.assertRaises(RuntimeError):
            embs = load_from_dir("tests/data/embeddings/text")

示例#21

0

显示文件

文件： test_embeddings.py 项目： vecto-ai/vecto

    def test_load(self):
        load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
        # TODO: assert right class
        load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_no_file_header'))
        # TODO: assert right class
        load_from_dir(path.join('tests', 'data', 'embeddings', 'npy'))

        embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
        embs.get_vector('apple')
        #with self.assertRaises(RuntimeError):
        #    embs.get_vector('word_that_not_in_vocabulary_27')
        embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'corrupted'))
        with self.assertRaises(RuntimeError):
            embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text'))

示例#22

0

显示文件

文件： test_sequence_labeling.py 项目： iris2hu/vecto

    def test_api(self):
        embs = load_from_dir(path_emb)

        for method in ['lr', '2FFNN']:
            sequence_labeling = Sequence_labeling(method=method)
            for subtask in ['chunk', 'pos', 'ner']:  # , 'chunk', 'pos', 'ner'
                result = sequence_labeling.get_result(
                    embs, path.join(path_sequence_labeling_dataset, subtask))
                self.assertIsInstance(result[0], dict)
                print(result)

示例#23

0

显示文件

文件： test_misc.py 项目： vohoaiviet/vecto-vsm-space

    def test_sequence_labeling(self):
        embs = load_from_dir(
            "./tests/data/embeddings/text/plain_with_file_header")

        for method in ['lr', '2FFNN']:
            sequence_labeling = Sequence_labeling(method='lr')
            for subtask in ['chunk', 'pos', 'ner']:  # , 'chunk', 'pos', 'ner'
                results = sequence_labeling.get_result(
                    embs, os.path.join(path_sequence_labeling_dataset,
                                       subtask))
                print(results)

示例#24

0

显示文件

文件： test_embeddings.py 项目： vohoaiviet/vecto-vsm-space

    def test_utils(self):
        embs = load_from_dir("tests/data/embeddings/text/plain_with_file_header")
        results = embs.get_most_similar_words('apple', 5)
        print(results)
        embs.cache_normalized_copy()
        results = embs.get_most_similar_words('apple', 5)
        print(results)

        results = embs.get_most_similar_words(embs.get_vector('apple'), 5)
        print(results)
        embs.get_x_label(0)

示例#25

0

显示文件

文件： test_embeddings.py 项目： vecto-ai/vecto

    def test_utils(self):
        embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
        results = embs.get_most_similar_words('apple', 5)
        print(results)
        embs.cache_normalized_copy()
        results = embs.get_most_similar_words('apple', 5)
        print(results)

        results = embs.get_most_similar_words(embs.get_vector('apple'), 5)
        print(results)
        embs.get_x_label(0)

示例#26

0

显示文件

    def test_outliers_results(self):
        embs = load_from_dir(
            path.join('tests', 'data', 'embeddings', 'text',
                      'plain_with_file_header'))
        outliers = AveragePairwiseCosine()
        result = outliers.get_result(embs, path_outliers_dataset)['test']
        amount_of_categories = 2
        amount_of_word_in_cats = 3

        self.assertEqual(len(result.keys()), amount_of_categories)
        self.assertEqual(len(result['cats']), amount_of_word_in_cats)

示例#27

0

显示文件

文件： test_outliers.py 项目： vecto-ai/vecto

    def test_outliers_results(self):
        embs = load_from_dir(
            path.join('tests', 'data', 'embeddings', 'text',
                      'plain_with_file_header'))
        outliers = AveragePairwiseCosine()
        result = outliers.get_result(embs, path_outliers_dataset)['test']
        amount_of_categories = 2
        # TODO: refactor to be understandable, check if ok after covab to UNK
        amount_of_word_in_cats = 4

        self.assertEqual(len(result.keys()), amount_of_categories)
        self.assertEqual(len(result['cats']), amount_of_word_in_cats)

示例#28

0

显示文件

文件： filter_embeddings.py 项目： Matthewdowney18/Yelp_attention

def main():
    embeddings_dir = '/mnt/data1/embeddings/crawl/'
    eng_fr_filename = '/mnt/data1/datasets/yelp/merged/train'

    dataset = SentenceDataset(eng_fr_filename, 20, 2)

    emb = embeddings.load_from_dir(embeddings_dir)
    vocab_embs = np.zeros((len(dataset.vocab), emb.matrix.shape[1]))

    for i, token in enumerate(dataset.vocab.token2id):
        if emb.has_word(token):
            vocab_embs[i] = emb.get_vector(token)
    np.save('embeddings', vocab_embs)

示例#29

0

显示文件

文件： test_misc.py 项目： vohoaiviet/vecto-vsm-space

    def test_similarity(self):
        embs = load_from_dir(
            path.join('tests', 'data', 'embeddings', 'text',
                      'plain_with_file_header'))
        similarity = Similarity()
        result = similarity.get_result(embs, path_similarity_dataset)
        print(result)

        similarity = Similarity(ignore_oov=False)
        result = similarity.get_result(embs, path_similarity_dataset)
        print(result)

        similarity_visualize.plot_accuracy()

示例#30

0

显示文件

    def test_synonymy_results(self):
        embs = load_from_dir(
            path.join('tests', 'data', 'embeddings', 'text',
                      'plain_with_file_header'))
        synonymy = CosineDistance()
        result = synonymy.get_result(embs, path_synonymy_dataset)['test']
        cat_is_synonym = 'yes'
        cat_is_hit = False
        distance_to_cat = 1.0

        self.assertEqual(result['tiger'][0]['is_synonym'], cat_is_synonym)
        self.assertEqual(result['tiger'][0]['hit'], cat_is_hit)
        self.assertEqual(result['tiger'][0]['distance'], distance_to_cat)