示例#1
0
def main(args):
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    # load the vocab
    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    vocab = load_vocab(args.vocab_file, max_word_length)

    test_prefix = args.test_prefix

    kwargs = {
        'test': True,
        'shuffle_on_load': False,
    }

    permute_number = options.get('permute_number', 4)

    if options.get('bidirectional'):
        data = BidirectionalLMDataset(test_prefix, vocab, **kwargs)
    elif options.get('multidirectional'):
        data = MultidirectionalLMDataset(test_prefix, vocab, permute_number, **kwargs)
    else:
        data = LMDataset(test_prefix, vocab, **kwargs)

    test(options, ckpt_file, data, batch_size=args.batch_siz, permute_number=permute_number)
示例#2
0
文件: run_test.py 项目: cheng18/crs
def main(args):
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    # load the vocab
    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    # vocab = load_vocab(args.vocab_file, max_word_length)
    vocab = load_vocab(args.vocab_file, args.stroke_vocab_file,
                       50)  # Winfred stroke_vocab

    test_prefix = args.test_prefix

    kwargs = {
        'test': True,
        'shuffle_on_load': False,
    }

    if options.get('bidirectional'):
        data = BidirectionalLMDataset(test_prefix, vocab, **kwargs)
    else:
        data = LMDataset(test_prefix, vocab, **kwargs)

    test(options, ckpt_file, data, batch_size=args.batch_size)
def main(args):

    if args.gpu is not None:
        n_gpus = len(args.gpu)
        set_gpu(args.gpu)
    else:
        n_gpus = 0

    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    # load the vocab
    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    vocab = load_vocab(args.vocab_files,
                       max_word_length=max_word_length,
                       polyglot=True)

    test_prefix = args.test_prefix

    kwargs = {
        'test': True,
        'shuffle_on_load': False,
    }

    data = BidirectionalPolyglotLMDataset(test_prefix, vocab, **kwargs)

    test(options, ckpt_file, data, batch_size=args.batch_size)
示例#4
0
    def test_train_bilm_chars(self):
        vocab, data, options = self._get_vocab_data_options(True, True)
        train(options, data, 1, self.tmp_dir, self.tmp_dir)

        # now test
        tf.reset_default_graph()
        options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir)
        data_test, vocab_test = self._get_data(True, True, True)
        perplexity = test(options, ckpt_file, data_test, batch_size=1)
        self.assertTrue(perplexity < 20.0)
示例#5
0
    def test_train_bilm_chars(self):
        vocab, data, options = self._get_vocab_data_options(True, True)
        train(options, data, 1, self.tmp_dir, self.tmp_dir)

        # now test
        tf.reset_default_graph()
        options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir)
        data_test, vocab_test = self._get_data(True, True, True)
        perplexity = test(options, ckpt_file, data_test, batch_size=1)
        self.assertTrue(perplexity < 20.0)
示例#6
0
    def test_train_skip_connections(self):
        bidirectional = True
        use_chars = False
        vocab, data, options = self._get_vocab_data_options(
            bidirectional, use_chars)
        options['lstm']['use_skip_connections'] = True
        train(options, data, 1, self.tmp_dir, self.tmp_dir)

        # now test
        tf.reset_default_graph()
        options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir)
        data_test, vocab_test = self._get_data(bidirectional, use_chars, test=True)
        perplexity = test(options, ckpt_file, data_test, batch_size=1)
        self.assertTrue(perplexity < 20.0)
示例#7
0
def main(args):
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    # load the vocab
    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    vocab = load_vocab(args.vocab_file, max_word_length)

    test_prefix = args.test_prefix

    kwargs = {
        'test': True,
        'shuffle_on_load': False,
    }

    if options.get('bidirectional'):
        data = BidirectionalLMDataset(test_prefix, vocab, **kwargs)
    else:
        data = LMDataset(test_prefix, vocab, **kwargs)

    test(options, ckpt_file, data, batch_size=args.batch_size)
示例#8
0
    def test_train_shared_softmax_embedding(self):
        bidirectional = True
        use_chars = False

        vocab, data, options = self._get_vocab_data_options(
            bidirectional, use_chars, share_embedding_softmax=True)
        train(options, data, 1, self.tmp_dir, self.tmp_dir)

        # now test
        tf.reset_default_graph()
        options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir)
        data_test, vocab_test = self._get_data(
            bidirectional, use_chars, test=True)
        perplexity = test(options, ckpt_file, data_test, batch_size=1)
        self.assertTrue(perplexity < 20.0)
示例#9
0
    def test_train_skip_connections(self):
        bidirectional = True
        use_chars = False
        vocab, data, options = self._get_vocab_data_options(
            bidirectional, use_chars)
        options['lstm']['use_skip_connections'] = True
        train(options, data, 1, self.tmp_dir, self.tmp_dir)

        # now test
        tf.reset_default_graph()
        options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir)
        data_test, vocab_test = self._get_data(
            bidirectional, use_chars, test=True)
        perplexity = test(options, ckpt_file, data_test, batch_size=1)
        self.assertTrue(perplexity < 20.0)
示例#10
0
    def test_train_shared_softmax_embedding(self):
        bidirectional = True
        use_chars = False

        vocab, data, options = self._get_vocab_data_options(
            bidirectional, use_chars, share_embedding_softmax=True)
        train(options, data, 1, self.tmp_dir, self.tmp_dir)

        # now test
        tf.reset_default_graph()
        options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir)
        data_test, vocab_test = self._get_data(
            bidirectional, use_chars, test=True)
        perplexity = test(options, ckpt_file, data_test, batch_size=1)
        self.assertTrue(perplexity < 20.0)
示例#11
0
def top_level(args):
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)
    vocab_file = os.path.join(args.save_dir, 'vocabs.txt')

    # load the vocab
    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    vocab = load_vocab(vocab_file, max_word_length)

    test_prefix = args.test_prefix

    kwargs = {
        'test': True,
        'shuffle_on_load': False,
    }

    if options.get('bidirectional'):
        data = BidirectionalLMDataset(test_prefix, vocab, **kwargs)
    else:
        data = LMDataset(test_prefix, vocab, **kwargs)

    test(options, ckpt_file, data, batch_size=args.batch_size)
示例#12
0
def main(args):
    ent_num = 14541
    with open(
            "/home/why2011btv/research/OpenKE/benchmarks/FB15K237/test2id.txt",
            'r') as f:
        #with open("/home/why2011btv/KG-embedding/obama.txt",'r') as f:
        lines = f.readlines()
        triplet_num = len(lines) - 1
        print("triplet_num:", triplet_num)
        #triplet_num = 600
        test_set = np.zeros([triplet_num, 3], np.int32)
        i = 0
        for line in lines:
            a = line.split(' ')
            if len(a) > 1 and i < triplet_num:
                #a[2] = (a[2])[:-1] #because of newline
                #a[1] = a[1][:-1]

                #            test_set[i][0] = int(a[0])
                #            test_set[i][1] = int(a[2])
                #            test_set[i][2] = int(a[1])
                aa = 1
                test_set[i][0] = int(a[0])
                test_set[i][1] = int(a[2]) + ent_num
                test_set[i][2] = int(a[1])
                #print("a[0]:",test_set[i][0])
                #print("a[2]:",test_set[i][1])
                #print("a[1]:",test_set[i][2])
                #print("a:",aa)
                #print(test_set)
                i += 1

    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)
    data = MYDataset(test_set)

    perplexity = test(options, ckpt_file, data, batch_size=2)
    return perplexity
示例#13
0
def main(args):
    is_load, load_path, save_path, budget = cuhk_prototype_tuner_v2.preprocess(
        t_id, params, args.save_dir)

    vocab = load_vocab(args.vocab_file, 50)

    batch_size = int(params['batch_size'])

    gpus_index_list = list(
        map(int, os.environ["CUDA_VISIBLE_DEVICES"].split(',')))
    n_gpus = len(os.environ["CUDA_VISIBLE_DEVICES"].split(','))

    n_train_tokens = 768648884

    sess_config = tf.compat.v1.ConfigProto(
        allow_soft_placement=True,
        inter_op_parallelism_threads=int(
            params['inter_op_parallelism_threads']),
        intra_op_parallelism_threads=int(
            params['intra_op_parallelism_threads']),
        graph_options=tf.compat.v1.GraphOptions(
            infer_shapes=params['infer_shapes'],
            place_pruned_graph=params['place_pruned_graph'],
            enable_bfloat16_sendrecv=params['enable_bfloat16_sendrecv'],
            optimizer_options=tf.compat.v1.OptimizerOptions(
                do_common_subexpression_elimination=params[
                    'do_common_subexpression_elimination'],
                max_folded_constant_in_bytes=int(
                    params['max_folded_constant']),
                do_function_inlining=params['do_function_inlining'],
                global_jit_level=params['global_jit_level'])))

    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation':
            'relu',
            'embedding': {
                'dim': 16
            },
            'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256],
                        [6, 512], [7, 1024]],
            'max_characters_per_token':
            50,
            'n_characters':
            261,
            'n_highway':
            2
        },
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 512,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': int(budget),  # NNI modification
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': 8192,
    }
    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)
    tf_save_dir = save_path
    tf_log_dir = save_path
    if not os.path.exists(tf_save_dir):
        os.makedirs(tf_save_dir)

    if params['tf_gpu_thread_mode'] in ["global", "gpu_private", "gpu_shared"]:
        os.environ['TF_GPU_THREAD_MODE'] = params['tf_gpu_thread_mode']
    if is_load:
        load_file = os.path.join(load_path, 'model.ckpt')
        start = time.time()
        final_perplexity = train(options,
                                 data,
                                 n_gpus,
                                 gpus_index_list,
                                 tf_save_dir,
                                 tf_log_dir,
                                 sess_config,
                                 restart_ckpt_file=load_file)
        end = time.time()
        shutil.rmtree(load_path)
    else:
        start = time.time()
        final_perplexity = train(options, data, n_gpus, gpus_index_list,
                                 tf_save_dir, tf_log_dir, sess_config)
        end = time.time()
    spent_time = (end - start) / 3600.0
    if args.test_prefix != '':
        options, ckpt_file = load_options_latest_checkpoint(tf_save_dir)
        kwargs = {
            'test': True,
            'shuffle_on_load': False,
        }
        test_data = BidirectionalLMDataset(args.test_prefix, vocab, **kwargs)
        final_perplexity = test(options, ckpt_file, test_data, batch_size=128)
    report_dict = {'runtime': spent_time, 'default': final_perplexity}
    nni.report_final_result(report_dict)
示例#14
0
            res_perplexities2.append(res2)
            count_in += OOV
            count_oov += IN
            res_perplexities0.append(res0)
            res_perplexities1.append(res1)

        if args.model == 'elmo':

            filepath = subdir + os.sep
            if options.get('bidirectional'):
                data = BidirectionalLMDataset(filepath, vocab, **kwargs)
                # print(data)
            else:
                data = LMDataset(filepath, vocab, **kwargs)

            res2 = test(options, ckpt_file, data, batch_size=args.batch_size)

            res_perplexities2.append(res2)

        outfile.write(file + '\t' + label + '\t' + str(res2) + '\n')

        if count % 5 == 0:
            print('I have calculated perplexities for %s files' % count,
                  file=sys.stderr)

print('=== Just a sanity check on the perplexity calculations: ')
print(labels[:5], fns[:5], res_perplexities2[:5])

print('Texts with the most extreme text-level perplexities:')
df = pd.DataFrame(list(zip(fns, labels, res_perplexities2)),
                  columns=['files', 'label', 'perpl'])