def get_envs(env_files=None): dataset = [] if env_files is None: fns = [get_train_shard_path(i) for i in range(0, 30)] else: fns = env_files for fn in fns: dataset += load_jsonl(fn) tables = load_jsonl(table_file) table_dict = dict([(table['name'], table) for table in tables]) # Load pretrained embeddings. embedding_model = word_embeddings.EmbeddingModel(vocab_file, embedding_file ) with open(en_vocab_file, 'r') as f: vocab = json.load(f) en_vocab = data_utils.Vocab([]) en_vocab.load_vocab(vocab) # Create environments. envs = create_envs(table_dict, dataset, en_vocab, embedding_model) return envs
def json_to_envs(dataset): tables = load_jsonl(FLAGS.table_file) table_dict = dict([(table['name'], table) for table in tables]) embedding_model = word_embeddings.EmbeddingModel(FLAGS.vocab_file, FLAGS.embedding_file) with open(FLAGS.en_vocab_file, 'r') as f: vocab = json.load(f) en_vocab = data_utils.Vocab([]) en_vocab.load_vocab(vocab) # Create environments. envs = create_envs(table_dict, dataset, en_vocab, embedding_model) return envs
def main(): ''' 1. call get_examples_to_annotate() to get an ordered list of examples to annotated 4. call annotate() on these examples and get the VERIFIED annotation 5. call sync_result() to save the results :return: None, all results are locally saved in files ''' # load the tables tables = load_jsonl(FLAGS.table_file) table_dict = dict([(table['name'], table) for table in tables]) # Load pre-trained embeddings. embedding_model = word_embeddings.EmbeddingModel(FLAGS.vocab_file, FLAGS.embedding_file) with open(FLAGS.en_vocab_file, 'r') as f: vocab = json.load(f) en_vocab = data_utils.Vocab([]) en_vocab.load_vocab(vocab) annotation_result_list = get_examples_to_annotate() for i in range(len(annotation_result_list)): if annotation_result_list[i][2] is None: # create a real environment env = create_envs(table_dict, [annotation_result_list[i][0]], en_vocab, embedding_model)[0] # get the annotation annotation = annotate( env, table_dict[env.question_annotation['context']]) annotation_result_list[i] = (annotation_result_list[i][0], annotation_result_list[i][1], annotation) sync_results(annotation_result_list)
def init_experiment(fns, use_gpu=False, gpu_id='0'): dataset = [] for fn in fns: dataset += load_jsonl(fn) tf.logging.info('{} examples in dataset.'.format(len(dataset))) tables = load_jsonl(FLAGS.table_file) table_dict = dict([(table['name'], table) for table in tables]) tf.logging.info('{} tables.'.format(len(table_dict))) # Load pretrained embeddings. embedding_model = word_embeddings.EmbeddingModel( FLAGS.vocab_file, FLAGS.embedding_file) with open(FLAGS.en_vocab_file, 'r') as f: vocab = json.load(f) en_vocab = data_utils.Vocab([]) en_vocab.load_vocab(vocab) tf.logging.info('{} unique tokens in encoder vocab'.format( len(en_vocab.vocab))) tf.logging.info('{} examples in the dataset'.format(len(dataset))) # Create environments. envs = create_envs(table_dict, dataset, en_vocab, embedding_model) if FLAGS.unittest: envs = envs[:25] tf.logging.info('{} environments in total'.format(len(envs))) graph_config = get_saved_graph_config() if graph_config: # If evaluating an saved model, just load its graph # config. agent = create_agent(graph_config, get_init_model_path()) else: if FLAGS.use_pretrained_embeddings: tf.logging.info('Using pretrained embeddings!') pretrained_embeddings = [] for i in xrange(len(en_vocab.special_tks), en_vocab.size): pretrained_embeddings.append( utils.average_token_embedding( utils.find_tk_in_model( en_vocab.lookup(i, reverse=True), embedding_model), embedding_model, embedding_size=FLAGS.pretrained_embedding_size)) pretrained_embeddings = np.vstack(pretrained_embeddings) else: pretrained_embeddings = None # Model configuration and initialization. de_vocab = envs[0].de_vocab n_mem = FLAGS.max_n_mem n_builtin = de_vocab.size - n_mem en_pretrained_vocab_size = en_vocab.size - len(en_vocab.special_tks) graph_config = {} graph_config['core_config'] = dict( max_n_valid_indices=FLAGS.max_n_valid_indices, n_mem=n_mem, n_builtin=n_builtin, use_attn=True, attn_size=FLAGS.attn_size, attn_vec_size=FLAGS.attn_vec_size, input_vocab_size=de_vocab.size, en_input_vocab_size=en_vocab.size, hidden_size=FLAGS.hidden_size, n_layers=FLAGS.n_layers, en_hidden_size=FLAGS.hidden_size, en_n_layers=FLAGS.en_n_layers, en_use_embeddings=True, en_embedding_size=FLAGS.en_embedding_size, value_embedding_size=FLAGS.value_embedding_size, en_pretrained_vocab_size=en_pretrained_vocab_size, en_pretrained_embedding_size=FLAGS.pretrained_embedding_size, add_lm_loss=FLAGS.lm_loss_coeff > 0.0, en_bidirectional=FLAGS.en_bidirectional, en_attn_on_constants=FLAGS.en_attn_on_constants) graph_config['use_gpu'] = use_gpu graph_config['gpu_id'] = gpu_id graph_config['output_type'] = 'softmax' graph_config['output_config'] = dict( output_vocab_size=de_vocab.size, use_logits=True) aux_loss_list = [('ent_reg', FLAGS.entropy_reg_coeff),] if FLAGS.lm_loss_coeff > 0.0: aux_loss_list.append(('en_lm_loss', FLAGS.lm_loss_coeff)) graph_config['train_config'] = dict( aux_loss_list=aux_loss_list, learning_rate=FLAGS.learning_rate, max_grad_norm=FLAGS.max_grad_norm, adam_beta1=FLAGS.adam_beta1, l2_coeff=FLAGS.l2_coeff, optimizer=FLAGS.optimizer, avg_loss_by_n=False) agent = create_agent( graph_config, get_init_model_path(), pretrained_embeddings=pretrained_embeddings) with open(os.path.join(get_experiment_dir(), 'graph_config.json'), 'w') as f: json.dump(graph_config, f, sort_keys=True, indent=2) return agent, envs
def main(unused_argv): assert tf.gfile.Exists(FLAGS.raw_input_dir) if not tf.gfile.Exists(FLAGS.processed_input_dir): tf.gfile.MkDir(FLAGS.processed_input_dir) data_folder = os.path.join(FLAGS.raw_input_dir, 'WikiTableQuestions/tagged') table_file = os.path.join(FLAGS.processed_input_dir, 'tables.jsonl') test_table_file = os.path.join(FLAGS.processed_input_dir, 'test_table.json') stop_words_file = os.path.join(FLAGS.raw_input_dir, 'stop_words.json') train_file = os.path.join(FLAGS.processed_input_dir, 'train_examples.jsonl') train_tagged = os.path.join( FLAGS.raw_input_dir, 'WikiTableQuestions/tagged/data/training.tagged') test_tagged = os.path.join( FLAGS.raw_input_dir, 'WikiTableQuestions/tagged/data/pristine-unseen-tables.tagged') # Preprocess the tables. subdirs = os.listdir(data_folder) subdirs.remove('data') # Preprocess the tables. table_dict = {} folders = [] t1 = time.time() for d in subdirs: for fn in os.listdir(os.path.join(data_folder, d)): full_path = os.path.join(data_folder, d, fn) m = re.match( r'.*/(?P<first>[0-9]*)-tagged/(?P<second>[0-9]*)\.tagged', full_path) folders.append(full_path) table_name = 't_{}_{}'.format(m.group('first'), m.group('second')) kg = table2kg( table_name, data_folder, max_n_tokens_for_num_prop=FLAGS.max_n_tokens_for_num_prop, min_frac_for_ordered_prop=FLAGS.min_frac_for_ordered_prop) kg['name'] = table_name table_dict[table_name] = kg t2 = time.time() print('{} sec used processing the tables.'.format(t2 - t1)) print 'total number of number cells: {}'.format(n_total_num) print 'total number of filtered number cells: {}'.format(n_filtered_num) print 'filtered ration: {}'.format(n_filtered_num * 1.0 / n_total_num) print 'date and number ratio: {}'.format(n_date_and_num * 1.0 / n_total_num) # Save the preprocessed test table. with open(test_table_file, 'w') as f: json.dump({'t_203_375': table_dict['t_203_375']}, f) # Save the preprocessed table. t1 = time.time() with open(table_file, 'w') as f: for i, (k, v) in enumerate(table_dict.iteritems()): if i % 1000 == 0: print 'number {}'.format(i) f.write(json.dumps(v)) f.write('\n') t2 = time.time() print '{} sec used dumping tables'.format(t2 - t1) df = create_df_from_wtq_questions(train_tagged) with open(stop_words_file, 'r') as f: stop_words_list = json.load(f) stop_words = set(stop_words_list) t1 = time.time() examples = collect_examples_from_df(df, table_dict, stop_words) t2 = time.time() print '{} sec used collecting train examples.'.format(t2 - t1) dump_examples(examples, train_file) for split_id in xrange(1, 6): processed_input_dir = os.path.join(FLAGS.processed_input_dir, 'data_split_{}'.format(split_id)) if not tf.gfile.Exists(processed_input_dir): tf.gfile.MkDir(processed_input_dir) train_split_tsv = os.path.join( FLAGS.raw_input_dir, 'WikiTableQuestions/data/random-split-{}-train.tsv'.format( split_id)) dev_split_tsv = os.path.join( FLAGS.raw_input_dir, 'WikiTableQuestions/data/random-split-{}-dev.tsv'.format(split_id)) # Create all the splitted datasets. train_df = create_df_from_wtq_questions(train_split_tsv) dev_df = create_df_from_wtq_questions(dev_split_tsv) assert len(train_df) + len(dev_df) == len(df) train_ids = set(train_df['id']) train_examples = [] dev_ids = set(dev_df['id']) dev_examples = [] for e in examples: if e['id'] in train_ids: train_examples.append(e) elif e['id'] in dev_ids: dev_examples.append(e) else: raise ValueError('id {} not found'.format(e['id'])) assert len(train_examples) + len(dev_examples) == len(df) train_split_jsonl = os.path.join(processed_input_dir, 'train_split.jsonl') dev_split_jsonl = os.path.join(processed_input_dir, 'dev_split.jsonl') dump_examples(train_examples, train_split_jsonl) dump_examples(dev_examples, dev_split_jsonl) train_shards = [] for i in range(FLAGS.n_train_shard): train_shards.append([]) for i, e in enumerate(train_examples): train_shards[i % FLAGS.n_train_shard].append(e) for i, sh in enumerate(train_shards): train_shard_jsonl = os.path.join( processed_input_dir, 'train_split_shard_{}-{}.jsonl'.format(FLAGS.n_train_shard, i)) dump_examples(sh, train_shard_jsonl) test_df = create_df_from_wtq_questions(test_tagged) t1 = time.time() test_examples = collect_examples_from_df(test_df, table_dict, stop_words) t2 = time.time() print '{} sec used collecting test examples.'.format(t2 - t1) test_split_jsonl = os.path.join(FLAGS.processed_input_dir, 'test_split.jsonl') dump_examples(test_examples, test_split_jsonl) # Load pretrained embeddings. vocab_file = os.path.join(FLAGS.raw_input_dir, "wikitable_glove_vocab.json") embedding_file = os.path.join(FLAGS.raw_input_dir, "wikitable_glove_embedding_mat.npy") embedding_model = word_embeddings.EmbeddingModel(vocab_file, embedding_file) def create_vocab(examples, embedding_model, min_count): token_count = {} for e in examples: for tk in e['tokens']: # Token must be in glove and also appears more than min_count. if find_tk_in_model(tk, embedding_model): try: token_count[tk] += 1 except KeyError: token_count[tk] = 1 en_vocab = data_utils.generate_vocab_from_token_count( token_count, min_count=min_count) return en_vocab for i in xrange(1, 11): en_vocab = create_vocab(train_examples + dev_examples, embedding_model, i) vocab_file = os.path.join(FLAGS.processed_input_dir, "en_vocab_min_count_{}.json".format(i)) with open(vocab_file, 'w') as f: json.dump(en_vocab.vocab, f, sort_keys=True, indent=2) print 'min_tk_count: {}, vocab size: {}'.format(i, len(en_vocab.vocab))
def main(unused_argv): assert tf.gfile.Exists(FLAGS.raw_input_dir) if not tf.gfile.Exists(FLAGS.processed_input_dir): tf.gfile.MkDir(FLAGS.processed_input_dir) table_file = os.path.join(FLAGS.processed_input_dir, 'tables.jsonl') stop_words_file = os.path.join(FLAGS.raw_input_dir, 'stop_words.json') with open(stop_words_file, 'r') as f: stop_words = json.load(f) # Load datasets. train_set = [] with open(os.path.join(FLAGS.raw_input_dir, 'train.jsonl'), 'r') as f: for line in f: train_set.append(json.loads(line)) dev_set = [] with open(os.path.join(FLAGS.raw_input_dir, 'dev.jsonl'), 'r') as f: for line in f: dev_set.append(json.loads(line)) test_set = [] with open(os.path.join(FLAGS.raw_input_dir, 'test.jsonl'), 'r') as f: for line in f: test_set.append(json.loads(line)) # Load tables. train_table_dict = {} with open(os.path.join(FLAGS.raw_input_dir, 'train.tables.jsonl'), 'r') as f: for line in f: _table = json.loads(line) train_table_dict[_table['id']] = _table dev_table_dict = {} with open(os.path.join(FLAGS.raw_input_dir, 'dev.tables.jsonl'), 'r') as f: for line in f: _table = json.loads(line) dev_table_dict[_table['id']] = _table test_table_dict = {} with open(os.path.join(FLAGS.raw_input_dir, 'test.tables.jsonl'), 'r') as f: for line in f: _table = json.loads(line) test_table_dict[_table['id']] = _table # Collect all the tables. print 'Start collecting all the tables.' kg_dict = {} for tb_dict in [dev_table_dict, train_table_dict, test_table_dict]: for i, (k, v) in enumerate(tb_dict.iteritems()): if i % 1000 == 0: print i kg_dict[k] = table2kg(v) # Check if the string or number value has the correct type. for kg in kg_dict.values(): for _, v in kg['kg'].iteritems(): for prop, val in v.iteritems(): if prop[-7:] == '-number': for num in val: if not (isinstance(num, int) or isinstance(num, float)): print kg raise ValueError if prop[-7:] == '-string': for num in val: if not isinstance(num, unicode): print kg raise ValueError # Save the tables. with open(table_file, 'w') as f: for _, v in kg_dict.iteritems(): f.write(json.dumps(v) + '\n') # Load the gold answers. with open(os.path.join(FLAGS.raw_input_dir, 'dev_gold.json'), 'r') as f: dev_answers = json.load(f) for q, ans in zip(dev_set, dev_answers): q['answer'] = ans with open(os.path.join(FLAGS.raw_input_dir, 'train_gold.json'), 'r') as f: train_answers = json.load(f) for q, ans in zip(train_set, train_answers): q['answer'] = ans with open(os.path.join(FLAGS.raw_input_dir, 'test_gold.json'), 'r') as f: test_answers = json.load(f) for q, ans in zip(test_set, test_answers): q['answer'] = ans # Annotate the examples and dump to files. train_split_jsonl = os.path.join(FLAGS.processed_input_dir, 'train_split.jsonl') dev_split_jsonl = os.path.join(FLAGS.processed_input_dir, 'dev_split.jsonl') test_split_jsonl = os.path.join(FLAGS.processed_input_dir, 'test_split.jsonl') t1 = time.time() dev_examples = [] print 'start annotating dev examples.' for i, q in enumerate(dev_set): if i % 500 == 0: print i e = annotate_question(q, 'dev-{}'.format(i), kg_dict, stop_words) expand_entities(e, kg_dict) dev_examples.append(e) t2 = time.time() print '{} sec used annotating dev examples.'.format(t2 - t1) dump_examples(dev_examples, dev_split_jsonl) t1 = time.time() train_examples = [] print 'start annotating train examples.' for i, q in enumerate(train_set): if i % 500 == 0: print i e = annotate_question(q, 'train-{}'.format(i), kg_dict, stop_words) expand_entities(e, kg_dict) train_examples.append(e) t2 = time.time() print '{} sec used annotating train examples.'.format(t2 - t1) dump_examples(train_examples, train_split_jsonl) t1 = time.time() test_examples = [] print 'start annotating test examples.' for i, q in enumerate(test_set): if i % 500 == 0: print i e = annotate_question(q, 'test-{}'.format(i), kg_dict, stop_words) expand_entities(e, kg_dict) test_examples.append(e) t2 = time.time() print '{} sec used annotating test examples.'.format(t2 - t1) dump_examples(test_examples, test_split_jsonl) train_shards = [] for i in range(FLAGS.n_train_shard): train_shards.append([]) for i, e in enumerate(train_examples): train_shards[i % FLAGS.n_train_shard].append(e) for i, sh in enumerate(train_shards): train_shard_jsonl = os.path.join( FLAGS.processed_input_dir, 'train_split_shard_{}-{}.jsonl'.format(FLAGS.n_train_shard, i)) dump_examples(sh, train_shard_jsonl) # Load pretrained embeddings. vocab_file = os.path.join(FLAGS.raw_input_dir, "wikisql_glove_vocab.json") embedding_file = os.path.join(FLAGS.raw_input_dir, "wikisql_glove_embedding_mat.npy") embedding_model = word_embeddings.EmbeddingModel(vocab_file, embedding_file) for i in xrange(1, 11): en_vocab = create_vocab(train_examples + dev_examples, embedding_model, i) vocab_file = os.path.join(FLAGS.processed_input_dir, "en_vocab_min_count_{}.json".format(i)) with open(vocab_file, 'w') as f: json.dump(en_vocab.vocab, f, sort_keys=True, indent=2) print 'min_tk_count: {}, vocab size: {}'.format(i, len(en_vocab.vocab))
def eval_examples(self, envs): # declare some constant params CLUSTER_NUM = 500 CLUSTER_SAMPLE_SIZE = 15 FIRST_N_CLUSTERS = 200 assert (FIRST_N_CLUSTERS * CLUSTER_SAMPLE_SIZE == 3 * FLAGS.al_budget_n) # # get failed env names # env_eval_results = ActivePicker.parallel_eval(envs, failed_eval) # failed_env_names_set = set(map(lambda (score, env_name): env_name, # filter(lambda (score, env_name): score > 0, env_eval_results))) # pure clustering, no failed information failed_env_names_set = set(map(lambda env: env['id'], envs)) # get the questions embedding for every environment embedding_model = word_embeddings.EmbeddingModel( FLAGS.vocab_file, FLAGS.embedding_file) failed_envs = json_to_envs( filter(lambda env_json: env_json['id'] in failed_env_names_set, envs)) failed_env_names = map(lambda env: env.name, failed_envs) embedding_matrix = preprocessing.normalize(np.vstack( map( lambda env: average_token_embedding(env.context[ -1], embedding_model), failed_envs)), copy=False) # run a k-means++ algorithm on this to get clusters print('##################################') print( 'Start running k-means algorithm on %d examples... (this could take a while)' % len(failed_env_names)) print('##################################') labels = KMeans(n_clusters=CLUSTER_NUM, random_state=0).fit(embedding_matrix).labels_ print('##################################') print('K-means running done!') print('##################################') # put env into clusters and index by name env_name_clusters = map( lambda i: np.array(failed_env_names)[labels == i].tolist(), range(CLUSTER_NUM)) env_name_clusters = sorted(env_name_clusters, key=lambda x: len(x), reverse=True) assert (len(env_name_clusters[FIRST_N_CLUSTERS - 1]) >= CLUSTER_SAMPLE_SIZE) # choose CLUSTER_SAMPLE_SIZE examples from the first FIRST_N_CLUSTERS clusters choose_from_clusters = map( lambda cluster: np.random.choice( cluster, CLUSTER_SAMPLE_SIZE, replace=False).tolist(), env_name_clusters[:FIRST_N_CLUSTERS]) chosen_env_names = set(reduce(lambda x, y: x + y, choose_from_clusters)) result = map( lambda env: (1.0 if env['id'] in chosen_env_names else 0.0, env['id']), envs) ''' # plot to see the performance pca = PCA(n_components=2) X_r = pca.fit(embedding_matrix).transform(embedding_matrix) plt.figure() for i in range(CLUSTER_NUM): plt.scatter(X_r[labels == i, 0], X_r[labels == i, 1], color=np.random.rand(3,), label='class_'+str(i)) plt.legend(loc='best', shadow=False, scatterpoints=1) plt.title('PCA of clusters') ''' return result