def preprocess(self): config = self.config input_dir = config.input_dir output_dir = config.output_dir # set file names file_names = { 'train': 'train.txt', 'validation': 'dev.txt', 'test': 'test.txt' } with open(os.path.join(input_dir, 'vocabs.txt'), 'r') as f: for i, l in enumerate(f.readlines()): if i == 2: self.types_vocab = { k: v for v, k in enumerate(l.strip().split('\t')) } self.words_vocab = {str(x): x for x in range(2)} # preprocessing trees for tag_name, fname in file_names.items(): self.__init_stats__(tag_name) data_list = [] with open(os.path.join(input_dir, fname), 'r') as f: for l in tqdm(f.readlines(), desc='Preprocessing {}'.format(tag_name)): #ris, t = l.strip().split('\t') t = l.strip() nltk_t = string_to_nltk_tree(t) nx_t = nltk_tree_to_nx( nltk_t, get_internal_node_dict=lambda w: { 'x': ConstValues.NO_ELEMENT, 'y': int(w.strip().split('_')[0]), 't': self.__get_type_id__(w.strip().split('_')[1]) }, get_leaf_node_dict=lambda w: { 'x': self.__get_word_id__(w), 'y': ConstValues.NO_ELEMENT, 't': ConstValues.NO_ELEMENT }) self.__update_stats__(tag_name, nx_t) data_list.append((self.__nx_to_dgl__(nx_t))) self.__print_stats__(tag_name) to_pkl_file(data_list, os.path.join(output_dir, '{}.pkl'.format(tag_name))) # save all stats self.__save_stats__()
def preprocess(self): config = self.config input_dir = config.input_dir output_dir = config.output_dir tree_type = config.preprocessor_config.tree_type # set file names file_names = { 'train': ['train_{}.pkl'.format(x) for x in tree_type], 'validation': ['validation_{}.pkl'.format(x) for x in tree_type], 'test': ['test_{}.pkl'.format(x) for x in tree_type] } # preprocessing trees for tag_name, f_list in file_names.items(): parsed_trees_list = [] for f in f_list: parsed_trees_list.append( from_pkl_file(os.path.join(input_dir, f))) n_trees = len(parsed_trees_list[0]) parsed_trees = [{ 'tree': tuple([v[i]['tree'] for v in parsed_trees_list]), 'coarse_label': parsed_trees_list[0][i]['coarse_label'], 'fine_label': parsed_trees_list[0][i]['fine_label'] } for i in range(n_trees)] self.__init_stats__(tag_name) data_list = [] for x in tqdm(parsed_trees, desc='Preprocessing {}'.format(tag_name)): t = self.tree_transformer.transform(*x['tree']) self.__assign_node_features__(t) self.__update_stats__(tag_name, t) dgl_t = self.__nx_to_dgl__(t) data_list.append((dgl_t, x['coarse_label'], x['fine_label'])) self.__print_stats__(tag_name) to_pkl_file(data_list, os.path.join(output_dir, '{}.pkl'.format(tag_name))) # save all stats self.__save_stats__() self.__save_word_embeddings__()
def preprocess(self): config = self.config input_dir = config.input_dir output_dir = config.output_dir tree_type = config.preprocessor_config.tree_type output_type = self.__get_output_type() # set file names file_names = { 'train': ['train_{}.pkl'.format(x) for x in tree_type], 'validation': ['validation_{}.pkl'.format(x) for x in tree_type], 'test': ['test_{}.pkl'.format(x) for x in tree_type] } sentiment_map_file = 'sentiment_map.pkl' # load sentiment map eprint('Loading sentiment map.') sentiment_map = from_pkl_file( os.path.join(input_dir, sentiment_map_file)) # preprocessing trees for tag_name, f_list in file_names.items(): nx_tree_list = [] for f in f_list: nx_tree_list.append(from_pkl_file(os.path.join(input_dir, f))) nx_tree_list = list(zip(*nx_tree_list)) self.__init_stats__(tag_name) tree_list = [] for x in tqdm(nx_tree_list, desc='Preprocessing {}'.format(tag_name)): t = self.tree_transformer.transform(*x) if self.__assign_node_features__(t, sentiment_map, output_type): # assign only if there is a label on the root (missing labe means neutral) self.__update_stats__(tag_name, t) tree_list.append(self.__nx_to_dgl__(t)) self.__print_stats__(tag_name) to_pkl_file(tree_list, os.path.join(output_dir, '{}.pkl'.format(tag_name))) # save all stats self.__save_stats__() # compute and sabe word_embeddings self.__save_word_embeddings__()
def __save_word_embeddings__(self): eprint('Loading word embeddings.') pretrained_embs_file = self.config.pretrained_embs_file embedding_dim = self.config.embedding_dim pretrained_embs = load_embeddings(pretrained_embs_file, self.words_vocab, embedding_dim=embedding_dim) to_pkl_file( pretrained_embs, os.path.join(self.config.output_dir, 'pretrained_embs.pkl')) if 'type_pretrained_embs_file' in self.config: eprint('Loading type embeddings.') type_pretrained_embs = load_embeddings( self.config.type_pretrained_embs_file, self.types_vocab, embedding_dim=self.config.type_embedding_dim) to_pkl_file( type_pretrained_embs, os.path.join(self.config.output_dir, 'type_pretrained_embs.pkl'))
def preprocess(self): config = self.config input_dir = config.input_dir output_dir = config.output_dir # set file names file_names = {'train': 'train.txt', 'validation': 'dev.txt', 'test': 'test.txt'} self.words_vocab = {str(x): x for x in range(10)} # preprocessing trees for tag_name, fname in file_names.items(): self.__init_stats__(tag_name) data_list = [] with open(os.path.join(input_dir, fname), 'r') as f: for l in tqdm(f.readlines(), desc='Preprocessing {}'.format(fname)): ris, a = l.strip().split('\t') if a[0] != '(': a = '(' + a + ')' nx_a = nltk_tree_to_nx(string_to_nltk_tree(a), get_internal_node_dict=lambda w: {'x': ConstValues.NO_ELEMENT, 'y': ConstValues.NO_ELEMENT, 't': self.__get_type_id__(w.strip())}, get_leaf_node_dict=lambda w: {'x': self.__get_word_id__(w.strip()), 'y': ConstValues.NO_ELEMENT, 't': ConstValues.NO_ELEMENT}) self.__add_intermediate_results__(nx_a) assert not (nx_a.nodes[0]['y'] != int(ris) and nx_a.number_of_nodes()>1) self.__update_stats__(tag_name, nx_a) data_list.append((self.__nx_to_dgl__(nx_a))) self.__print_stats__(tag_name) to_pkl_file(data_list, os.path.join(output_dir, '{}.pkl'.format(tag_name))) # save all stats self.__save_stats__()
for x in parser.tokenize(txt, tokenizer_prop) ] else: tok_list = [txt.lower()] tok_set = tuple(tok_list) if tok_set not in sentiment_map: sentiment_map[tok_set] = id2sentiment[id] else: if sentiment_map[tok_set] == 2: # neutral sentiment_map[tok_set] = id2sentiment[id] # store sentiment map eprint('Saving sentiment map.') to_pkl_file(sentiment_map, sentiment_map_out_file) for f_name in fnames: rf_name = os.path.join(input_dir, f_name) wf_dep_name = os.path.join(output_dir, f_name.replace('.txt', '_dep.pkl')) wf_const_name = os.path.join(output_dir, f_name.replace('.txt', '_const.pkl')) wf_bin_const_name = os.path.join( output_dir, f_name.replace('.txt', '_bin_const.pkl')) if not path_exists_with_message( wf_dep_name) or not path_exists_with_message( wf_const_name) or not path_exists_with_message( wf_bin_const_name): ris = {'dep': [], 'const': [], 'bin_const': []}
desc='Buildiing trees from {}: '.format(f_name)): if skip_first_line: skip_first_line = False continue v = l.strip().split('\t') sent_a = v[1] sent_b = v[2] rel_score = float(v[3]) ent_judgment = entailment_vocab[v[4]] ris_a, = parser.raw_parse(sent_a) ris_b, = parser.raw_parse(sent_b) for k in ris_a: ris[k].append({ 'tree_a': ris_a[k], 'tree_b': ris_b[k], 'relatedness': rel_score, 'entailment': ent_judgment }) eprint('Saving parsed trees.') to_pkl_file(ris['dep'], wf_dep_name) to_pkl_file(ris['const'], wf_const_name) to_pkl_file(ris['bin_const'], wf_bin_const_name) words_vocab_file = os.path.join(output_dir, 'words_vocab.pkl') if not path_exists_with_message(words_vocab_file): eprint('Store word vocabulary.') to_pkl_file(parser.words_vocab, words_vocab_file)
def preprocess(self): config = self.config input_dir = config.input_dir output_dir = config.output_dir train_max_num_ops = config.max_num_ops_in_training # set file names file_names = { 'train': ['train{}'.format(x) for x in range(train_max_num_ops + 1)], 'validation': ['dev{}'.format(x) for x in range(13)], 'test': ['test{}'.format(x) for x in range(13)] } # preprocessing trees for tag_name, fname_list in file_names.items(): self.__init_stats__(tag_name) data_list = [] for fname in fname_list: with open(os.path.join(input_dir, fname), 'r') as f: for l in tqdm(f.readlines(), desc='Preprocessing {}'.format(fname)): entail, a, b = l.strip().split('\t') if a[0] != '(': a = '(' + a + ')' if b[0] != '(': b = '(' + b + ')' ax, _ = parse_string_tree(a, 0) bx, _ = parse_string_tree(b, 0) nx_a = nltk_tree_to_nx( ax, get_internal_node_dict=lambda w: { 'x': ConstValues.NO_ELEMENT, 'y': ConstValues.NO_ELEMENT, 't': self.__get_type_id__(w.strip()) }, get_leaf_node_dict=lambda w: { 'x': self.__get_word_id__(w.strip()), 'y': ConstValues.NO_ELEMENT, 't': ConstValues.NO_ELEMENT }) nx_b = nltk_tree_to_nx( bx, get_internal_node_dict=lambda w: { 'x': ConstValues.NO_ELEMENT, 'y': ConstValues.NO_ELEMENT, 't': self.__get_type_id__(w.strip()) }, get_leaf_node_dict=lambda w: { 'x': self.__get_word_id__(w.strip()), 'y': ConstValues.NO_ELEMENT, 't': ConstValues.NO_ELEMENT }) self.__update_stats__(tag_name, nx_a) self.__update_stats__(tag_name, nx_b) data_list.append((self.__nx_to_dgl__(nx_a), self.__nx_to_dgl__(nx_b), self.__get_output_id__(entail))) self.__print_stats__(tag_name) to_pkl_file(data_list, os.path.join(output_dir, '{}.pkl'.format(tag_name))) # save all stats self.__save_stats__()
sent = v[1] coarse_label = fine_label.split(':')[0] coarse_label_id = coarse_label_vocab.setdefault( coarse_label, len(coarse_label_vocab)) fine_label_id = fine_label_vocab.setdefault( fine_label, len(fine_label_vocab)) ris, = parser.raw_parse(sent) for kk in ris: parsed_trees[kk].append({ 'tree': ris[kk], 'coarse_label': coarse_label_id, 'fine_label': fine_label_id }) to_pkl_file(parsed_trees, out_file) # save words vocab file eprint('Store word vocabulary.') words_vocab_file = os.path.join(output_dir, 'words_vocab.pkl') to_pkl_file(parser.words_vocab, words_vocab_file) # strore label vocabs eprint('Store label vocabulary.') to_json_file(coarse_label_vocab, os.path.join(output_dir, 'coarse_vocab.json')) to_json_file(fine_label_vocab, os.path.join(output_dir, 'fine_vocab.json')) else: parsed_trees = from_pkl_file(out_file)