def process_files(self, input_text_file, input_da_file, skip_hello=False): """Load DAs & sentences, obtain abstraction instructions, and store it all in member variables (to be used later by writing methods). @param input_text_file: path to the input file with sentences @param input_da_file: path to the input file with DAs @param skip_hello: skip hello() DAs (remove them from the output?) """ # load DAs self._das = [] with codecs.open(input_da_file, 'r', encoding='UTF-8') as fh: for line in fh: self._das.append(DA.parse(line.strip())) # load & process sentences self._sents = [] with codecs.open(input_text_file, 'r', encoding='UTF-8') as fh: for line in fh: self._sents.append(self.analyze(line.strip())) assert(len(self._das) == len(self._sents)) # skip hello() DAs, if required if skip_hello: pos = 0 while pos < len(self._das): da = self._das[pos] if len(da) == 1 and da[0].da_type == 'hello': del self._das[pos] del self._sents[pos] else: pos += 1 # delexicalize DAs and sentences self._delex_texts() self._delex_das()
def _init_training(self, das_file, ttree_file, data_portion): """Initialize training. Store input data, initialize 1-hot feature representations for input and output and transform training data accordingly, initialize the classification neural network. """ # read input log_info('Reading DAs from ' + das_file + '...') das = read_das(das_file) log_info('Reading t-trees from ' + ttree_file + '...') ttree_doc = read_ttrees(ttree_file) trees = trees_from_doc(ttree_doc, self.language, self.selector) # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] # add empty tree + empty DA to training data # (i.e. forbid the network to keep any of its outputs "always-on") train_size += 1 self.train_trees.append(TreeData()) empty_da = DA.parse('inform()') self.train_das.append(empty_da) self.train_order = range(len(self.train_trees)) log_info('Using %d training instances.' % train_size) # initialize input features/embeddings if self.tree_embs: self.dict_size = self.tree_embs.init_dict(self.train_trees) self.X = np.array([ self.tree_embs.get_embeddings(tree) for tree in self.train_trees ]) else: self.tree_feats = Features(['node: presence t_lemma formeme']) self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.X = [ self.tree_feats.get_features(tree, {}) for tree in self.train_trees ] self.X = self.tree_vect.fit_transform(self.X) # initialize output features self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence']) self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.y = [ self.da_feats.get_features(None, {'da': da}) for da in self.train_das ] self.y = self.da_vect.fit_transform(self.y) # initialize I/O shapes self.input_shape = [list(self.X[0].shape)] self.num_outputs = len(self.da_vect.get_feature_names()) # initialize NN classifier self._init_neural_network()
def process_dataset(self, input_data): """Load DAs & sentences, obtain abstraction instructions, and store it all in member variables (to be used later by writing methods). @param input_data: path to the input JSON file with the data """ # load data from JSON self._das = [] self._texts = [] with codecs.open(input_data, 'r', encoding='UTF-8') as fh: data = json.load(fh) for inst in data: da = DA.parse(inst['da']) da.sort() self._das.append(da) self._texts.append(self.analyze(inst['text'])) # delexicalize DAs and sentences self._create_delex_texts() self._create_delex_das() # return the result out = [] for da, text, delex_da, delex_text, abst in zip( self._das, self._texts, self._delex_das, self._delex_texts, self._absts): out.append(Inst(da, text, delex_da, delex_text, abst)) return out
def process_files(self, input_text_file, input_da_file, skip_hello=False): """Load DAs & sentences, obtain abstraction instructions, and store it all in member variables (to be used later by writing methods). @param input_text_file: path to the input file with sentences @param input_da_file: path to the input file with DAs @param skip_hello: skip hello() DAs (remove them from the output?) """ # load DAs self._das = [] with codecs.open(input_da_file, 'r', encoding='UTF-8') as fh: for line in fh: self._das.append(DA.parse(line.strip())) # load & process sentences self._sents = [] with codecs.open(input_text_file, 'r', encoding='UTF-8') as fh: for line in fh: self._sents.append(self.analyze(line.strip())) assert (len(self._das) == len(self._sents)) # skip hello() DAs, if required if skip_hello: pos = 0 while pos < len(self._das): da = self._das[pos] if len(da) == 1 and da[0].da_type == 'hello': del self._das[pos] del self._sents[pos] else: pos += 1 # delexicalize DAs and sentences self._delex_texts() self._delex_das()
def _init_training(self, das_file, ttree_file, data_portion): """Initialize training. Store input data, initialize 1-hot feature representations for input and output and transform training data accordingly, initialize the classification neural network. """ # read input log_info('Reading DAs from ' + das_file + '...') das = read_das(das_file) log_info('Reading t-trees from ' + ttree_file + '...') ttree_doc = read_ttrees(ttree_file) trees = trees_from_doc(ttree_doc, self.language, self.selector) # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] # add empty tree + empty DA to training data # (i.e. forbid the network to keep any of its outputs "always-on") train_size += 1 self.train_trees.append(TreeData()) empty_da = DA.parse('inform()') self.train_das.append(empty_da) self.train_order = range(len(self.train_trees)) log_info('Using %d training instances.' % train_size) # initialize input features/embeddings if self.tree_embs: self.dict_size = self.tree_embs.init_dict(self.train_trees) self.X = np.array([self.tree_embs.get_embeddings(tree) for tree in self.train_trees]) else: self.tree_feats = Features(['node: presence t_lemma formeme']) self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.X = [self.tree_feats.get_features(tree, {}) for tree in self.train_trees] self.X = self.tree_vect.fit_transform(self.X) # initialize output features self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence']) self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.y = [self.da_feats.get_features(None, {'da': da}) for da in self.train_das] self.y = self.da_vect.fit_transform(self.y) # initialize I/O shapes self.input_shape = [list(self.X[0].shape)] self.num_outputs = len(self.da_vect.get_feature_names()) # initialize NN classifier self._init_neural_network()
def process_files(self, input_text_file, input_da_file): """Load DAs & sentences, obtain abstraction instructions, and store it all in member variables (to be used later by writing methods).""" # load DAs self._das = [] with codecs.open(input_da_file, 'r', encoding='UTF-8') as fh: for line in fh: self._das.append(DA.parse(line.strip())) # load & process sentences self._sents = [] with codecs.open(input_text_file, 'r', encoding='UTF-8') as fh: for line in fh: self._sents.append(self.analyze(line.strip())) assert (len(self._das) == len(self._sents)) # delexicalize DAs and sentences self._delex_texts() self._delex_das()
def convert(args): src = lines_to_list(args.src_file) if args.das: src = [DA.parse(da_text).to_cambridge_da_string() for da_text in src] ref = lines_to_list(args.ref_file) columns = ['mr', 'orig_ref'] df = pd.DataFrame.from_dict({'mr': src, 'orig_ref': ref}) if args.system_output: sys = lines_to_list(args.system_output) df['system_ref'] = sys columns.append('system_ref') if args.score: score = [float(score) for score in lines_to_list(args.score)] df['quality'] = score columns.append('quality') df.to_csv(args.out_file, columns=columns, sep=b"\t", index=False, encoding='UTF-8')
def _init_training(self, das, trees, data_portion): """Initialize training. Store input data, initialize 1-hot feature representations for input and output and transform training data accordingly, initialize the classification neural network. @param das: name of source file with training DAs, or list of DAs @param trees: name of source file with corresponding trees/sentences, or list of trees @param data_portion: portion of the training data to be used (0.0-1.0) """ # read input from files or take it directly from parameters if not isinstance(das, list): log_info('Reading DAs from ' + das + '...') das = read_das(das) if not isinstance(trees, list): log_info('Reading t-trees from ' + trees + '...') ttree_doc = read_ttrees(trees) if self.mode == 'tokens': tokens = tokens_from_doc(ttree_doc, self.language, self.selector) trees = self._tokens_to_flat_trees(tokens) elif self.mode == 'tagged_lemmas': tls = tagged_lemmas_from_doc(ttree_doc, self.language, self.selector) trees = self._tokens_to_flat_trees(tls, use_tags=True) else: trees = trees_from_doc(ttree_doc, self.language, self.selector) elif self.mode in ['tokens', 'tagged_lemmas']: trees = self._tokens_to_flat_trees( trees, use_tags=self.mode == 'tagged_lemmas') # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] # ignore contexts, if they are contained in the DAs if isinstance(self.train_das[0], tuple): self.train_das = [da for (context, da) in self.train_das] # delexicalize if DAs are lexicalized and we don't want that if self.delex_slots: self.train_das = [ da.get_delexicalized(self.delex_slots) for da in self.train_das ] # add empty tree + empty DA to training data # (i.e. forbid the network to keep any of its outputs "always-on") train_size += 1 self.train_trees.append(TreeData()) empty_da = DA.parse('inform()') self.train_das.append(empty_da) self.train_order = range(len(self.train_trees)) log_info('Using %d training instances.' % train_size) # initialize input features/embeddings if self.tree_embs: self.dict_size = self.tree_embs.init_dict(self.train_trees) self.X = np.array([ self.tree_embs.get_embeddings(tree) for tree in self.train_trees ]) else: self.tree_feats = Features(['node: presence t_lemma formeme']) self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.X = [ self.tree_feats.get_features(tree, {}) for tree in self.train_trees ] self.X = self.tree_vect.fit_transform(self.X) # initialize output features self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence']) self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.y = [ self.da_feats.get_features(None, {'da': da}) for da in self.train_das ] self.y = self.da_vect.fit_transform(self.y) log_info('Number of binary classes: %d.' % len(self.da_vect.get_feature_names())) # initialize I/O shapes if not self.tree_embs: self.input_shape = list(self.X[0].shape) else: self.input_shape = self.tree_embs.get_embeddings_shape() self.num_outputs = len(self.da_vect.get_feature_names()) # initialize NN classifier self._init_neural_network() # initialize the NN variables self.session.run(tf.global_variables_initializer())
def convert(args): """Main function – read in the JSON data and output TGEN-specific files.""" # initialize storage items = 0 das = [] # abstracted DAs concs = [] # concrete sentences texts = [] # abstracted sentences absts = [] # abstraction descriptions contexts = [] # abstracted contexts conc_contexts = [] # lexicalized contexts # process the input data and store it in memory with open(args.in_file, 'r') as fh: data = json.load(fh, encoding='UTF-8') for item in data: da = convert_abstr_da(DA.parse(item['response_da'])) context = convert_abstractions(item['context_utt']) context_l = item['context_utt_l'] conc_da = DA.parse(item['response_da_l']) concs_ = [tokenize(s) for s in item['response_nl_l']] absts_ = [] texts_ = [] for abst_text in item['response_nl']: text, abst = get_abstraction( abst_text, conc_da, args.slot_names) # convert *SLOT -> X absts_.append(abst) texts_.append(text) das.append(da) contexts.append(context) conc_contexts.append(context_l) concs.append(concs_) absts.append(absts_) texts.append(texts_) items += 1 print 'Processed', items, 'items.' if args.split: # get file name prefixes and compute data sizes for all the parts to be split out_names = re.split(r'[, ]+', args.out_name) data_sizes = [int(part_size) for part_size in args.split.split(':')] assert len(out_names) == len(data_sizes) # compute sizes for all but the 1st part (+ round them) total = float(sum(data_sizes)) remain = items for part_no in xrange(len(data_sizes) - 1, 0, -1): part_size = int(round(items * (data_sizes[part_no] / total))) data_sizes[part_no] = part_size remain -= part_size # put whatever remained into the 1st part data_sizes[0] = remain else: # use just one part -- containing all the data data_sizes = [items] out_names = [args.out_name] # write all data parts for part_size, part_name in zip(data_sizes, out_names): repeat_num = len(concs[0]) if args.multi_ref and part_name in ['devel', 'test', 'dtest', 'etest']: repeat_num = 1 # repeat DAs and contexts for synonymous paraphrases, unless for test data in multi-ref mode write_part(part_name + '-das.txt', das, part_size, repeat_num) write_part(part_name + '-context.txt', contexts, part_size, repeat_num) write_part(part_name + '-conc_context.txt', conc_contexts, part_size, repeat_num) # write all other just once (here, each instance is a list, it will be unrolled) write_part(part_name + '-conc.txt', concs, part_size) write_part(part_name + '-abst.txt', absts, part_size) write_part(part_name + '-text.txt', texts, part_size)
def read_sfx_data(): with codecs.open('data/sfrest-refs.tag.ngram.txt', 'r', 'UTF-8') as fh: refs = [split_tags(inst.strip()) for inst in fh.readlines()] with codecs.open('data/sfrest-mrs.txt', 'r', 'UTF-8') as fh: mrs = [DA.parse(mr.strip()) for mr in fh.readlines()] return mrs, refs
def _init_training(self, das, trees, data_portion): """Initialize training. Store input data, initialize 1-hot feature representations for input and output and transform training data accordingly, initialize the classification neural network. @param das: name of source file with training DAs, or list of DAs @param trees: name of source file with corresponding trees/sentences, or list of trees @param data_portion: portion of the training data to be used (0.0-1.0) """ # read input from files or take it directly from parameters if not isinstance(das, list): log_info('Reading DAs from ' + das + '...') das = read_das(das) if not isinstance(trees, list): log_info('Reading t-trees from ' + trees + '...') ttree_doc = read_ttrees(trees) if self.mode == 'tokens': tokens = tokens_from_doc(ttree_doc, self.language, self.selector) trees = self._tokens_to_flat_trees(tokens) elif self.mode == 'tagged_lemmas': tls = tagged_lemmas_from_doc(ttree_doc, self.language, self.selector) trees = self._tokens_to_flat_trees(tls, use_tags=True) else: trees = trees_from_doc(ttree_doc, self.language, self.selector) elif self.mode in ['tokens', 'tagged_lemmas']: trees = self._tokens_to_flat_trees(trees, use_tags=self.mode == 'tagged_lemmas') # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] # ignore contexts, if they are contained in the DAs if isinstance(self.train_das[0], tuple): self.train_das = [da for (context, da) in self.train_das] # delexicalize if DAs are lexicalized and we don't want that if self.delex_slots: self.train_das = [da.get_delexicalized(self.delex_slots) for da in self.train_das] # add empty tree + empty DA to training data # (i.e. forbid the network to keep any of its outputs "always-on") train_size += 1 self.train_trees.append(TreeData()) empty_da = DA.parse('inform()') self.train_das.append(empty_da) self.train_order = range(len(self.train_trees)) log_info('Using %d training instances.' % train_size) # initialize input features/embeddings if self.tree_embs: self.dict_size = self.tree_embs.init_dict(self.train_trees) self.X = np.array([self.tree_embs.get_embeddings(tree) for tree in self.train_trees]) else: self.tree_feats = Features(['node: presence t_lemma formeme']) self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.X = [self.tree_feats.get_features(tree, {}) for tree in self.train_trees] self.X = self.tree_vect.fit_transform(self.X) # initialize output features self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence']) self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.y = [self.da_feats.get_features(None, {'da': da}) for da in self.train_das] self.y = self.da_vect.fit_transform(self.y) log_info('Number of binary classes: %d.' % len(self.da_vect.get_feature_names())) # initialize I/O shapes if not self.tree_embs: self.input_shape = list(self.X[0].shape) else: self.input_shape = self.tree_embs.get_embeddings_shape() self.num_outputs = len(self.da_vect.get_feature_names()) # initialize NN classifier self._init_neural_network() # initialize the NN variables self.session.run(tf.global_variables_initializer())
def convert(args): """Main function – read in the JSON data and output TGEN-specific files.""" # initialize storage items = 0 conc_das = [] # concrete DAs das = [] # abstracted DAs concs = [] # concrete sentences texts = [] # abstracted sentences absts = [] # abstraction descriptions contexts = [] # abstracted contexts conc_contexts = [] # lexicalized contexts # process the input data and store it in memory with open(args.in_file, 'r') as fh: data = json.load(fh, encoding='UTF-8') for item in data: da = convert_abstr_da(DA.parse(item['response_da'])) context = convert_abstractions(item['context_utt']) context_l = item['context_utt_l'] conc_da = DA.parse(item['response_da_l']) concs_ = [tokenize(s) for s in item['response_nl_l']] absts_ = [] texts_ = [] for abst_text in item['response_nl']: text, abst = get_abstraction(abst_text, conc_da, args.slot_names) # convert *SLOT -> X absts_.append(abst) texts_.append(text) das.append(da) conc_das.append(conc_da) contexts.append(context) conc_contexts.append(context_l) concs.append(concs_) absts.append(absts_) texts.append(texts_) items += 1 print 'Processed', items, 'items.' if args.split: # get file name prefixes and compute data sizes for all the parts to be split out_names = re.split(r'[, ]+', args.out_name) data_sizes = [int(part_size) for part_size in args.split.split(':')] assert len(out_names) == len(data_sizes) # compute sizes for all but the 1st part (+ round them) total = float(sum(data_sizes)) remain = items for part_no in xrange(len(data_sizes) - 1, 0, -1): part_size = int(round(items * (data_sizes[part_no] / total))) data_sizes[part_no] = part_size remain -= part_size # put whatever remained into the 1st part data_sizes[0] = remain else: # use just one part -- containing all the data data_sizes = [items] out_names = [args.out_name] # write all data parts for part_size, part_name in zip(data_sizes, out_names): repeat_num = len(concs[0]) if args.multi_ref and part_name in ['devel', 'test', 'dtest', 'etest']: repeat_num = 1 # repeat DAs and contexts for synonymous paraphrases, unless for test data in multi-ref mode write_part(part_name + '-das.txt', das, part_size, repeat_num) write_part(part_name + '-conc_das.txt', conc_das, part_size, repeat_num) write_part(part_name + '-context.txt', contexts, part_size, repeat_num) write_part(part_name + '-conc_context.txt', conc_contexts, part_size, repeat_num) # write all other just once (here, each instance is a list, it will be unrolled) write_part(part_name + '-ref.txt', concs, part_size, trunc=False, separate=True) write_part(part_name + '-conc.txt', concs, part_size) write_part(part_name + '-abst.txt', absts, part_size) write_part(part_name + '-text.txt', texts, part_size)