def conll_data_generator(filenames, data_config): for filename in filenames: with open(filename, 'r') as f: sents = 0 toks = 0 buf = [] for line in f: line = line.strip() if line: toks += 1 split_line = line.split() data_vals = [] for d in data_config.keys(): # only return the data that we're actually going to use as inputs or outputs if ('feature' in data_config[d] and data_config[d]['feature']) or \ ('label' in data_config[d] and data_config[d]['label']): datum_idx = data_config[d]['conll_idx'] converter_name = data_config[d]['converter']['name'] if 'converter' in data_config[d] else 'default_converter' converter_params = data_converters.get_params(data_config[d], split_line, datum_idx) data = data_converters.dispatch(converter_name)(**converter_params) data_vals.extend(data) # print(tuple(data_vals)) buf.append(tuple(data_vals)) else: if buf: sents += 1 yield buf buf = [] # print() # catch the last one if buf: yield buf
def conll_data_generator(filenames, data_config): """ Read CoNLL formated @filenames files. Yields each sentence. Select columns defined in @data_config. data_config optionaly specify converters. """ for filename in filenames: with open(filename, 'r') as f: sents = 0 toks = 0 buf = [] for line in f: line = line.strip() if line: toks += 1 split_line = line.split() data_vals = [] for d in data_config.keys(): # only return the data that we're actually going to use as inputs or outputs if (('feature' in data_config[d] and data_config[d]['feature']) or ('label' in data_config[d] and data_config[d]['label'])): datum_idx = data_config[d]['conll_idx'] converter_name = data_config[d]['converter'][ 'name'] if 'converter' in data_config[ d] else 'default_converter' converter_params = data_converters.get_params( data_config[d], split_line, datum_idx) tf.logging.log( tf.logging.INFO, f"conll_data_generator dispatching for {d}: " f"{converter_name}, " f"{converter_params}") data = data_converters.dispatch(converter_name)( **converter_params) data_vals.extend(data) # print(tuple(data_vals)) buf.append(tuple(data_vals)) else: if buf: sents += 1 tf.logging.log( tf.logging.INFO, f"data_generator.conll_data_generator " f"yielding buf: {buf}: ") yield buf buf = [] # catch the last one if buf: tf.logging.log( tf.logging.INFO, f"data_generator.conll_data_generator " f"yielding last buf from {filename}: {buf}: ") yield buf
def conll_data_generator(filenames, data_config): # print("debug <processing input data using config>: ", data_config) for filename in filenames: with open(filename, 'r') as f: sents = 0 toks = 0 buf = [] for line in f: line = line.strip() # print("debug <input line>: ", line) if line: # if toks <20: # print("debug <input line>: ", line) toks += 1 split_line = line.split() data_vals = [] for d in data_config.keys(): # only return the data that we're actually going to use as inputs or outputs if ('feature' in data_config[d] and data_config[d]['feature']) or \ ('label' in data_config[d] and data_config[d]['label']): datum_idx = data_config[d]['conll_idx'] converter_name = data_config[d]['converter']['name'] if 'converter' in data_config[d] else 'default_converter' converter_params = data_converters.get_params(data_config[d], split_line, datum_idx) # print(datum_idx, converter_name) # try: data = data_converters.dispatch(converter_name)(**converter_params) # except Exception as e: # print(e) # print("debug <converter_name, converter_params>", converter_name, '\n',converter_params) data_vals.extend(data) # if toks < 30: # print("debug <data_vals>: ",tuple(data_vals)) # print("debug <data_vals>:", tuple(data_vals)) buf.append(tuple(data_vals)) else: if buf: sents += 1 yield buf buf = [] # print() # catch the last one if buf: yield buf
def create_load_or_update_vocab_files(self, data_config, save_dir, filenames=None, update_only=False): # init maps vocabs = [] vocabs_index = {} for d in data_config: updatable = 'updatable' in data_config[d] and data_config[d][ 'updatable'] if 'vocab' in data_config[d] and data_config[d]['vocab'] == d and ( updatable or not update_only): this_vocab = {} if update_only and updatable and d in self.vocab_maps: this_vocab = self.vocab_maps[d] vocabs.append(this_vocab) vocabs_index[d] = len(vocabs_index) # Create vocabs from data files if filenames: for filename in filenames: with open(filename, 'r') as f: for line in f: line = line.strip() if line: split_line = line.split() for d in vocabs_index.keys(): datum_idx = data_config[d]['conll_idx'] this_vocab_map = vocabs[vocabs_index[d]] converter_name = data_config[d]['converter'][ 'name'] if 'converter' in data_config[ d] else 'default_converter' converter_params = data_converters.get_params( data_config[d], split_line, datum_idx) this_data = data_converters.dispatch( converter_name)(**converter_params) for this_datum in this_data: if this_datum not in this_vocab_map: this_vocab_map[this_datum] = 0 this_vocab_map[this_datum] += 1 # Assume we have the vocabs saved to disk; load them else: for d in vocabs_index.keys(): this_vocab_map = vocabs[vocabs_index[d]] with open("%s/%s.txt" % (save_dir, d), 'r') as f: for line in f: datum, count = line.strip().split() this_vocab_map[datum] = int(count) # build reverse_maps, joint_label_lookup_maps for v in vocabs_index.keys(): # build reverse_lookup map, from int -> string this_counts_map = vocabs[vocabs_index[v]] this_map = dict( zip(this_counts_map.keys(), range(len(this_counts_map.keys())))) reverse_map = dict( zip(range(len(this_counts_map.keys())), this_counts_map.keys())) self.oovs[v] = False if 'oov' in self.data_config[v] and self.data_config[v]['oov']: self.oovs[v] = True # reverse_map[len(reverse_map)] = constants.OOV_STRING # this_map[len(this_map)] = constants.OOV_STRING self.reverse_maps[v] = reverse_map self.vocab_maps[v] = this_map # check whether we need to build joint_label_lookup_map if 'label_components' in self.data_config[v]: joint_vocab_map = vocabs[vocabs_index[v]] label_components = self.data_config[v]['label_components'] component_keys = [ vocabs[vocabs_index[d]].keys() for d in label_components ] component_maps = [ dict(zip(comp_keys, range(len(comp_keys)))) for comp_keys in component_keys ] map_names = [ "%s_to_%s" % (v, label_comp) for label_comp in label_components ] joint_to_comp_maps = [ np.zeros([len(joint_vocab_map), 1], dtype=np.int32) for _ in label_components ] for joint_idx, joint_label in enumerate( joint_vocab_map.keys()): split_label = joint_label.split(constants.JOINT_LABEL_SEP) for label_comp, comp_map, joint_to_comp_map in zip( split_label, component_maps, joint_to_comp_maps): comp_idx = comp_map[label_comp] joint_to_comp_map[joint_idx] = comp_idx # add them to the master map for map_name, joint_to_comp_map in zip(map_names, joint_to_comp_maps): self.joint_label_lookup_maps[map_name] = joint_to_comp_map for d in vocabs_index.keys(): this_vocab_map = vocabs[vocabs_index[d]] with open("%s/%s.txt" % (save_dir, d), 'w') as f: for k, v in this_vocab_map.items(): print("%s\t%d" % (k, v), file=f) return {k: len(vocabs[vocabs_index[k]]) for k in vocabs_index.keys()}