def show_training_data(self): train = self.train dev = self.dev self.log('### Loaded data') self.log('# train: {} ... {}\n'.format(train.inputs[0][0], train.inputs[0][-1])) self.log('# train_gold: {} ... {}\n'.format(train.outputs[0][0], train.outputs[0][-1])) t2i_tmp = list(self.dic.tables[constants.UNIGRAM].str2id.items()) self.log('# token2id: {} ... {}\n'.format(t2i_tmp[:10], t2i_tmp[len(t2i_tmp)-10:])) if self.dic.has_table(constants.BIGRAM): b2i_tmp = list(self.dic.tables[constants.BIGRAM].str2id.items()) self.log('# bigram2id: {} ... {}\n'.format(b2i_tmp[:10], b2i_tmp[len(b2i_tmp)-10:])) if self.dic.has_trie(constants.CHUNK): id2chunk = self.dic.tries[constants.CHUNK].id2chunk n_chunks = len(self.dic.tries[constants.CHUNK]) c2i_head = [(id2chunk[i], i) for i in range(0, min(10, n_chunks))] c2i_tail = [(id2chunk[i], i) for i in range(max(0, n_chunks-10), n_chunks)] self.log('# chunk2id: {} ... {}\n'.format(c2i_head, c2i_tail)) if self.dic.has_table(constants.SEG_LABEL): id2seg = {v:k for k,v in self.dic.tables[constants.SEG_LABEL].str2id.items()} self.log('# label_set: {}\n'.format(id2seg)) attr_indexes=common.get_attribute_values(self.args.attr_indexes) for i in range(len(attr_indexes)): if self.dic.has_table(constants.ATTR_LABEL(i)): id2attr = {v:k for k,v in self.dic.tables[constants.ATTR_LABEL(i)].str2id.items()} self.log('# {}-th attribute labels: {}\n'.format(i, id2attr)) self.report('[INFO] vocab: {}'.format(len(self.dic.tables[constants.UNIGRAM]))) self.report('[INFO] data length: train={} devel={}'.format( len(train.inputs[0]), len(dev.inputs[0]) if dev else 0))
def show_training_data(self): train = self.train dev = self.dev self.log('### Loaded data') self.log('# train: {} ... {}\n'.format(train.inputs[0][0], train.inputs[0][-1])) self.log('# train_gold_attr: {} ... {}\n'.format( train.outputs[0][0], train.outputs[0][-1])) t2i_tmp = list(self.dic.tables[constants.UNIGRAM].str2id.items()) self.log('# token2id: {} ... {}\n'.format(t2i_tmp[:10], t2i_tmp[len(t2i_tmp) - 10:])) attr_indexes = common.get_attribute_values(self.args.attr_indexes) for i in range(len(attr_indexes)): if self.dic.has_table(constants.ATTR_LABEL(i)): id2attr = { v: k for k, v in self.dic.tables[constants.ATTR_LABEL( i)].str2id.items() } self.log('# {}-th attribute labels: {}\n'.format(i, id2attr)) self.report('[INFO] vocab: {}'.format( len(self.dic.tables[constants.UNIGRAM]))) self.report('[INFO] data length: train={} devel={}'.format( len(train.inputs[0]), len(dev.inputs[0]) if dev else 0))
def parse_commandline_input(self, line, dic): attr_delim = self.attr_delim if self.attr_delim else constants.SL_ATTR_DELIM get_unigram_id = dic.tables[constants.UNIGRAM].get_id if constants.ATTR_LABEL(1) in dic.tables: use_attr1 = True get_attr1_id = dic.tables[constants.ATTR_LABEL(1)].get_id else: use_attr1 = False get_attr1_id = None org_arr = line.split(' ') if use_attr1: org_attr1_seq = [ self.preprocess_attribute( elem.split(attr_delim)[1] if attr_delim in elem else constants.UNK_SYMBOL, 0, #self.attr_depths[0], None, #self.attr_target_labelsets[0] ) for elem in org_arr] org_attr1_seqs = [org_attr1_seq] attr1_seq = [get_attr1_id(attr) for attr in org_attr1_seq] attr1_seqs = [attr1_seq] else: org_attr1_seqs = [] attr1_seqs = [] org_token_seq = [elem.split(attr_delim)[0] for elem in org_arr] org_token_seqs = [org_token_seq] ptoken_seq = [self.preprocess_token(word) for word in org_token_seq] uni_seq = [get_unigram_id(word) for word in ptoken_seq] uni_seqs = [uni_seq] inputs = [uni_seqs, None, attr1_seqs] # TODO fix outputs = [] orgdata = [org_token_seqs, org_attr1_seqs] return RestorableData(inputs, outputs, orgdata=orgdata)
def load_dic(self, dic_path): with open(dic_path, 'rb') as f: self.dic = pickle.load(f) self.log('Load dic: {}'.format(dic_path)) self.log('Num of tokens: {}'.format( len(self.dic.tables[constants.UNIGRAM]))) if self.dic.has_table(constants.BIGRAM): self.log('Num of bigrams: {}'.format( len(self.dic.tables[constants.BIGRAM]))) if self.dic.has_trie(constants.CHUNK): self.log('Num of chunks: {}'.format( len(self.dic.tries[constants.CHUNK]))) if self.dic.has_table(constants.SEG_LABEL): self.log('Num of segmentation labels: {}'.format( len(self.dic.tables[constants.SEG_LABEL]))) for i in range(3): # tmp if self.dic.has_table(constants.ATTR_LABEL(i)): self.log('Num of {}-th attribute labels: {}'.format( i, len(self.dic.tables[constants.ATTR_LABEL(i)]))) if self.dic.has_table(constants.ARC_LABEL): self.log('Num of arc labels: {}'.format( len(self.dic.tables[constants.ARC_LABEL]))) if self.dic.has_table(constants_sematt.SEM_LABEL): self.log('Num of sem labels: {}'.format( len(self.dic.tables[constants_sematt.SEM_LABEL]))) self.log('')
def load_decode_data_SL(self, path, dic): attr_delim = self.attr_delim if self.attr_delim else constants.SL_ATTR_DELIM get_unigram_id = dic.tables[constants.UNIGRAM].get_id if constants.ATTR_LABEL(1) in dic.tables: use_attr1 = True get_attr1_id = dic.tables[constants.ATTR_LABEL(1)].get_id else: use_attr1 = False get_attr1_id = None org_token_seqs = [] org_attr1_seqs = [] token_seqs = [] attr1_seqs = [] ins_cnt = 0 with open(path) as f: for line in f: line = self.normalize_input_line(line) if len(line) <= 1: continue elif line[0] == constants.COMMENT_SYM: continue org_arr = line.split(constants.SL_TOKEN_DELIM) if use_attr1: org_attr1_seq = [ self.preprocess_attribute( elem.split(attr_delim)[1], self.attr_depths[0], self.attr_target_labelsets[0]) for elem in org_arr] org_attr1_seqs.append(org_attr1_seq) attr1_seq = [get_attr1_id(attr) for attr in org_attr1_seq] attr1_seqs.append(attr1_seq) org_token_seq = [elem.split(attr_delim)[0] for elem in org_arr] org_token_seqs.append(org_token_seq) ptoken_seq = [self.preprocess_token(token) for token in org_token_seq] token_seq = [get_unigram_id(ptoken, update=ptoken in self.unigram_vocab) for ptoken in ptoken_seq] token_seqs.append(token_seq) ins_cnt += 1 if ins_cnt % constants.NUM_FOR_REPORTING == 0: print('Read', ins_cnt, 'sentences', file=sys.stderr) inputs = [token_seqs] inputs.append(None) # bigram inputs.append(attr1_seqs if attr1_seqs else None) outputs = [] orgdata = [org_token_seqs, org_attr1_seqs] return RestorableData(inputs, outputs, orgdata=orgdata)
def setup_classifier(self): dic = self.dic hparams = self.hparams n_vocab = len(dic.tables['unigram']) unigram_embed_dim = hparams['unigram_embed_dim'] if 'pretrained_unigram_embed_dim' in hparams and hparams['pretrained_unigram_embed_dim'] > 0: pretrained_unigram_embed_dim = hparams['pretrained_unigram_embed_dim'] else: pretrained_unigram_embed_dim = 0 if 'pretrained_embed_usage' in hparams: pretrained_embed_usage = models.util.ModelUsage.get_instance(hparams['pretrained_embed_usage']) else: pretrained_embed_usage = models.util.ModelUsage.NONE n_attr1 = len(dic.tables[constants.ATTR_LABEL(0)]) if ( hparams['attr1_embed_dim'] > 0 and constants.ATTR_LABEL(0) in dic.tables) else 0 n_labels = len(dic.tables[constants.ARC_LABEL]) if common.is_typed_parsing_task(self.task) else 0 attr1_embed_dim = hparams['attr1_embed_dim'] if n_attr1 > 0 else 0 if (pretrained_embed_usage == models.util.ModelUsage.ADD or pretrained_embed_usage == models.util.ModelUsage.INIT): if pretrained_unigram_embed_dim > 0 and pretrained_unigram_embed_dim != unigram_embed_dim: print('Error: pre-trained and random initialized unigram embedding vectors ' + 'must be the same dimension for {} operation'.format(hparams['pretrained_embed_usage']) + ': d1={}, d2={}'.format(pretrained_unigram_embed_dim, unigram_embed_dim), file=sys.stderr) sys.exit() predictor = models.parser.RNNBiaffineParser( n_vocab, unigram_embed_dim, n_attr1, attr1_embed_dim, hparams['rnn_unit_type'], hparams['rnn_bidirection'], hparams['rnn_n_layers'], hparams['rnn_n_units'], hparams['mlp4arcrep_n_layers'], hparams['mlp4arcrep_n_units'], hparams['mlp4labelrep_n_layers'], hparams['mlp4labelrep_n_units'], mlp4labelpred_n_layers=hparams['mlp4labelpred_n_layers'], mlp4labelpred_n_units=hparams['mlp4labelpred_n_units'], n_labels=n_labels, rnn_dropout=hparams['rnn_dropout'], hidden_mlp_dropout=hparams['hidden_mlp_dropout'], pred_layers_dropout=hparams['pred_layers_dropout'], pretrained_unigram_embed_dim=pretrained_unigram_embed_dim, pretrained_embed_usage=pretrained_embed_usage) self.classifier = classifiers.dependency_parser.DependencyParser(predictor)
def load_decode_data_SL(self, path, dic): attr_delim = self.attr_delim if self.attr_delim else constants.SL_ATTR_DELIM num_attrs = len(self.attr_indexes) word_clm = self.token_index get_unigram_id = dic.tables[constants.UNIGRAM].get_id get_attr_id = dic.tables[constants.ATTR_LABEL(0)].get_id if num_attrs > 0 else None root_token = constants.ROOT_SYMBOL org_token_seqs = [] org_attr_seqs = [] # second or later attribute is ignored token_seqs = [] attr_seqs = [] ins_cnt = 0 with open(path) as f: for line in f: line = self.normalize_input_line(line) if len(line) <= 1: continue elif line[0] == constants.COMMENT_SYM: continue org_arr = line.split(constants.SL_TOKEN_DELIM) org_token_seq = [elem.split(attr_delim)[word_clm] for elem in org_arr] org_token_seq.insert(0, root_token) org_token_seqs.append(org_token_seq) ptoken_seq = [self.preprocess_token(token) for token in org_token_seq] token_seq = [get_unigram_id(ptoken, update=ptoken in self.unigram_vocab) for ptoken in ptoken_seq] token_seqs.append(token_seq) if num_attrs > 0: org_token_seq = [elem.split(attr_delim)[0] for elem in org_arr] org_attr_seq = [ self.preprocess_attribute( elem.split(attr_delim)[self.attr_indexes[0]], self.attr_depths[0], self.attr_target_labelsets[0]) for elem in org_arr] org_attr_seq.insert(0, root_token) org_attr_seqs.append(org_attr_seq) attr_seq = [get_attr_id(attr) for attr in org_attr_seq] attr_seqs.append(attr_seq) ins_cnt += 1 if ins_cnt % constants.NUM_FOR_REPORTING == 0: print('Read', ins_cnt, 'sentences', file=sys.stderr) inputs = [token_seqs, None] outputs = [] outputs.append(attr_seqs if num_attrs > 0 else None) orgdata = [org_token_seqs] orgdata.append(org_attr_seqs if num_attrs > 0 else None) return RestorableData(inputs, outputs, orgdata=orgdata)
def run_epoch(self, data, train=True): classifier = self.classifier evaluator = self.evaluator inputs = self.gen_inputs(data) xs = inputs[0] n_sen = len(xs) golds = inputs[self.label_begin_index] if train: self.classifier.train(*inputs) ret = self.classifier.decode(*inputs[:self.label_begin_index]) counts = self.evaluator.calculate(*[xs], *[golds], *[ret]) if train: self.log('\n<training result>') res = evaluator.report_results(n_sen, counts, file=self.logger) self.report('train\t%s' % res) if self.args.devel_data: self.log('\n<development result>') v_res = self.run_epoch(self.dev, train=False) self.report('devel\t%s' % v_res) # save model if not self.args.quiet: mdl_path = '{}/{}.pkl'.format(constants.MODEL_DIR, self.start_time) with open(mdl_path, 'wb') as f: pickle.dump(self.classifier, f) mdl_path_txt = '{}/{}.txt'.format(constants.MODEL_DIR, self.start_time) self.classifier.predictor.dump_model_as_txt( mdl_path_txt, self.dic.tables[constants_sematt.SEM_LABEL].id2str, self.dic.tables[constants.UNIGRAM].id2str, self.dic.tables[constants.ATTR_LABEL(0)].id2str) self.log('Save the model (binary): %s' % mdl_path) self.log('Save the model (text): %s' % mdl_path_txt) self.report('[INFO] Save the model (binary): %s\n' % mdl_path) self.report('[INFO] Save the model (text1): %s\n' % mdl_path_txt) if not self.args.quiet: self.reporter.close() self.reporter = open( '{}/{}.log'.format(constants.LOG_DIR, self.start_time), 'a') res = None if train else evaluator.report_results( n_sen, counts, file=self.logger) return res
def grow_embedding_layers(self, dic_grown, external_model=None, train=True): id2unigram_grown = dic_grown.tables[constants.UNIGRAM].id2str n_vocab_org = self.predictor.unigram_embed.W.shape[0] n_vocab_grown = len(id2unigram_grown) if (self.predictor.pretrained_embed_usage == models.util.ModelUsage.ADD or self.predictor.pretrained_embed_usage == models.util.ModelUsage.CONCAT): pretrained_unigram_embed = self.predictor.pretrained_unigram_embed else: pretrained_unigram_embed = None models.util.grow_embedding_layers( n_vocab_org, n_vocab_grown, self.predictor.unigram_embed, pretrained_unigram_embed, external_model, id2unigram_grown, self.predictor.pretrained_embed_usage, train=train) if constants.ATTR_LABEL(0) in dic_grown.tables: # POS id2pos_grown = dic_grown.tables[constants.ATTR_LABEL(0)].id2str n_pos_org = self.predictor.pos_embed.W.shape[0] n_pos_grown = len(id2pos_grown) models.util.grow_embedding_layers( n_pos_org, n_pos_grown, self.predictor.pos_embed, train=train)
def load_model(self): model_path = self.args.model_path if model_path.endswith('.pkl'): model_format = 'pkl' array = model_path.split('.pkl') elif model_path.endswith('.txt'): model_format = 'txt' array = model_path.split('.txt') else: print( 'Error: invalid model format. The file name must ends with \'pkl\' or \'txt\'.', file=sys.stderr) sys.exit() dic_path = '{}.s2i'.format(array[0]) hparam_path = '{}.hyp'.format(array[0]) param_path = model_path # dictionary self.load_dic(dic_path) # hyper parameters self.load_hyperparameters(hparam_path) self.log('Load hyperparameters: {}\n'.format(hparam_path)) self.show_hyperparameters() # model if model_format == 'pkl': with open(model_path, 'rb') as f: self.classifier = pickle.load(f) elif model_format == 'txt': predictor = models.attribute_annotator.load_model_from_txt( model_path, self.dic.tables[constants_sematt.SEM_LABEL].str2id, self.dic.tables[constants.UNIGRAM].str2id, (self.dic.tables[constants.ATTR_LABEL(0)].str2id if self.dic.has_table(constants.ATTR_LABEL(0)) else None)) self.classifier = classifiers.pattern_matcher.PatternMatcher( predictor) self.log('Load model: {}\n'.format(model_path))
def decode_batch(self, *inputs, org_tokens=None, org_attrs=None, file=sys.stdout): ys = self.classifier.decode(*inputs) id2label = (self.dic.tables[constants.SEG_LABEL if common.is_segmentation_task(self.task) else constants.ATTR_LABEL(0)].id2str) # for i in range(len(inputs[0])): # print(len(inputs[0][i]), inputs[0][i]) # print(len(org_tokens[i]), org_tokens[i]) # print(len(ys[i]), ys[i]) # print() if not org_attrs: org_attrs = [None] * len(org_tokens) for x_str, a_str, y in zip(org_tokens, org_attrs, ys): y_str = [id2label[int(yi)] for yi in y] y_str = self.convert_to_valid_BIES_seq(y_str) if self.task == constants.TASK_TAG: if a_str: res = ['{}{}{}{}{}'.format(xi_str, self.args.output_attr_delim, ai_str, self.args.output_attr_delim, yi_str) for xi_str, ai_str, yi_str in zip(x_str, a_str, y_str)] else: res = ['{}{}{}'.format(xi_str, self.args.output_attr_delim, yi_str) for xi_str, yi_str in zip(x_str, y_str)] if self.args.output_data_format == 'wl': res.append('') res = self.args.output_token_delim.join(res) elif self.task == constants.TASK_SEG: res = ['{}{}'.format(xi_str, self.args.output_token_delim if (yi_str.startswith('E') or yi_str.startswith('S')) else '') for xi_str, yi_str in zip(x_str, y_str)] res = ''.join(res).rstrip(' ') elif self.task == constants.TASK_SEGTAG: res = ['{}{}'.format( xi_str, (self.args.output_attr_delim+yi_str[2:]+self.args.output_token_delim) if (yi_str.startswith('E-') or yi_str.startswith('S-')) else '' ) for xi_str, yi_str in zip(x_str, y_str)] res = ''.join(res).rstrip(' ') else: print('Error: Invalid decode type', file=self.logger) sys.exit() print(res, file=file)
def init_dictionary(num_attrs=0): dic = dictionary.Dictionary() # unigram dic.create_table(constants.UNIGRAM) dic.tables[constants.UNIGRAM].set_unk(constants.UNK_SYMBOL) # attributes for i in range(num_attrs): dic.create_table(constants.ATTR_LABEL(i)) # dic.tables[constants.ATTR_LABEL(i)].set_unk(constants.UNK_SYMBOL) return dic
def grow_inference_layers(self, dic_grown): n_labels_org = self.predictor.mlp.layers[-1].W.shape[0] if common.is_segmentation_task(self.task): n_labels_grown = len(dic_grown.tables[constants.SEG_LABEL].id2str) else: n_labels_grown = len( dic_grown.tables[constants.ATTR_LABEL(0)].id2str) models.util.grow_MLP(n_labels_org, n_labels_grown, self.predictor.mlp.layers[-1]) if self.predictor.use_crf: models.util.grow_crf_layer(n_labels_org, n_labels_grown, self.predictor.crf)
def parse_commandline_input(self, line, dic): attr_delim = self.attr_delim if self.attr_delim else constants.SL_ATTR_DELIM num_attrs = len(self.attr_indexes) get_unigram_id = dic.tables[constants.UNIGRAM].get_id if constants.ATTR_LABEL(0) in dic.tables: use_attr0 = True get_attr0_id = dic.tables[constants.ATTR_LABEL(0)].get_id else: use_attr0 = False get_attr0_id = None org_arr = line.split(' ') if use_attr0: attr0_seq = [ elem.split(attr_delim)[self.attr_indexes[0]] if attr_delim in elem else '' for elem in org_arr] org_attr0_seq = [ self.preprocess_attribute(attr, self.attr_depths[0], self.attr_target_labelsets[0]) for attr in attr0_seq] org_attr0_seqs = [org_attr0_seq] attr0_seq = [get_attr0_id(attr) for attr in org_attr0_seq] attr0_seqs = [attr0_seq] else: org_attr0_seqs = [] attr0_seqs = [] org_token_seq = [elem.split(attr_delim)[0] for elem in org_arr] org_token_seqs = [org_token_seq] ptoken_seq = [self.preprocess_token(word) for word in org_token_seq] uni_seq = [get_unigram_id(word) for word in ptoken_seq] uni_seqs = [uni_seq] inputs = [uni_seqs] outputs = [attr0_seqs] orgdata = [org_token_seqs, org_attr0_seqs] return RestorableData(inputs, outputs, orgdata=orgdata)
def setup_evaluator(self, evaluator=None): if self.task == constants.TASK_TAG and self.args.ignored_labels: # TODO fix ignored_labels = set() for label in self.args.ignored_labels.split(','): label_id = self.dic.tables[constants.ATTR_LABEL(0)].get_id(label) if label_id >= 0: ignored_labels.add(label_id) self.log('Setup evaluator: labels to be ignored={}\n'.format(ignored_labels)) else: ignored_labels = set() # TODO reflect ignored_labels evaluator1 = None if self.task == constants.TASK_SEG: if self.args.evaluation_method == 'normal': evaluator1 = FMeasureEvaluator(self.dic.tables[constants.SEG_LABEL].id2str) elif self.args.evaluation_method == 'each_length': evaluator1 = FMeasureEvaluatorForEachLength(self.dic.tables[constants.SEG_LABEL].id2str) elif self.args.evaluation_method == 'each_vocab': vocabs = self.gen_vocabs() evaluator1 = FMeasureEvaluatorForEachVocab(self.dic.tables[constants.SEG_LABEL].id2str, vocabs) elif self.task == constants.TASK_SEGTAG: evaluator1 = DoubleFMeasureEvaluator(self.dic.tables[constants.SEG_LABEL].id2str) elif self.task == constants.TASK_TAG: if common.use_fmeasure(self.dic.tables[constants.ATTR_LABEL(0)].str2id): evaluator1 = FMeasureEvaluator(self.dic.tables[constants.ATTR_LABEL(0)].id2str) else: evaluator1 = AccuracyEvaluator(self.dic.tables[constants.ATTR_LABEL(0)].id2str) evaluator1.calculator.id2token = self.dic.tables[constants.UNIGRAM].id2str # tmp if not evaluator: self.evaluator = evaluator1 else: evaluator = evaluator1
def setup_evaluator(self, evaluator=None): ignored_labels = set() if self.args.ignored_labels: for label in self.args.ignored_labels.split(','): label_id = self.dic.tables[constants.ATTR_LABEL(0)].get_id( label) if label_id >= 0: ignored_labels.add(label_id) self.args.ignored_labels = ignored_labels self.log('Setup evaluator: labels to be ignored={}\n'.format( ignored_labels)) self.evaluator = AccuracyEvaluator(ignore_head=False, ignored_labels=ignored_labels)
def init_dictionary( num_attrs=0, use_arc_label=False): dic = dictionary.Dictionary() # unigram dic.create_table(constants.UNIGRAM) dic.tables[constants.UNIGRAM].set_unk(constants.UNK_SYMBOL) dic.tables[constants.UNIGRAM].get_id(constants.ROOT_SYMBOL, update=True) # attributes for i in range(num_attrs): dic.create_table(constants.ATTR_LABEL(i)) # dic.tables[constants.ATTR_LABEL(i)].set_unk(constants.UNK_SYMBOL) # arc label if use_arc_label: dic.create_table(constants.ARC_LABEL) return dic
def load_gold_data_WL(self, path, dic, train=True): attr_delim = self.attr_delim if self.attr_delim else constants.WL_ATTR_DELIM num_attrs = len(self.attr_indexes) word_clm = self.token_index if not dic: dic = init_dictionary(num_attrs=num_attrs) get_unigram_id = dic.tables[constants.UNIGRAM].get_id get_ith_attr_id = [] for i in range(num_attrs): get_ith_attr_id.append(dic.tables[constants.ATTR_LABEL(i)].get_id) token_seqs = [] attr_seqs_list = [[] for i in range(num_attrs)] ins_cnt = 0 with open(path) as f: uni_seq = [] attr_seq_list = [[] for i in range(num_attrs)] for line in f: line = self.normalize_input_line(line) if len(line) == 0: if len(uni_seq) > 0: token_seqs.append(uni_seq) uni_seq = [] for i, attr_seq in enumerate(attr_seq_list): if self.attr_chunking_flags[i]: attr_seq = [get_ith_attr_id[i](attr, update=train) for attr in data_loader.get_labelseq_BIOES(attr_seq)] attr_seqs_list[i].append(attr_seq) attr_seq_list = [[] for i in range(num_attrs)] ins_cnt += 1 if ins_cnt % constants.NUM_FOR_REPORTING == 0: print('Read', ins_cnt, 'sentences', file=sys.stderr) continue elif line[0] == constants.COMMENT_SYM: continue array = line.split(attr_delim) token = self.preprocess_token(array[word_clm]) tlen = len(token) attrs = [None] * max(num_attrs, 1) for i in range(num_attrs): attr = array[self.attr_indexes[i]] if len(array) > self.attr_indexes[i] else '' attrs[i] = self.preprocess_attribute( attr, self.attr_depths[i], self.attr_target_labelsets[i]) update_token = self.to_be_registered(token, train, self.freq_tokens, self.unigram_vocab) uni_seq.append(get_unigram_id(token, update=update_token)) for i in range(num_attrs): attr = attrs[i] attr_tmp = attr if self.attr_chunking_flags[i] else get_ith_attr_id[i](attr, update=train) attr_seq_list[i].append(attr_tmp) # register last sentenece if len(uni_seq) > 0: token_seqs.append(uni_seq) for i, attr_seq in enumerate(attr_seq_list): if self.attr_chunking_flags[i]: attr_seq = [get_ith_attr_id[i](attr, update=train) for attr in data_loader.get_labelseq_BIOES(attr_seq)] attr_seqs_list[i].append(attr_seq) inputs = [token_seqs] inputs.append(None) # bigram inputs.append(attr_seqs_list[1] if len(attr_seqs_list) > 1 else None) outputs = [] if len(attr_seqs_list) > 0: outputs.append(attr_seqs_list[0]) return Data(inputs, outputs), dic
def load_decode_data_WL(self, path, dic): attr_delim = self.attr_delim if self.attr_delim else constants.WL_ATTR_DELIM num_attrs = len(self.attr_indexes) word_clm = self.token_index get_unigram_id = dic.tables[constants.UNIGRAM].get_id get_attr_id = dic.tables[constants.ATTR_LABEL(0)].get_id if num_attrs > 0 else None root_token = constants.ROOT_SYMBOL org_token_seqs = [] org_attr_seqs = [] token_seqs = [] attr_seqs = [] ins_cnt = 0 with open(path) as f: org_token_seq = [root_token] org_attr_seq = [root_token] token_seq = [get_unigram_id(root_token)] attr_seq_list = [] for line in f: line = self.normalize_input_line(line) if len(line) == 0: if len(token_seq) > 0: org_token_seqs.append(org_token_seq) org_token_seq = [root_token] token_seqs.append(token_seq) token_seq = [get_unigram_id(root_token)] if num_attrs > 0: if self.attr_chunking_flags[0]: org_attr_seq = [attr for attr in data_loader.get_labelseq_BIOES(attr_seq)] org_attr_seqs.append(org_attr_seq) attr_seq = [get_attr_id(attr) for attr in org_attr_seq] attr_seqs.append(attr_seq) org_attr_seq = [root_token] attr_seq = [get_attr_id(root_token)] ins_cnt += 1 if ins_cnt % constants.NUM_FOR_REPORTING == 0: print('Read', ins_cnt, 'sentences', file=sys.stderr) continue elif line[0] == constants.COMMENT_SYM: continue array = line.split(attr_delim) org_token = array[word_clm] org_token_seq.append(org_token) ptoken = self.preprocess_token(org_token) token_seq.append(get_unigram_id(ptoken, update=ptoken in self.unigram_vocab)) attrs = [None] * max(num_attrs, 1) if num_attrs > 0: attr = self.preprocess_attribute( array[self.attr_indexes[0]], self.attr_depths[0], self.attr_target_labelsets[0]) org_attr_seq.append(attr) # register last sentenece if len(token_seq) > 1: # initialized sequence contains ROOT org_token_seqs.append(org_token_seq) token_seqs.append(token_seq) if num_attrs > 0: if self.attr_chunking_flags[0]: org_attr_seq = [attr for attr in data_loader.get_labelseq_BIOES(attr_seq)] org_attr_seqs.append(org_attr_seq) attr_seq = [get_attr_id(attr) for attr in org_attr_seq] attr_seqs.append(attr_seq) inputs = [token_seqs, None] outputs = [] outputs.append(attr_seqs if num_attrs > 0 else None) orgdata = [org_token_seqs] orgdata.append(org_attr_seqs if num_attrs > 0 else None) return RestorableData(inputs, outputs, orgdata=orgdata)
def load_gold_data_WL(self, path, dic, train=True): attr_delim = self.attr_delim if self.attr_delim else constants.WL_ATTR_DELIM num_attrs = len(self.attr_indexes) word_clm = self.token_index head_clm = self.head_index arc_clm = self.arc_index if not dic: dic = init_dictionary( num_attrs=num_attrs, use_arc_label=self.use_arc_label) get_unigram_id = dic.tables[constants.UNIGRAM].get_id get_arc_id = dic.tables[constants.ARC_LABEL].get_id if self.use_arc_label else None get_ith_attr_id = [] for i in range(num_attrs): get_ith_attr_id.append(dic.tables[constants.ATTR_LABEL(i)].get_id) token_seqs = [] head_seqs = [] # list of head id sequences arc_seqs = [] # list of arc label sequences attr_seqs_list = [[] for i in range(num_attrs)] ins_cnt = 0 sen_len_th = 3 # ROOT + more than two tokens with open(path) as f: uni_seq = [get_unigram_id(constants.ROOT_SYMBOL)] head_seq = [constants.NO_PARENTS_ID] arc_seq = [constants.NO_PARENTS_ID] if self.use_arc_label else None attr_seq_list = [[get_ith_attr_id[i](constants.ROOT_SYMBOL, update=train)] for i in range(num_attrs)] for line in f: line = self.normalize_input_line(line) if len(line) == 0: if len(uni_seq) >= sen_len_th: token_seqs.append(uni_seq) uni_seq = [get_unigram_id(constants.ROOT_SYMBOL)] head_seqs.append(head_seq) head_seq = [constants.NO_PARENTS_ID] if self.use_arc_label: arc_seqs.append(arc_seq) arc_seq = [constants.NO_PARENTS_ID] for i, attr_seq in enumerate(attr_seq_list): if self.attr_chunking_flags[i]: # TODO fix code for ROOT attr_seq = [get_ith_attr_id[i](attr, update=train) for attr in data_loader.get_labelseq_BIOES(attr_seq)] attr_seqs_list[i].append(attr_seq) attr_seq_list = [[get_ith_attr_id[i](constants.ROOT_SYMBOL)] for i in range(num_attrs)] ins_cnt += 1 if ins_cnt % constants.NUM_FOR_REPORTING == 0: print('Read', ins_cnt, 'sentences', file=sys.stderr) continue elif line[0] == constants.COMMENT_SYM: continue array = line.split(attr_delim) token = self.preprocess_token(array[word_clm]) attrs = [None] * max(num_attrs, 1) for i in range(num_attrs): org_attr = array[self.attr_indexes[i]] if self.attr_indexes[i] < len(array) else constants.UNK_SYMBOL attrs[i] = self.preprocess_attribute( org_attr, self.attr_depths[i], self.attr_target_labelsets[i]) attr_tmp = attrs[i] if self.attr_chunking_flags[i] else get_ith_attr_id[i]( attrs[i], update=train) attr_seq_list[i].append(attr_tmp) update_token = self.to_be_registered(token, train, self.freq_tokens, self.unigram_vocab) uni_seq.append(get_unigram_id(token, update=update_token)) head = int(array[head_clm]) if head < 0: head = 0 head_seq.append(head) if self.use_arc_label: arc = array[arc_clm] arc_seq.append(get_arc_id(arc, update=train)) # register last sentenece if len(uni_seq) >= sen_len_th: # org_token_seqs.append(org_token_seq) token_seqs.append(uni_seq) head_seqs.append(head_seq) arc_seqs.append(arc_seq) for i, attr_seq in enumerate(attr_seq_list): if self.attr_chunking_flags[i]: attr_seq = [get_ith_attr_id[i](attr, update=train) for attr in data_loader.get_labelseq_BIOES(attr_seq)] attr_seqs_list[i].append(attr_seq) inputs = [token_seqs] inputs.append(attr_seqs_list[1] if len(attr_seqs_list) > 1 else None) print(len(inputs[0])) outputs = [] outputs.append(attr_seqs_list[0] if len(attr_seqs_list) > 0 else None) outputs.append(head_seqs) if self.use_arc_label: outputs.append(arc_seqs) return Data(inputs, outputs), dic
def setup_classifier(self): dic = self.dic hparams = self.hparams n_vocab = len(dic.tables['unigram']) unigram_embed_dim = hparams['unigram_embed_dim'] if 'bigram_embed_dim' in hparams and hparams['bigram_embed_dim'] > 0: bigram_embed_dim = hparams['bigram_embed_dim'] n_bigrams = len(dic.tables[constants.BIGRAM]) else: bigram_embed_dim = n_bigrams = 0 if 'pretrained_unigram_embed_dim' in hparams and hparams['pretrained_unigram_embed_dim'] > 0: pretrained_unigram_embed_dim = hparams['pretrained_unigram_embed_dim'] else: pretrained_unigram_embed_dim = 0 if 'pretrained_bigram_embed_dim' in hparams and hparams['pretrained_bigram_embed_dim'] > 0: pretrained_bigram_embed_dim = hparams['pretrained_bigram_embed_dim'] else: pretrained_bigram_embed_dim = 0 if 'pretrained_embed_usage' in hparams: pretrained_embed_usage = models.util.ModelUsage.get_instance(hparams['pretrained_embed_usage']) else: pretrained_embed_usage = models.util.ModelUsage.NONE if common.is_segmentation_task(self.task): n_label = len(dic.tables[constants.SEG_LABEL]) n_labels = [n_label] attr1_embed_dim = n_attr1 = 0 else: n_labels = [] for i in range(3): # tmp if constants.ATTR_LABEL(i) in dic.tables: n_label = len(dic.tables[constants.ATTR_LABEL(i)]) n_labels.append(n_label) if 'attr1_embed_dim' in hparams and hparams['attr1_embed_dim'] > 0: attr1_embed_dim = hparams['attr1_embed_dim'] n_attr1 = n_labels[1] if len(n_labels) > 1 else 0 else: attr1_embed_dim = n_attr1 = 0 if (pretrained_embed_usage == models.util.ModelUsage.ADD or pretrained_embed_usage == models.util.ModelUsage.INIT): if pretrained_unigram_embed_dim > 0 and pretrained_unigram_embed_dim != unigram_embed_dim: print('Error: pre-trained and random initialized unigram embedding vectors ' + 'must be the same dimension for {} operation'.format(hparams['pretrained_embed_usage']) + ': d1={}, d2={}'.format(pretrained_unigram_embed_dim, unigram_embed_dim), file=sys.stderr) sys.exit() if pretrained_bigram_embed_dim > 0 and pretrained_bigram_embed_dim != bigram_embed_dim: print('Error: pre-trained and random initialized bigram embedding vectors ' + 'must be the same dimension for {} operation'.format(hparams['pretrained_embed_usage']) + ': d1={}, d2={}'.format(pretrained_bigram_embed_dim, bigram_embed_dim), file=sys.stderr) sys.exit() predictor = models.tagger.construct_RNNTagger( n_vocab, unigram_embed_dim, n_bigrams, bigram_embed_dim, n_attr1, attr1_embed_dim, 0, 0, hparams['rnn_unit_type'], hparams['rnn_bidirection'], hparams['rnn_n_layers'], hparams['rnn_n_units'], hparams['rnn_n_layers2'] if 'rnn_n_layers2' in hparams else 0, hparams['rnn_n_units2'] if 'rnn_n_units2' in hparams else 0, hparams['mlp_n_layers'], hparams['mlp_n_units'], n_labels[0], use_crf=hparams['inference_layer'] == 'crf', feat_dim=hparams['additional_feat_dim'], mlp_n_additional_units=0, rnn_dropout=hparams['rnn_dropout'], embed_dropout=hparams['embed_dropout'] if 'embed_dropout' in hparams else 0.0, mlp_dropout=hparams['mlp_dropout'], pretrained_unigram_embed_dim=pretrained_unigram_embed_dim, pretrained_bigram_embed_dim=pretrained_bigram_embed_dim, pretrained_embed_usage=pretrained_embed_usage) self.classifier = classifiers.sequence_tagger.SequenceTagger(predictor, task=self.task)
def load_gold_data_WL(self, path, dic, train=True): attr_delim = self.attr_delim if self.attr_delim else constants.WL_ATTR_DELIM num_attrs = len(self.attr_indexes) if not dic: dic = init_dictionary(num_attrs=num_attrs) get_unigram_id = dic.tables[constants.UNIGRAM].get_id get_label_id = dic.tables[constants_sematt.SEM_LABEL].get_id get_ith_attr_id = [] for i in range(num_attrs): get_ith_attr_id.append(dic.tables[constants.ATTR_LABEL(i)].get_id) token_seqs = [] label_seqs = [] # list of semantic attribute sequences attr_seqs_list = [[] for i in range(num_attrs)] ins_cnt = 0 word_clm = self.token_index label_clm = self.label_index with open(path) as f: uni_seq = [] label_seq = [] attr_seq_list = [[] for i in range(num_attrs)] for line in f: line = self.normalize_input_line(line) if len(line) == 0: if len(uni_seq) > 0: token_seqs.append(uni_seq) uni_seq = [] label_seqs.append(label_seq) label_seq = [] for i, attr_seq in enumerate(attr_seq_list): if self.attr_chunking_flags[i]: attr_seq = [get_ith_attr_id[i](attr, update=train) for attr in data_loader.get_labelseq_BIOES(attr_seq)] attr_seqs_list[i].append(attr_seq) attr_seq_list = [[] for i in range(num_attrs)] ins_cnt += 1 if ins_cnt % constants.NUM_FOR_REPORTING == 0: print('Read', ins_cnt, 'sentences', file=sys.stderr) continue elif line[0] == constants.COMMENT_SYM: continue array = line.split(attr_delim) token = self.preprocess_token(array[word_clm]) tlen = len(token) attrs = [None] * max(num_attrs, 1) if len(array) < 2 + num_attrs: continue for i in range(num_attrs): attrs[i] = self.preprocess_attribute( array[self.attr_indexes[i]], self.attr_depths[i], self.attr_target_labelsets[i]) attr_tmp = attrs[i] if self.attr_chunking_flags[i] else get_ith_attr_id[i]( attrs[i], update=train) attr_seq_list[i].append(attr_tmp) update_token = self.to_be_registered(token, train) uni_seq.append(get_unigram_id(token, update=update_token)) label = array[label_clm] if len(array) > label_clm else constants.NONE_SYMBOL if label == '': label = constants.NONE_SYMBOL if DELIM in label: labels = label.split(DELIM) label = labels[0] if COLON in label: # ':' is used as a special character when reading/writing a txt-format model label = label.replace(COLON, COLON_ALT) label_seq.append(get_label_id(label, update=train)) # register last sentenece if len(uni_seq) > 0: token_seqs.append(uni_seq) label_seqs.append(label_seq) for i, attr_seq in enumerate(attr_seq_list): if self.attr_chunking_flags[i]: attr_seq = [get_ith_attr_id[i](attr, update=train) for attr in data_loader.get_labelseq_BIOES(attr_seq)] attr_seqs_list[i].append(attr_seq) inputs = [token_seqs] inputs.append(attr_seqs_list[0] if len(attr_seqs_list) > 0 else None) outputs = [label_seqs] return Data(inputs, outputs), dic