def make_data(self, trainfilename, maxseqlen=None, maxclauselen=None, label_ind=None, train=False): use_attention = self.params["use_attention"] batch_size = self.params["batch_size"] str_seqs, label_seqs = read_passages(trainfilename, is_labeled=train) print("Filtering data") str_seqs = clean_words(str_seqs) label_seqs = to_BIO(label_seqs) if not label_ind: self.label_ind = {"none": 0} else: self.label_ind = label_ind seq_lengths = [len(seq) for seq in str_seqs] if self.maxseqlen is None: if maxseqlen: self.maxseqlen = maxseqlen elif self.params["maxseqlen"] is not None: self.maxseqlen = self.params["maxseqlen"] else: self.maxseqlen = max(seq_lengths) if self.maxclauselen is None: if maxclauselen: self.maxclauselen = maxclauselen elif self.params["maxclauselen"] is not None: self.maxclauselen = self.params["maxclauselen"] elif use_attention: sentence_lens = [] for str_seq in str_seqs: for seq in str_seq: tokens = self.tokenizer.tokenize(seq.lower()) sentence_lens.append(len(tokens)) self.maxclauselen = np.round( np.mean(sentence_lens) + 3 * np.std(sentence_lens)).astype(int) if len(self.label_ind) <= 1: for str_seq, label_seq in zip(str_seqs, label_seqs): for label in label_seq: if label not in self.label_ind: # Add new labels with values 0,1,2,.... self.label_ind[label] = len(self.label_ind) self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()} discourse_generator = BertDiscourseGenerator(self.bert, self.tokenizer, str_seqs, label_seqs, self.label_ind, batch_size, use_attention, self.maxseqlen, self.maxclauselen, train) return seq_lengths, discourse_generator # One-hot representation of labels
def make_data(self, trainfilename, maxseqlen=None, maxclauselen=None, label_ind=None, train=False): use_attention = self.params["use_attention"] maxseqlen = self.params["maxseqlen"] maxclauselen = self.params["maxclauselen"] batch_size = self.params["batch_size"] str_seqs, label_seqs = read_passages(trainfilename, is_labeled=train) print("Filtering data") str_seqs = clean_words(str_seqs) label_seqs = to_BIO(label_seqs) if not label_ind: self.label_ind = {"none": 0} else: self.label_ind = label_ind seq_lengths = [len(seq) for seq in str_seqs] if not maxseqlen: maxseqlen = max(seq_lengths) if not maxclauselen: if use_attention: clauselens = [] for str_seq in str_seqs: clauselens.extend( [len(clause.split()) for clause in str_seq]) maxclauselen = np.round( np.mean(clauselens) + 3 * np.std(clauselens)).astype(int) X = [] Y = [] Y_inds = [] init_word_rep_len = len(self.rep_reader.word_rep) # Vocab size if len(self.label_ind) <= 1: for str_seq, label_seq in zip(str_seqs, label_seqs): for label in label_seq: if label not in self.label_ind: # Add new labels with values 0,1,2,.... self.label_ind[label] = len(self.label_ind) self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()} discourse_generator = DiscourseGenerator(self.rep_reader, str_seqs, label_seqs, self.label_ind, batch_size, use_attention, maxseqlen, maxclauselen, train, self.input_size) self.maxseqlen = maxseqlen self.maxclauselen = maxclauselen return seq_lengths, discourse_generator # One-hot representation of labels
def make_data(self, clauses, use_attention, maxseqlen=None, maxclauselen=None, label_ind=None, train=False): print >> sys.stderr, "Reading data.." str_seqs, label_seqs = read_passages(clauses, is_labeled=train) print >> sys.stderr, "Sample data for train:" if train else "Sample data for test:" print >> sys.stderr, zip(str_seqs[0], label_seqs[0]) if not label_ind: self.label_ind = {"none": 0} else: self.label_ind = label_ind seq_lengths = [len(seq) for seq in str_seqs] if not maxseqlen: maxseqlen = max(seq_lengths) if not maxclauselen: if use_attention: clauselens = [] for str_seq in str_seqs: clauselens.extend( [len(clause.split()) for clause in str_seq]) maxclauselen = max(clauselens) X = [] Y = [] Y_inds = [] #init_word_rep_len = len(self.rep_reader.word_rep) all_word_types = set([]) for str_seq, label_seq in zip(str_seqs, label_seqs): for label in label_seq: if label not in self.label_ind: self.label_ind[label] = len(self.label_ind) if use_attention: x = numpy.zeros((maxseqlen, maxclauselen, self.input_size)) else: x = numpy.zeros((maxseqlen, self.input_size)) y_ind = numpy.zeros(maxseqlen) seq_len = len(str_seq) # The following conditional is true only when we've already trained, and one of the sequences in the test set is longer than the longest sequence in training. if seq_len > maxseqlen: str_seq = str_seq[:maxseqlen] seq_len = maxseqlen if train: for i, (clause, label) in enumerate(zip(str_seq, label_seq)): clause_rep = self.rep_reader.get_clause_rep(clause) for word in clause.split(): all_word_types.add(word) if use_attention: if len(clause_rep) > maxclauselen: clause_rep = clause_rep[:maxclauselen] x[-seq_len + i][-len(clause_rep):] = clause_rep else: x[-seq_len + i] = numpy.mean(clause_rep, axis=0) y_ind[-seq_len + i] = self.label_ind[label] X.append(x) Y_inds.append(y_ind) else: for i, clause in enumerate(str_seq): clause_rep = self.rep_reader.get_clause_rep(clause) for word in clause.split(): all_word_types.add(word) if use_attention: if len(clause_rep) > maxclauselen: clause_rep = clause_rep[:maxclauselen] x[-seq_len + i][-len(clause_rep):] = clause_rep else: x[-seq_len + i] = numpy.mean(clause_rep, axis=0) X.append(x) final_word_rep_len = len(self.rep_reader.word_rep) #oov_ratio = float(final_word_rep_len - init_word_rep_len)/len(all_word_types) #print >>sys.stderr, "OOV ratio: %f" % oov_ratio for y_ind in Y_inds: y = numpy.zeros((maxseqlen, len(self.label_ind))) for i, y_ind_i in enumerate(y_ind): y[i][y_ind_i] = 1 Y.append(y) self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()} return seq_lengths, numpy.asarray(X), numpy.asarray(Y)
clauses = codecs.open(test_file, "r", "utf-8") test_seq_lengths, X_test, _ = nnt.make_data( clauses, use_attention, maxseqlen=maxseqlen, maxclauselen=maxclauselen, label_ind=label_ind, train=False) print >> sys.stderr, "X_test shape:", X_test.shape pred_probs, pred_label_seqs, _ = nnt.predict( X_test, bid, test_seq_lengths) if show_att: att_weights = nnt.get_attention_weights( X_test.astype('float32')) clause_seqs, _ = read_passages(test_file, is_labeled=True) paralens = [[len(clause.split()) for clause in seq] for seq in clause_seqs] for clauselens, sample_att_weights, pred_label_seq in zip( paralens, att_weights, pred_label_seqs): for clauselen, clause_weights, pred_label in zip( clauselens, sample_att_weights[-len(clauselens):], pred_label_seq): print >> outfile, pred_label, " ".join([ "%.4f" % val for val in clause_weights[-clauselen:] ]) print >> outfile else: for pred_label_seq in pred_label_seqs: for pred_label in pred_label_seq: print >> outfile, pred_label
args = argparser.parse_args() if args.train_file: trainfile = args.train_file train = True else: train = False if args.test_files: testfiles = args.test_files test = True else: test = False p_count = 0 c_count = 0 if (train): clauses = codecs.open(trainfile, "r", "utf-8") str_seqs, label_seqs = read_passages(clauses, is_labeled=True) p_count += len(str_seqs) c_count += len(list(itertools.chain(*label_seqs))) if (test): for testfile in testfiles: clauses = codecs.open(testfile, "r", "utf-8") str_seqs, label_seqs = read_passages(clauses, is_labeled=True) p_count += len(str_seqs) c_count += len(list(itertools.chain(*label_seqs))) print("Paragraph Count: %d" % p_count) print("Clause Count: %d" % c_count)
def make_data(self, trainfilename, use_attention, maxseqlen=None, maxclauselen=None, label_ind=None, train=False): # list of list str_seqs, label_seqs = read_passages(trainfilename, is_labeled=train) if not label_ind: self.label_ind = {"none": 0} else: self.label_ind = label_ind seq_lengths = [len(seq) for seq in str_seqs] if not maxseqlen: maxseqlen = max(seq_lengths) if not maxclauselen: if use_attention: clauselens = [] for str_seq in str_seqs: clauselens.extend( [len(clause.split()) for clause in str_seq]) maxclauselen = max(clauselens) X = [] Y = [] Y_inds = [] init_word_rep_len = len(self.rep_reader.word_rep) # Vocab size all_word_types = set([]) for str_seq, label_seq in zip(str_seqs, label_seqs): for label in label_seq: if label not in self.label_ind: # Add new labels with values 0,1,2,.... self.label_ind[label] = len(self.label_ind) if use_attention: x = np.zeros((maxseqlen, maxclauselen, self.input_size)) else: x = np.zeros((maxseqlen, self.input_size)) y_ind = np.zeros(maxseqlen) seq_len = len(str_seq) # The following conditional is true only when we've already trained, and one of the sequences in the test set is longer than the longest sequence in training. if seq_len > maxseqlen: str_seq = str_seq[:maxseqlen] seq_len = maxseqlen if train: for i, (clause, label) in enumerate(zip(str_seq, label_seq)): clause_rep = self.rep_reader.get_clause_rep( clause ) # Makes embedding non-trainable from the beginning. for word in clause.split(): all_word_types.add(word) # Vocab if use_attention: if len(clause_rep) > maxclauselen: clause_rep = clause_rep[:maxclauselen] x[-seq_len + i][-len(clause_rep):] = clause_rep else: x[-seq_len + i] = np.mean(clause_rep, axis=0) y_ind[-seq_len + i] = self.label_ind[label] X.append(x) Y_inds.append(y_ind) else: for i, clause in enumerate(str_seq): clause_rep = self.rep_reader.get_clause_rep(clause) for word in clause.split(): all_word_types.add(word) if use_attention: if len(clause_rep) > maxclauselen: clause_rep = clause_rep[:maxclauselen] x[-seq_len + i][-len(clause_rep):] = clause_rep else: x[-seq_len + i] = np.mean(clause_rep, axis=0) X.append(x) # Once there is OOV, new word vector is added to word_rep final_word_rep_len = len(self.rep_reader.word_rep) oov_ratio = float(final_word_rep_len - init_word_rep_len) / len(all_word_types) for y_ind in Y_inds: y = np.zeros((maxseqlen, len(self.label_ind))) for i, y_ind_i in enumerate(y_ind): y[i][y_ind_i.astype(int)] = 1 Y.append(y) self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()} return seq_lengths, np.asarray(X), np.asarray( Y) # One-hot representation of labels
def make_data(self, trainfilename, use_attention, maxseqlen=None, maxclauselen=None, label_ind=None, train=False): print >>sys.stderr, "Reading data.." str_seqs, label_seqs = read_passages(trainfilename, train) if not label_ind: self.label_ind = {"none": 0} else: self.label_ind = label_ind if not maxseqlen: maxseqlen = max([len(label_seq) for label_seq in label_seqs]) if not maxclauselen: if use_attention: clauselens = [] for str_seq in str_seqs: clauselens.extend([len(clause.split()) for clause in str_seq]) maxclauselen = max(clauselens) X = [] Y = [] Y_inds = [] for str_seq, label_seq in zip(str_seqs, label_seqs): for label in label_seq: if label not in self.label_ind: self.label_ind[label] = len(self.label_ind) if use_attention: x = numpy.zeros((maxseqlen, maxclauselen, self.input_size)) else: x = numpy.zeros((maxseqlen, self.input_size)) y_ind = numpy.zeros(maxseqlen) seq_len = len(str_seq) # The following conditional is true only when we've already trained, and one of the sequences in the test set is longer than the longest sequence in training. if seq_len > maxseqlen: str_seq = str_seq[:maxseqlen] seq_len = maxseqlen if train: for i, (clause, label) in enumerate(zip(str_seq, label_seq)): clause_rep = self.rep_reader.get_clause_rep(clause) if use_attention: if len(clause_rep) > maxclauselen: clause_rep = clause_rep[:maxclauselen] x[-seq_len+i][-len(clause_rep):] = clause_rep else: x[-seq_len+i] = numpy.mean(clause_rep, axis=0) y_ind[-seq_len+i] = self.label_ind[label] X.append(x) Y_inds.append(y_ind) else: for i, clause in enumerate(str_seq): clause_rep = self.rep_reader.get_clause_rep(clause) if use_attention: if len(clause_rep) > maxclauselen: clause_rep = clause_rep[:maxclauselen] x[-seq_len+i][-len(clause_rep):] = clause_rep else: x[-seq_len+i] = numpy.mean(clause_rep, axis=0) X.append(x) for y_ind in Y_inds: y = numpy.zeros((maxseqlen, len(self.label_ind))) for i, y_ind_i in enumerate(y_ind): y[i][y_ind_i] = 1 Y.append(y) self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()} return numpy.asarray(X), numpy.asarray(Y)
model_config_file = open("model_%s_config.json"%model_ext, "r") model_weights_file_name = "model_%s_weights"%model_ext model_label_ind = "model_%s_label_ind.json"%model_ext nnt.tagger = model_from_json(model_config_file.read(), custom_objects={"TensorAttention":TensorAttention, "HigherOrderTimeDistributedDense":HigherOrderTimeDistributedDense}) print >>sys.stderr, "Loaded model:" print >>sys.stderr, nnt.tagger.summary() nnt.tagger.load_weights(model_weights_file_name) print >>sys.stderr, "Loaded weights" label_ind_json = json.load(open(model_label_ind)) label_ind = {k: int(label_ind_json[k]) for k in label_ind_json} print >>sys.stderr, "Loaded label index:", label_ind for l in nnt.tagger.layers: if l.name == "tensorattention": maxseqlen, maxclauselen = l.td1, l.td2 break for test_file in args.test_files: print >>sys.stderr, "Predicting on file %s"%(test_file) test_out_file_name = test_file.split("/")[-1].replace(".txt", "")+"_att=%s_cont=%s_bid=%s"%(str(use_attention), att_context, str(bid))+".out" outfile = open(test_out_file_name, "w") X_test, _ = nnt.make_data(test_file, use_attention, maxseqlen=maxseqlen, maxclauselen=maxclauselen, label_ind=label_ind, train=False) print >>sys.stderr, "X_test shape:", X_test.shape pred_probs, pred_label_seqs, _ = nnt.predict(X_test, bid) if use_attention: att_weights = nnt.get_attention_weights(X_test.astype('float32')) clause_seqs, _ = read_passages(test_file, False) paralens = [[len(clause.split()) for clause in seq] for seq in clause_seqs] for clauselens, sample_att_weights, pred_label_seq in zip(paralens, att_weights, pred_label_seqs): for clauselen, clause_weights, pred_label in zip(clauselens, sample_att_weights[-len(clauselens):], pred_label_seq): print >>outfile, pred_label, " ".join(["%.4f"%val for val in clause_weights[-clauselen:]]) print >>outfile
break if not os.path.exists(args.out_path): os.mkdir(args.out_path) paper_paths = glob(os.path.join(params["test_path"], "*")) random.shuffle(paper_paths) for test_file in paper_paths: test_out_file_name = os.path.join( params["out_path"], test_file.split("/")[-1].replace(".txt", "") + ".tsv") if not os.path.exists(test_out_file_name): print("Predicting on file %s" % (test_file)) raw_seqs, _ = read_passages(test_file, is_labeled=False) if len(raw_seqs) == 0: print("Empty file", test_file) continue outfile = open(test_out_file_name, "w") test_seq_lengths, test_generator = nnt.make_data( test_file, label_ind=label_ind, train=False) pred_probs, pred_label_seqs, _ = nnt.predict( test_generator, test_seq_lengths) pred_label_seqs = from_BIO(pred_label_seqs) for raw_seq, pred_label_seq in zip(raw_seqs, pred_label_seqs): for clause, pred_label in zip(raw_seq, pred_label_seq): print(clause + "\t" + pred_label, file=outfile) print("", file=outfile)
print >>sys.stderr, "Loaded label index:", label_ind if not use_attention: assert nnt.tagger.layers[0].name == "timedistributeddense" maxseqlen = nnt.tagger.layers[0].input_length maxclauselen = None else: for l in nnt.tagger.layers: if l.name == "tensorattention": maxseqlen, maxclauselen = l.td1, l.td2 break for test_file in testfiles: print >>sys.stderr, "Predicting on file %s"%(test_file) test_out_file_name = test_file.split("/")[-1].replace(".txt", "")+"_att=%s_cont=%s_bid=%s"%(str(use_attention), att_context, str(bid))+".out" outfile = open(test_out_file_name, "w") test_seq_lengths, X_test, _ = nnt.make_data(test_file, use_attention, maxseqlen=maxseqlen, maxclauselen=maxclauselen, label_ind=label_ind, train=False) print >>sys.stderr, "X_test shape:", X_test.shape pred_probs, pred_label_seqs, _ = nnt.predict(X_test, bid, test_seq_lengths) if show_att: att_weights = nnt.get_attention_weights(X_test.astype('float32')) clause_seqs, _ = read_passages(test_file, is_labeled=True) paralens = [[len(clause.split()) for clause in seq] for seq in clause_seqs] for clauselens, sample_att_weights, pred_label_seq in zip(paralens, att_weights, pred_label_seqs): for clauselen, clause_weights, pred_label in zip(clauselens, sample_att_weights[-len(clauselens):], pred_label_seq): print >>outfile, pred_label, " ".join(["%.4f"%val for val in clause_weights[-clauselen:]]) print >>outfile else: for pred_label_seq in pred_label_seqs: for pred_label in pred_label_seq: print >>outfile, pred_label print >>outfile