def make_data(self,
                  trainfilename,
                  maxseqlen=None,
                  maxclauselen=None,
                  label_ind=None,
                  train=False):
        use_attention = self.params["use_attention"]
        batch_size = self.params["batch_size"]

        str_seqs, label_seqs = read_passages(trainfilename, is_labeled=train)
        print("Filtering data")
        str_seqs = clean_words(str_seqs)
        label_seqs = to_BIO(label_seqs)
        if not label_ind:
            self.label_ind = {"none": 0}
        else:
            self.label_ind = label_ind
        seq_lengths = [len(seq) for seq in str_seqs]
        if self.maxseqlen is None:
            if maxseqlen:
                self.maxseqlen = maxseqlen
            elif self.params["maxseqlen"] is not None:
                self.maxseqlen = self.params["maxseqlen"]
            else:
                self.maxseqlen = max(seq_lengths)
        if self.maxclauselen is None:
            if maxclauselen:
                self.maxclauselen = maxclauselen
            elif self.params["maxclauselen"] is not None:
                self.maxclauselen = self.params["maxclauselen"]
            elif use_attention:
                sentence_lens = []
                for str_seq in str_seqs:
                    for seq in str_seq:
                        tokens = self.tokenizer.tokenize(seq.lower())
                        sentence_lens.append(len(tokens))
                self.maxclauselen = np.round(
                    np.mean(sentence_lens) +
                    3 * np.std(sentence_lens)).astype(int)

        if len(self.label_ind) <= 1:
            for str_seq, label_seq in zip(str_seqs, label_seqs):
                for label in label_seq:
                    if label not in self.label_ind:
                        # Add new labels with values 0,1,2,....
                        self.label_ind[label] = len(self.label_ind)
        self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()}
        discourse_generator = BertDiscourseGenerator(self.bert, self.tokenizer,
                                                     str_seqs, label_seqs,
                                                     self.label_ind,
                                                     batch_size, use_attention,
                                                     self.maxseqlen,
                                                     self.maxclauselen, train)
        return seq_lengths, discourse_generator  # One-hot representation of labels
    def make_data(self,
                  trainfilename,
                  maxseqlen=None,
                  maxclauselen=None,
                  label_ind=None,
                  train=False):
        use_attention = self.params["use_attention"]
        maxseqlen = self.params["maxseqlen"]
        maxclauselen = self.params["maxclauselen"]
        batch_size = self.params["batch_size"]

        str_seqs, label_seqs = read_passages(trainfilename, is_labeled=train)
        print("Filtering data")
        str_seqs = clean_words(str_seqs)
        label_seqs = to_BIO(label_seqs)
        if not label_ind:
            self.label_ind = {"none": 0}
        else:
            self.label_ind = label_ind
        seq_lengths = [len(seq) for seq in str_seqs]
        if not maxseqlen:
            maxseqlen = max(seq_lengths)
        if not maxclauselen:
            if use_attention:
                clauselens = []
                for str_seq in str_seqs:
                    clauselens.extend(
                        [len(clause.split()) for clause in str_seq])

                maxclauselen = np.round(
                    np.mean(clauselens) + 3 * np.std(clauselens)).astype(int)
        X = []
        Y = []
        Y_inds = []
        init_word_rep_len = len(self.rep_reader.word_rep)  # Vocab size
        if len(self.label_ind) <= 1:
            for str_seq, label_seq in zip(str_seqs, label_seqs):
                for label in label_seq:
                    if label not in self.label_ind:
                        # Add new labels with values 0,1,2,....
                        self.label_ind[label] = len(self.label_ind)
        self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()}
        discourse_generator = DiscourseGenerator(self.rep_reader, str_seqs,
                                                 label_seqs, self.label_ind,
                                                 batch_size, use_attention,
                                                 maxseqlen, maxclauselen,
                                                 train, self.input_size)
        self.maxseqlen = maxseqlen
        self.maxclauselen = maxclauselen
        return seq_lengths, discourse_generator  # One-hot representation of labels
示例#3
0
    def make_data(self,
                  clauses,
                  use_attention,
                  maxseqlen=None,
                  maxclauselen=None,
                  label_ind=None,
                  train=False):
        print >> sys.stderr, "Reading data.."

        str_seqs, label_seqs = read_passages(clauses, is_labeled=train)

        print >> sys.stderr, "Sample data for train:" if train else "Sample data for test:"
        print >> sys.stderr, zip(str_seqs[0], label_seqs[0])
        if not label_ind:
            self.label_ind = {"none": 0}
        else:
            self.label_ind = label_ind
        seq_lengths = [len(seq) for seq in str_seqs]
        if not maxseqlen:
            maxseqlen = max(seq_lengths)
        if not maxclauselen:
            if use_attention:
                clauselens = []
                for str_seq in str_seqs:
                    clauselens.extend(
                        [len(clause.split()) for clause in str_seq])
                maxclauselen = max(clauselens)
        X = []
        Y = []
        Y_inds = []
        #init_word_rep_len = len(self.rep_reader.word_rep)
        all_word_types = set([])
        for str_seq, label_seq in zip(str_seqs, label_seqs):
            for label in label_seq:
                if label not in self.label_ind:
                    self.label_ind[label] = len(self.label_ind)
            if use_attention:
                x = numpy.zeros((maxseqlen, maxclauselen, self.input_size))
            else:
                x = numpy.zeros((maxseqlen, self.input_size))
            y_ind = numpy.zeros(maxseqlen)
            seq_len = len(str_seq)
            # The following conditional is true only when we've already trained, and one of the sequences in the test set is longer than the longest sequence in training.
            if seq_len > maxseqlen:
                str_seq = str_seq[:maxseqlen]
                seq_len = maxseqlen
            if train:
                for i, (clause, label) in enumerate(zip(str_seq, label_seq)):
                    clause_rep = self.rep_reader.get_clause_rep(clause)
                    for word in clause.split():
                        all_word_types.add(word)
                    if use_attention:
                        if len(clause_rep) > maxclauselen:
                            clause_rep = clause_rep[:maxclauselen]
                        x[-seq_len + i][-len(clause_rep):] = clause_rep
                    else:
                        x[-seq_len + i] = numpy.mean(clause_rep, axis=0)
                    y_ind[-seq_len + i] = self.label_ind[label]
                X.append(x)
                Y_inds.append(y_ind)
            else:
                for i, clause in enumerate(str_seq):
                    clause_rep = self.rep_reader.get_clause_rep(clause)
                    for word in clause.split():
                        all_word_types.add(word)
                    if use_attention:
                        if len(clause_rep) > maxclauselen:
                            clause_rep = clause_rep[:maxclauselen]
                        x[-seq_len + i][-len(clause_rep):] = clause_rep
                    else:
                        x[-seq_len + i] = numpy.mean(clause_rep, axis=0)
                X.append(x)
        final_word_rep_len = len(self.rep_reader.word_rep)
        #oov_ratio = float(final_word_rep_len - init_word_rep_len)/len(all_word_types)
        #print >>sys.stderr, "OOV ratio: %f" % oov_ratio
        for y_ind in Y_inds:
            y = numpy.zeros((maxseqlen, len(self.label_ind)))
            for i, y_ind_i in enumerate(y_ind):
                y[i][y_ind_i] = 1
            Y.append(y)
        self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()}
        return seq_lengths, numpy.asarray(X), numpy.asarray(Y)
示例#4
0
            clauses = codecs.open(test_file, "r", "utf-8")

            test_seq_lengths, X_test, _ = nnt.make_data(
                clauses,
                use_attention,
                maxseqlen=maxseqlen,
                maxclauselen=maxclauselen,
                label_ind=label_ind,
                train=False)
            print >> sys.stderr, "X_test shape:", X_test.shape
            pred_probs, pred_label_seqs, _ = nnt.predict(
                X_test, bid, test_seq_lengths)
            if show_att:
                att_weights = nnt.get_attention_weights(
                    X_test.astype('float32'))
                clause_seqs, _ = read_passages(test_file, is_labeled=True)
                paralens = [[len(clause.split()) for clause in seq]
                            for seq in clause_seqs]
                for clauselens, sample_att_weights, pred_label_seq in zip(
                        paralens, att_weights, pred_label_seqs):
                    for clauselen, clause_weights, pred_label in zip(
                            clauselens, sample_att_weights[-len(clauselens):],
                            pred_label_seq):
                        print >> outfile, pred_label, " ".join([
                            "%.4f" % val for val in clause_weights[-clauselen:]
                        ])
                    print >> outfile
            else:
                for pred_label_seq in pred_label_seqs:
                    for pred_label in pred_label_seq:
                        print >> outfile, pred_label
示例#5
0
    args = argparser.parse_args()

    if args.train_file:
        trainfile = args.train_file
        train = True
    else:
        train = False

    if args.test_files:
        testfiles = args.test_files
        test = True
    else:
        test = False

    p_count = 0
    c_count = 0
    if (train):
        clauses = codecs.open(trainfile, "r", "utf-8")
        str_seqs, label_seqs = read_passages(clauses, is_labeled=True)
        p_count += len(str_seqs)
        c_count += len(list(itertools.chain(*label_seqs)))
    if (test):
        for testfile in testfiles:
            clauses = codecs.open(testfile, "r", "utf-8")
            str_seqs, label_seqs = read_passages(clauses, is_labeled=True)
            p_count += len(str_seqs)
            c_count += len(list(itertools.chain(*label_seqs)))

    print("Paragraph Count: %d" % p_count)
    print("Clause Count: %d" % c_count)
示例#6
0
 def make_data(self,
               trainfilename,
               use_attention,
               maxseqlen=None,
               maxclauselen=None,
               label_ind=None,
               train=False):
     # list of list
     str_seqs, label_seqs = read_passages(trainfilename, is_labeled=train)
     if not label_ind:
         self.label_ind = {"none": 0}
     else:
         self.label_ind = label_ind
     seq_lengths = [len(seq) for seq in str_seqs]
     if not maxseqlen:
         maxseqlen = max(seq_lengths)
     if not maxclauselen:
         if use_attention:
             clauselens = []
             for str_seq in str_seqs:
                 clauselens.extend(
                     [len(clause.split()) for clause in str_seq])
             maxclauselen = max(clauselens)
     X = []
     Y = []
     Y_inds = []
     init_word_rep_len = len(self.rep_reader.word_rep)  # Vocab size
     all_word_types = set([])
     for str_seq, label_seq in zip(str_seqs, label_seqs):
         for label in label_seq:
             if label not in self.label_ind:
                 # Add new labels with values 0,1,2,....
                 self.label_ind[label] = len(self.label_ind)
         if use_attention:
             x = np.zeros((maxseqlen, maxclauselen, self.input_size))
         else:
             x = np.zeros((maxseqlen, self.input_size))
         y_ind = np.zeros(maxseqlen)
         seq_len = len(str_seq)
         # The following conditional is true only when we've already trained, and one of the sequences in the test set is longer than the longest sequence in training.
         if seq_len > maxseqlen:
             str_seq = str_seq[:maxseqlen]
             seq_len = maxseqlen
         if train:
             for i, (clause, label) in enumerate(zip(str_seq, label_seq)):
                 clause_rep = self.rep_reader.get_clause_rep(
                     clause
                 )  # Makes embedding non-trainable from the beginning.
                 for word in clause.split():
                     all_word_types.add(word)  # Vocab
                 if use_attention:
                     if len(clause_rep) > maxclauselen:
                         clause_rep = clause_rep[:maxclauselen]
                     x[-seq_len + i][-len(clause_rep):] = clause_rep
                 else:
                     x[-seq_len + i] = np.mean(clause_rep, axis=0)
                 y_ind[-seq_len + i] = self.label_ind[label]
             X.append(x)
             Y_inds.append(y_ind)
         else:
             for i, clause in enumerate(str_seq):
                 clause_rep = self.rep_reader.get_clause_rep(clause)
                 for word in clause.split():
                     all_word_types.add(word)
                 if use_attention:
                     if len(clause_rep) > maxclauselen:
                         clause_rep = clause_rep[:maxclauselen]
                     x[-seq_len + i][-len(clause_rep):] = clause_rep
                 else:
                     x[-seq_len + i] = np.mean(clause_rep, axis=0)
             X.append(x)
     # Once there is OOV, new word vector is added to word_rep
     final_word_rep_len = len(self.rep_reader.word_rep)
     oov_ratio = float(final_word_rep_len -
                       init_word_rep_len) / len(all_word_types)
     for y_ind in Y_inds:
         y = np.zeros((maxseqlen, len(self.label_ind)))
         for i, y_ind_i in enumerate(y_ind):
             y[i][y_ind_i.astype(int)] = 1
         Y.append(y)
     self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()}
     return seq_lengths, np.asarray(X), np.asarray(
         Y)  # One-hot representation of labels
示例#7
0
 def make_data(self, trainfilename, use_attention, maxseqlen=None, maxclauselen=None, label_ind=None, train=False):
   print >>sys.stderr, "Reading data.."
   str_seqs, label_seqs = read_passages(trainfilename, train)
   if not label_ind:
     self.label_ind = {"none": 0}
   else:
     self.label_ind = label_ind
   if not maxseqlen:
     maxseqlen = max([len(label_seq) for label_seq in label_seqs])
   if not maxclauselen:
     if use_attention:
       clauselens = []
       for str_seq in str_seqs:
         clauselens.extend([len(clause.split()) for clause in str_seq])
       maxclauselen = max(clauselens)
   X = []
   Y = []
   Y_inds = []
   for str_seq, label_seq in zip(str_seqs, label_seqs):
     for label in label_seq:
       if label not in self.label_ind:
         self.label_ind[label] = len(self.label_ind)
     if use_attention:
       x = numpy.zeros((maxseqlen, maxclauselen, self.input_size))
     else:
       x = numpy.zeros((maxseqlen, self.input_size))
     y_ind = numpy.zeros(maxseqlen)
     seq_len = len(str_seq)
     # The following conditional is true only when we've already trained, and one of the sequences in the test set is longer than the longest sequence in training.
     if seq_len > maxseqlen:
       str_seq = str_seq[:maxseqlen]
       seq_len = maxseqlen 
     if train:
       for i, (clause, label) in enumerate(zip(str_seq, label_seq)):
         clause_rep = self.rep_reader.get_clause_rep(clause)
         if use_attention:
           if len(clause_rep) > maxclauselen:
             clause_rep = clause_rep[:maxclauselen]
           x[-seq_len+i][-len(clause_rep):] = clause_rep
         else:
           x[-seq_len+i] = numpy.mean(clause_rep, axis=0)
         y_ind[-seq_len+i] = self.label_ind[label]
       X.append(x)
       Y_inds.append(y_ind)
     else:
       for i, clause in enumerate(str_seq):
         clause_rep = self.rep_reader.get_clause_rep(clause)
         if use_attention:
           if len(clause_rep) > maxclauselen:
             clause_rep = clause_rep[:maxclauselen]
           x[-seq_len+i][-len(clause_rep):] = clause_rep
         else:
           x[-seq_len+i] = numpy.mean(clause_rep, axis=0)
       X.append(x)
   for y_ind in Y_inds:
     y = numpy.zeros((maxseqlen, len(self.label_ind)))
     for i, y_ind_i in enumerate(y_ind):
       y[i][y_ind_i] = 1
     Y.append(y) 
   self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()}
   return numpy.asarray(X), numpy.asarray(Y)
示例#8
0
      model_config_file = open("model_%s_config.json"%model_ext, "r")
      model_weights_file_name = "model_%s_weights"%model_ext
      model_label_ind = "model_%s_label_ind.json"%model_ext
      nnt.tagger = model_from_json(model_config_file.read(), custom_objects={"TensorAttention":TensorAttention, "HigherOrderTimeDistributedDense":HigherOrderTimeDistributedDense})
      print >>sys.stderr, "Loaded model:"
      print >>sys.stderr, nnt.tagger.summary()
      nnt.tagger.load_weights(model_weights_file_name)
      print >>sys.stderr, "Loaded weights"
      label_ind_json = json.load(open(model_label_ind))
      label_ind = {k: int(label_ind_json[k]) for k in label_ind_json}
      print >>sys.stderr, "Loaded label index:", label_ind
    for l in nnt.tagger.layers:
      if l.name == "tensorattention":
        maxseqlen, maxclauselen = l.td1, l.td2
        break
    for test_file in args.test_files:
      print >>sys.stderr, "Predicting on file %s"%(test_file)
      test_out_file_name = test_file.split("/")[-1].replace(".txt", "")+"_att=%s_cont=%s_bid=%s"%(str(use_attention), att_context, str(bid))+".out"
      outfile = open(test_out_file_name, "w")
      X_test, _ = nnt.make_data(test_file, use_attention, maxseqlen=maxseqlen, maxclauselen=maxclauselen, label_ind=label_ind, train=False)
      print >>sys.stderr, "X_test shape:", X_test.shape
      pred_probs, pred_label_seqs, _ = nnt.predict(X_test, bid)
      if use_attention:
        att_weights = nnt.get_attention_weights(X_test.astype('float32'))
        clause_seqs, _ = read_passages(test_file, False)
        paralens = [[len(clause.split()) for clause in seq] for seq in clause_seqs]
        for clauselens, sample_att_weights, pred_label_seq in zip(paralens, att_weights, pred_label_seqs):
          for clauselen, clause_weights, pred_label in zip(clauselens, sample_att_weights[-len(clauselens):], pred_label_seq):
            print >>outfile, pred_label, " ".join(["%.4f"%val for val in clause_weights[-clauselen:]])
          print >>outfile
                    break

        if not os.path.exists(args.out_path):
            os.mkdir(args.out_path)

        paper_paths = glob(os.path.join(params["test_path"], "*"))
        random.shuffle(paper_paths)

        for test_file in paper_paths:
            test_out_file_name = os.path.join(
                params["out_path"],
                test_file.split("/")[-1].replace(".txt", "") + ".tsv")

            if not os.path.exists(test_out_file_name):
                print("Predicting on file %s" % (test_file))
                raw_seqs, _ = read_passages(test_file, is_labeled=False)
                if len(raw_seqs) == 0:
                    print("Empty file", test_file)
                    continue
                outfile = open(test_out_file_name, "w")

                test_seq_lengths, test_generator = nnt.make_data(
                    test_file, label_ind=label_ind, train=False)
                pred_probs, pred_label_seqs, _ = nnt.predict(
                    test_generator, test_seq_lengths)

                pred_label_seqs = from_BIO(pred_label_seqs)
                for raw_seq, pred_label_seq in zip(raw_seqs, pred_label_seqs):
                    for clause, pred_label in zip(raw_seq, pred_label_seq):
                        print(clause + "\t" + pred_label, file=outfile)
                    print("", file=outfile)
示例#10
0
      print >>sys.stderr, "Loaded label index:", label_ind
    if not use_attention:
      assert nnt.tagger.layers[0].name == "timedistributeddense"
      maxseqlen = nnt.tagger.layers[0].input_length
      maxclauselen = None
    else:
      for l in nnt.tagger.layers:
        if l.name == "tensorattention":
          maxseqlen, maxclauselen = l.td1, l.td2
          break
    for test_file in testfiles:
      print >>sys.stderr, "Predicting on file %s"%(test_file)
      test_out_file_name = test_file.split("/")[-1].replace(".txt", "")+"_att=%s_cont=%s_bid=%s"%(str(use_attention), att_context, str(bid))+".out"
      outfile = open(test_out_file_name, "w")
      test_seq_lengths, X_test, _ = nnt.make_data(test_file, use_attention, maxseqlen=maxseqlen, maxclauselen=maxclauselen, label_ind=label_ind, train=False)
      print >>sys.stderr, "X_test shape:", X_test.shape
      pred_probs, pred_label_seqs, _ = nnt.predict(X_test, bid, test_seq_lengths)
      if show_att:
        att_weights = nnt.get_attention_weights(X_test.astype('float32'))
        clause_seqs, _ = read_passages(test_file, is_labeled=True)
        paralens = [[len(clause.split()) for clause in seq] for seq in clause_seqs]
        for clauselens, sample_att_weights, pred_label_seq in zip(paralens, att_weights, pred_label_seqs):
          for clauselen, clause_weights, pred_label in zip(clauselens, sample_att_weights[-len(clauselens):], pred_label_seq):
            print >>outfile, pred_label, " ".join(["%.4f"%val for val in clause_weights[-clauselen:]])
          print >>outfile
      else:
        for pred_label_seq in pred_label_seqs:
          for pred_label in pred_label_seq:
            print >>outfile, pred_label
          print >>outfile