示例#1
0
 def __init__(self, word_rep_file=None, pickled_rep_reader=None):
     if pickled_rep_reader:
         self.rep_reader = pickled_rep_reader
     elif word_rep_file:
         self.rep_reader = RepReader(word_rep_file)
     self.input_size = self.rep_reader.rep_shape[0]
     self.tagger = None
示例#2
0
 def __init__(self,
              word_rep_file,
              train=False,
              cv=True,
              folds=5,
              modeltype="mlp",
              trained_model_name="trained_model.pkl",
              tagset_file="tagset.pkl"):
     self.trained_model_name = "%s_%s" % (modeltype, trained_model_name)
     self.cv = cv
     self.folds = folds
     self.rep_reader = RepReader(word_rep_file)
     self.input_size = self.rep_reader.rep_shape[0]
     if modeltype == "mlp":
         self.hidden_sizes = [20, 10]
     else:
         self.hidden_size = 20
     self.max_iter = 100
     self.learning_rate = 0.01
     self.tag_index = None
     self.modeltype = modeltype
     if train:
         print >> sys.stderr, "Statement classifier initialized for training."
         if self.cv:
             print >> sys.stderr, "Cross-validation will be done"
         self.classifier = None
     else:
         self.classifier = cPickle.load(open(self.trained_model_name, "rb"))
         print >> sys.stderr, "Stored model loaded. Statement classifier initialized for prediction."
示例#3
0
 def __init__(self, word_rep_file=None, pickled_rep_reader=None):
   if pickled_rep_reader:
     self.rep_reader = pickled_rep_reader
   elif word_rep_file:
     self.rep_reader = RepReader(word_rep_file)
   self.input_size = self.rep_reader.rep_shape[0]
   self.tagger = None
示例#4
0
 def __init__(self, word_rep_file, train=False, cv=True, folds=5, modeltype="mlp", trained_model_name="trained_model.pkl", tagset_file="tagset.pkl"):
   self.trained_model_name = "%s_%s"%(modeltype, trained_model_name)
   self.cv = cv
   self.folds = folds
   self.rep_reader = RepReader(word_rep_file)
   self.input_size = self.rep_reader.rep_shape[0]
   if modeltype == "mlp":
     self.hidden_sizes = [20, 10]
   else:
     self.hidden_size = 20
   self.max_iter = 100
   self.learning_rate = 0.01
   self.tag_index = None
   self.modeltype = modeltype
   if train:
     print >>sys.stderr, "Statement classifier initialized for training."
     if self.cv:
       print >>sys.stderr, "Cross-validation will be done"
     self.classifier = None
   else:
     self.classifier = cPickle.load(open(self.trained_model_name, "rb"))
     print >>sys.stderr, "Stored model loaded. Statement classifier initialized for prediction."
示例#5
0
class PassageTagger(object):
    def __init__(self, word_rep_file=None, pickled_rep_reader=None):
        if pickled_rep_reader:
            self.rep_reader = pickled_rep_reader
        elif word_rep_file:
            self.rep_reader = RepReader(word_rep_file)
        else:
            self.rep_reader = RepReader(elastic=True)
        self.input_size = self.rep_reader.rep_shape[0]
        self.tagger = None

    def make_data(self,
                  clauses,
                  use_attention,
                  maxseqlen=None,
                  maxclauselen=None,
                  label_ind=None,
                  train=False):
        print >> sys.stderr, "Reading data.."

        str_seqs, label_seqs = read_passages(clauses, is_labeled=train)

        print >> sys.stderr, "Sample data for train:" if train else "Sample data for test:"
        print >> sys.stderr, zip(str_seqs[0], label_seqs[0])
        if not label_ind:
            self.label_ind = {"none": 0}
        else:
            self.label_ind = label_ind
        seq_lengths = [len(seq) for seq in str_seqs]
        if not maxseqlen:
            maxseqlen = max(seq_lengths)
        if not maxclauselen:
            if use_attention:
                clauselens = []
                for str_seq in str_seqs:
                    clauselens.extend(
                        [len(clause.split()) for clause in str_seq])
                maxclauselen = max(clauselens)
        X = []
        Y = []
        Y_inds = []
        #init_word_rep_len = len(self.rep_reader.word_rep)
        all_word_types = set([])
        for str_seq, label_seq in zip(str_seqs, label_seqs):
            for label in label_seq:
                if label not in self.label_ind:
                    self.label_ind[label] = len(self.label_ind)
            if use_attention:
                x = numpy.zeros((maxseqlen, maxclauselen, self.input_size))
            else:
                x = numpy.zeros((maxseqlen, self.input_size))
            y_ind = numpy.zeros(maxseqlen)
            seq_len = len(str_seq)
            # The following conditional is true only when we've already trained, and one of the sequences in the test set is longer than the longest sequence in training.
            if seq_len > maxseqlen:
                str_seq = str_seq[:maxseqlen]
                seq_len = maxseqlen
            if train:
                for i, (clause, label) in enumerate(zip(str_seq, label_seq)):
                    clause_rep = self.rep_reader.get_clause_rep(clause)
                    for word in clause.split():
                        all_word_types.add(word)
                    if use_attention:
                        if len(clause_rep) > maxclauselen:
                            clause_rep = clause_rep[:maxclauselen]
                        x[-seq_len + i][-len(clause_rep):] = clause_rep
                    else:
                        x[-seq_len + i] = numpy.mean(clause_rep, axis=0)
                    y_ind[-seq_len + i] = self.label_ind[label]
                X.append(x)
                Y_inds.append(y_ind)
            else:
                for i, clause in enumerate(str_seq):
                    clause_rep = self.rep_reader.get_clause_rep(clause)
                    for word in clause.split():
                        all_word_types.add(word)
                    if use_attention:
                        if len(clause_rep) > maxclauselen:
                            clause_rep = clause_rep[:maxclauselen]
                        x[-seq_len + i][-len(clause_rep):] = clause_rep
                    else:
                        x[-seq_len + i] = numpy.mean(clause_rep, axis=0)
                X.append(x)
        final_word_rep_len = len(self.rep_reader.word_rep)
        #oov_ratio = float(final_word_rep_len - init_word_rep_len)/len(all_word_types)
        #print >>sys.stderr, "OOV ratio: %f" % oov_ratio
        for y_ind in Y_inds:
            y = numpy.zeros((maxseqlen, len(self.label_ind)))
            for i, y_ind_i in enumerate(y_ind):
                y[i][y_ind_i] = 1
            Y.append(y)
        self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()}
        return seq_lengths, numpy.asarray(X), numpy.asarray(Y)

    def get_attention_weights(self, X_test):
        if not self.tagger:
            raise RuntimeError, "Tagger not trained yet!"
        inp = self.tagger.get_input()
        att_out = None
        for layer in self.tagger.layers:
            if layer.get_config()['name'].lower() == "tensorattention":
                att_out = layer.get_output()
                break
        if not att_out:
            raise RuntimeError, "No attention layer found!"
        f = theano.function([inp], att_out)
        return f(X_test)

    def predict(self, X, bidirectional, test_seq_lengths=None, tagger=None):
        if not tagger:
            tagger = self.tagger
        if not tagger:
            raise RuntimeError, "Tagger not trained yet!"
        if test_seq_lengths is None:
            # Determining actual lengths sans padding
            x_lens = []
            for x in X:
                x_len = 0
                for i, xi in enumerate(x):
                    if xi.sum() != 0:
                        x_len = len(x) - i
                        break
                x_lens.append(x_len)
        else:
            x_lens = test_seq_lengths
        if bidirectional:
            pred_probs = tagger.predict({'input': X})['output']
        else:
            pred_probs = tagger.predict(X)
        pred_inds = numpy.argmax(pred_probs, axis=2)
        pred_label_seqs = []
        for pred_ind, x_len in zip(pred_inds, x_lens):
            pred_label_seq = [self.rev_label_ind[pred]
                              for pred in pred_ind][-x_len:]
            # If the following number is positive, it means we ignored some clauses in the test passage to make it the same length as the ones we trained on.
            num_ignored_clauses = max(0, x_len - len(pred_label_seq))
            # Make labels for those if needed.
            if num_ignored_clauses > 0:
                warnings.warn(
                    "Test sequence too long. Ignoring %d clauses at the beginning and labeling them none."
                    % num_ignored_clauses)
                ignored_clause_labels = ["none"] * num_ignored_clauses
                pred_label_seq = ignored_clause_labels + pred_label_seq
            pred_label_seqs.append(pred_label_seq)
        return pred_probs, pred_label_seqs, x_lens

    def fit_model(self, X, Y, use_attention, att_context, bidirectional):
        print >> sys.stderr, "Input shape:", X.shape, Y.shape
        early_stopping = EarlyStopping(patience=2)
        num_classes = len(self.label_ind)
        if bidirectional:
            tagger = Graph()
            tagger.add_input(name='input', input_shape=X.shape[1:])
            if use_attention:
                tagger.add_node(TensorAttention(X.shape[1:],
                                                context=att_context),
                                name='attention',
                                input='input')
                lstm_input_node = 'attention'
            else:
                lstm_input_node = 'input'
            tagger.add_node(LSTM(X.shape[-1] / 2, return_sequences=True),
                            name='forward',
                            input=lstm_input_node)
            tagger.add_node(LSTM(X.shape[-1] / 2,
                                 return_sequences=True,
                                 go_backwards=True),
                            name='backward',
                            input=lstm_input_node)
            tagger.add_node(TimeDistributedDense(num_classes,
                                                 activation='softmax'),
                            name='softmax',
                            inputs=['forward', 'backward'],
                            merge_mode='concat',
                            concat_axis=-1)
            tagger.add_output(name='output', input='softmax')
            tagger.summary()
            tagger.compile('adam', {'output': 'categorical_crossentropy'})
            #tagger.fit({'input':X, 'output':Y}, validation_split=0.1, callbacks=[early_stopping], show_accuracy=True, nb_epoch=100, batch_size=10)
            tagger.fit({
                'input': X,
                'output': Y
            },
                       validation_split=0.1,
                       callbacks=[early_stopping],
                       nb_epoch=100,
                       batch_size=10)
        else:
            tagger = Sequential()
            word_proj_dim = 50
            if use_attention:
                _, input_len, timesteps, input_dim = X.shape
                tagger.add(
                    HigherOrderTimeDistributedDense(input_dim=input_dim,
                                                    output_dim=word_proj_dim))
                att_input_shape = (input_len, timesteps, word_proj_dim)
                print >> sys.stderr, "Attention input shape:", att_input_shape
                tagger.add(Dropout(0.5))
                tagger.add(
                    TensorAttention(att_input_shape, context=att_context))
            else:
                _, input_len, input_dim = X.shape
                tagger.add(
                    TimeDistributedDense(input_dim=input_dim,
                                         input_length=input_len,
                                         output_dim=word_proj_dim))
            tagger.add(
                LSTM(input_dim=word_proj_dim,
                     output_dim=word_proj_dim,
                     input_length=input_len,
                     return_sequences=True))
            tagger.add(TimeDistributedDense(num_classes, activation='softmax'))
            tagger.summary()
            tagger.compile(loss='categorical_crossentropy', optimizer='adam')
            tagger.fit(X,
                       Y,
                       validation_split=0.1,
                       callbacks=[early_stopping],
                       show_accuracy=True,
                       batch_size=10)

        return tagger

    def train(self,
              X,
              Y,
              use_attention,
              att_context,
              bidirectional,
              cv=True,
              folds=5):
        if cv:
            cv_folds = make_folds(X, Y, folds)
            accuracies = []
            fscores = []
            for fold_num, ((train_fold_X, train_fold_Y),
                           (test_fold_X, test_fold_Y)) in enumerate(cv_folds):
                tagger = self.fit_model(train_fold_X, train_fold_Y,
                                        use_attention, att_context,
                                        bidirectional)
                pred_probs, pred_label_seqs, x_lens = self.predict(
                    test_fold_X, bidirectional, tagger=tagger)
                pred_inds = numpy.argmax(pred_probs, axis=2)
                flattened_preds = []
                flattened_targets = []
                for x_len, pred_ind, test_target in zip(
                        x_lens, pred_inds, test_fold_Y):
                    flattened_preds.extend(pred_ind[-x_len:])
                    flattened_targets.extend(
                        [list(tt).index(1) for tt in test_target[-x_len:]])
                assert len(flattened_preds) == len(flattened_targets)
                accuracy, weighted_fscore, all_fscores = evaluate(
                    flattened_targets, flattened_preds)
                print >> sys.stderr, "Finished fold %d. Accuracy: %f, Weighted F-score: %f" % (
                    fold_num, accuracy, weighted_fscore)
                print >> sys.stderr, "Individual f-scores:"
                for cat in all_fscores:
                    print >> sys.stderr, "%s: %f" % (self.rev_label_ind[cat],
                                                     all_fscores[cat])
                accuracies.append(accuracy)
                fscores.append(weighted_fscore)
            accuracies = numpy.asarray(accuracies)
            fscores = numpy.asarray(fscores)
            print >> sys.stderr, "Accuracies:", accuracies
            print >> sys.stderr, "Average: %0.4f (+/- %0.4f)" % (
                accuracies.mean(), accuracies.std() * 2)
            print >> sys.stderr, "Fscores:", fscores
            print >> sys.stderr, "Average: %0.4f (+/- %0.4f)" % (
                fscores.mean(), fscores.std() * 2)
        self.tagger = self.fit_model(X, Y, use_attention, att_context,
                                     bidirectional)
        model_ext = "att=%s_cont=%s_bi=%s" % (str(use_attention), att_context,
                                              str(bidirectional))
        model_config_file = open("model_%s_config.json" % model_ext, "w")
        model_weights_file_name = "model_%s_weights" % model_ext
        model_label_ind = "model_%s_label_ind.json" % model_ext
        model_rep_reader = "model_%s_rep_reader.pkl" % model_ext
        print >> model_config_file, self.tagger.to_json()
        self.tagger.save_weights(model_weights_file_name, overwrite=True)
        json.dump(self.label_ind, open(model_label_ind, "w"))
示例#6
0
class PassageTagger(object):
    def __init__(self, word_rep_file=None, pickled_rep_reader=None):
        if pickled_rep_reader:
            self.rep_reader = pickled_rep_reader
        elif word_rep_file:
            self.rep_reader = RepReader(word_rep_file)
        try:
            self.input_size = self.rep_reader.rep_shape[0]
        except:
            self.input_size = 0
        self.tagger = None

    def make_data(self,
                  trainfilename,
                  use_attention,
                  maxseqlen=None,
                  maxclauselen=None,
                  label_ind=None,
                  train=False):
        # list of list
        str_seqs, label_seqs = read_passages(trainfilename, is_labeled=train)
        if not label_ind:
            self.label_ind = {"none": 0}
        else:
            self.label_ind = label_ind
        seq_lengths = [len(seq) for seq in str_seqs]
        if not maxseqlen:
            maxseqlen = max(seq_lengths)
        if not maxclauselen:
            if use_attention:
                clauselens = []
                for str_seq in str_seqs:
                    clauselens.extend(
                        [len(clause.split()) for clause in str_seq])
                maxclauselen = max(clauselens)
        X = []
        Y = []
        Y_inds = []
        init_word_rep_len = len(self.rep_reader.word_rep)  # Vocab size
        all_word_types = set([])
        for str_seq, label_seq in zip(str_seqs, label_seqs):
            for label in label_seq:
                if label not in self.label_ind:
                    # Add new labels with values 0,1,2,....
                    self.label_ind[label] = len(self.label_ind)
            if use_attention:
                x = np.zeros((maxseqlen, maxclauselen, self.input_size))
            else:
                x = np.zeros((maxseqlen, self.input_size))
            y_ind = np.zeros(maxseqlen)
            seq_len = len(str_seq)
            # The following conditional is true only when we've already trained, and one of the sequences in the test set is longer than the longest sequence in training.
            if seq_len > maxseqlen:
                str_seq = str_seq[:maxseqlen]
                seq_len = maxseqlen
            if train:
                for i, (clause, label) in enumerate(zip(str_seq, label_seq)):
                    clause_rep = self.rep_reader.get_clause_rep(
                        clause
                    )  # Makes embedding non-trainable from the beginning.
                    for word in clause.split():
                        all_word_types.add(word)  # Vocab
                    if use_attention:
                        if len(clause_rep) > maxclauselen:
                            clause_rep = clause_rep[:maxclauselen]
                        x[-seq_len + i][-len(clause_rep):] = clause_rep
                    else:
                        x[-seq_len + i] = np.mean(clause_rep, axis=0)
                    y_ind[-seq_len + i] = self.label_ind[label]
                X.append(x)
                Y_inds.append(y_ind)
            else:
                for i, clause in enumerate(str_seq):
                    clause_rep = self.rep_reader.get_clause_rep(clause)
                    for word in clause.split():
                        all_word_types.add(word)
                    if use_attention:
                        if len(clause_rep) > maxclauselen:
                            clause_rep = clause_rep[:maxclauselen]
                        x[-seq_len + i][-len(clause_rep):] = clause_rep
                    else:
                        x[-seq_len + i] = np.mean(clause_rep, axis=0)
                X.append(x)
        # Once there is OOV, new word vector is added to word_rep
        final_word_rep_len = len(self.rep_reader.word_rep)
        oov_ratio = float(final_word_rep_len -
                          init_word_rep_len) / len(all_word_types)
        for y_ind in Y_inds:
            y = np.zeros((maxseqlen, len(self.label_ind)))
            for i, y_ind_i in enumerate(y_ind):
                y[i][y_ind_i.astype(int)] = 1
            Y.append(y)
        self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()}
        return seq_lengths, np.asarray(X), np.asarray(
            Y)  # One-hot representation of labels

    def make_data_cached_elmo(self,
                              use_attention,
                              maxseqlen=300,
                              maxclauselen=30,
                              label_ind=None,
                              train=False):
        textpath = "/nas/home/xiangcil/bio_corpus/Molecular_Interaction_Evidence_Fragment_Corpus/02_expt_spans_complete/pathway_logic/"
        all_texts = glob.glob(textpath + "*.tsv")
        cachepath = "/nas/home/xiangcil/bio_corpus/Elmo_Cached_Molecular_Interaction_Evidence_Fragment_Corpus/pathway_logic/"
        if not label_ind:
            self.label_ind = {"none": 0}
        else:
            self.label_ind = label_ind

        label_seqs = []
        label_seq = []
        elmo_layer = 3
        embedding_dim = 1024
        X = np.zeros((0, maxclauselen, maxseqlen, embedding_dim * elmo_layer))
        for filename in all_texts:
            shortfilename = filename.split("/")[-1].split(".")[0]
            embedding_numpy_file = cachepath + shortfilename + ".npy"
            X_paper = np.load(embedding_numpy_file)
            X = np.append(X, X_paper, axis=0)
            df = pd.read_csv(filename,
                             sep='\t',
                             header=0,
                             index_col=0,
                             engine='python')
            df = df[pd.notnull(df["Discourse Type"])]
            num_rec = df.shape[0]
            prev_paragraph = ""
            for i in range(num_rec):
                if df["Paragraph"][i][0] == "p":  # e.g. "p1"
                    if df["Paragraph"][i] != prev_paragraph:
                        prev_paragraph = df["Paragraph"][i]
                        if len(label_seq) > 0:
                            label_seqs.append(label_seq)
                        label_seq = []
                        clause_count = 0
                    if clause_count < maxclauselen:
                        label_seq.append(df["Discourse Type"][i])
                    clause_count += 1
            print("Loading pkl file: ", shortfilename)
        if not use_attention:
            X = np.mean(X, axis=2)

        seq_lengths = [len(label_seq) for label_seq in label_seqs]
        Y = []
        Y_inds = []
        for label_seq in label_seqs:
            for label in label_seq:
                if label not in self.label_ind:
                    # Add new labels with values 0,1,2,....
                    self.label_ind[label] = len(self.label_ind)
            y_ind = np.zeros(maxseqlen)
            seq_len = len(label_seq)
            if train:
                for i, label in enumerate(label_seq):
                    y_ind[-seq_len + i] = self.label_ind[label]
                Y_inds.append(y_ind)

        for y_ind in Y_inds:
            y = np.zeros((maxseqlen, len(self.label_ind)))
            for i, y_ind_i in enumerate(y_ind):
                y[i][y_ind_i.astype(int)] = 1
            Y.append(y)
        self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()}

        return seq_lengths, X, np.asarray(Y)

    def get_attention_weights(self, X_test):
        if not self.tagger:
            raise (RuntimeError, "Tagger not trained yet!")
        inp = self.tagger.get_input()
        att_out = None
        for layer in self.tagger.layers:
            if layer.get_config()['name'].lower() == "tensorattention":
                att_out = layer.get_output()
                break
        if not att_out:
            raise (RuntimeError, "No attention layer found!")
        f = theano.function([inp], att_out)
        return f(X_test)

    def predict(self, X, bidirectional, test_seq_lengths=None, tagger=None):
        if not tagger:
            tagger = self.tagger
        if not tagger:
            raise (RuntimeError, "Tagger not trained yet!")
        if test_seq_lengths is None:
            # Determining actual lengths sans padding
            x_lens = []
            for x in X:
                x_len = 0
                for i, xi in enumerate(x):
                    if xi.sum() != 0:
                        x_len = len(x) - i
                        break
                x_lens.append(x_len)
        else:
            x_lens = test_seq_lengths
        if bidirectional:
            pred_probs = tagger.predict(X)
        else:
            pred_probs = tagger.predict(X)
        pred_inds = np.argmax(pred_probs, axis=2)
        pred_label_seqs = []
        for pred_ind, x_len in zip(pred_inds, x_lens):
            pred_label_seq = [self.rev_label_ind[pred]
                              for pred in pred_ind][-x_len:]
            # If the following number is positive, it means we ignored some clauses in the test passage to make it the same length as the ones we trained on.
            num_ignored_clauses = max(0, x_len - len(pred_label_seq))
            # Make labels for those if needed.
            if num_ignored_clauses > 0:
                warnings.warn(
                    "Test sequence too long. Ignoring %d clauses at the beginning and labeling them none."
                    % num_ignored_clauses)
                ignored_clause_labels = ["none"] * num_ignored_clauses
                pred_label_seq = ignored_clause_labels + pred_label_seq
            pred_label_seqs.append(pred_label_seq)
        return pred_probs, pred_label_seqs, x_lens

    def fit_model(self, X, Y, use_attention, att_context, bidirectional, crf):
        early_stopping = EarlyStopping(patience=2)
        num_classes = len(self.label_ind)
        tagger = Sequential()
        word_proj_dim = 50
        if use_attention:
            sample_size, input_len, timesteps, input_dim = X.shape
            self.td1 = input_len
            self.td2 = timesteps
            tagger.add(
                HigherOrderTimeDistributedDense(input_dim=input_dim,
                                                output_dim=word_proj_dim))
            att_input_shape = (sample_size, input_len, timesteps,
                               word_proj_dim)
            tagger.add(Dropout(0.5))
            tagger.add(TensorAttention(att_input_shape, context=att_context))
        else:
            _, input_len, input_dim = X.shape
            tagger.add(
                TimeDistributed(Dense(input_dim=input_dim,
                                      units=word_proj_dim)))
        if bidirectional:
            tagger.add(
                Bidirectional(
                    LSTM(input_shape=(input_len, word_proj_dim),
                         units=word_proj_dim,
                         return_sequences=True)))
        else:
            tagger.add(
                LSTM(input_shape=(input_len, word_proj_dim),
                     units=word_proj_dim,
                     return_sequences=True))
        tagger.add(TimeDistributed(Dense(num_classes, activation='softmax')))

        def step_decay(epoch):
            initial_lrate = 0.1
            drop = 0.5
            epochs_drop = 5.0
            lrate = initial_lrate * np.power(
                drop, np.floor((1 + epoch) / epochs_drop))
            return lrate

        epoch = 100
        if crf:
            crf = CRF(num_classes, learn_mode="marginal")
            tagger.add(crf)
            #rmsprop = RMSprop(lr=0.05, rho=0.9, epsilon=None, decay=0.99)
            #lr = 0.1
            #decay = lr / epoch
            #sgd = SGD(lr=lr, decay=decay, momentum=0.9, nesterov=True)
            tagger.compile(optimizer='rmsprop',
                           loss=crf.loss_function,
                           metrics=[crf.accuracy])

        else:
            tagger.compile(loss='categorical_crossentropy',
                           optimizer='adam',
                           metrics=['accuracy'])
        #tagger.fit(X, Y, validation_split=0.1, epochs=100, callbacks=[early_stopping], batch_size=10)

        #tagger.fit(X, Y, validation_split=0.1, epochs=epoch, batch_size=10, callbacks = [LearningRateScheduler(step_decay)])
        tagger.fit(X, Y, validation_split=0.1, epochs=epoch, batch_size=10)
        tagger.summary()
        return tagger

    def train(self,
              X,
              Y,
              use_attention,
              att_context,
              bidirectional,
              cv=True,
              folds=5,
              crf=False):
        if cv:
            cv_folds = make_folds(X, Y, folds)
            accuracies = []
            fscores = []
            for fold_num, ((train_fold_X, train_fold_Y),
                           (test_fold_X, test_fold_Y)) in enumerate(cv_folds):
                self.tagger = self.fit_model(train_fold_X, train_fold_Y,
                                             use_attention, att_context,
                                             bidirectional, crf)
                pred_probs, pred_label_seqs, x_lens = self.predict(
                    test_fold_X, bidirectional, tagger=self.tagger)
                pred_inds = np.argmax(pred_probs, axis=2)
                flattened_preds = []
                flattened_targets = []
                for x_len, pred_ind, test_target in zip(
                        x_lens, pred_inds, test_fold_Y):
                    flattened_preds.extend(pred_ind[-x_len:])
                    flattened_targets.extend(
                        [list(tt).index(1) for tt in test_target[-x_len:]])
                assert len(flattened_preds) == len(flattened_targets)
                accuracy, weighted_fscore, all_fscores = evaluate(
                    flattened_targets, flattened_preds)
                print("Finished fold %d. Accuracy: %f, Weighted F-score: %f" %
                      (fold_num, accuracy, weighted_fscore))
                print("Individual f-scores:")
                for cat in all_fscores:
                    print("%s: %f" %
                          (self.rev_label_ind[cat], all_fscores[cat]))
                accuracies.append(accuracy)
                fscores.append(weighted_fscore)
            accuracies = np.asarray(accuracies)
            fscores = np.asarray(fscores)
            print("Accuracies:", accuracies)
            print("Average: %0.4f (+/- %0.4f)" %
                  (accuracies.mean(), accuracies.std() * 2))
            print(sys.stderr, "Fscores:", fscores)
            print(
                sys.stderr, "Average: %0.4f (+/- %0.4f)" %
                (fscores.mean(), fscores.std() * 2))
        else:
            self.tagger = self.fit_model(X, Y, use_attention, att_context,
                                         bidirectional, crf)
        model_ext = "att=%s_cont=%s_bi=%s" % (str(use_attention), att_context,
                                              str(bidirectional))
        model_config_file = open("model_%s_config.json" % model_ext, "w")
        model_weights_file_name = "model_%s_weights" % model_ext
        model_label_ind = "model_%s_label_ind.json" % model_ext
        model_rep_reader = "model_%s_rep_reader.pkl" % model_ext
        self.tagger.save_weights(model_weights_file_name, overwrite=True)
        json.dump(self.label_ind, open(model_label_ind, "w"))
        pickle.dump(self.rep_reader, open(model_rep_reader, "wb"))
示例#7
0
class StatementClassifier(object):
  def __init__(self, word_rep_file, train=False, cv=True, folds=5, modeltype="mlp", trained_model_name="trained_model.pkl", tagset_file="tagset.pkl"):
    self.trained_model_name = "%s_%s"%(modeltype, trained_model_name)
    self.cv = cv
    self.folds = folds
    self.rep_reader = RepReader(word_rep_file)
    self.input_size = self.rep_reader.rep_shape[0]
    if modeltype == "mlp":
      self.hidden_sizes = [20, 10]
    else:
      self.hidden_size = 20
    self.max_iter = 100
    self.learning_rate = 0.01
    self.tag_index = None
    self.modeltype = modeltype
    if train:
      print >>sys.stderr, "Statement classifier initialized for training."
      if self.cv:
        print >>sys.stderr, "Cross-validation will be done"
      self.classifier = None
    else:
      self.classifier = cPickle.load(open(self.trained_model_name, "rb"))
      print >>sys.stderr, "Stored model loaded. Statement classifier initialized for prediction."

  def make_data(self, trainfile_name):
    print >>sys.stderr, "Reading data.."
    train_data = [tuple(x.strip().split("\t")) for x in codecs.open(trainfile_name, "r", "utf-8")]
    shuffle(train_data)
    train_labels, train_clauses = zip(*train_data)
    train_labels = [tl.lower() for tl in train_labels]
    tagset = list(set(train_labels))
    if not self.tag_index:
      self.tag_index = {l:i for (i, l) in enumerate(tagset)}
    Y = numpy.asarray([self.tag_index[label] for label in train_labels])
    if self.modeltype=="mlp":
      X = numpy.asarray([numpy.mean(self.rep_reader.get_clause_rep(clause.lower()), axis=0) for clause in train_clauses])
    else:
      X = numpy.asarray([self.rep_reader.get_clause_rep(clause.lower()) for clause in train_clauses])
    return X, Y, len(tagset)
    
  def classify(self, classifier, X):
    output_func = classifier.get_output_func()
    predictions = [numpy.argmax(output_func(x)) for x in X]
    return predictions

  def fit_model(self, X, Y, num_classes):
    if self.modeltype == "mlp":
      classifier = MLP(self.input_size, self.hidden_sizes, num_classes)
    else:
      classifier = RNN(self.input_size, self.hidden_size, num_classes)
    train_func = classifier.get_train_func(self.learning_rate)
    for num_iter in range(self.max_iter):
      for x, y in zip(X, Y):
        train_func(x, y)
    return classifier

  def train(self, trainfile_name):
    train_X, train_Y, num_classes = self.make_data(trainfile_name)
    accuracies = []
    fscores = []
    if self.cv:
      num_points = train_X.shape[0]
      fol_len = num_points / self.folds
      rem = num_points % self.folds
      X_folds = numpy.split(train_X, self.folds) if rem == 0 else numpy.split(train_X[:-rem], self.folds)
      Y_folds = numpy.split(train_Y, self.folds) if rem == 0 else numpy.split(train_Y[:-rem], self.folds)
      for i in range(self.folds):
        train_folds_X = []
        train_folds_Y = []
        for j in range(self.folds):
          if i != j:
            train_folds_X.append(X_folds[j])
            train_folds_Y.append(Y_folds[j])
        train_fold_X = numpy.concatenate(train_folds_X)
        train_fold_Y = numpy.concatenate(train_folds_Y)
        classifier = self.fit_model(train_fold_X, train_fold_Y, num_classes)
        predictions = self.classify(classifier, X_folds[i])
        accuracy, weighted_fscore, _ = self.evaluate(Y_folds[i], predictions)
        accuracies.append(accuracy)
        fscores.append(weighted_fscore)
      accuracies = numpy.asarray(accuracies)
      fscores = numpy.asarray(fscores)
      print >>sys.stderr, "Accuracies:", accuracies
      print >>sys.stderr, "Average: %0.4f (+/- %0.4f)"%(accuracies.mean(), accuracies.std() * 2)
      print >>sys.stderr, "Fscores:", fscores
      print >>sys.stderr, "Average: %0.4f (+/- %0.4f)"%(fscores.mean(), fscores.std() * 2)
    self.classifier = self.fit_model(train_X, train_Y, num_classes)
    cPickle.dump(classifier, open(self.trained_model_name, "wb"))
    #pickle.dump(tagset, open(self.stored_tagset, "wb"))
    print >>sys.stderr, "Done"

  def evaluate(self, y, pred):
    accuracy = float(sum([c == p for c, p in zip(y, pred)]))/len(pred)
    num_gold = {}
    num_pred = {}
    num_correct = {}
    for c, p in zip(y, pred):
      if c in num_gold:
        num_gold[c] += 1
      else:
        num_gold[c] = 1
      if p in num_pred:
        num_pred[p] += 1
      else:
        num_pred[p] = 1
      if c == p:
        if c in num_correct:
          num_correct[c] += 1
        else:
          num_correct[c] = 1
    fscores = {}
    for p in num_pred:
      precision = float(num_correct[p]) / num_pred[p] if p in num_correct else 0.0
      recall = float(num_correct[p]) / num_gold[p] if p in num_correct else 0.0
      fscores[p] = 2 * precision * recall / (precision + recall) if precision !=0 and recall !=0 else 0.0
    weighted_fscore = sum([fscores[p] * num_gold[p] if p in num_gold else 0.0 for p in fscores]) / sum(num_gold.values())
    return accuracy, weighted_fscore, fscores
    parser.add_argument('-i', '--inFile', help='Input File')
    parser.add_argument('-t', '--textColumn', help='Name of text column')
    parser.add_argument('-l', '--labelColumn', help='Name of text column')
    parser.add_argument('-e', '--esIndex', help='ElasticSearch Index Name')
    parser.add_argument('-m', '--modelFile', help='Keras model file')
    '''
    '''
    SIGNATURE FOR ADDING FLAGS
    add_boolean_argument(parser, 'full_text_pdf')
    '''
    args = parser.parse_args()

    base_dir = '/Users/Gully/Documents/Projects/2_active/corpora_local/intact/2018-04-17-cleanup/'
    index_name = 'oa_all_fasttext'
    model_file_name = 'i_meth_label.model.h5'
    rep_reader = RepReader(index_name=index_name, elastic=True)
    # From https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/tutorials/input_fn/boston.py

    COLUMNS = ["ID", "i_meth", "p_meth", "pmid", "subfig", "text"]
    FEATURES = ["text"]
    LABEL = "p_meth"

    interaction_df = pd.read_csv(base_dir + 'ontologies/i_meth_codes.tsv',
                                 sep='\t',
                                 names=['text', 'uri', 'label'],
                                 index_col=0)
    interaction_df

    participant_df = pd.read_csv(base_dir + 'ontologies/p_meth_codes.tsv',
                                 sep='\t',
                                 names=['text', 'uri', 'label'],
示例#9
0
    parser.add_argument('inFile', help='Input File')
    parser.add_argument('textColumn', help='Name of text column')
    parser.add_argument('labelColumn', help='Name of text column')
    parser.add_argument('testSize', help='Size of held-out test set')
    parser.add_argument('--kerasFile', help='Keras model file')
    parser.add_argument('--esIndex',
                        help='ElasticSearch Representation Index Name')
    parser.add_argument('--repFile', help='Representation File Path')

    add_boolean_argument(parser, 'randomizeTestSet')

    args = parser.parse_args()

    rep_reader = None
    if args.repFile is not None:
        rep_reader = RepReader(embedding_file=args.repFile, elastic=False)
    elif args.esIndex is not None:
        rep_reader = RepReader(index_name=args.esIndex, elastic=True)
    else:
        raise ValueError(
            "You must specify either kerasFile or esIndex. Neither specified.")

    sd = SpreadsheetData(args.inFile, args.textColumn, args.labelColumn,
                         args.testSize, args.randomizeTestSet)

    # embedding matrix
    print('preparing embedding matrix...')
    words_not_found = []
    nb_words = min(sd.MAX_NB_WORDS, len(sd.word_index) + 1)
    embed_dim = rep_reader.rep_shape[0]
    embedding_matrix = np.zeros((nb_words, embed_dim))
示例#10
0
class StatementClassifier(object):
  def __init__(self, word_rep_file, train=False, cv=True, folds=5, modeltype="mlp", trained_model_name="trained_model.pkl", tagset_file="tagset.pkl"):
    self.trained_model_name = "%s_%s"%(modeltype, trained_model_name)
    self.cv = cv
    self.folds = folds
    self.rep_reader = RepReader(word_rep_file)
    self.input_size = self.rep_reader.rep_shape[0]
    if modeltype == "mlp":
      self.hidden_sizes = [20, 10]
    else:
      self.hidden_size = 20
    self.max_iter = 100
    self.learning_rate = 0.01
    self.tag_index = None
    self.modeltype = modeltype
    if train:
      print >>sys.stderr, "Statement classifier initialized for training."
      if self.cv:
        print >>sys.stderr, "Cross-validation will be done"
      self.classifier = None
    else:
      self.classifier = cPickle.load(open(self.trained_model_name, "rb"))
      print >>sys.stderr, "Stored model loaded. Statement classifier initialized for prediction."

  def make_data(self, trainfile_name):
    print >>sys.stderr, "Reading data.."
    train_data = [tuple(x.strip().split("\t")) for x in codecs.open(trainfile_name, "r", "utf-8")]
    shuffle(train_data)
    train_clauses, train_labels = zip(*train_data)
    train_labels = [tl.lower() for tl in train_labels]
    tagset = list(set(train_labels))
    if not self.tag_index:
      self.tag_index = {l:i for (i, l) in enumerate(tagset)}
    Y = numpy.asarray([self.tag_index[label] for label in train_labels])
    if self.modeltype=="mlp":
      X = numpy.asarray([numpy.mean(self.rep_reader.get_clause_rep(clause.lower()), axis=0) for clause in train_clauses], dtype='float32')
    elif self.modeltype == "rnn":
      X = numpy.asarray([numpy.asarray(self.rep_reader.get_clause_rep(clause.lower()), dtype='float32') for clause in train_clauses])
    elif self.modeltype == "lstm":
      clause_reps = [self.rep_reader.get_clause_rep(clause.lower()) for clause in train_clauses]
      maxlen = max([len(clause_rep) for clause_rep in clause_reps])
      # Padding X with zeros at the end to make all sequences of same length
      X = numpy.zeros((len(train_clauses), maxlen, max(self.rep_reader.rep_shape)))
      for i in range(len(clause_reps)):
        x_len = len(clause_reps[i])
        X[i][-x_len:] = clause_reps[i]
    return X, Y, len(tagset)
    
  def classify(self, classifier, X):
    if self.modeltype == "mlp" or self.modeltype == "rnn":
      output_func = classifier.get_output_func()
      predictions = [numpy.argmax(output_func(x)) for x in X]
    elif self.modeltype == "lstm":
      predictions = [numpy.argmax(classifier.predict(numpy.asarray([x]))) for x in X]
    return predictions

  def fit_model(self, X, Y, num_classes):
    if self.modeltype == "mlp" or self.modeltype == "rnn":
      if self.modeltype == "mlp":
        classifier = MLP(self.input_size, self.hidden_sizes, num_classes)
      else:
        classifier = RNN(self.input_size, self.hidden_size, num_classes)
      train_func = classifier.get_train_func(self.learning_rate)
      for num_iter in range(self.max_iter):
        for x, y in zip(X, Y):
          train_func(x, y)
    elif self.modeltype == "lstm":
      classifier = Sequential()
      classifier.add(LSTM(input_dim=self.input_size, output_dim=self.input_size/2))
      #classifier.add(Dropout(0.3))
      classifier.add(Dense(num_classes, activation='softmax'))
      classifier.compile(loss='categorical_crossentropy', optimizer='adam')
      Y_indexed = numpy.zeros((len(Y), num_classes))
      for i in range(len(Y)):
        Y_indexed[i][Y[i]] = 1
      classifier.fit(X, Y_indexed, nb_epoch=20)
    return classifier

  def train(self, trainfile_name):
    train_X, train_Y, num_classes = self.make_data(trainfile_name)
    accuracies = []
    fscores = []
    if self.cv:
      cv_folds = make_folds(train_X, train_Y, self.folds)
      for i, ((train_fold_X, train_fold_Y), (test_fold_X, test_fold_Y)) in enumerate(cv_folds):
        classifier = self.fit_model(train_fold_X, train_fold_Y, num_classes)
        predictions = self.classify(classifier, test_fold_X)
        accuracy, weighted_fscore, _ = evaluate(test_fold_Y, predictions)
        print >>sys.stderr, "Finished fold %d. Accuracy: %f, F-score: %f"%(i, accuracy, weighted_fscore)
        accuracies.append(accuracy)
        fscores.append(weighted_fscore)
      accuracies = numpy.asarray(accuracies)
      fscores = numpy.asarray(fscores)
      print >>sys.stderr, "Accuracies:", accuracies
      print >>sys.stderr, "Average: %0.4f (+/- %0.4f)"%(accuracies.mean(), accuracies.std() * 2)
      print >>sys.stderr, "Fscores:", fscores
      print >>sys.stderr, "Average: %0.4f (+/- %0.4f)"%(fscores.mean(), fscores.std() * 2)
    #self.classifier = self.fit_model(train_X, train_Y, num_classes)
    #cPickle.dump(classifier, open(self.trained_model_name, "wb"))
    #pickle.dump(tagset, open(self.stored_tagset, "wb"))
    print >>sys.stderr, "Done"
示例#11
0
 def __init__(self, word_rep_file):
   self.rep_reader = RepReader(word_rep_file)
   self.input_size = self.rep_reader.rep_shape[0]
   self.tagger = None
示例#12
0
class PassageTagger(object):
  def __init__(self, word_rep_file):
    self.rep_reader = RepReader(word_rep_file)
    self.input_size = self.rep_reader.rep_shape[0]
    self.tagger = None

  def make_data(self, trainfilename, use_attention, maxseqlen=None, maxclauselen=None, label_ind=None, train=False):
    print >>sys.stderr, "Reading data.."
    str_seqs, label_seqs = read_passages(trainfilename, train)
    if not label_ind:
      self.label_ind = {"none": 0}
    else:
      self.label_ind = label_ind
    if not maxseqlen:
      maxseqlen = max([len(label_seq) for label_seq in label_seqs])
    if not maxclauselen:
      if use_attention:
        clauselens = []
        for str_seq in str_seqs:
          clauselens.extend([len(clause.split()) for clause in str_seq])
        maxclauselen = max(clauselens)
    X = []
    Y = []
    Y_inds = []
    for str_seq, label_seq in zip(str_seqs, label_seqs):
      for label in label_seq:
        if label not in self.label_ind:
          self.label_ind[label] = len(self.label_ind)
      if use_attention:
        x = numpy.zeros((maxseqlen, maxclauselen, self.input_size))
      else:
        x = numpy.zeros((maxseqlen, self.input_size))
      y_ind = numpy.zeros(maxseqlen)
      seq_len = len(str_seq)
      # The following conditional is true only when we've already trained, and one of the sequences in the test set is longer than the longest sequence in training.
      if seq_len > maxseqlen:
        str_seq = str_seq[:maxseqlen]
        seq_len = maxseqlen 
      if train:
        for i, (clause, label) in enumerate(zip(str_seq, label_seq)):
          clause_rep = self.rep_reader.get_clause_rep(clause)
          if use_attention:
            if len(clause_rep) > maxclauselen:
              clause_rep = clause_rep[:maxclauselen]
            x[-seq_len+i][-len(clause_rep):] = clause_rep
          else:
            x[-seq_len+i] = numpy.mean(clause_rep, axis=0)
          y_ind[-seq_len+i] = self.label_ind[label]
        X.append(x)
        Y_inds.append(y_ind)
      else:
        for i, clause in enumerate(str_seq):
          clause_rep = self.rep_reader.get_clause_rep(clause)
          if use_attention:
            if len(clause_rep) > maxclauselen:
              clause_rep = clause_rep[:maxclauselen]
            x[-seq_len+i][-len(clause_rep):] = clause_rep
          else:
            x[-seq_len+i] = numpy.mean(clause_rep, axis=0)
        X.append(x)
    for y_ind in Y_inds:
      y = numpy.zeros((maxseqlen, len(self.label_ind)))
      for i, y_ind_i in enumerate(y_ind):
        y[i][y_ind_i] = 1
      Y.append(y) 
    self.rev_label_ind = {i: l for (l, i) in self.label_ind.items()}
    return numpy.asarray(X), numpy.asarray(Y)

  def get_attention_weights(self, X_test):
    if not self.tagger:
      raise RuntimeError, "Tagger not trained yet!"
    inp = self.tagger.get_input()
    att_out = None
    for layer in self.tagger.layers:
      if layer.get_config()['name'].lower() == "tensorattention":
        att_out = layer.get_output()
        break
    if not att_out:
      raise RuntimeError, "No attention layer found!"
    f = theano.function([inp], att_out)
    return f(X_test)

  def predict(self, X, bidirectional, tagger=None):
    if not tagger:
      tagger = self.tagger
    if not tagger:
      raise RuntimeError, "Tagger not trained yet!"
    # Determining actual lengths sans padding
    x_lens = []
    for x in X:
      x_len = 0
      for i, xi in enumerate(x):
        if xi.sum() != 0:
          x_len = len(x) - i
          break
      x_lens.append(x_len)
    if bidirectional:
      pred_probs = tagger.predict({'input':X})['output']
    else:
      pred_probs = tagger.predict(X)
    pred_inds = numpy.argmax(pred_probs, axis=2)
    pred_label_seqs = []
    for pred_ind, x_len in zip(pred_inds, x_lens):
      pred_label_seq = [self.rev_label_ind[pred] for pred in pred_ind][-x_len:]
      pred_label_seqs.append(pred_label_seq)
    return pred_probs, pred_label_seqs, x_lens

  def fit_model(self, X, Y, use_attention, att_context, bidirectional):
    print >>sys.stderr, "Input shape:", X.shape, Y.shape
    num_classes = len(self.label_ind)
    if bidirectional:
      tagger = Graph()
      tagger.add_input(name='input', input_shape=X.shape[1:])
      if use_attention:
        tagger.add_node(TensorAttention(X.shape[1:], context=att_context), name='attention', input='input')
        lstm_input_node = 'attention'
      else:
        lstm_input_node = 'input'
      tagger.add_node(LSTM(X.shape[-1]/2, return_sequences=True), name='forward', input=lstm_input_node)
      tagger.add_node(LSTM(X.shape[-1]/2, return_sequences=True, go_backwards=True), name='backward', input=lstm_input_node)
      tagger.add_node(TimeDistributedDense(num_classes, activation='softmax'), name='softmax', inputs=['forward', 'backward'], merge_mode='concat', concat_axis=-1)
      tagger.add_output(name='output', input='softmax')
      print >>sys.stderr, tagger.summary()
      tagger.compile('adam', {'output':'categorical_crossentropy'})
      tagger.fit({'input':X, 'output':Y})
    else:
      tagger = Sequential()
      word_proj_dim = 50
      if use_attention:
        _, input_len, timesteps, input_dim = X.shape
        tagger.add(HigherOrderTimeDistributedDense(input_dim=input_dim, output_dim=word_proj_dim))
        att_input_shape = (input_len, timesteps, word_proj_dim)
        print >>sys.stderr, "Attention input shape:", att_input_shape
        tagger.add(Dropout(0.5))
        tagger.add(TensorAttention(att_input_shape, context=att_context))
        #tagger.add(Dropout(0.5))
      else:
        _, input_len, input_dim = X.shape
        tagger.add(TimeDistributedDense(input_dim=input_dim, output_dim=word_proj_dim))
      tagger.add(LSTM(input_dim=word_proj_dim, output_dim=word_proj_dim, input_length=input_len, return_sequences=True))
      tagger.add(TimeDistributedDense(num_classes, activation='softmax'))
      print >>sys.stderr, tagger.summary()
      tagger.compile(loss='categorical_crossentropy', optimizer='adam')
      tagger.fit(X, Y, batch_size=10)

    return tagger

  def train(self, X, Y, use_attention, att_context, bidirectional, cv=True, folds=5):
    if cv:
      cv_folds = make_folds(X, Y, folds)
      accuracies = []
      fscores = []
      for fold_num, ((train_fold_X, train_fold_Y), (test_fold_X, test_fold_Y)) in enumerate(cv_folds):
        tagger = self.fit_model(train_fold_X, train_fold_Y, use_attention, att_context, bidirectional)
        pred_probs, pred_label_seqs, x_lens = self.predict(test_fold_X, bidirectional, tagger)
        pred_inds = numpy.argmax(pred_probs, axis=2)
        flattened_preds = []
        flattened_targets = []
        for x_len, pred_ind, test_target in zip(x_lens, pred_inds, test_fold_Y):
          flattened_preds.extend(pred_ind[-x_len:])
          flattened_targets.extend([list(tt).index(1) for tt in test_target[-x_len:]])
        assert len(flattened_preds) == len(flattened_targets)
        accuracy, weighted_fscore, all_fscores = evaluate(flattened_targets, flattened_preds)
        print >>sys.stderr, "Finished fold %d. Accuracy: %f, Weighted F-score: %f"%(fold_num, accuracy, weighted_fscore)
        print >>sys.stderr, "Individual f-scores:"
        for cat in all_fscores:
          print >>sys.stderr, "%s: %f"%(self.rev_label_ind[cat], all_fscores[cat])
        accuracies.append(accuracy)
        fscores.append(weighted_fscore)
      accuracies = numpy.asarray(accuracies)
      fscores = numpy.asarray(fscores)
      print >>sys.stderr, "Accuracies:", accuracies
      print >>sys.stderr, "Average: %0.4f (+/- %0.4f)"%(accuracies.mean(), accuracies.std() * 2)
      print >>sys.stderr, "Fscores:", fscores
      print >>sys.stderr, "Average: %0.4f (+/- %0.4f)"%(fscores.mean(), fscores.std() * 2)
    self.tagger = self.fit_model(X, Y, use_attention, att_context, bidirectional)
    model_ext = "att=%s_cont=%s_bi=%s"%(str(use_attention), att_context, str(bidirectional))
    model_config_file = open("model_%s_config.json"%model_ext, "w")
    model_weights_file_name = "model_%s_weights"%model_ext
    model_label_ind = "model_%s_label_ind.json"%model_ext
    print >>model_config_file, self.tagger.to_json()
    self.tagger.save_weights(model_weights_file_name)
    json.dump(self.label_ind, open(model_label_ind, "w"))
示例#13
0
class StatementClassifier(object):
    def __init__(self,
                 word_rep_file,
                 train=False,
                 cv=True,
                 folds=5,
                 modeltype="mlp",
                 trained_model_name="trained_model.pkl",
                 tagset_file="tagset.pkl"):
        self.trained_model_name = "%s_%s" % (modeltype, trained_model_name)
        self.cv = cv
        self.folds = folds
        self.rep_reader = RepReader(word_rep_file)
        self.input_size = self.rep_reader.rep_shape[0]
        if modeltype == "mlp":
            self.hidden_sizes = [20, 10]
        else:
            self.hidden_size = 20
        self.max_iter = 100
        self.learning_rate = 0.01
        self.tag_index = None
        self.modeltype = modeltype
        if train:
            print >> sys.stderr, "Statement classifier initialized for training."
            if self.cv:
                print >> sys.stderr, "Cross-validation will be done"
            self.classifier = None
        else:
            self.classifier = cPickle.load(open(self.trained_model_name, "rb"))
            print >> sys.stderr, "Stored model loaded. Statement classifier initialized for prediction."

    def make_data(self, trainfile_name):
        print >> sys.stderr, "Reading data.."
        train_data = [
            tuple(x.strip().split("\t"))
            for x in codecs.open(trainfile_name, "r", "utf-8")
        ]
        shuffle(train_data)
        train_labels, train_clauses = zip(*train_data)
        train_labels = [tl.lower() for tl in train_labels]
        tagset = list(set(train_labels))
        if not self.tag_index:
            self.tag_index = {l: i for (i, l) in enumerate(tagset)}
        Y = numpy.asarray([self.tag_index[label] for label in train_labels])
        if self.modeltype == "mlp":
            X = numpy.asarray([
                numpy.mean(self.rep_reader.get_clause_rep(clause.lower()),
                           axis=0) for clause in train_clauses
            ])
        else:
            X = numpy.asarray([
                self.rep_reader.get_clause_rep(clause.lower())
                for clause in train_clauses
            ])
        return X, Y, len(tagset)

    def classify(self, classifier, X):
        output_func = classifier.get_output_func()
        predictions = [numpy.argmax(output_func(x)) for x in X]
        return predictions

    def fit_model(self, X, Y, num_classes):
        if self.modeltype == "mlp":
            classifier = MLP(self.input_size, self.hidden_sizes, num_classes)
        else:
            classifier = RNN(self.input_size, self.hidden_size, num_classes)
        train_func = classifier.get_train_func(self.learning_rate)
        for num_iter in range(self.max_iter):
            for x, y in zip(X, Y):
                train_func(x, y)
        return classifier

    def train(self, trainfile_name):
        train_X, train_Y, num_classes = self.make_data(trainfile_name)
        accuracies = []
        fscores = []
        if self.cv:
            num_points = train_X.shape[0]
            fol_len = num_points / self.folds
            rem = num_points % self.folds
            X_folds = numpy.split(train_X,
                                  self.folds) if rem == 0 else numpy.split(
                                      train_X[:-rem], self.folds)
            Y_folds = numpy.split(train_Y,
                                  self.folds) if rem == 0 else numpy.split(
                                      train_Y[:-rem], self.folds)
            for i in range(self.folds):
                train_folds_X = []
                train_folds_Y = []
                for j in range(self.folds):
                    if i != j:
                        train_folds_X.append(X_folds[j])
                        train_folds_Y.append(Y_folds[j])
                train_fold_X = numpy.concatenate(train_folds_X)
                train_fold_Y = numpy.concatenate(train_folds_Y)
                classifier = self.fit_model(train_fold_X, train_fold_Y,
                                            num_classes)
                predictions = self.classify(classifier, X_folds[i])
                accuracy, weighted_fscore, _ = self.evaluate(
                    Y_folds[i], predictions)
                accuracies.append(accuracy)
                fscores.append(weighted_fscore)
            accuracies = numpy.asarray(accuracies)
            fscores = numpy.asarray(fscores)
            print >> sys.stderr, "Accuracies:", accuracies
            print >> sys.stderr, "Average: %0.4f (+/- %0.4f)" % (
                accuracies.mean(), accuracies.std() * 2)
            print >> sys.stderr, "Fscores:", fscores
            print >> sys.stderr, "Average: %0.4f (+/- %0.4f)" % (
                fscores.mean(), fscores.std() * 2)
        self.classifier = self.fit_model(train_X, train_Y, num_classes)
        cPickle.dump(classifier, open(self.trained_model_name, "wb"))
        #pickle.dump(tagset, open(self.stored_tagset, "wb"))
        print >> sys.stderr, "Done"

    def evaluate(self, y, pred):
        accuracy = float(sum([c == p for c, p in zip(y, pred)])) / len(pred)
        num_gold = {}
        num_pred = {}
        num_correct = {}
        for c, p in zip(y, pred):
            if c in num_gold:
                num_gold[c] += 1
            else:
                num_gold[c] = 1
            if p in num_pred:
                num_pred[p] += 1
            else:
                num_pred[p] = 1
            if c == p:
                if c in num_correct:
                    num_correct[c] += 1
                else:
                    num_correct[c] = 1
        fscores = {}
        for p in num_pred:
            precision = float(
                num_correct[p]) / num_pred[p] if p in num_correct else 0.0
            recall = float(
                num_correct[p]) / num_gold[p] if p in num_correct else 0.0
            fscores[p] = 2 * precision * recall / (
                precision + recall) if precision != 0 and recall != 0 else 0.0
        weighted_fscore = sum([
            fscores[p] * num_gold[p] if p in num_gold else 0.0 for p in fscores
        ]) / sum(num_gold.values())
        return accuracy, weighted_fscore, fscores