Exemplo n.º 1
0
def yieldRouge(CorpusFile):
    """yield ROUGE scores of all sentences in corpus
    >>> rouge = yieldRouge('BioASQ-trainingDataset5b.json')
    >>> target = (0, '15829955', 0, {'N-1': 0.1519, 'S4': 0.0, 'SU4': 0.04525, 'N-2': 0.0, 'L': 0.0}, 'The identification of common variants that contribute to the genesis of human inherited disorders remains a significant challenge.')
    >>> next(rouge) == target
    True
    >>> target2 = (0, '15829955', 1, {'N-1': 0.31915, 'S4': 0.02273, 'SU4': 0.09399, 'N-2': 0.13043, 'L': 0.04445}, 'Hirschsprung disease (HSCR) is a multifactorial, non-mendelian disorder in which rare high-penetrance coding sequence mutations in the receptor tyrosine kinase RET contribute to risk in combination with mutations at other genes.')
    >>> next(rouge) == target2
    True
    """
    data = json.load(open(CorpusFile, encoding='utf-8'))['questions']
    for qi in range(len(data)):
        if 'snippets' not in data[qi].keys():
            print("Warning: No snippets in question %s" % data[qi]['body'])
            continue
        ai = 0
        if type(data[qi]['ideal_answer']) == list:
            ideal_answers = data[qi]['ideal_answer']
        else:
            ideal_answers = [data[qi]['ideal_answer']]
        for (pubmedid, senti, sent) in yield_candidate_text(data[qi]):
            rouge_scores = [
                rouge_engine.get_scores(h, sent)[0] for h in ideal_answers
            ]
            rouge_l = max([r['rouge-l']['f'] for r in rouge_scores])
            yield (qi, pubmedid, senti, rouge_l, sent)
Exemplo n.º 2
0
def NNbaseline(testfile=EVALFILE):
    """Evaluate a baseline that uses supervised NN"""
    KEEP_PROB = 0.2 # Rate of cells to keep at the dropout layer
    nanswers = {"summary": 6,
                "factoid": 2,
                "yesno": 2,
                "list": 3}
    with open(NN_LOGFILE, 'w') as f:
        f.write("episode,reward,QID,summary\n")

    with open(NN_EVALFILE, 'w') as f:
        f.write("episode,reward,QID,summary\n")

    env = Environment(jsonfile='BioASQ-trainingDataset5b.json')
    all_data = env.data
    with open('rl-rouge5b.csv') as f:
        csvfile = csv.DictReader(f)
        all_rouge = [l for l in csvfile]

    if type(testfile) == None:
        all_indices = list(range(len(all_data)))
        np.random.shuffle(all_indices)
        split_boundary = int(len(all_indices)*.8)
        train_indices = all_indices[:split_boundary]
        test_indices = all_indices[split_boundary:]
    else:
        with open(testfile) as f:
            reader = csv.DictReader(f)
            test_indices = list(set(int(l['QID']) for l in reader) & set(range(len(all_data))))
        train_indices = [i for i in range(len(all_data)) if i not in test_indices]

    print("Train indices:", train_indices)
    print("Test indices:", test_indices)

    tfidf_train_text = [all_data[x]['body'] for x in train_indices]
    tfidf_train_text += [c[2] for x in train_indices for c in yield_candidate_text(all_data[x])]
    ideal_summaries_sentences = []
    for x in train_indices:
        ideal_summaries = all_data[x]['ideal_answer']
        if type(ideal_summaries) != list:
            ideal_summaries = [ideal_summaries]
        for ideal_sum in ideal_summaries:
            ideal_summaries_sentences += sent_tokenize(ideal_sum)
    tfidf_train_text += ideal_summaries_sentences
    #print(len(tfidf_train_text))
    #print(tfidf_train_text[:10])
    tfidf = TfidfVectorizer(tokenizer=my_tokenize)
    tfidf.fit(tfidf_train_text)
    vocabulary_size = len(tfidf.get_feature_names())

    graph = tf.Graph()
    with graph.as_default():
        X_state = tf.placeholder(tf.float32, shape=[None, 2*vocabulary_size]) # + 1])
        Q_state = tf.placeholder(tf.float32, shape=[None, vocabulary_size])
        Y_result = tf.placeholder(tf.float32, shape=[None, 1])
        keep_prob = tf.placeholder(tf.float32)
        dropout1 = tf.nn.dropout(tf.concat((X_state, Q_state), 1), keep_prob)
        hidden = tf.layers.dense(dropout1, N_HIDDEN, activation=tf.nn.relu,
                                 kernel_initializer=tf.contrib.layers.variance_scaling_initializer())
        dropout2 = tf.nn.dropout(hidden, keep_prob)
        outputs = tf.layers.dense(dropout2, 1, activation=None)

        mse = tf.reduce_mean(tf.square(Y_result - outputs))
        optimizer = tf.train.AdamOptimizer()
        train = optimizer.minimize(mse)
        init = tf.global_variables_initializer()
        saver = tf.train.Saver()

    if VERBOSE > 0:
        print("Training NN Baseline")
    with tf.Session(graph=graph) as sess:
        if RESTORE:
            saver.restore(sess, NN_CHECKPOINT_PATH)
        else:
            init.run()

        episode = 0
        while True:
            # 1. Train
            while True:
                train_x = np.random.choice(train_indices)
                observation = env.reset(train_x)
                if len(env.candidates) > 0:
                    break
            tfidf_all_candidates = tfidf.transform(env.candidates).todense()
            tfidf_all_text = tfidf.transform([" ".join(env.candidates)]).todense()[0,:]
            Y = [[float(l['L'])] for l in all_rouge if int(l['qid']) == env.qid][:len(env.candidates)]
            Q = np.tile(tfidf.transform([env.question]).todense()[0,:], (len(env.candidates),1))
            X = np.vstack([np.hstack([tfidf_all_text, c]) for c in tfidf_all_candidates])
            sess.run(train,
                    feed_dict={X_state: X,
                               Q_state: Q,
                               Y_result: Y,
                               keep_prob: KEEP_PROB})
            # 2. Evaluate
            predicted = sess.run(outputs,
                                 feed_dict={X_state: X,
                                            Q_state: Q,
                                            keep_prob: 1.0})
            n = nanswers[env.qtype]
            topn = sorted(predicted)[-n:]
            while not observation['done']:
                if predicted[observation['next_candidate']] >= topn[0]:
                    action = 1
                else:
                    action = 0
                observation = env.step(action)
            reward = observation['reward']
            print("Episode: %i, reward: %f" % (episode, reward))
            with open(NN_LOGFILE, 'a') as f:
                f.write('%i,%f,%i,"%s"\n' % (episode,reward,env.qid," ".join([str(x) for x in observation['summary']])))

            episode += 1
            if episode % SAVE_EPISODES == 0:
                print("Saving checkpoint in %s" % (NN_CHECKPOINT_PATH))
                saver.save(sess, NN_CHECKPOINT_PATH)
                # 3. Evaluate test data
                print("Testing results")
                test_results = []
                for test_x in test_indices:
                    observation = env.reset(test_x)
                    if len(env.candidates) == 0:
                        continue

                    tfidf_all_candidates = tfidf.transform(env.candidates).todense()
                    tfidf_all_text = tfidf.transform([" ".join(env.candidates)]).todense()[0,:]
                    Q = np.tile(tfidf.transform([env.question]).todense()[0,:], (len(env.candidates), 1))
                    X = np.vstack([np.hstack([tfidf_all_text, c]) for c in tfidf_all_candidates])
                    predicted = sess.run(outputs,
                                         feed_dict={X_state: X,
                                                    Q_state: Q,
                                                    keep_prob: 1.0})
                    n = nanswers[env.qtype]
                    topn = sorted(predicted)[-n:]
                    while not observation['done']:
                        if predicted[observation['next_candidate']] >= topn[0]:
                            action = 1
                        else:
                            action = 0
                        observation = env.step(action)
                    reward = observation['reward']


                    test_results.append(reward)
                    with open(NN_EVALFILE, 'a') as f:
                            f.write('%i,%f,%i,"%s"\n' % (episode,reward,env.qid," ".join([str(x) for x in observation['summary']])))
                print("Mean of evaluation results:", np.mean(test_results))
Exemplo n.º 3
0
def train():

    with open(LOGFILE, 'w') as f:
        f.write("episode,reward,QID,summary\n")

    with open(EVALFILE, 'w') as f:
        f.write("episode,reward,QID,summary\n")

    env = Environment(jsonfile='BioASQ-trainingDataset5b.json')
    alldata = list(range(len(env.data)))
    np.random.shuffle(alldata)
    split_boundary = int(len(alldata) * .8)
    train_indices = alldata[:split_boundary]
    test_indices = alldata[split_boundary:]

    # train tf.idf
    if VERBOSE > 0:
        print("Training tf.idf")
    tfidf_train_text = [env.data[x]['body'] for x in train_indices]
    tfidf_train_text += [
        c[2] for x in train_indices for c in yield_candidate_text(env.data[x])
    ]
    ideal_summaries_sentences = []
    for x in train_indices:
        ideal_summaries = env.data[x]['ideal_answer']
        if type(ideal_summaries) != list:
            ideal_summaries = [ideal_summaries]
        for ideal_sum in ideal_summaries:
            ideal_summaries_sentences += sent_tokenize(ideal_sum)
    tfidf_train_text += ideal_summaries_sentences
    #print(len(tfidf_train_text))
    #print(tfidf_train_text[:10])
    tfidf = TfidfVectorizer(tokenizer=my_tokenize)
    tfidf.fit(tfidf_train_text)
    nnModel = NNModel(len(tfidf.get_feature_names()))

    if VERBOSE > 0:
        print("Training REINFORCE")
    with tf.Session(graph=nnModel.graph) as sess:
        if RESTORE:
            nnModel.saver.restore(sess, CHECKPOINT_PATH)
        else:
            nnModel.init.run()

        while True:
            train_x = np.random.choice(train_indices)
            observation = env.reset(train_x)  # Reset to a random question
            if len(env.candidates) > 0:
                break
        tfidf_all_candidates = tfidf.transform(env.candidates)
        tfidf_all_text = tfidf.transform([" ".join(env.candidates)
                                          ]).todense()[0, :]

        all_gradients = []
        episode = 0
        while True:
            # The following code is based on "Policy Gradients"
            # at https://github.com/ageron/handson-ml/blob/master/16_reinforcement_learning.ipynb
            this_candidate = observation['next_candidate']
            tfidf_this_candidate = tfidf_all_candidates[
                this_candidate].todense()
            tfidf_remaining_candidates = tfidf.transform([
                " ".join(env.candidates[this_candidate + 1:])
            ]).todense()[0, :]
            tfidf_summary = tfidf.transform([
                " ".join([env.candidates[x] for x in observation['summary']])
            ]).todense()[0, :]
            tfidf_question = tfidf.transform([env.question]).todense()[0, :]
            #print(tfidf_question.shape)
            XState = np.hstack([
                tfidf_all_text, tfidf_this_candidate,
                tfidf_remaining_candidates, tfidf_summary
            ])  #, [[len(observation['summary'])]]])
            action_val, gradients_val = sess.run(
                [nnModel.action, nnModel.gradients],
                feed_dict={
                    nnModel.X_state: XState,
                    nnModel.Q_state: tfidf_question,
                    nnModel.episode: episode
                })
            all_gradients.append(gradients_val)
            #action = 1 if np.random.uniform() < action_prob else 0
            observation = env.step(action_val)

            if observation['done']:
                # reward all actions that lead to the summary
                reward = observation['reward']
                print("Episode: %i, reward: %f" % (episode, reward))
                with open(LOGFILE, 'a') as f:
                    f.write('%i,%f,%i,"%s"\n' %
                            (episode, reward, env.qid, " ".join(
                                [str(x) for x in observation['summary']])))

                feed_dict = {}
                #print(nnModel.gradient_placeholders[0].shape)
                for var_index, grad_placeholder in enumerate(
                        nnModel.gradient_placeholders):
                    mean_gradients = np.mean([
                        reward * one_gradient[var_index]
                        for one_gradient in all_gradients
                    ],
                                             axis=0)
                    feed_dict[grad_placeholder] = mean_gradients
                sess.run(nnModel.training_op, feed_dict=feed_dict)

                episode += 1
                if episode % SAVE_EPISODES == 0:
                    print("Saving checkpoint in %s" % (CHECKPOINT_PATH))
                    nnModel.saver.save(sess, CHECKPOINT_PATH)
                    print("Testing results")
                    test_results = []
                    for test_x in test_indices:
                        observation = env.reset(test_x)
                        if len(env.candidates) == 0:
                            continue

                        tfidf_all_candidates = tfidf.transform(env.candidates)
                        tfidf_all_text = tfidf.transform(
                            [" ".join(env.candidates)]).todense()[0, :]
                        while not observation['done']:
                            this_candidate = observation['next_candidate']
                            tfidf_this_candidate = tfidf_all_candidates[
                                this_candidate].todense()
                            tfidf_remaining_candidates = tfidf.transform([
                                " ".join(env.candidates[this_candidate + 1:])
                            ]).todense()[0, :]
                            tfidf_summary = tfidf.transform([
                                " ".join([
                                    env.candidates[x]
                                    for x in observation['summary']
                                ])
                            ]).todense()[0, :]
                            tfidf_question = tfidf.transform(
                                [env.question]).todense()[0, :]
                            #print(tfidf_question.shape)
                            XState = np.hstack([
                                tfidf_all_text, tfidf_this_candidate,
                                tfidf_remaining_candidates, tfidf_summary
                            ])  #, [[len(observation['summary'])]]])
                            output_val = sess.run(nnModel.outputs,
                                                  feed_dict={
                                                      nnModel.X_state:
                                                      XState,
                                                      nnModel.Q_state:
                                                      tfidf_question
                                                  })
                            action_val = 0
                            if output_val < 0.5:
                                action_val = 1
                            observation = env.step(action_val)
                        reward = observation['reward']
                        test_results.append(reward)
                        with open(EVALFILE, 'a') as f:
                            f.write(
                                '%i,%f,%i,"%s"\n' %
                                (episode, reward, env.qid, " ".join(
                                    [str(x) for x in observation['summary']])))
                    print("Mean of evaluation results:", np.mean(test_results))

                # Pick next training question
                while True:
                    train_x = np.random.choice(train_indices)
                    observation = env.reset(
                        train_x)  # Reset to a random question
                    if len(env.candidates) > 0:
                        break
                all_gradients = []
                tfidf_all_candidates = tfidf.transform(env.candidates)
                tfidf_all_text = tfidf.transform([" ".join(env.candidates)
                                                  ]).todense()[0, :]
Exemplo n.º 4
0
def bioasq_run(test_data='phaseB_3b_01.json',
               output_filename='bioasq-out-rl.json'):
    """Run model for BioASQ"""
    print("Running BioASQ")
    with open(BEST_TFIDF_FILENAME, 'rb') as f:
        tfidf = pickle.load(f)
    testset = json.load(open(test_data, encoding='utf-8'))['questions']
    if DEBUG:
        testset = testset[:10]
    result = []
    nnModel = NNModel(len(tfidf.get_feature_names()))
    with tf.Session(graph=nnModel.graph) as sess:
        nnModel.saver.restore(sess, BEST_CHECKPOINT_PATH)
        for r in testset:
            test_question = r['body']
            test_id = r['id']
            test_candidates = [
                sent for pubmedid, senti, sent in yield_candidate_text(r)
            ]
            test_candidates = test_candidates[:20]
            if len(test_candidates) == 0:
                print("Warning: no text to summarise")
                test_summary = ''
            else:
                if QTYPES:
                    q_types = [0.0] * len(QTYPES)
                    q_types[QTYPES.index(r['type'])] = 1.0
                else:
                    q_types = []
                tfidf_all_candidates = tfidf.transform(test_candidates)
                tfidf_all_text = tfidf.transform([" ".join(test_candidates)
                                                  ]).todense()[0, :]
                test_summary = ''
                len_summary = 0
                output_probs = []
                for this_candidate in range(len(test_candidates)):
                    tfidf_this_candidate = tfidf_all_candidates[
                        this_candidate].todense()
                    tfidf_remaining_candidates = tfidf.transform([
                        " ".join(test_candidates[this_candidate + 1:])
                    ]).todense()[0, :]
                    tfidf_summary = tfidf.transform([test_summary
                                                     ]).todense()[0, :]
                    tfidf_question = tfidf.transform([test_question
                                                      ]).todense()[0, :]
                    XState = np.hstack([
                        tfidf_all_text,
                        tfidf_this_candidate,
                        tfidf_remaining_candidates,
                        tfidf_summary,
                        #                                  [[len(observation['summary']), this_candidate]]])
                        [[len_summary]],
                        [q_types]
                    ])
                    output_val = sess.run(nnModel.outputs,
                                          feed_dict={
                                              nnModel.X_state: XState,
                                              nnModel.Q_state: tfidf_question
                                          })
                    output_probs.append(1 - output_val)
                    if output_val < 0.5:
                        len_summary += 1
                        if test_summary == '':
                            test_summary = test_candidates[this_candidate]
                        else:
                            test_summary += " " + test_candidates[
                                this_candidate]
            if test_summary == '' and len(test_candidates) > 0:
                print(
                    "Warning: no summary produced; returning top sentence %i" %
                    np.argmax(output_probs))
                #print("Output probabilities are:")
                #print(output_probs)
                test_summary = test_candidates[np.argmax(output_probs)]
            if r['type'] == "yesno":
                exactanswer = "yes"
            else:
                exactanswer = ""

            result.append({
                "id": test_id,
                "ideal_answer": test_summary,
                "exact_answer": exactanswer
            })
    print("Saving results in file %s" % output_filename)
    with open(output_filename, 'w') as f:
        f.write(json.dumps({"questions": result}, indent=2))