def train_recurrent(model_out, pubtator_file, directional_distant_directory,
                    symmetric_distant_directory, distant_entity_a_col,
                    distant_entity_b_col, distant_rel_col, entity_a, entity_b):

    #get distant_relations from external knowledge base file
    distant_interactions, reverse_distant_interactions = load_data.load_distant_directories(
        directional_distant_directory, symmetric_distant_directory,
        distant_entity_a_col, distant_entity_b_col, distant_rel_col)

    key_order = sorted(distant_interactions)
    #get pmids,sentences,
    training_pmids, training_forward_sentences, training_reverse_sentences, entity_a_text, entity_b_text = load_data.load_pubtator_abstract_sentences(
        pubtator_file, entity_a, entity_b)

    #training full model
    training_instances, \
    dep_path_list_dictionary, \
    dep_word_dictionary,word2vec_embeddings  = load_data.build_instances_training(training_forward_sentences,
                                                   training_reverse_sentences,
                                                   distant_interactions,
                                                   reverse_distant_interactions,
                                                   entity_a_text,
                                                   entity_b_text,
                                                   key_order,True)

    dep_path_list_features, dep_word_features, dep_type_path_length, dep_word_path_length, labels = load_data.build_recurrent_arrays(
        training_instances)
    features = [
        dep_path_list_features, dep_word_features, dep_type_path_length,
        dep_word_path_length
    ]

    if os.path.exists(model_out):
        shutil.rmtree(model_out)

    pickle.dump([dep_path_list_dictionary, dep_word_dictionary, key_order],
                open(model_out + 'a.pickle', 'wb'))

    trained_model_path = rnn.recurrent_train(features, labels,
                                             len(dep_path_list_dictionary),
                                             len(dep_word_dictionary),
                                             model_out + '/', key_order,
                                             word2vec_embeddings)

    print("trained model")

    return trained_model_path
def parallel_k_fold_cross_validation(batch_id, k, pmids, forward_sentences,
                                     reverse_sentences, distant_interactions,
                                     reverse_distant_interactions,
                                     entity_a_text, entity_b_text,
                                     hidden_array, key_order, recurrent):

    pmids = list(pmids)
    #split training sentences for cross validation
    ten_fold_length = len(pmids) / k
    all_chunks = [
        pmids[i:i + ten_fold_length]
        for i in xrange(0, len(pmids), ten_fold_length)
    ]

    total_test = []  #test_labels for instances
    total_predicted_prob = []  #test_probability returns for instances
    total_instances = []

    fold_chunks = all_chunks[:]
    fold_test_abstracts = set(fold_chunks.pop(batch_id))
    fold_training_abstracts = set(
        list(itertools.chain.from_iterable(fold_chunks)))

    fold_training_forward_sentences = {}
    fold_training_reverse_sentences = {}
    fold_test_forward_sentences = {}
    fold_test_reverse_sentences = {}

    for key in forward_sentences:
        if key.split('|')[0] in fold_training_abstracts:
            fold_training_forward_sentences[key] = forward_sentences[key]
        elif key.split('|')[0] in fold_test_abstracts:
            fold_test_forward_sentences[key] = forward_sentences[key]

    for key in reverse_sentences:
        if key.split('|')[0] in fold_training_abstracts:
            fold_training_reverse_sentences[key] = reverse_sentences[key]
        elif key.split('|')[0] in fold_test_abstracts:
            fold_test_reverse_sentences[key] = reverse_sentences[key]

    if recurrent is False:
        fold_training_instances, \
        fold_dep_dictionary, \
        fold_dep_word_dictionary,\
        fold_dep_element_dictionary,\
        fold_between_word_dictionary = load_data.build_instances_training(fold_training_forward_sentences,
                                                                      fold_training_reverse_sentences,
                                                                      distant_interactions,
                                                                      reverse_distant_interactions,
                                                                      entity_a_text,
                                                                      entity_b_text,key_order)

        #train model
        X = []
        y = []
        for t in fold_training_instances:
            X.append(t.features)
            y.append(t.label)

        fold_train_X = np.array(X)
        fold_train_y = np.array(y)

        model_dir = os.path.dirname(
            os.path.realpath(__file__)
        ) + '/model_building_meta_data/test' + str(batch_id) + str(
            time.time()).replace('.', '')
        if os.path.exists(model_dir):
            shutil.rmtree(model_dir)

        fold_test_instances = load_data.build_instances_testing(
            fold_test_forward_sentences, fold_test_reverse_sentences,
            fold_dep_dictionary, fold_dep_word_dictionary,
            fold_dep_element_dictionary, fold_between_word_dictionary,
            distant_interactions, reverse_distant_interactions, entity_a_text,
            entity_b_text, key_order)

        # group instances by pmid and build feature array
        fold_test_features = []
        fold_test_labels = []
        pmid_test_instances = {}
        for test_index in range(len(fold_test_instances)):
            fti = fold_test_instances[test_index]
            if fti.sentence.pmid not in pmid_test_instances:
                pmid_test_instances[fti.sentence.pmid] = []
            pmid_test_instances[fti.sentence.pmid].append(test_index)
            fold_test_features.append(fti.features)
            fold_test_labels.append(fti.label)

        fold_test_X = np.array(fold_test_features)
        fold_test_y = np.array(fold_test_labels)

        test_model = snn.feed_forward_train(fold_train_X, fold_train_y,
                                            fold_test_X, fold_test_y,
                                            hidden_array, model_dir + '/',
                                            key_order)

        fold_test_predicted_prob = snn.feed_forward_test(
            fold_test_X, fold_test_y, test_model)

        total_predicted_prob = fold_test_predicted_prob.tolist()
        total_test = fold_test_y.tolist()
        total_instances = fold_test_instances

        total_test = np.array(total_test)
        total_predicted_prob = np.array(total_predicted_prob)

        return total_predicted_prob, total_instances

    else:
        fold_training_instances, \
        fold_dep_path_list_dictionary, \
        fold_dep_word_dictionary,word2vec_embeddings = load_data.build_instances_training(fold_training_forward_sentences,
                                                                          fold_training_reverse_sentences,
                                                                          distant_interactions,
                                                                          reverse_distant_interactions,
                                                                          entity_a_text,
                                                                          entity_b_text, key_order,True)

        dep_path_list_features, dep_word_features, dep_type_path_length, dep_word_path_length, labels = load_data.build_recurrent_arrays(
            fold_training_instances)

        features = [
            dep_path_list_features, dep_word_features, dep_type_path_length,
            dep_word_path_length
        ]

        model_dir = os.path.dirname(
            os.path.realpath(__file__)
        ) + '/model_building_meta_data/test' + str(batch_id) + str(
            time.time()).replace('.', '')
        if os.path.exists(model_dir):
            shutil.rmtree(model_dir)

        trained_model_path = rnn.recurrent_train(
            features, labels, len(fold_dep_path_list_dictionary),
            len(fold_dep_word_dictionary), model_dir + '/', key_order,
            word2vec_embeddings)

        fold_test_instances = load_data.build_instances_testing(
            fold_test_forward_sentences, fold_test_reverse_sentences, None,
            fold_dep_word_dictionary, None, None, distant_interactions,
            reverse_distant_interactions, entity_a_text, entity_b_text,
            key_order, fold_dep_path_list_dictionary)

        # group instances by pmid and build feature array
        test_dep_path_list_features, test_dep_word_features, test_dep_type_path_length, test_dep_word_path_length, test_labels = load_data.build_recurrent_arrays(
            fold_test_instances)

        test_features = [
            test_dep_path_list_features, test_dep_word_features,
            test_dep_type_path_length, test_dep_word_path_length
        ]
        print(trained_model_path)
        fold_test_predicted_prob, fold_test_labels = rnn.recurrent_test(
            test_features, test_labels, trained_model_path)

        assert (np.array_equal(fold_test_labels, test_labels))

        total_predicted_prob = fold_test_predicted_prob.tolist()
        total_test = fold_test_labels.tolist()
        total_instances = fold_test_instances

        total_test = np.array(total_test)
        total_predicted_prob = np.array(total_predicted_prob)

        return total_predicted_prob, total_instances
def one_fold_cross_validation(model_out,
                              pmids,
                              forward_sentences,
                              reverse_sentences,
                              distant_interactions,
                              reverse_distant_interactions,
                              entity_a_text,
                              entity_b_text,
                              hidden_array,
                              key_order,
                              recurrent,
                              pubtator_labels=None):
    pmids = list(pmids)
    #split training sentences for cross validation
    testlength = int(len(pmids) * 0.50)
    random.shuffle(pmids)
    fold_test_abstracts = pmids[:testlength]
    fold_training_abstracts = pmids[testlength:]

    fold_training_forward_sentences = {}
    fold_training_reverse_sentences = {}
    fold_test_forward_sentences = {}
    fold_test_reverse_sentences = {}

    for key in forward_sentences:
        if key.split('|')[0] in fold_training_abstracts:
            fold_training_forward_sentences[key] = forward_sentences[key]
        elif key.split('|')[0] in fold_test_abstracts:
            fold_test_forward_sentences[key] = forward_sentences[key]

    for key in reverse_sentences:
        if key.split('|')[0] in fold_training_abstracts:
            fold_training_reverse_sentences[key] = reverse_sentences[key]
        elif key.split('|')[0] in fold_test_abstracts:
            fold_test_reverse_sentences[key] = reverse_sentences[key]

    if recurrent is False:
        fold_training_instances, \
        fold_dep_dictionary, \
        fold_dep_word_dictionary, \
        fold_dep_element_dictionary, \
        fold_between_word_dictionary = load_data.build_instances_training(fold_training_forward_sentences,
                                                                    fold_training_reverse_sentences,
                                                                    distant_interactions,
                                                                    reverse_distant_interactions,
                                                                    entity_a_text, entity_b_text, key_order)

        if pubtator_labels:
            fold_training_instances, \
            fold_dep_dictionary, \
            fold_dep_word_dictionary, \
            fold_dep_element_dictionary, \
            fold_between_word_dictionary = load_data.build_instances_labelled(fold_training_forward_sentences,
                                                                         fold_training_reverse_sentences,
                                                                         pubtator_labels, key_order[0],
                                                                         entity_a_text,
                                                                         entity_b_text,
                                                                         key_order)

        pickle.dump([
            fold_dep_dictionary, fold_dep_word_dictionary,
            fold_dep_element_dictionary, fold_between_word_dictionary,
            key_order
        ], open(model_out + 'a.pickle', 'wb'))

        # train model
        X = []
        y = []
        for t in fold_training_instances:
            X.append(t.features)
            y.append(t.label)

        fold_train_X = np.array(X)
        fold_train_y = np.array(y)

        model_dir = model_out
        if os.path.exists(model_dir):
            shutil.rmtree(model_dir)

        fold_test_instances = load_data.build_instances_testing(
            fold_test_forward_sentences, fold_test_reverse_sentences,
            fold_dep_dictionary, fold_dep_word_dictionary,
            fold_dep_element_dictionary, fold_between_word_dictionary,
            distant_interactions, reverse_distant_interactions, entity_a_text,
            entity_b_text, key_order)
        if pubtator_labels:
            fold_test_instances = load_data.build_labelled_testing(
                fold_test_forward_sentences, fold_test_reverse_sentences,
                fold_dep_dictionary, fold_dep_word_dictionary,
                fold_dep_element_dictionary, fold_between_word_dictionary,
                pubtator_labels, key_order[0], entity_a_text, entity_b_text,
                key_order)

        # group instances by pmid and build feature array
        fold_test_features = []
        fold_test_labels = []
        pmid_test_instances = {}
        for test_index in range(len(fold_test_instances)):
            fti = fold_test_instances[test_index]
            if fti.sentence.pmid not in pmid_test_instances:
                pmid_test_instances[fti.sentence.pmid] = []
            pmid_test_instances[fti.sentence.pmid].append(test_index)
            fold_test_features.append(fti.features)
            fold_test_labels.append(fti.label)

        fold_test_X = np.array(fold_test_features)
        fold_test_y = np.array(fold_test_labels)
        print(fold_test_y.shape)
        print(fold_test_X.shape)

        test_model = snn.feed_forward_train(fold_train_X, fold_train_y,
                                            fold_test_X, fold_test_y,
                                            hidden_array, model_dir + '/',
                                            key_order)

        group_instances = load_data.batch_instances(fold_test_instances)

        probability_dict = {}
        label_dict = {}
        cs_grad_dict = {}
        cs_hidden_act_dict = {}

        for g in group_instances:
            fold_test_features = []
            fold_test_labels = []
            for ti in group_instances[g]:
                fold_test_features.append(fold_test_instances[ti].features)
                fold_test_labels.append(fold_test_instances[ti].label)
            fold_test_X = np.array(fold_test_features)
            fold_test_y = np.array(fold_test_labels)

            fold_test_predicted_prob, fold_test_labels, fold_test_cs_grads, fold_test_cs_hidden_activations = snn.feed_forward_test(
                fold_test_X, fold_test_y, test_model)
            probability_dict[g] = fold_test_predicted_prob
            label_dict[g] = fold_test_labels
            for i in range(len(fold_test_cs_grads)):
                print(fold_test_cs_grads[i])
                print(fold_test_predicted_prob[i])
                cs_grad_dict[group_instances[g][i]] = [
                    fold_test_predicted_prob[i], fold_test_labels[i],
                    fold_test_cs_grads[i], group_instances[g]
                ]
                cs_hidden_act_dict[group_instances[g][i]] = [
                    fold_test_predicted_prob[i], fold_test_labels[i],
                    fold_test_cs_hidden_activations[i], group_instances[g]
                ]

    else:
        fold_training_instances, \
        fold_dep_path_list_dictionary, \
        fold_dep_word_dictionary, word2vec_embeddings = load_data.build_instances_training(fold_training_forward_sentences,
                                                                                      fold_training_reverse_sentences,
                                                                                      distant_interactions,
                                                                                      reverse_distant_interactions,
                                                                                      entity_a_text,
                                                                                      entity_b_text,
                                                                                      key_order, True)

        if pubtator_labels:
            fold_training_instances, \
            fold_dep_path_list_dictionary, \
            fold_dep_word_dictionary, word2vec_embeddings = load_data.build_instances_labelled(fold_training_forward_sentences,
                                                                                          fold_training_reverse_sentences,
                                                                                          pubtator_labels, key_order[0],
                                                                                          entity_a_text,
                                                                                          entity_b_text,
                                                                                          key_order, True)

        pickle.dump([
            fold_dep_path_list_dictionary, fold_dep_word_dictionary, key_order
        ], open(model_out + 'a.pickle', 'wb'))

        dep_path_list_features, dep_word_features, dep_type_path_length, dep_word_path_length, labels = load_data.build_recurrent_arrays(
            fold_training_instances)

        features = [
            dep_path_list_features, dep_word_features, dep_type_path_length,
            dep_word_path_length
        ]

        model_dir = model_out
        if os.path.exists(model_dir):
            shutil.rmtree(model_dir)

        test_model = rnn.recurrent_train(features, labels,
                                         len(fold_dep_path_list_dictionary),
                                         len(fold_dep_word_dictionary),
                                         model_dir + '/', key_order,
                                         word2vec_embeddings)

        fold_test_instances = load_data.build_instances_testing(
            fold_test_forward_sentences, fold_test_reverse_sentences, None,
            fold_dep_word_dictionary, None, None, distant_interactions,
            reverse_distant_interactions, entity_a_text, entity_b_text,
            key_order, fold_dep_path_list_dictionary)

        if pubtator_labels:
            fold_test_instances = load_data.build_labelled_testing(
                fold_test_forward_sentences, fold_test_reverse_sentences, None,
                fold_dep_word_dictionary, None, None, pubtator_labels,
                key_order[0], entity_a_text, entity_b_text, key_order,
                fold_dep_path_list_dictionary)

        group_instances = load_data.batch_instances(fold_test_instances)

        test_dep_path_list_features, test_dep_word_features, test_dep_type_path_length, test_dep_word_path_length, test_labels = load_data.build_recurrent_arrays(
            fold_test_instances)

        test_features = [
            test_dep_path_list_features, test_dep_word_features,
            test_dep_type_path_length, test_dep_word_path_length
        ]

        fold_test_predicted_prob, fold_test_labels, fold_test_cs_grads, fold_test_cs_hidden_activations = rnn.recurrent_test(
            test_features, test_labels, test_model)
        cs_grad_dict = {}
        cs_hidden_act_dict = {}
        for g in group_instances:
            for ti in group_instances[g]:
                print(fold_test_predicted_prob[ti])
                cs_grad_dict[ti] = [
                    fold_test_predicted_prob[ti], fold_test_labels[ti], [],
                    group_instances[g]
                ]

        # group instances by pmid and build feature array

    return fold_test_instances, cs_grad_dict, cs_hidden_act_dict
def train_feed_forward(model_out, pubtator_file, directional_distant_directory,
                       symmetric_distant_directory, distant_entity_a_col,
                       distant_entity_b_col, distant_rel_col, entity_a,
                       entity_b):

    #get distant_relations from external knowledge base file
    distant_interactions, reverse_distant_interactions = load_data.load_distant_directories(
        directional_distant_directory, symmetric_distant_directory,
        distant_entity_a_col, distant_entity_b_col, distant_rel_col)

    key_order = sorted(distant_interactions)
    #get pmids,sentences,
    training_pmids, training_forward_sentences, training_reverse_sentences, entity_a_text, entity_b_text = load_data.load_pubtator_abstract_sentences(
        pubtator_file, entity_a, entity_b)

    #hidden layer structure
    hidden_array = [256]

    #k-cross val
    #instance_predicts, single_instances= cv.k_fold_cross_validation(10,training_pmids,training_forward_sentences,
    #                                                                training_reverse_sentences,distant_interactions,
    #                                                                reverse_distant_interactions,entity_a_text,
    #                                                                entity_b_text,hidden_array,key_order)

    #cv.write_cv_output(model_out+'_predictions',instance_predicts,single_instances,key_order)

    #training full model
    training_instances, \
    dep_dictionary, \
    dep_word_dictionary, \
    dep_element_dictionary, \
    between_word_dictionary = load_data.build_instances_training(training_forward_sentences,
                                                   training_reverse_sentences,
                                                   distant_interactions,
                                                   reverse_distant_interactions,
                                                   entity_a_text,
                                                   entity_b_text,
                                                   key_order)

    X = []
    y = []
    instance_sentences = set()
    for t in training_instances:
        instance_sentences.add(' '.join(t.sentence.sentence_words))
        X.append(t.features)
        y.append(t.label)

    X_train = np.array(X)
    y_train = np.array(y)

    if os.path.exists(model_out):
        shutil.rmtree(model_out)

    trained_model_path = ffnn.feed_forward_train(X_train, y_train, None, None,
                                                 hidden_array, model_out + '/',
                                                 key_order)

    print('Number of Sentences')
    print(len(instance_sentences))
    print('Number of Instances')
    print(len(training_instances))
    print('Number of dependency paths ')
    print(len(dep_dictionary))
    print('Number of dependency words')
    print(len(dep_word_dictionary))
    print('Number of between words')
    print(len(between_word_dictionary))
    print('Number of elements')
    print(len(dep_element_dictionary))
    print('length of feature space')
    print(
        len(dep_dictionary) + len(dep_word_dictionary) +
        len(dep_element_dictionary) + len(between_word_dictionary))
    pickle.dump([
        dep_dictionary, dep_word_dictionary, dep_element_dictionary,
        between_word_dictionary, key_order
    ], open(model_out + 'a.pickle', 'wb'))
    print("trained model")

    return trained_model_path
def distant_train(model_out, sentence_file, distant_file, distant_e1_col,
                  distant_e2_col, entity_1, entity_1_file, entity_1_col,
                  entity_2, entity_2_file, entity_2_col, symmetric):

    if entity_1_file.upper() != "NONE":
        entity_1_ids = load_data.load_id_list(entity_1_file, entity_1_col)
    else:
        entity_1_ids = None
    if entity_2_file.upper() != "NONE":
        entity_2_ids = load_data.load_id_list(entity_2_file, entity_2_col)
    else:
        entity_2_ids = None

    distant_interactions = load_data.load_distant_kb(distant_file,
                                                     distant_e1_col,
                                                     distant_e2_col)

    training_sentences = load_data.load_xml(sentence_file, entity_1, entity_2)

    training_instances, dep_dictionary, dep_word_dictionary, between_word_dictionary = load_data.build_instances_training(
        training_sentences, distant_interactions, entity_1_ids, entity_2_ids,
        symmetric)

    X = []
    y = []
    instance_sentences = set()
    for t in training_instances:
        instance_sentences.add(t.get_sentence())
        X.append(t.features)
        y.append(t.label)

    X_train = np.array(X)
    y_train = np.ravel(y)

    model = LogisticRegression()
    model.fit(X_train, y_train)
    print('Number of Sentences')
    print(len(instance_sentences))
    print('Number of Instances')
    print(len(training_instances))
    print('Number of Positive Instances')
    print(y.count(1))
    print(model.get_params)
    joblib.dump(
        (model, dep_dictionary, dep_word_dictionary, between_word_dictionary),
        model_out)

    print("trained model")
    '''
def k_fold_cross_validation(k, sentences_dict, distant_interactions,
                            reverse_distant_interactions, entity_1_ids,
                            entity_2_ids, symmetric):

    training_list = sorted(sentences_dict.iterkeys())

    #split training sentences for cross validation
    ten_fold_length = len(training_list) / k
    print(ten_fold_length)
    all_chunks = [
        training_list[i:i + ten_fold_length]
        for i in xrange(0, len(training_list), ten_fold_length)
    ]

    total_test = np.array([])
    total_predicted_prob = np.array([])
    for i in range(len(all_chunks)):
        #print('building')
        print('Fold #: ' + str(i))
        fold_chunks = all_chunks[:]
        fold_test_abstracts = fold_chunks.pop(i)
        fold_training_abstracts = list(
            itertools.chain.from_iterable(fold_chunks))
        fold_training_sentences = []
        for key in fold_training_abstracts:
            fold_training_sentences = fold_training_sentences + sentences_dict[
                key]
        print(len(fold_training_sentences))

        fold_training_instances, fold_dep_dictionary, fold_dep_word_dictionary, fold_dep_element_dictionary, fold_between_word_dictionary = load_data.build_instances_training(
            fold_training_sentences, distant_interactions,
            reverse_distant_interactions, entity_1_ids, entity_2_ids,
            symmetric)

        #print('# of train instances: ' + str(len(fold_training_instances)))
        print(len(fold_training_instances))

        #train model
        X = []
        y = []
        for t in fold_training_instances:
            X.append(t.features)
            y.append(t.label)

        fold_train_X = np.array(X)
        fold_train_y = np.array(y)

        model = LogisticRegression()
        model.fit(fold_train_X, fold_train_y)

        for key in fold_test_abstracts:
            fold_test_sentences = sentences_dict[key]
            fold_test_instances = load_data.build_instances_testing(
                fold_test_sentences, fold_dep_dictionary,
                fold_dep_word_dictionary, fold_dep_element_dictionary,
                fold_between_word_dictionary, distant_interactions,
                reverse_distant_interactions, entity_1_ids, entity_2_ids,
                symmetric)

            instance_to_group_dict, group_to_instance_dict, instance_dict = create_instance_groupings(
                fold_test_instances, symmetric)

            for g in group_to_instance_dict:
                group_X = []
                group_y = []
                for ti in group_to_instance_dict[g]:
                    group_X.append(ti.features)
                    group_y.append(ti.label)

                group_test_X = np.array(group_X)
                group_test_y = np.unique(group_y)

                if group_test_y.size == 1:
                    total_test = np.append(total_test, group_test_y[0])
                else:
                    continue
                    print('error')
                #total_test = np.append(total_test,group_y)

                predicted_prob = model.predict_proba(group_test_X)[:, 1]
                negation_predicted_prob = 1 - predicted_prob
                noisy_or = 1 - np.prod(negation_predicted_prob)
                total_predicted_prob = np.append(total_predicted_prob,
                                                 noisy_or)

                # Generate precision recall curves

    positives = collections.Counter(total_test)[1]
    accuracy = float(positives) / total_test.size
    precision, recall, _ = metrics.precision_recall_curve(
        total_test, total_predicted_prob, 1)
    plt.step(recall, precision, color='b', alpha=0.2, where='post')
    plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')

    plt.plot((0.0, 1.0), (accuracy, accuracy))

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.show()
def distant_train(model_out, abstracts, distant_file, distant_e1_col,
                  distant_e2_col, distant_rel_col, entity_1, entity_1_file,
                  entity_1_col, entity_2, entity_2_file, entity_2_col,
                  symmetric):
    '''Method for distantly training the data'''

    #following is used to help differentiate genes that are both Human and Virus
    #get normalized ids for entity_1 optional
    if entity_1_file.upper() != "NONE":
        entity_1_ids = load_data.load_id_list(entity_1_file, entity_1_col)
    else:
        entity_1_ids = None
    #get normalized ids for entity_2
    if entity_2_file.upper() != "NONE":
        entity_2_ids = load_data.load_id_list(entity_2_file, entity_2_col)
    else:
        entity_2_ids = None

    #load the distant knowledge base
    distant_interactions, reverse_distant_interactions = load_data.load_distant_kb(
        distant_file, distant_e1_col, distant_e2_col, distant_rel_col)
    #load the sentence data
    if abstracts.endswith('.pkl'):
        training_abstract_sentences = load_data.load_abstracts_from_pickle(
            abstracts)
    else:
        training_sentences = load_data.load_abstracts_from_directory(
            abstracts, entity_1, entity_2)
    print(len(training_abstract_sentences))

    k_fold_cross_validation(10, training_abstract_sentences,
                            distant_interactions, reverse_distant_interactions,
                            entity_1_ids, entity_2_ids, symmetric)

    training_sentences = []
    for key in training_abstract_sentences:
        training_sentences = training_sentences + training_abstract_sentences[
            key]

    training_instances, dep_dictionary, dep_word_dictionary, element_dictionary, between_word_dictionary = load_data.build_instances_training(
        training_sentences, distant_interactions, reverse_distant_interactions,
        entity_1_ids, entity_2_ids, symmetric)

    X = []
    y = []
    instance_sentences = set()
    for t in training_instances:
        instance_sentences.add(t.get_sentence())
        X.append(t.features)
        y.append(t.label)

    X_train = np.array(X)
    y_train = np.ravel(y)

    model = LogisticRegression()
    model.fit(X_train, y_train)
    print('Number of Sentences')
    print(len(instance_sentences))
    print('Number of Instances')
    print(len(training_instances))
    print('Number of Positive Instances')
    print(y.count(1))
    print(model.get_params)
    print('Number of dependency paths ')
    print(len(dep_dictionary))
    print('Number of dependency words')
    print(len(dep_word_dictionary))
    print('Number of between words')
    print(len(between_word_dictionary))
    print('Number of elements')
    print(len(element_dictionary))
    print('length of feature space')
    print(
        len(dep_dictionary) + len(dep_word_dictionary) +
        len(element_dictionary) + len(between_word_dictionary))
    joblib.dump((model, dep_dictionary, dep_word_dictionary,
                 element_dictionary, between_word_dictionary), model_out)

    print("trained model")