示例#1
0
文件: CNN.py 项目: bwallace/Deep-PICO
def get_X_y(wv, wv_dim):

    pmids, sentences, lbls, vectorizer = parse_summerscales.get_tokens_and_lbls()


    # see: https://github.com/fchollet/keras/issues/233
    # num_sentences x 1 x max_token_len x wv_dim
    # number of sequences x 1 x max number of tokens (padded to max len) x word vector size
    num_sentences = len(sentences)
    #max_token_len = max([len(s) for s in sentences])

    #X_embedded = np.zeros((num_sentences, wv_dim))
    X_embedded, X_tokens = [], [] # here a sequence associated with each doc/abstract
    y = []

    #X_tokens = []
    cur_pmid = pmids[0]
    cur_x_embedded, cur_x_tokens, cur_y = [], [], []
 
    unknown_words_to_vecs = {}

    for idx, s in enumerate(sentences):
        if cur_pmid != pmids[idx]:
            X_embedded.append(np.vstack(cur_x_embedded))
            X_tokens.append(np.vstack(cur_x_tokens))
            y.append(np.array(cur_y))
            cur_x_embedded, cur_x_tokens, cur_y = [], [], []
            cur_pmid = pmids[idx]
        
        for j, t in enumerate(s): 
            try:
                v = wv[t]
            except:
                # or maybe use 0s???
                if not t in unknown_words_to_vecs:
                    v = np.random.uniform(-1,1,wv_dim)
                    unknown_words_to_vecs[t] = v 
                
                v = unknown_words_to_vecs[t]

            cur_x_embedded.append(v)
            cur_x_tokens.append(vectorizer.vocabulary_[t])
            

        cur_y.extend(lbls[idx])

            #cur_x_tokens.append(v)

    X_embedded.append(np.vstack(cur_x_embedded))
    X_tokens.append(np.vstack(cur_x_tokens))
    y.append(np.array(cur_y))

    return X_embedded, X_tokens, y, vectorizer, unknown_words_to_vecs
示例#2
0
def get_PMIDs_to_X_y(use_pickle, use_coref):
    pmids_dict, token_to_features = \
                parse_summerscales.get_tokens_and_lbls(use_pickle=use_pickle, use_coref=use_coref)

    pmids_to_X_y = {}
    for pmid in pmids_dict:
        pmid_sentences, pmid_lbls = pmids_dict[pmid]
        # for this sentence
        X_tokens, X_features = [], []
        y = []
        for sent_idx, s in enumerate(pmid_sentences):
            for j, token in enumerate(s):
                X_features.append(token_to_features[token])
                X_tokens.append(token)

            y.extend(pmid_lbls[sent_idx])

        pmids_to_X_y[pmid] = (np.vstack(X_tokens), np.vstack(X_features),
                              np.hstack(y))
    return (pmids_to_X_y, X_tokens)
示例#3
0
def main():
    n_folds = 5
    try:
        opts, args = getopt.getopt(sys.argv[1:], '', ['window_size=', 'wiki=', 'n_feature_maps=', 'epochs=',
                                                      'undersample=', 'n_feature_maps=', 'criterion=',
                                                      'optimizer=', 'model='])
    except getopt.GetoptError as error:
        print error
        sys.exit(2)
    model_type = 'nn'
    window_size = 5
    wiki = True
    n_feature_maps = 100
    epochs = 20
    undersample = False
    binary_cross_entropy = False
    criterion = 'categorical_crossentropy'
    optimizer = 'adam'
    k = 2

    for opt, arg in opts:
        if opt == '--window_size':
            window_size = int(arg)
        elif opt == '--wiki':
            if arg == 0:
                wiki = False
        elif opt == '--epochs':
            epochs = int(arg)
        elif opt == '--n_feature_maps':
            n_feature_maps = int(arg)
        elif opt == '--undersample':
            option = int(arg)

            if option == 1:
                undersample = True

        elif opt == '--n_feature_maps':
            n_feature_maps = int(arg)
        elif opt == '--criterion':
            criterion = arg
        elif opt == '--optimizer':
            optimizer = arg
        elif opt == '--model':
            model_type = arg
        else:
            print "Option {} is not valid!".format(opt)

    if criterion == 'binary_crossentropy':
        binary_cross_entropy = True
        k = 1

    print('Loading word2vec model...')

    if wiki:
        print 'Using wiki word2vec...'
        word2vec_model = 'wikipedia-pubmed-and-PMC-w2v.bin'
    else:
        print 'Using non-wiki word2vec...'
        word2vec_model = 'PubMed-w2v.bin'
    w2v = Word2Vec.load_word2vec_format(word2vec_model, binary=True)
    print('Loaded word2vec model')

    pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \
        parse_summerscales.get_tokens_and_lbls(
                make_pmids_dict=True, sen=True)
    all_pmids = pmids_dict.keys()
    n = len(all_pmids)
    kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds)

    for fold_idx, (train, test) in enumerate(kf):
        print("on fold %s" % fold_idx)
        train_pmids = [all_pmids[pmid_idx] for pmid_idx in train]
        test_pmids  = [all_pmids[pmid_idx] for pmid_idx in test]

        print train_pmids
        print('loading data...')

        if model_type == 'cnn':
            X_train, y_train = _prep_data(train_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy)
            X_test, y_test = _prep_data(test_pmids, pmids_dict, w2v, window_size, model_type,  binary_ce=binary_cross_entropy)
        elif model_type == 'nn':
            X_train, y_train = _prep_data(train_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy)
            X_test, y_test = _prep_data(test_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy)

        if undersample:
            # Undersample the non group tags at random....probably a bad idea...
            idx_undersample = numpy.where(y_train[:, 1] == 0)[0]
            idx_postive = numpy.where(y_train[:, 1] == 1)[0]
            random_negative_sample = numpy.random.choice(idx_undersample, idx_postive.shape[0])
            X_train_postive = X_train[idx_postive, :, :, :]
            y_train_postive = y_train[idx_postive, :]

            X_train_negative = X_train[random_negative_sample, :, :, :]
            y_train_negative = y_train[random_negative_sample, :]

            X_train = numpy.vstack((X_train_postive, X_train_negative))
            y_train = numpy.vstack((y_train_postive, y_train_negative))

        print('loaded data...')

        if model_type == 'cnn':
            model = GroupCNN(window_size=window_size, n_feature_maps=n_feature_maps, k_output=k)
        elif model_type == 'nn':
            model = GroupNN(window_size=window_size, k=k)
        model.train(X_train, y_train, epochs, optim_algo=optimizer, criterion=criterion)

        accuracy, f1_score, precision, auc, recall = model.test(X_test, y_test)

        print "Accuracy: {}".format(accuracy)
        print "F1: {}".format(f1_score)
        print "Precision: {}".format(precision)
        print "AUC: {}".format(auc)
        print "Recall: {}".format(recall)


        sys.exit()
示例#4
0
文件: crf.py 项目: bwallace/Deep-PICO
def run_crf(w2v, l2, l1, iters, shallow_parse, words_before, words_after, grid_search,tacc, name, transfer_learning=False):

    pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \
        parse_summerscales.get_tokens_and_lbls(
                make_pmids_dict=True, sen=True, use_genia=shallow_parse, using_tacc=tacc)

    model = pycrfsuite.Trainer(verbose=False)
    all_pmids = pmids_dict.keys()
    n = len(all_pmids)
    n_folds = 5
    kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds)
    fold_gi = []
    recall_scores = []
    precision_scores = []
    f1_scores = []
    model_type = 'nn'
    binary_cross_entropy = True

    for fold_idx, (train, test) in enumerate(kf):
        print("on fold %s" % fold_idx)
        train_pmids = [all_pmids[pmid_idx] for pmid_idx in train]
        test_pmids  = [all_pmids[pmid_idx] for pmid_idx in test]
        print('loading data...')

        if transfer_learning:
            nn_model = GroupNN.load_model(model_path='NNModel.hdf5', model_info_path='NNModel.hdf5.p')
            window_size = nn_model.model_info['window_size']

            train_x, train_y = GroupCNNExperiment._prep_data(train_pmids, pmids_dict, w2v, window_size, model_type,
                                                             binary_ce=binary_cross_entropy, crf=True)
            test_x, test_y = GroupCNNExperiment._prep_data(test_pmids, pmids_dict, w2v, window_size, model_type,
                                                           binary_ce=binary_cross_entropy, crf=True)

            train_x = transform_features(nn_model, train_x)
            test_x = transform_features(nn_model, test_x)

            train_y = _labels_to_str(train_y)
            test_y = _labels_to_str(test_y)
        else:

            train_x, train_y = abstracts2features(pmids_dict, train_pmids, words_before, words_after, w2v, shallow_parse)
            test_x, test_y = abstracts2features(pmids_dict, test_pmids, words_before, words_after, w2v, shallow_parse)

        print('loaded data...')
        for x, y in zip(train_x, train_y):
            model.append(x, y)

        if grid_search:
            model.set_params({
                'c1': l1,   # coefficient for L1 penalty
                'c2': l2,  # coefficient for L2 penalty
                'max_iterations': iters,  # stop earlier

                # include transitions that are possible, but not observed
                'feature.possible_transitions': True
            })

            crf = sklearn_crfsuite.CRF(
                algorithm='lbfgs',
                c1=l1,
                c2=l2,
                max_iterations=iters,
                all_possible_transitions=False
            )

            params_space = {
                'c1': scipy.stats.expon(scale=0.5),
                'c2': scipy.stats.expon(scale=0.05),
            }

            # use the same metric for evaluation
            f1_scorer = make_scorer(metrics.flat_f1_score,
                                    average='weighted', labels=test_y)


            # search
            rs = RandomizedSearchCV(crf, params_space,
                                    cv=3,
                                    verbose=1,
                                    n_jobs=-1,
                                    n_iter=50,
                                    scoring=f1_scorer)
            rs.fit(train_x, train_y)
            info = rs.best_estimator_.tagger_.info()
            tagger = rs.best_estimator_.tagger_
        else:
            model.set_params({
                'c1': l1,   # coefficient for L1 penalty
                'c2': l2,  # coefficient for L2 penalty
                'max_iterations': iters,  # stop earlier

                # include transitions that are possible, but not observed
                'feature.possible_transitions': True
            })
            model_name = name + '_model {}'.format(fold_idx)
            print('training model...')
            model.train(model_name)
            print('done...')
            tagger = pycrfsuite.Tagger()
            tagger.open(model_name)

            info = tagger.info()

        def print_transitions(trans_features):
            for (label_from, label_to), weight in trans_features:
                print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

        print("Top likely transitions:")
        print_transitions(Counter(info.transitions).most_common(80))

        print("\nTop unlikely transitions:")
        print_transitions(Counter(info.transitions).most_common()[-80:])

        def print_state_features(state_features):
            for (attr, label), weight in state_features:
                print("%0.6f %-6s %s" % (weight, label, attr))

        print("Top positive:")
        print_state_features(Counter(info.state_features).most_common(80))

        print("\nTop negative:")
        print_state_features(Counter(info.state_features).most_common()[-80:])

        g_i = []
        y_truths, predictions = [], []
        abstract_predicted_mentions, true_abstract_mentions = [], []

        for i, (pmid, x, y) in enumerate(zip(test_pmids, test_x, test_y)):
            print(pmid)
            abstract_words, abstract_labels, tagged_abstract, groups_dict, groups =  pmids_dict[pmid]

            vocab = groups_map[pmid]
            abstract_words, _, _, _, _ = pmids_dict[pmid]

            count = 0
            pred_labels = tagger.tag(x)
            pred_mentions = output2words(pred_labels, abstract_words)
            true_mentions = output2words(abstract_labels, abstract_words)

            print "Predicted: {}".format(pred_mentions)
            print "True: {}".format(true_mentions)
            print '\n'
            """
            abstract_predicted_words.append(pred_words)
          #  vocab2 = output2words(x, y, vectorizer, w2v, abstract_words)
            if len(test_y) > 0:
                y_truths.append(y)
                predictions.append(pred_labels)


            for pred_word in pred_words:
                for v_word in vocab:
                    if pred_word == v_word:
                        count += 1

            pred_words_string = ''
            true_words = " "
            for word in pred_words:
                pred_words_string = pred_words_string + " " + word
            for word in vocab:
                true_words = true_words + " " + word

            print("Predicted: {}".format(pred_words_string))
            print("True: {}".format(true_words))
            print ""
            if len(vocab) == 0:
                continue
            g_i1 = float(count)/float(len(vocab))

            g_i.append(g_i1)
            print LSTM_extraction._crf_evaluate_detection(y, pred_labels, )
            """
            abstract_predicted_mentions.append(pred_mentions)
            true_abstract_mentions.append(true_mentions)
        fold_recall, fold_precision, fold_f1_score = eveluate(abstract_predicted_mentions, true_abstract_mentions)
        recall_scores.append(fold_recall)
        precision_scores.append(fold_precision)
        f1_scores.append(fold_f1_score)

        fold_recall_results = "Fold recall: {}".format(fold_recall)
        fold_precision_results = "Fold precision: {}".format(fold_precision)
        fold_f1_results = "Fold F1 Score: {}".format(fold_f1_score)
        print fold_recall_results
        print fold_precision_results
        print fold_f1_results

        file = open(model_name + '_results.txt', 'w+')

        file.write(fold_recall_results + '\n')
        file.write(fold_precision_results + '\n')
        file.write(fold_f1_results + '\n')

       # avg_g_i = float(0)
        """
        for x in g_i:
            avg_g_i += x
        avg_g_i = avg_g_i/len(g_i)
        fold_gi.append(avg_g_i)
        print('g_i: {}'.format(avg_g_i))

    """
    """
        recall, precision, tp_overlapping_tokens, fp_tokens, accuracy = LSTM_extraction._crf_evaluate_detection(y_truths, predictions, abstract, vectorizer)

        print('Recall: {}'.format(recall))
        print('precision: {}'.format(precision))
        print('')

    avg_g_i = 0

    for g in fold_gi:
        avg_g_i+=g

    avg_g_i = float(avg_g_i)/float(len(fold_gi))

    avg_recall = 0

    for g in fold_gi:
        avg_recall+=g

    avg_recall = float(avg_recall)/float(len(recalls))

    avg_precision = 0

    for g in precisions:
        avg_precision+=g

    avg_precision = float(avg_precision)/float(len(precisions))
    "
    print("Fold avg g_i: {}".format(avg_g_i))
    """

    recall_average = _compute_average(recall_scores)
    precision_average = _compute_average(precision_scores)
    f1_scores = _compute_average(f1_scores)

    print "Recall Average: {}".format(recall_average)
    print "Precision Average: {}".format(precision_average)
    print "F1 Average: {}".format(f1_scores)
示例#5
0
def main():
    n_folds = 5
    try:
        opts, args = getopt.getopt(sys.argv[1:], '', ['window_size=', 'wiki=', 'n_feature_maps=', 'epochs=',
                                                      'undersample=', 'n_feature_maps=', 'criterion=',
                                                      'optimizer=', 'model=', 'genia=', 'tacc=', 'layers=',
                                                      'hyperopt=', 'model_name='])
    except getopt.GetoptError as error:
        print error
        sys.exit(2)
    model_type = 'nn'
    window_size = 5
    wiki = True
    n_feature_maps = 100
    epochs = 20
    undersample = False
    binary_cross_entropy = False
    criterion = 'categorical_crossentropy'
    optimizer = 'adam'
    k = 2
    use_genia = False
    using_tacc = False
    layer_sizes = []
    hyperopt = False
    model_name = 'model'

    for opt, arg in opts:
        if opt == '--window_size':
            window_size = int(arg)
        elif opt == '--wiki':
            if arg == 0:
                wiki = False
        elif opt == '--epochs':
            epochs = int(arg)
        elif opt == '--layers':
            layer_sizes = arg.split(',')
        elif opt == '--n_feature_maps':
            n_feature_maps = int(arg)
        elif opt == '--undersample':
            option = int(arg)

            if option == 1:
                undersample = True

        elif opt == '--n_feature_maps':
            n_feature_maps = int(arg)
        elif opt == '--criterion':
            criterion = arg
        elif opt == '--optimizer':
            optimizer = arg
        elif opt == '--model':
            model_type = arg
        elif opt == '--genia':
            if int(arg) == 1:
                use_genia= True
        elif opt == '--tacc':
            if int(arg) == 1:
                using_tacc = True
        elif opt == '--hyperopt':
            if int(arg) == 1:
                hyperopt = True
        elif opt == '--model_name':
            model_name = arg
        else:
            print "Option {} is not valid!".format(opt)

    if criterion == 'binary_crossentropy':
        binary_cross_entropy = True
        k = 1

    print('Loading word2vec model...')

    if wiki:
        print 'Using wiki word2vec...'
        word2vec_model = 'wikipedia-pubmed-and-PMC-w2v.bin'
    else:
        print 'Using non-wiki word2vec...'
        word2vec_model = 'PubMed-w2v.bin'
    w2v = Word2Vec.load_word2vec_format(word2vec_model, binary=True)
    print('Loaded word2vec model')

    pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \
        parse_summerscales.get_tokens_and_lbls(
                make_pmids_dict=True, sen=True, use_genia=use_genia, using_tacc=using_tacc)
    all_pmids = pmids_dict.keys()
    n = len(all_pmids)
    kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds)

    accuracies = []
    recalls = []
    precisions = []
    f1_scores = []
    aucs = []

    global model

    for fold_idx, (train, test) in enumerate(kf):
        print("on fold %s" % fold_idx)
        train_pmids = [all_pmids[pmid_idx] for pmid_idx in train]
        test_pmids  = [all_pmids[pmid_idx] for pmid_idx in test]

        print train_pmids
        print('loading data...')

        if model_type == 'cnn':
            X_train, y_train = _prep_data(train_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy)
            X_test, y_test = _prep_data(test_pmids, pmids_dict, w2v, window_size, model_type,  binary_ce=binary_cross_entropy)
        elif model_type == 'nn':
            X_train, y_train = _prep_data(train_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy)
            X_test, y_test = _prep_data(test_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy)
        elif model_type == 'ladder':
            X_train, y_train = _prep_data(train_pmids, pmids_dict, w2v, window_size, model_type, binary_ce=binary_cross_entropy)
            X_test, y_test = _prep_data(test_pmids, pmids_dict, w2v, window_size, model_type,  binary_ce=binary_cross_entropy)




        if undersample:
            # Undersample the non group tags at random....probably a bad idea...
            if binary_cross_entropy:
                idx_undersample = numpy.where(y_train == 0)[0]
                idx_postive = numpy.where(y_train == 1)[0]
            else:
                idx_undersample = numpy.where(y_train[:, 1] == 0)[0]
                idx_postive = numpy.where(y_train[:, 1] == 1)[0]
            random_negative_sample = numpy.random.choice(idx_undersample, idx_postive.shape[0])

            if model_type == 'nn':
                X_train_postive = X_train[idx_postive, :]
                X_train_negative = X_train[random_negative_sample, :]
            else:
                X_train_postive = X_train[idx_postive, :, :, :]

                X_train_negative = X_train[random_negative_sample, :, :, :]

            if binary_cross_entropy:
                y_train_postive = y_train[idx_postive]
                y_train_negative = y_train[random_negative_sample]
            else:
                y_train_postive = y_train[idx_postive, :]
                y_train_negative = y_train[random_negative_sample, :]


            X_train = numpy.vstack((X_train_postive, X_train_negative))

            if binary_cross_entropy:
                y_train = numpy.hstack((y_train_postive, y_train_negative))

            else:
                y_train = numpy.vstack((y_train_postive, y_train_negative))

        print('loaded data...')

        if model_type == 'cnn':
            model = GroupCNN(window_size=window_size, n_feature_maps=n_feature_maps, k_output=k, name=model_name)
        elif model_type == 'nn':
            model = GroupNN(window_size=window_size, k=k, hyperparameter_search=hyperopt, name=model_name)

        if hyperopt:
            best_run, best_model = optim.minimize(model=_model,
                                          data=_data,
                                          algo=tpe.suggest,
                                          max_evals=5,
                                          trials=Trials())
            model.model = best_model

        else:
            model.train(X_train, y_train, epochs, optim_algo=optimizer, criterion=criterion)

        words = []
        for pmid in test_pmids:
            words.extend(pmids_dict[pmid][0])

        predictions = model.predict_classes(X_test)

        predicted_words = crf.output2words(predictions, words)
        y_test_arg_max = numpy.argmax(y_test, axis=1)
        true_words = crf.output2words(y_test_arg_max, words)

        accuracy, f1_score, precision, auc, recall = model.test(X_test, y_test)
        recall, precision, f1_score = crf.eveluate(predicted_words, true_words)

        print "Accuracy: {}".format(accuracy)
        print "F1: {}".format(f1_score)
        print "Precision: {}".format(precision)
        print "AUC: {}".format(auc)
        print "Recall: {}".format(recall)

        accuracies.append(accuracy)
        f1_scores.append(f1_score)
        precisions.append(precision)
        aucs.append(auc)
        recalls.append(recall)
    mean_accuracy = numpy.mean(accuracies)
    mean_f1_score = numpy.mean(f1_scores)
    mean_precision = numpy.mean(precisions)
    mean_auc_score = numpy.mean(aucs)
    mean_recall = numpy.mean(recalls)

    mean_accuracy_string = "Mean Accuracy: {}".format(mean_accuracy)
    mean_f1_score_string = "Mean F1: {}".format(mean_f1_score)
    mean_precision_string = "Mean Precision: {}".format(mean_precision)
    mean_auc_score_string = "Mean AUC: {}".format(mean_auc_score)
    mean_recall_string = "Mean Recall: {}".format(mean_recall)

    print mean_accuracy_string
    print mean_f1_score_string
    print mean_precision_string
    print mean_auc_score_string
    print mean_recall_string

    results = open('{}_fold_results'.format(model.model_name), 'w+')
    results.write(mean_accuracy_string)
    results.write(mean_f1_score_string)
    results.write(mean_precision_string)
    results.write(mean_auc_score_string)
    results.write(mean_recall_string)
示例#6
0
def get_X_y(wv, wv_dim, vectorizer=None, distant=False, n=None):

    pmids, sentences, lbls, vectorizer = [None] * 4
    if distant:
        #pmids, sentences, lbls, vectorizer = distant_intervention_tag.get_tokens_and_lbls(N=N)
        pmids, tagged_abstracts, tokens_and_lbls, intervention_texts = \
                distant_intervention_tag.distantly_annotate(n=n)
    else:
        pmids, sentences, lbls, vectorizer = parse_summerscales.get_tokens_and_lbls(
        )

    #pdb.set_trace()

    # see: https://github.com/fchollet/keras/issues/233
    # num_sentences x 1 x max_token_len x wv_dim
    # number of sequences x 1 x max number of tokens (padded to max len) x word vector size
    num_sentences = len(sentences)
    #max_token_len = max([len(s) for s in sentences])

    #X_embedded = np.zeros((num_sentences, wv_dim))
    X_embedded, X_tokens = [], [
    ]  # here a sequence associated with each doc/abstract
    y = []

    #X_tokens = []
    cur_pmid = pmids[0]

    cur_x_embedded, cur_x_tokens, cur_y, token_pmid_list = [], [], [], []

    unknown_words_to_vecs = {}

    for idx, s in enumerate(sentences):
        if cur_pmid != pmids[idx]:
            X_embedded.append(np.vstack(cur_x_embedded))
            X_tokens.append(np.vstack(cur_x_tokens))
            y.append(np.array(cur_y))
            cur_x_embedded, cur_x_tokens, cur_y = [], [], []
            cur_pmid = pmids[idx]

        for j, t in enumerate(s):
            try:
                v = wv[t]
            except:
                print("%s not known!" % t)

                # or maybe use 0s???
                if not t in unknown_words_to_vecs:
                    v = np.random.uniform(-1, 1, wv_dim)
                    unknown_words_to_vecs[t] = v

                v = unknown_words_to_vecs[t]

            cur_x_embedded.append(v)
            cur_x_tokens.append(vectorizer.vocabulary_[t])
            token_pmid_list.append(cur_pmid)

        cur_y.extend(lbls[idx])

    X_embedded.append(np.vstack(cur_x_embedded))
    X_tokens.append(np.vstack(cur_x_tokens))
    y.append(np.array(cur_y))

    X_embedded = np.vstack(X_embedded)
    X_tokens = np.vstack(X_tokens)
    y = np.hstack(y)
    return X_embedded, X_tokens, y, vectorizer, unknown_words_to_vecs, token_pmid_list
示例#7
0
def run_crf(w2v, words_before, words_after, shallow_parse):

    pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \
        parse_summerscales.get_tokens_and_lbls(
                make_pmids_dict=True, sen=True)


    """
        Create model
    """
    model = ChainCRF(directed=False)
    ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=30)

    all_pmids = pmids_dict.keys()
    n = len(all_pmids)
    n_folds = 5
    kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds)
    fold_gi = []


    for fold_idx, (train, test) in enumerate(kf):
        print("on fold %s" % fold_idx)
        train_pmids = [all_pmids[pmid_idx] for pmid_idx in train]
        test_pmids  = [all_pmids[pmid_idx] for pmid_idx in test]
        print('loading data...')
        train_x, train_y = abstract2features(pmids_dict, words_before, w2v, shallow_parse)
        test_x, test_y = abstract2features(pmids_dict, words_after, w2v, shallow_parse)

        print('loaded data...')
        print 'training...'
        ssvm.fit(train_x, train_y)

        print ssvm.score(test_x, test_y)

        for i, (pmid, x, y) in enumerate(zip(test_pmids, test_x, test_y)):
            abstract_words, _, _= pmids_dict[pmid]

            print(pmid)

            # predict() takes in a list returns another list
            prediction = ssvm.predict([x]).pop(0)

            predicted = ''
            output = ''

            if len(prediction) > 0:

                for p in prediction:
                    if p == 1:
                        print "word: {}".format(abstract_words[p])
                        if n == 0:
                            predicted += abstract_words[p]
                        else:
                            predicted += ' ' + abstract_words[p]

                if not predicted == '':
                    output = 'predicted: {}'.format(predicted)
                else:
                    output = 'Predicted nothing!'
            else:
                output = 'Predicted nothing!'
            print output
示例#8
0
def get_PMIDs_to_X_y(wv, wv_dim, max_length=None, distant=False, n=200):

    unknown_words_to_vecs = {}
    tokens_DS = None

    if distant:
        tokens_and_lbls, X_DS_embedded, y_DS, tokens_DS, unknown_words_to_vecs = \
                _get_distantly_lbled_tokens(n=n, wv=wv, wv_dim=wv_dim)

    # we pass tokens_DS -- the unique tokens in the DS
    # data -- to go into our vectorizer!
    """
    pmids_dict, pmids, sentences, lbls, vectorizer, groups_map = \
                parse_summerscales.get_tokens_and_lbls(
                        make_pmids_dict=True)
    """
    pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \
        parse_summerscales.get_tokens_and_lbls(
                make_pmids_dict=True, sen=True)
    ###
    # now loop through and get X_tokens representation!
    if distant:
        # really token_indices maybe more correct
        X_DS_tokens = []
        for abs_idx, abs_tokens_and_lbls in enumerate(tokens_and_lbls):
            for token_idx, token_and_lbl in enumerate(abs_tokens_and_lbls):
                t, lbl = token_and_lbl
                X_DS_tokens.append(vectorizer.vocabulary_[t])

    # see: https://github.com/fchollet/keras/issues/233
    # num_sentences x 1 x max_token_len x wv_dim
    # number of sequences x 1 x max number of tokens (padded to max len) x word vector size

# num_sentences = len(sentences)
#max_token_len = max([len(s) for s in sentences])

#X_embedded = np.zeros((num_sentences, wv_dim))
    X_embedded, X_tokens = [], [
    ]  # here a sequence associated with each doc/abstract

    #unknown_words_to_vecs = {}
    pmids_to_X_y = {}

    for pmid in pmids_dict:
        # pmid_sentences, pmid_lbls = pmids_dict[pmid]
        abstract_tokens, abstract_output_labels, _ = pmids_dict[pmid]
        # for this sentence
        X_embedded = []
        X_tokens = []
        y = []

        for w_i, word_token in enumerate(abstract_tokens):
            try:
                v = wv[word_token]
            except:
                # or maybe use 0s???
                if word_token not in unknown_words_to_vecs:
                    print("word '%s' not known!" % word_token)
                    v = np.random.uniform(-1, 1, wv_dim)
                    unknown_words_to_vecs[word_token] = v

                v = unknown_words_to_vecs[word_token]
            X_embedded.append(v)
            X_tokens.append(vectorizer.vocabulary_[word_token])

        #pmids_to_X_y[pmid] = (np.vstack(X_embedded), np.vstack(X_tokens), np.hstack(y))

        if len(abstract_output_labels) > max_length:
            abstract_output_labels = abstract_output_labels[:max_length]
        elif len(abstract_output_labels) < max_length:
            padding = []
            for i in range(max_length - len(abstract_output_labels)):
                padding.append(0)
            abstract_output_labels = padding + abstract_output_labels
        assert len(abstract_output_labels) == max_length, 'Must be same size'

        pmids_to_X_y[pmid] = (X_embedded, X_tokens, abstract_output_labels)
        """
        for sent_idx, s in enumerate(pmid_sentences):
            for j, t in enumerate(s): 
                try:
                    v = wv[t]
                except:
                    # or maybe use 0s???
                    if not t in unknown_words_to_vecs:
                        print("word '%s' not known!" % t)
                        v = np.random.uniform(-1,1,wv_dim)
                        unknown_words_to_vecs[t] = v 
                    
                    v = unknown_words_to_vecs[t]

                X_embedded.append(v)
                X_tokens.append(vectorizer.vocabulary_[t])
            
            y.extend(pmid_lbls[sent_idx])

        pmids_to_X_y[pmid] = (np.vstack(X_embedded), np.vstack(X_tokens), np.hstack(y))
    """
    if distant:
        return pmids_to_X_y, vectorizer, unknown_words_to_vecs, X_DS_embedded, X_DS_tokens, y_DS

    return pmids_to_X_y, vectorizer, unknown_words_to_vecs, groups_map, pmids_dict
示例#9
0
def run_crf(w2v, words_before, words_after, shallow_parse):

    pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \
        parse_summerscales.get_tokens_and_lbls(
                make_pmids_dict=True, sen=True)
    """
        Create model
    """
    model = ChainCRF(directed=False)
    ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=30)

    all_pmids = pmids_dict.keys()
    n = len(all_pmids)
    n_folds = 5
    kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds)
    fold_gi = []

    for fold_idx, (train, test) in enumerate(kf):
        print("on fold %s" % fold_idx)
        train_pmids = [all_pmids[pmid_idx] for pmid_idx in train]
        test_pmids = [all_pmids[pmid_idx] for pmid_idx in test]
        print('loading data...')
        train_x, train_y = abstract2features(pmids_dict, words_before, w2v,
                                             shallow_parse)
        test_x, test_y = abstract2features(pmids_dict, words_after, w2v,
                                           shallow_parse)

        print('loaded data...')
        print 'training...'
        ssvm.fit(train_x, train_y)

        print ssvm.score(test_x, test_y)

        for i, (pmid, x, y) in enumerate(zip(test_pmids, test_x, test_y)):
            abstract_words, _, _ = pmids_dict[pmid]

            print(pmid)

            # predict() takes in a list returns another list
            prediction = ssvm.predict([x]).pop(0)

            predicted = ''
            output = ''

            if len(prediction) > 0:

                for p in prediction:
                    if p == 1:
                        print "word: {}".format(abstract_words[p])
                        if n == 0:
                            predicted += abstract_words[p]
                        else:
                            predicted += ' ' + abstract_words[p]

                if not predicted == '':
                    output = 'predicted: {}'.format(predicted)
                else:
                    output = 'Predicted nothing!'
            else:
                output = 'Predicted nothing!'
            print output
示例#10
0
def main():
    n_folds = 5
    try:
        opts, args = getopt.getopt(sys.argv[1:], '', [
            'window_size=', 'wiki=', 'n_feature_maps=', 'epochs=',
            'undersample=', 'n_feature_maps=', 'criterion=', 'optimizer=',
            'model=', 'genia=', 'tacc=', 'layers=', 'hyperopt=', 'model_name='
        ])
    except getopt.GetoptError as error:
        print error
        sys.exit(2)
    model_type = 'nn'
    window_size = 5
    wiki = True
    n_feature_maps = 100
    epochs = 20
    undersample = False
    binary_cross_entropy = False
    criterion = 'categorical_crossentropy'
    optimizer = 'adam'
    k = 2
    use_genia = False
    using_tacc = False
    layer_sizes = []
    hyperopt = False
    model_name = 'model'

    for opt, arg in opts:
        if opt == '--window_size':
            window_size = int(arg)
        elif opt == '--wiki':
            if arg == 0:
                wiki = False
        elif opt == '--epochs':
            epochs = int(arg)
        elif opt == '--layers':
            layer_sizes = arg.split(',')
        elif opt == '--n_feature_maps':
            n_feature_maps = int(arg)
        elif opt == '--undersample':
            option = int(arg)

            if option == 1:
                undersample = True

        elif opt == '--n_feature_maps':
            n_feature_maps = int(arg)
        elif opt == '--criterion':
            criterion = arg
        elif opt == '--optimizer':
            optimizer = arg
        elif opt == '--model':
            model_type = arg
        elif opt == '--genia':
            if int(arg) == 1:
                use_genia = True
        elif opt == '--tacc':
            if int(arg) == 1:
                using_tacc = True
        elif opt == '--hyperopt':
            if int(arg) == 1:
                hyperopt = True
        elif opt == '--model_name':
            model_name = arg
        else:
            print "Option {} is not valid!".format(opt)

    if criterion == 'binary_crossentropy':
        binary_cross_entropy = True
        k = 1

    print('Loading word2vec model...')

    if wiki:
        print 'Using wiki word2vec...'
        word2vec_model = 'wikipedia-pubmed-and-PMC-w2v.bin'
    else:
        print 'Using non-wiki word2vec...'
        word2vec_model = 'PubMed-w2v.bin'
    w2v = Word2Vec.load_word2vec_format(word2vec_model, binary=True)
    print('Loaded word2vec model')

    pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \
        parse_summerscales.get_tokens_and_lbls(
                make_pmids_dict=True, sen=True, use_genia=use_genia, using_tacc=using_tacc)
    all_pmids = pmids_dict.keys()
    n = len(all_pmids)
    kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds)

    accuracies = []
    recalls = []
    precisions = []
    f1_scores = []
    aucs = []

    global model

    for fold_idx, (train, test) in enumerate(kf):
        print("on fold %s" % fold_idx)
        train_pmids = [all_pmids[pmid_idx] for pmid_idx in train]
        test_pmids = [all_pmids[pmid_idx] for pmid_idx in test]

        print train_pmids
        print('loading data...')

        if model_type == 'cnn':
            X_train, y_train = _prep_data(train_pmids,
                                          pmids_dict,
                                          w2v,
                                          window_size,
                                          model_type,
                                          binary_ce=binary_cross_entropy)
            X_test, y_test = _prep_data(test_pmids,
                                        pmids_dict,
                                        w2v,
                                        window_size,
                                        model_type,
                                        binary_ce=binary_cross_entropy)
        elif model_type == 'nn':
            X_train, y_train = _prep_data(train_pmids,
                                          pmids_dict,
                                          w2v,
                                          window_size,
                                          model_type,
                                          binary_ce=binary_cross_entropy)
            X_test, y_test = _prep_data(test_pmids,
                                        pmids_dict,
                                        w2v,
                                        window_size,
                                        model_type,
                                        binary_ce=binary_cross_entropy)
        elif model_type == 'ladder':
            X_train, y_train = _prep_data(train_pmids,
                                          pmids_dict,
                                          w2v,
                                          window_size,
                                          model_type,
                                          binary_ce=binary_cross_entropy)
            X_test, y_test = _prep_data(test_pmids,
                                        pmids_dict,
                                        w2v,
                                        window_size,
                                        model_type,
                                        binary_ce=binary_cross_entropy)

        if undersample:
            # Undersample the non group tags at random....probably a bad idea...
            if binary_cross_entropy:
                idx_undersample = numpy.where(y_train == 0)[0]
                idx_postive = numpy.where(y_train == 1)[0]
            else:
                idx_undersample = numpy.where(y_train[:, 1] == 0)[0]
                idx_postive = numpy.where(y_train[:, 1] == 1)[0]
            random_negative_sample = numpy.random.choice(
                idx_undersample, idx_postive.shape[0])

            if model_type == 'nn':
                X_train_postive = X_train[idx_postive, :]
                X_train_negative = X_train[random_negative_sample, :]
            else:
                X_train_postive = X_train[idx_postive, :, :, :]

                X_train_negative = X_train[random_negative_sample, :, :, :]

            if binary_cross_entropy:
                y_train_postive = y_train[idx_postive]
                y_train_negative = y_train[random_negative_sample]
            else:
                y_train_postive = y_train[idx_postive, :]
                y_train_negative = y_train[random_negative_sample, :]

            X_train = numpy.vstack((X_train_postive, X_train_negative))

            if binary_cross_entropy:
                y_train = numpy.hstack((y_train_postive, y_train_negative))

            else:
                y_train = numpy.vstack((y_train_postive, y_train_negative))

        print('loaded data...')

        if model_type == 'cnn':
            model = GroupCNN(window_size=window_size,
                             n_feature_maps=n_feature_maps,
                             k_output=k,
                             name=model_name)
        elif model_type == 'nn':
            model = GroupNN(window_size=window_size,
                            k=k,
                            hyperparameter_search=hyperopt,
                            name=model_name)

        if hyperopt:
            best_run, best_model = optim.minimize(model=_model,
                                                  data=_data,
                                                  algo=tpe.suggest,
                                                  max_evals=5,
                                                  trials=Trials())
            model.model = best_model

        else:
            model.train(X_train,
                        y_train,
                        epochs,
                        optim_algo=optimizer,
                        criterion=criterion)

        words = []
        for pmid in test_pmids:
            words.extend(pmids_dict[pmid][0])

        predictions = model.predict_classes(X_test)

        predicted_words = crf.output2words(predictions, words)
        y_test_arg_max = numpy.argmax(y_test, axis=1)
        true_words = crf.output2words(y_test_arg_max, words)

        accuracy, f1_score, precision, auc, recall = model.test(X_test, y_test)
        recall, precision, f1_score = crf.eveluate(predicted_words, true_words)

        print "Accuracy: {}".format(accuracy)
        print "F1: {}".format(f1_score)
        print "Precision: {}".format(precision)
        print "AUC: {}".format(auc)
        print "Recall: {}".format(recall)

        accuracies.append(accuracy)
        f1_scores.append(f1_score)
        precisions.append(precision)
        aucs.append(auc)
        recalls.append(recall)
    mean_accuracy = numpy.mean(accuracies)
    mean_f1_score = numpy.mean(f1_scores)
    mean_precision = numpy.mean(precisions)
    mean_auc_score = numpy.mean(aucs)
    mean_recall = numpy.mean(recalls)

    mean_accuracy_string = "Mean Accuracy: {}".format(mean_accuracy)
    mean_f1_score_string = "Mean F1: {}".format(mean_f1_score)
    mean_precision_string = "Mean Precision: {}".format(mean_precision)
    mean_auc_score_string = "Mean AUC: {}".format(mean_auc_score)
    mean_recall_string = "Mean Recall: {}".format(mean_recall)

    print mean_accuracy_string
    print mean_f1_score_string
    print mean_precision_string
    print mean_auc_score_string
    print mean_recall_string

    results = open('{}_fold_results'.format(model.model_name), 'w+')
    results.write(mean_accuracy_string)
    results.write(mean_f1_score_string)
    results.write(mean_precision_string)
    results.write(mean_auc_score_string)
    results.write(mean_recall_string)
示例#11
0
文件: crf.py 项目: bwallace/Deep-PICO
def run_crf(w2v,
            l2,
            l1,
            iters,
            shallow_parse,
            words_before,
            words_after,
            grid_search,
            tacc,
            name,
            transfer_learning=False):

    pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \
        parse_summerscales.get_tokens_and_lbls(
                make_pmids_dict=True, sen=True, use_genia=shallow_parse, using_tacc=tacc)

    model = pycrfsuite.Trainer(verbose=False)
    all_pmids = pmids_dict.keys()
    n = len(all_pmids)
    n_folds = 5
    kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds)
    fold_gi = []
    recall_scores = []
    precision_scores = []
    f1_scores = []
    model_type = 'nn'
    binary_cross_entropy = True

    for fold_idx, (train, test) in enumerate(kf):
        print("on fold %s" % fold_idx)
        train_pmids = [all_pmids[pmid_idx] for pmid_idx in train]
        test_pmids = [all_pmids[pmid_idx] for pmid_idx in test]
        print('loading data...')

        if transfer_learning:
            nn_model = GroupNN.load_model(model_path='NNModel.hdf5',
                                          model_info_path='NNModel.hdf5.p')
            window_size = nn_model.model_info['window_size']

            train_x, train_y = GroupCNNExperiment._prep_data(
                train_pmids,
                pmids_dict,
                w2v,
                window_size,
                model_type,
                binary_ce=binary_cross_entropy,
                crf=True)
            test_x, test_y = GroupCNNExperiment._prep_data(
                test_pmids,
                pmids_dict,
                w2v,
                window_size,
                model_type,
                binary_ce=binary_cross_entropy,
                crf=True)

            train_x = transform_features(nn_model, train_x)
            test_x = transform_features(nn_model, test_x)

            train_y = _labels_to_str(train_y)
            test_y = _labels_to_str(test_y)
        else:

            train_x, train_y = abstracts2features(pmids_dict, train_pmids,
                                                  words_before, words_after,
                                                  w2v, shallow_parse)
            test_x, test_y = abstracts2features(pmids_dict, test_pmids,
                                                words_before, words_after, w2v,
                                                shallow_parse)

        print('loaded data...')
        for x, y in zip(train_x, train_y):
            model.append(x, y)

        if grid_search:
            model.set_params({
                'c1': l1,  # coefficient for L1 penalty
                'c2': l2,  # coefficient for L2 penalty
                'max_iterations': iters,  # stop earlier

                # include transitions that are possible, but not observed
                'feature.possible_transitions': True
            })

            crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                       c1=l1,
                                       c2=l2,
                                       max_iterations=iters,
                                       all_possible_transitions=False)

            params_space = {
                'c1': scipy.stats.expon(scale=0.5),
                'c2': scipy.stats.expon(scale=0.05),
            }

            # use the same metric for evaluation
            f1_scorer = make_scorer(metrics.flat_f1_score,
                                    average='weighted',
                                    labels=test_y)

            # search
            rs = RandomizedSearchCV(crf,
                                    params_space,
                                    cv=3,
                                    verbose=1,
                                    n_jobs=-1,
                                    n_iter=50,
                                    scoring=f1_scorer)
            rs.fit(train_x, train_y)
            info = rs.best_estimator_.tagger_.info()
            tagger = rs.best_estimator_.tagger_
        else:
            model.set_params({
                'c1': l1,  # coefficient for L1 penalty
                'c2': l2,  # coefficient for L2 penalty
                'max_iterations': iters,  # stop earlier

                # include transitions that are possible, but not observed
                'feature.possible_transitions': True
            })
            model_name = name + '_model {}'.format(fold_idx)
            print('training model...')
            model.train(model_name)
            print('done...')
            tagger = pycrfsuite.Tagger()
            tagger.open(model_name)

            info = tagger.info()

        def print_transitions(trans_features):
            for (label_from, label_to), weight in trans_features:
                print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

        print("Top likely transitions:")
        print_transitions(Counter(info.transitions).most_common(80))

        print("\nTop unlikely transitions:")
        print_transitions(Counter(info.transitions).most_common()[-80:])

        def print_state_features(state_features):
            for (attr, label), weight in state_features:
                print("%0.6f %-6s %s" % (weight, label, attr))

        print("Top positive:")
        print_state_features(Counter(info.state_features).most_common(80))

        print("\nTop negative:")
        print_state_features(Counter(info.state_features).most_common()[-80:])

        g_i = []
        y_truths, predictions = [], []
        abstract_predicted_mentions, true_abstract_mentions = [], []

        for i, (pmid, x, y) in enumerate(zip(test_pmids, test_x, test_y)):
            print(pmid)
            abstract_words, abstract_labels, tagged_abstract, groups_dict, groups = pmids_dict[
                pmid]

            vocab = groups_map[pmid]
            abstract_words, _, _, _, _ = pmids_dict[pmid]

            count = 0
            pred_labels = tagger.tag(x)
            pred_mentions = output2words(pred_labels, abstract_words)
            true_mentions = output2words(abstract_labels, abstract_words)

            print "Predicted: {}".format(pred_mentions)
            print "True: {}".format(true_mentions)
            print '\n'
            """
            abstract_predicted_words.append(pred_words)
          #  vocab2 = output2words(x, y, vectorizer, w2v, abstract_words)
            if len(test_y) > 0:
                y_truths.append(y)
                predictions.append(pred_labels)


            for pred_word in pred_words:
                for v_word in vocab:
                    if pred_word == v_word:
                        count += 1

            pred_words_string = ''
            true_words = " "
            for word in pred_words:
                pred_words_string = pred_words_string + " " + word
            for word in vocab:
                true_words = true_words + " " + word

            print("Predicted: {}".format(pred_words_string))
            print("True: {}".format(true_words))
            print ""
            if len(vocab) == 0:
                continue
            g_i1 = float(count)/float(len(vocab))

            g_i.append(g_i1)
            print LSTM_extraction._crf_evaluate_detection(y, pred_labels, )
            """
            abstract_predicted_mentions.append(pred_mentions)
            true_abstract_mentions.append(true_mentions)
        fold_recall, fold_precision, fold_f1_score = eveluate(
            abstract_predicted_mentions, true_abstract_mentions)
        recall_scores.append(fold_recall)
        precision_scores.append(fold_precision)
        f1_scores.append(fold_f1_score)

        fold_recall_results = "Fold recall: {}".format(fold_recall)
        fold_precision_results = "Fold precision: {}".format(fold_precision)
        fold_f1_results = "Fold F1 Score: {}".format(fold_f1_score)
        print fold_recall_results
        print fold_precision_results
        print fold_f1_results

        file = open(model_name + '_results.txt', 'w+')

        file.write(fold_recall_results + '\n')
        file.write(fold_precision_results + '\n')
        file.write(fold_f1_results + '\n')

        # avg_g_i = float(0)
        """
        for x in g_i:
            avg_g_i += x
        avg_g_i = avg_g_i/len(g_i)
        fold_gi.append(avg_g_i)
        print('g_i: {}'.format(avg_g_i))

    """
    """
        recall, precision, tp_overlapping_tokens, fp_tokens, accuracy = LSTM_extraction._crf_evaluate_detection(y_truths, predictions, abstract, vectorizer)

        print('Recall: {}'.format(recall))
        print('precision: {}'.format(precision))
        print('')

    avg_g_i = 0

    for g in fold_gi:
        avg_g_i+=g

    avg_g_i = float(avg_g_i)/float(len(fold_gi))

    avg_recall = 0

    for g in fold_gi:
        avg_recall+=g

    avg_recall = float(avg_recall)/float(len(recalls))

    avg_precision = 0

    for g in precisions:
        avg_precision+=g

    avg_precision = float(avg_precision)/float(len(precisions))
    "
    print("Fold avg g_i: {}".format(avg_g_i))
    """

    recall_average = _compute_average(recall_scores)
    precision_average = _compute_average(precision_scores)
    f1_scores = _compute_average(f1_scores)

    print "Recall Average: {}".format(recall_average)
    print "Precision Average: {}".format(precision_average)
    print "F1 Average: {}".format(f1_scores)
示例#12
0
def get_X_y(wv, wv_dim, vectorizer=None, distant=False, n=None):

    pmids, sentences, lbls, vectorizer = [None]*4
    if distant: 
        #pmids, sentences, lbls, vectorizer = distant_intervention_tag.get_tokens_and_lbls(N=N)
        pmids, tagged_abstracts, tokens_and_lbls, intervention_texts = \
                distant_intervention_tag.distantly_annotate(n=n)
    else:
        pmids, sentences, lbls, vectorizer = parse_summerscales.get_tokens_and_lbls()

    #pdb.set_trace()

    # see: https://github.com/fchollet/keras/issues/233
    # num_sentences x 1 x max_token_len x wv_dim
    # number of sequences x 1 x max number of tokens (padded to max len) x word vector size
    num_sentences = len(sentences)
    #max_token_len = max([len(s) for s in sentences])

    #X_embedded = np.zeros((num_sentences, wv_dim))
    X_embedded, X_tokens = [], [] # here a sequence associated with each doc/abstract
    y = []
    
    #X_tokens = []
    cur_pmid = pmids[0]
   
    cur_x_embedded, cur_x_tokens, cur_y, token_pmid_list = [], [], [], []
 
    unknown_words_to_vecs = {}

    for idx, s in enumerate(sentences):
        if cur_pmid != pmids[idx]:
            X_embedded.append(np.vstack(cur_x_embedded))
            X_tokens.append(np.vstack(cur_x_tokens))
            y.append(np.array(cur_y))
            cur_x_embedded, cur_x_tokens, cur_y = [], [], []
            cur_pmid = pmids[idx]
        
        for j, t in enumerate(s): 
            try:
                v = wv[t]
            except:
                print("%s not known!" % t)

                # or maybe use 0s???
                if not t in unknown_words_to_vecs:
                    v = np.random.uniform(-1,1,wv_dim)
                    unknown_words_to_vecs[t] = v 
                
                v = unknown_words_to_vecs[t]

            cur_x_embedded.append(v)
            cur_x_tokens.append(vectorizer.vocabulary_[t])
            token_pmid_list.append(cur_pmid)

        cur_y.extend(lbls[idx])


    X_embedded.append(np.vstack(cur_x_embedded))
    X_tokens.append(np.vstack(cur_x_tokens))
    y.append(np.array(cur_y))

    X_embedded = np.vstack(X_embedded)
    X_tokens   = np.vstack(X_tokens)
    y          = np.hstack(y)
    return X_embedded, X_tokens, y, vectorizer, unknown_words_to_vecs, token_pmid_list
示例#13
0
def get_PMIDs_to_X_y(wv, wv_dim, max_length=None, distant=False, n=200):
    
    unknown_words_to_vecs = {}
    tokens_DS = None 

    if distant:
        tokens_and_lbls, X_DS_embedded, y_DS, tokens_DS, unknown_words_to_vecs = \
                _get_distantly_lbled_tokens(n=n, wv=wv, wv_dim=wv_dim)

    # we pass tokens_DS -- the unique tokens in the DS
    # data -- to go into our vectorizer!
    """
    pmids_dict, pmids, sentences, lbls, vectorizer, groups_map = \
                parse_summerscales.get_tokens_and_lbls(
                        make_pmids_dict=True)
    """
    pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \
        parse_summerscales.get_tokens_and_lbls(
                make_pmids_dict=True, sen=True)
    ###
    # now loop through and get X_tokens representation!
    if distant:
        # really token_indices maybe more correct
        X_DS_tokens = []
        for abs_idx, abs_tokens_and_lbls in enumerate(tokens_and_lbls):
            for token_idx, token_and_lbl in enumerate(abs_tokens_and_lbls):
                t, lbl = token_and_lbl
                X_DS_tokens.append(vectorizer.vocabulary_[t])

    # see: https://github.com/fchollet/keras/issues/233
    # num_sentences x 1 x max_token_len x wv_dim
    # number of sequences x 1 x max number of tokens (padded to max len) x word vector size
   # num_sentences = len(sentences)
    #max_token_len = max([len(s) for s in sentences])

    #X_embedded = np.zeros((num_sentences, wv_dim))
    X_embedded, X_tokens = [], [] # here a sequence associated with each doc/abstract

    #unknown_words_to_vecs = {}
    pmids_to_X_y = {}

    for pmid in pmids_dict:
       # pmid_sentences, pmid_lbls = pmids_dict[pmid]
        abstract_tokens, abstract_output_labels, _ = pmids_dict[pmid]
        # for this sentence
        X_embedded = [] 
        X_tokens   = []
        y = []


        for w_i, word_token in enumerate(abstract_tokens):
            try:
                v = wv[word_token]
            except:
                # or maybe use 0s???
                if word_token not in unknown_words_to_vecs:
                    print("word '%s' not known!" % word_token)
                    v = np.random.uniform(-1,1,wv_dim)
                    unknown_words_to_vecs[word_token] = v

                v = unknown_words_to_vecs[word_token]
            X_embedded.append(v)
            X_tokens.append(vectorizer.vocabulary_[word_token])

        #pmids_to_X_y[pmid] = (np.vstack(X_embedded), np.vstack(X_tokens), np.hstack(y))

        if len(abstract_output_labels) > max_length:
            abstract_output_labels = abstract_output_labels[:max_length]
        elif len(abstract_output_labels) < max_length:
            padding = []
            for i in range(max_length - len(abstract_output_labels)):
                padding.append(0)
            abstract_output_labels = padding + abstract_output_labels
        assert len(abstract_output_labels) == max_length, 'Must be same size'

        pmids_to_X_y[pmid] = (X_embedded, X_tokens, abstract_output_labels)
        """
        for sent_idx, s in enumerate(pmid_sentences):
            for j, t in enumerate(s): 
                try:
                    v = wv[t]
                except:
                    # or maybe use 0s???
                    if not t in unknown_words_to_vecs:
                        print("word '%s' not known!" % t)
                        v = np.random.uniform(-1,1,wv_dim)
                        unknown_words_to_vecs[t] = v 
                    
                    v = unknown_words_to_vecs[t]

                X_embedded.append(v)
                X_tokens.append(vectorizer.vocabulary_[t])
            
            y.extend(pmid_lbls[sent_idx])

        pmids_to_X_y[pmid] = (np.vstack(X_embedded), np.vstack(X_tokens), np.hstack(y))
    """
    if distant:
        return pmids_to_X_y, vectorizer, unknown_words_to_vecs, X_DS_embedded, X_DS_tokens, y_DS

    return pmids_to_X_y, vectorizer, unknown_words_to_vecs, groups_map, pmids_dict