Exemplo n.º 1
0
def apply_d2v_sectionwise():
    '''
    Evaluate doc2vec for comparison of patent sections
    '''
    # define embedding size
    size = 50
    # load data
    model = pkl.load(
        open("../doc2vec/models/full_pat_corpus_dm_50_min5_iter18.model"))
    corpus = np.load('../corpus/corpus_claims.npy').item()
    target_ids = np.load('../corpus/target_ids.npy')
    random_ids = np.load('../corpus/random_ids.npy')
    dupl_ids = np.load('../corpus/dupl_ids.npy').item()
    cited_ids = np.load('../corpus/cited_ids.npy').item()
    id_dict = make_combis(target_ids, random_ids, cited_ids, dupl_ids)
    patfeats_d2v = infer_patfeats(corpus, model)
    scores = calc_simcoef_distr(patfeats_d2v, ['cited', 'duplicate', 'random'],
                                id_dict, 'linear')
    auc = calc_auc(scores['cited'], scores['random'])[2]
    '''
    # guarantee that scores range between 0 and 1
    for label, vals in scores.items():
        scores[label] = scores[label] - np.min(scores[label])
        scores[label] = scores[label]/np.max(scores[label])
    '''
    plot_score_distr('human_eval',
                     'linear', ['random', 'cited', 'duplicate'], {
                         'cited': scores['cited'],
                         'random': scores['random'],
                         'duplicate': scores['duplicate']
                     },
                     auc, ['cited'],
                     histdir='doc2vec_full%i_claims' % size,
                     bins=50)
Exemplo n.º 2
0
def make_kpca_feats():
    target_ids = np.load('../corpus/target_ids.npy')
    random_ids = np.load('../corpus/random_ids.npy')
    dupl_ids = np.load('../corpus/dupl_ids.npy').item()
    cited_ids = np.load('../corpus/cited_ids.npy').item()
    id_dict = make_combis(target_ids, random_ids, cited_ids, dupl_ids)
 
    # load corpus
    pat_corpus = np.load('../corpus/corpus_abstract.npy').item()
    # extract features
    ft = FeatureTransform(renorm='max')
    docfeats = ft.texts2features(pat_corpus)
    doc_ids = docfeats.keys()
    # split into target and rest
    train_feats = {pid : pat for pid, pat in docfeats.items() if pid not in target_ids}
    target_feats = {pid : docfeats[pid] for pid in target_ids}
    np.save('human_eval/corpus_info/train_feats_full.npy', train_feats)
    np.save('human_eval/corpus_info/target_feats_full.npy', target_feats)
    return train_feats, target_feats
Exemplo n.º 3
0
def apply_d2v_full_corpus():
    target_ids = np.load('../corpus/target_ids.npy')
    random_ids = np.load('../corpus/random_ids.npy')
    dupl_ids = np.load('../corpus/dupl_ids.npy').item()
    cited_ids = np.load('../corpus/cited_ids.npy').item()
    id_dict = make_combis(target_ids, random_ids, cited_ids, dupl_ids)
    pat_corpus = np.load('../corpus/corpus.npy').item()
    for size in [50]:
        pat_corpus, target_pat_corpus = make_d2v_corpus(target_ids)
        #train model
        model = pkl.load(
            open("../doc2vec/models/full_pat_corpus_dm_50_min5_iter18.model"))
        #load model
        #model = pkl.load(open('human_eval/patents_dm_50_min5_iter10.model'))
        #patfeats_d2v = infer_patfeats(pat_corpus, model)
        #patfeats_d2v = corpus_to_patfeats(model, pat_corpus, target_ids)
        patfeats_d2v = make_doc2vec_corpus(model, target_pat_corpus)
        #np.save('../doc2vec/patfeats_d2v%i.npy' %size, patfeats_d2v)

        scores = calc_simcoef_distr(patfeats_d2v,
                                    ['cited', 'duplicate', 'random'], id_dict,
                                    'linear')
        auc = calc_auc(scores['cited'], scores['random'])[2]
        '''
        # guarantee that scores range between 0 and 1
        for label, vals in scores.items():
            scores[label] = scores[label] - np.min(scores[label])
            scores[label] = scores[label]/np.max(scores[label])
        '''
        plot_score_distr('human_eval',
                         'linear', ['random', 'cited', 'duplicate'], {
                             'cited': scores['cited'],
                             'random': scores['random'],
                             'duplicate': scores['duplicate']
                         },
                         auc, ['cited'],
                         histdir='doc2vec_full%i_no_target' % size,
                         bins=50)
Exemplo n.º 4
0
                         },
                         human_auc, ['relevant'],
                         histdir='kpca_1000_rel_corp',
                         bins=20)


if __name__ == "__main__":
    #apply_kpca_rel_corpus()

    #train_feats, target_feats = make_kpca_feats()
    target_ids = np.load('corpus/target_ids.npy')
    random_ids = np.load('corpus/random_ids.npy')
    dupl_ids = np.load('corpus/dupl_ids.npy').item()
    cited_ids = np.load('corpus/cited_ids.npy').item()

    id_dict = make_combis(target_ids, random_ids, cited_ids, dupl_ids)
    train_feats = np.load(
        'human_eval/corpus_info/train_feats_claims.npy').item()
    target_feats = np.load(
        'human_eval/corpus_info/target_feats_claims.npy').item()
    # make feature matrices
    X_train, featurenames = features2mat(train_feats, train_feats.keys())
    #np.save('human_eval/corpus_info/featurenames_full_corpus.npy', featurenames)
    X_target, _ = features2mat(target_feats, target_feats.keys(), featurenames)
    for n_comp in [100, 250, 500, 1000]:
        print n_comp
        # fit LSA
        kpca = KernelPCA(n_components=n_comp, kernel='linear')
        X_train_kpca = kpca.fit_transform(X_train)
        #pkl.dump(kpca, open('human_eval/models/kpca_%i.model' %n_comp, 'wb'), -1)
        X_target_kpca = kpca.transform(X_target)