예제 #1
0
def corpus_to_patfeats(model, corpus, target_ids):
    '''
    Transform corpus into doc2vec feature vectors
    Checks if the patents in the corpus are contained in the model
    If so: take the learned document vector
    otherwise: infer the vector
    '''
    patfeats_d2v = {}
    vecsize = len(model.docvecs[0])
    cont = 0
    not_cont = 0
    for pid, pat in corpus.items():
        # check if the patents in the corpus are contained in the model
        if pid in model.docvecs.doctags.keys():
            patfeats_d2v[pid] = norm_dict(
                dict(zip(range(vecsize), model.docvecs[pid])), 'length')
            cont += 1
        else:
            not_cont += 1
            patfeats_d2v[pid] = norm_dict(
                dict(
                    zip(range(vecsize),
                        model.infer_vector(pat.lower().split()))), 'length')
    for tid in target_ids:
        patfeats_d2v[tid] = norm_dict(
            dict(
                zip(range(vecsize),
                    model.infer_vector(corpus[tid].lower().split()))),
            'length')
    print cont, not_cont
    return patfeats_d2v
예제 #2
0
def make_doc2vec_corpus(model, target_pat_corpus=False):
    patfeats_d2v = {}
    vecsize = len(model.docvecs[0])
    # get doc vecs for training documents
    for pid in model.docvecs.doctags.keys():
        patfeats_d2v[pid] = norm_dict(
            dict(zip(range(vecsize), model.docvecs[pid])), 'length')
    if target_pat_corpus:
        # infer doc vecs for target patents
        for pid, pat in target_pat_corpus.items():
            patfeats_d2v[pid] = norm_dict(
                dict(zip(range(vecsize), model.infer_vector(pat))), 'length')
    return patfeats_d2v
예제 #3
0
def infer_patfeats(corpus, model):
    patfeats_d2v = {}
    vecsize = len(model.docvecs[0])
    for pid, pat in corpus.items():
        patfeats_d2v[pid] = norm_dict(
            dict(zip(range(vecsize), model.infer_vector(pat.lower().split()))),
            'length')
    return patfeats_d2v
예제 #4
0
 def texts2features(self, textdict, fit_ids=[]):
     """
     preprocess texts, count how often each word occurs, weight counts, normalize
     If this is called the first time, possibly the idf weights and bigrams are computed (using the documents
         specified in fit_ids), in future calls, the precomputed weights and bigrams are used, e.g. when applying
         the routine to new test documents.
     Input:
         - textdict: a dict with {docid: text}
         - fit_ids: if only a portion of all texts should be used to compute the weights and identify bigrams
                    (e.g. only training data - only used in the first initializing run)
     Returns:
         - docfeats: a dict with {docid: {term: (normalized/weighted) count}}
     """
     docids = set(textdict.keys())
     if not fit_ids:
         fit_ids = set(textdict.keys())
     # pre-process texts
     textdict_pp = {did:preprocess_text(textdict[did], self.to_lower, self.norm_num) for did in docids}
     # possibly find bigrams
     if self.identify_bigrams:
         if not self.bigrams:
             self.bigrams = find_bigrams(select_copy(textdict_pp, fit_ids), self.bg_threshold)
         textdict_pp = replace_bigrams(textdict_pp, self.bigrams)
     # split texts into tokens
     docfeats = {}
     for did in docids:
         featdict = dict(Counter(textdict_pp[did].split()))
         # normalize
         if self.norm:
             featdict = norm_dict(featdict, norm=self.norm)
         docfeats[did] = featdict
     # possibly compute idf weights and re-normalize
     if self.weight:
         if not self.Dw:
             self.Dw = compute_idf(select_copy(docfeats, fit_ids))
         for did in docids:
             # if the word was not in Dw (= not in the training set), delete it (otherwise it can mess with renormalization)
             docfeats[did] = {term:docfeats[did][term]*self.Dw[term] for term in docfeats[did] if term in self.Dw}
     if self.renorm:
         for did in docids:
             docfeats[did] = norm_dict(docfeats[did], norm=self.renorm)
     return docfeats
예제 #5
0
def apply_kpca_rel_corpus():
    # load combis for small corpus
    combis = np.load('human_eval/corpus_info/combis.npy')
    target_ids = list(set([comb[0] for comb in combis]))
    single_pat_corpus = np.load('human_eval/corpus_info/single_pat_corpus.npy').item()
    ft = FeatureTransform(renorm='max')
    docfeats = ft.texts2features(single_pat_corpus)
    doc_ids = docfeats.keys()
    train_feats = {pid : pat for pid, pat in docfeats.items() if pid not in target_ids}
    target_feats = {pid : docfeats[pid] for pid in target_ids}
    # make feature matrices
    X_train, featurenames = features2mat(train_feats, train_feats.keys())
    X_target, _ = features2mat(target_feats, target_feats.keys(), featurenames)
    # train on full patent corpus (excluding target patents)
    kpca = KernelPCA(n_components=250, kernel='linear')
    X_train_kpca = kpca.fit_transform(X_train)
    # make feat mat for small corpus
    X_target_kpca = kpca.transform(X_target)
    patfeats_lsa = {pid: norm_dict(dict(zip(range(250), X_train_kpca[i,:])), 'length') for i, pid in enumerate(train_feats.keys())}
    for i, pid in enumerate(target_feats.keys()):
        patfeats_lsa[pid] = norm_dict(dict(zip(range(250), X_target_kpca[i,:])), 'length')
    pat_ids = np.load('human_eval/corpus_info/pat_ids.npy')
    binary_label_pairs = np.load('human_eval/corpus_info/binary_label_pairs.npy').item()
    human_label_pairs = np.load('human_eval/corpus_info/human_label_pairs.npy').item()
    binary_sim_combis, binary_diff_combis = group_combis(binary_label_pairs)
    human_sim_combis, human_diff_combis = group_combis(human_label_pairs)
    for simcoef in ['linear']:
        binary_scores = calc_simcoef_distr(patfeats_lsa, ['random', 'cited'], 
                                           {'cited': binary_sim_combis, 'random': binary_diff_combis},
                                           simcoef)
        human_scores = calc_simcoef_distr(patfeats_lsa, ['irrelevant', 'relevant'],
                                          {'relevant': human_sim_combis, 'irrelevant': human_diff_combis},
                                          simcoef)
        binary_auc = calc_auc(binary_scores['cited'], binary_scores['random'])[2]
        human_auc = calc_auc(human_scores['relevant'], human_scores['irrelevant'])[2]
        plot_score_distr('human_eval', simcoef, ['random', 'cited'], 
                         {'cited': binary_scores['cited'], 'random': binary_scores['random']},
                         binary_auc, ['cited'], histdir='kpca_1000_rel_corp', bins=20)
        plot_score_distr('human_eval', simcoef, ['irrelevant', 'relevant'], 
                 {'relevant': human_scores['relevant'], 'irrelevant': human_scores['irrelevant']},
                 human_auc, ['relevant'], histdir='kpca_1000_rel_corp', bins=20)
예제 #6
0
def compute_idf(docfeats):
    """
    Inputs:
        - docfeats: a dict with doc_id:{term:count}
    Returns:
        - Dw: a dict with {term: weight}
    """
    # total number of documents
    N = float(len(docfeats))
    # invert the dictionary to be term:{doc_id:count}
    termdocs = invert_dict2(docfeats)
    # compute idf for every term
    return norm_dict({term:log(N/len(termdocs[term])) for term in termdocs})
 ## our case: weights learned by regression
 # transform into very basic features, i.e. w/o idf weights
 print "making patent pair features"
 ft = FeatureTransform(identify_bigrams=False,
                       norm=None,
                       weight=False,
                       renorm=None)
 # transform into pair features + baseline cosine labels
 patfeats = ft.texts2features(pat_corpus)
 # make pairwise feature matrix
 print "making feature matrix"
 patfeats_pairs = {}
 for combi in combis:
     target_id, pid = combi.split('_')
     patfeats_pairs[target_id + '_' + pid] = norm_dict(
         pointwise_dict_multiply(patfeats[target_id], patfeats[pid]),
         'length')
 featmat, featurenames = features2mat(patfeats_pairs, combis)
 '''
 print "performing regression"
 # perform logistig regression
 log_reg = lm.LogisticRegression(C=1., fit_intercept=True, solver='liblinear', random_state=13)
 log_reg.fit(featmat, labels)
 weights_logreg = norm_dict(dict(zip(featurenames, log_reg.coef_)))
 Dw_all['logreg'] = weights_logreg
 '''
 # perform regression with lasso
 clf = lm.Lasso(alpha=0.00005, fit_intercept=True, random_state=13)
 clf.fit(featmat, labels)
 idf_weights = norm_dict(dict(zip(featurenames, clf.coef_)))
 weights = postprocess_weights(idf_weights, zero=True, sqrt=False)
예제 #8
0
     'human_eval/corpus_info/train_feats_claims.npy').item()
 target_feats = np.load(
     'human_eval/corpus_info/target_feats_claims.npy').item()
 # make feature matrices
 X_train, featurenames = features2mat(train_feats, train_feats.keys())
 #np.save('human_eval/corpus_info/featurenames_full_corpus.npy', featurenames)
 X_target, _ = features2mat(target_feats, target_feats.keys(), featurenames)
 for n_comp in [100, 250, 500, 1000]:
     print n_comp
     # fit LSA
     kpca = KernelPCA(n_components=n_comp, kernel='linear')
     X_train_kpca = kpca.fit_transform(X_train)
     #pkl.dump(kpca, open('human_eval/models/kpca_%i.model' %n_comp, 'wb'), -1)
     X_target_kpca = kpca.transform(X_target)
     kpca_feats = {
         pid: norm_dict(dict(zip(range(n_comp), X_train_kpca[i, :])),
                        'length')
         for i, pid in enumerate(train_feats.keys())
     }
     for i, pid in enumerate(target_feats.keys()):
         kpca_feats[pid] = norm_dict(
             dict(zip(range(n_comp), X_target_kpca[i, :])), 'length')
     np.save('human_eval/corpus_info/kpca_feats.npy', kpca_feats)
     scores = calc_simcoef_distr(kpca_feats,
                                 ['cited', 'duplicate', 'random'], id_dict,
                                 'linear')
     auc, aps = calc_auc(scores['cited'], scores['random'])[2::]
     print(auc, aps)
     plot_score_distr('human_eval',
                      'linear', ['random', 'cited', 'duplicate'], {
                          'cited': scores['cited'],
                          'random': scores['random'],
예제 #9
0
def model_selection(combis, patfeats_pairs, single_pat_corpus,
                    binary_label_pairs, human_label_pairs):
    alphas = np.arange(10) / 100000.
    param_auc_dict = {}
    param_auc_dict['cited'] = {}
    param_auc_dict['human'] = {}
    for alpha in alphas:
        param_auc_dict['cited']['%.5f' % alpha] = {}
        param_auc_dict['human']['%.5f' % alpha] = {}
        for wtype in [
                'idf_weights', 'idf_weights_sqrt', 'idf_weights_zeroed',
                'idf_weights_zeroed_sqrt'
        ]:
            param_auc_dict['cited']['%.5f' % alpha][wtype] = []
            param_auc_dict['human']['%.5f' % alpha][wtype] = []
    ## model selection
    for n in range(5):
        print "testing for the %ith time" % n
        # train/test split
        combis_perm = np.random.permutation(combis)
        trainids = combis_perm[:int(np.ceil(len(combis) * 0.7))]
        testids = combis_perm[int(np.ceil(len(combis) * 0.7)):]
        patfeats_pairs_train = {}
        for combi in trainids:
            target_id, pid = combi
            patfeats_pairs_train[(target_id, pid)] = patfeats_pairs[(target_id,
                                                                     pid)]
        train_pair_ids = patfeats_pairs_train.keys()
        # transform into feature matrix (number of pairs) x (bow-dim)
        print "make feature matrix train"
        featmat_train, featurenames = features2mat(patfeats_pairs_train,
                                                   train_pair_ids)
        # same for test set
        patfeats_pairs_test = {}
        for combi in testids:
            target_id, pid = combi
            patfeats_pairs_test[(target_id, pid)] = patfeats_pairs[(target_id,
                                                                    pid)]
        test_pair_ids = patfeats_pairs_test.keys()
        print "make feature matrix test"
        featmat_test, featurenames = features2mat(patfeats_pairs_test,
                                                  test_pair_ids, featurenames)

        # get the corresponding label vectors
        y_human_train = [human_label_pairs[tid] for tid in train_pair_ids]
        y_human_test = [human_label_pairs[tid] for tid in test_pair_ids]
        y_binary_train = [binary_label_pairs[tid] for tid in train_pair_ids]
        y_binary_test = [binary_label_pairs[tid] for tid in test_pair_ids]

        for alpha in alphas:
            # perform the linear regression for binary (cited/not cited) labels
            print "perform regression for binary scoring"
            clf = lm.Lasso(alpha=alpha, fit_intercept=True, random_state=13)
            clf.fit(featmat_train, y_binary_train)
            ## calculate AUC-values
            # the fitted coefficients are now our word weights
            # perform regression for all weight postprocessings
            weights = {}
            weights['idf_weights'] = norm_dict(
                dict(zip(featurenames, clf.coef_)))
            weights['idf_weights_zeroed'] = postprocess_weights(
                weights['idf_weights'], zero=True, sqrt=False)
            weights['idf_weights_sqrt'] = postprocess_weights(
                weights['idf_weights'], zero=False, sqrt=False)
            weights['idf_weights_zeroed_sqrt'] = postprocess_weights(
                weights['idf_weights'], zero=True, sqrt=True)

            # multiply patfeats with idf weights
            for wtype in [
                    'idf_weights', 'idf_weights_sqrt', 'idf_weights_zeroed',
                    'idf_weights_zeroed_sqrt'
            ]:
                ft = FeatureTransform(identify_bigrams=False,
                                      norm=None,
                                      weight=True,
                                      renorm='length')
                ft.Dw = weights[wtype]
                patfeats_idf = ft.texts2features(single_pat_corpus)

                # calculate auc for cited/not cited on test set
                for simcoef in ['linear']:
                    y_true = []
                    y_pred = []
                    for combi in testids:
                        y_true.append(binary_label_pairs[(combi[0], combi[1])])
                        y_pred.append(
                            compute_sim(patfeats_idf[combi[0]],
                                        patfeats_idf[combi[1]], simcoef))
                    fpr, tpr, thresholds = roc_curve(y_true,
                                                     y_pred,
                                                     pos_label=1)
                    auc_val = auc(fpr, tpr)
                    print "cited, alpha: %.5f, AUC: %.4f" % (alpha, auc_val)
                    param_auc_dict['cited']['%.5f' %
                                            alpha][wtype].append(auc_val)

            print "perform regression for human scoring"
            clf = lm.Lasso(alpha=alpha, fit_intercept=True, random_state=13)
            clf.fit(featmat_train, y_human_train)
            ## calculate AUC-values
            # the fitted coefficients are now our word weights
            # perform regression for all weight postprocessings
            weights = {}
            weights['idf_weights'] = norm_dict(
                dict(zip(featurenames, clf.coef_)))
            weights['idf_weights_zeroed'] = postprocess_weights(
                weights['idf_weights'], zero=True, sqrt=False)
            weights['idf_weights_sqrt'] = postprocess_weights(
                weights['idf_weights'], zero=False, sqrt=False)
            weights['idf_weights_zeroed_sqrt'] = postprocess_weights(
                weights['idf_weights'], zero=True, sqrt=True)

            # multiply patfeats with idf weights
            for wtype in [
                    'idf_weights', 'idf_weights_sqrt', 'idf_weights_zeroed',
                    'idf_weights_zeroed_sqrt'
            ]:
                ft = FeatureTransform(identify_bigrams=False,
                                      norm=None,
                                      weight=True,
                                      renorm='length')
                ft.Dw = weights[wtype]
                patfeats_idf = ft.texts2features(single_pat_corpus)

                # calculate auc for cited/not cited on test set
                for simcoef in ['linear']:
                    y_true = []
                    y_pred = []
                    for combi in testids:
                        y_true.append(
                            int(human_label_pairs[(combi[0],
                                                   combi[1])] >= 0.5))
                        y_pred.append(
                            compute_sim(patfeats_idf[combi[0]],
                                        patfeats_idf[combi[1]], simcoef))
                    fpr, tpr, thresholds = roc_curve(y_true,
                                                     y_pred,
                                                     pos_label=1)
                    auc_val = auc(fpr, tpr)
                    print "human, alpha: %.5f, AUC: %.4f" % (alpha, auc_val)
                    param_auc_dict['human']['%.5f' %
                                            alpha][wtype].append(auc_val)
    np.save('human_eval/regression/param_auc_dict.npy', param_auc_dict)