def corpus_to_patfeats(model, corpus, target_ids): ''' Transform corpus into doc2vec feature vectors Checks if the patents in the corpus are contained in the model If so: take the learned document vector otherwise: infer the vector ''' patfeats_d2v = {} vecsize = len(model.docvecs[0]) cont = 0 not_cont = 0 for pid, pat in corpus.items(): # check if the patents in the corpus are contained in the model if pid in model.docvecs.doctags.keys(): patfeats_d2v[pid] = norm_dict( dict(zip(range(vecsize), model.docvecs[pid])), 'length') cont += 1 else: not_cont += 1 patfeats_d2v[pid] = norm_dict( dict( zip(range(vecsize), model.infer_vector(pat.lower().split()))), 'length') for tid in target_ids: patfeats_d2v[tid] = norm_dict( dict( zip(range(vecsize), model.infer_vector(corpus[tid].lower().split()))), 'length') print cont, not_cont return patfeats_d2v
def make_doc2vec_corpus(model, target_pat_corpus=False): patfeats_d2v = {} vecsize = len(model.docvecs[0]) # get doc vecs for training documents for pid in model.docvecs.doctags.keys(): patfeats_d2v[pid] = norm_dict( dict(zip(range(vecsize), model.docvecs[pid])), 'length') if target_pat_corpus: # infer doc vecs for target patents for pid, pat in target_pat_corpus.items(): patfeats_d2v[pid] = norm_dict( dict(zip(range(vecsize), model.infer_vector(pat))), 'length') return patfeats_d2v
def infer_patfeats(corpus, model): patfeats_d2v = {} vecsize = len(model.docvecs[0]) for pid, pat in corpus.items(): patfeats_d2v[pid] = norm_dict( dict(zip(range(vecsize), model.infer_vector(pat.lower().split()))), 'length') return patfeats_d2v
def texts2features(self, textdict, fit_ids=[]): """ preprocess texts, count how often each word occurs, weight counts, normalize If this is called the first time, possibly the idf weights and bigrams are computed (using the documents specified in fit_ids), in future calls, the precomputed weights and bigrams are used, e.g. when applying the routine to new test documents. Input: - textdict: a dict with {docid: text} - fit_ids: if only a portion of all texts should be used to compute the weights and identify bigrams (e.g. only training data - only used in the first initializing run) Returns: - docfeats: a dict with {docid: {term: (normalized/weighted) count}} """ docids = set(textdict.keys()) if not fit_ids: fit_ids = set(textdict.keys()) # pre-process texts textdict_pp = {did:preprocess_text(textdict[did], self.to_lower, self.norm_num) for did in docids} # possibly find bigrams if self.identify_bigrams: if not self.bigrams: self.bigrams = find_bigrams(select_copy(textdict_pp, fit_ids), self.bg_threshold) textdict_pp = replace_bigrams(textdict_pp, self.bigrams) # split texts into tokens docfeats = {} for did in docids: featdict = dict(Counter(textdict_pp[did].split())) # normalize if self.norm: featdict = norm_dict(featdict, norm=self.norm) docfeats[did] = featdict # possibly compute idf weights and re-normalize if self.weight: if not self.Dw: self.Dw = compute_idf(select_copy(docfeats, fit_ids)) for did in docids: # if the word was not in Dw (= not in the training set), delete it (otherwise it can mess with renormalization) docfeats[did] = {term:docfeats[did][term]*self.Dw[term] for term in docfeats[did] if term in self.Dw} if self.renorm: for did in docids: docfeats[did] = norm_dict(docfeats[did], norm=self.renorm) return docfeats
def apply_kpca_rel_corpus(): # load combis for small corpus combis = np.load('human_eval/corpus_info/combis.npy') target_ids = list(set([comb[0] for comb in combis])) single_pat_corpus = np.load('human_eval/corpus_info/single_pat_corpus.npy').item() ft = FeatureTransform(renorm='max') docfeats = ft.texts2features(single_pat_corpus) doc_ids = docfeats.keys() train_feats = {pid : pat for pid, pat in docfeats.items() if pid not in target_ids} target_feats = {pid : docfeats[pid] for pid in target_ids} # make feature matrices X_train, featurenames = features2mat(train_feats, train_feats.keys()) X_target, _ = features2mat(target_feats, target_feats.keys(), featurenames) # train on full patent corpus (excluding target patents) kpca = KernelPCA(n_components=250, kernel='linear') X_train_kpca = kpca.fit_transform(X_train) # make feat mat for small corpus X_target_kpca = kpca.transform(X_target) patfeats_lsa = {pid: norm_dict(dict(zip(range(250), X_train_kpca[i,:])), 'length') for i, pid in enumerate(train_feats.keys())} for i, pid in enumerate(target_feats.keys()): patfeats_lsa[pid] = norm_dict(dict(zip(range(250), X_target_kpca[i,:])), 'length') pat_ids = np.load('human_eval/corpus_info/pat_ids.npy') binary_label_pairs = np.load('human_eval/corpus_info/binary_label_pairs.npy').item() human_label_pairs = np.load('human_eval/corpus_info/human_label_pairs.npy').item() binary_sim_combis, binary_diff_combis = group_combis(binary_label_pairs) human_sim_combis, human_diff_combis = group_combis(human_label_pairs) for simcoef in ['linear']: binary_scores = calc_simcoef_distr(patfeats_lsa, ['random', 'cited'], {'cited': binary_sim_combis, 'random': binary_diff_combis}, simcoef) human_scores = calc_simcoef_distr(patfeats_lsa, ['irrelevant', 'relevant'], {'relevant': human_sim_combis, 'irrelevant': human_diff_combis}, simcoef) binary_auc = calc_auc(binary_scores['cited'], binary_scores['random'])[2] human_auc = calc_auc(human_scores['relevant'], human_scores['irrelevant'])[2] plot_score_distr('human_eval', simcoef, ['random', 'cited'], {'cited': binary_scores['cited'], 'random': binary_scores['random']}, binary_auc, ['cited'], histdir='kpca_1000_rel_corp', bins=20) plot_score_distr('human_eval', simcoef, ['irrelevant', 'relevant'], {'relevant': human_scores['relevant'], 'irrelevant': human_scores['irrelevant']}, human_auc, ['relevant'], histdir='kpca_1000_rel_corp', bins=20)
def compute_idf(docfeats): """ Inputs: - docfeats: a dict with doc_id:{term:count} Returns: - Dw: a dict with {term: weight} """ # total number of documents N = float(len(docfeats)) # invert the dictionary to be term:{doc_id:count} termdocs = invert_dict2(docfeats) # compute idf for every term return norm_dict({term:log(N/len(termdocs[term])) for term in termdocs})
## our case: weights learned by regression # transform into very basic features, i.e. w/o idf weights print "making patent pair features" ft = FeatureTransform(identify_bigrams=False, norm=None, weight=False, renorm=None) # transform into pair features + baseline cosine labels patfeats = ft.texts2features(pat_corpus) # make pairwise feature matrix print "making feature matrix" patfeats_pairs = {} for combi in combis: target_id, pid = combi.split('_') patfeats_pairs[target_id + '_' + pid] = norm_dict( pointwise_dict_multiply(patfeats[target_id], patfeats[pid]), 'length') featmat, featurenames = features2mat(patfeats_pairs, combis) ''' print "performing regression" # perform logistig regression log_reg = lm.LogisticRegression(C=1., fit_intercept=True, solver='liblinear', random_state=13) log_reg.fit(featmat, labels) weights_logreg = norm_dict(dict(zip(featurenames, log_reg.coef_))) Dw_all['logreg'] = weights_logreg ''' # perform regression with lasso clf = lm.Lasso(alpha=0.00005, fit_intercept=True, random_state=13) clf.fit(featmat, labels) idf_weights = norm_dict(dict(zip(featurenames, clf.coef_))) weights = postprocess_weights(idf_weights, zero=True, sqrt=False)
'human_eval/corpus_info/train_feats_claims.npy').item() target_feats = np.load( 'human_eval/corpus_info/target_feats_claims.npy').item() # make feature matrices X_train, featurenames = features2mat(train_feats, train_feats.keys()) #np.save('human_eval/corpus_info/featurenames_full_corpus.npy', featurenames) X_target, _ = features2mat(target_feats, target_feats.keys(), featurenames) for n_comp in [100, 250, 500, 1000]: print n_comp # fit LSA kpca = KernelPCA(n_components=n_comp, kernel='linear') X_train_kpca = kpca.fit_transform(X_train) #pkl.dump(kpca, open('human_eval/models/kpca_%i.model' %n_comp, 'wb'), -1) X_target_kpca = kpca.transform(X_target) kpca_feats = { pid: norm_dict(dict(zip(range(n_comp), X_train_kpca[i, :])), 'length') for i, pid in enumerate(train_feats.keys()) } for i, pid in enumerate(target_feats.keys()): kpca_feats[pid] = norm_dict( dict(zip(range(n_comp), X_target_kpca[i, :])), 'length') np.save('human_eval/corpus_info/kpca_feats.npy', kpca_feats) scores = calc_simcoef_distr(kpca_feats, ['cited', 'duplicate', 'random'], id_dict, 'linear') auc, aps = calc_auc(scores['cited'], scores['random'])[2::] print(auc, aps) plot_score_distr('human_eval', 'linear', ['random', 'cited', 'duplicate'], { 'cited': scores['cited'], 'random': scores['random'],
def model_selection(combis, patfeats_pairs, single_pat_corpus, binary_label_pairs, human_label_pairs): alphas = np.arange(10) / 100000. param_auc_dict = {} param_auc_dict['cited'] = {} param_auc_dict['human'] = {} for alpha in alphas: param_auc_dict['cited']['%.5f' % alpha] = {} param_auc_dict['human']['%.5f' % alpha] = {} for wtype in [ 'idf_weights', 'idf_weights_sqrt', 'idf_weights_zeroed', 'idf_weights_zeroed_sqrt' ]: param_auc_dict['cited']['%.5f' % alpha][wtype] = [] param_auc_dict['human']['%.5f' % alpha][wtype] = [] ## model selection for n in range(5): print "testing for the %ith time" % n # train/test split combis_perm = np.random.permutation(combis) trainids = combis_perm[:int(np.ceil(len(combis) * 0.7))] testids = combis_perm[int(np.ceil(len(combis) * 0.7)):] patfeats_pairs_train = {} for combi in trainids: target_id, pid = combi patfeats_pairs_train[(target_id, pid)] = patfeats_pairs[(target_id, pid)] train_pair_ids = patfeats_pairs_train.keys() # transform into feature matrix (number of pairs) x (bow-dim) print "make feature matrix train" featmat_train, featurenames = features2mat(patfeats_pairs_train, train_pair_ids) # same for test set patfeats_pairs_test = {} for combi in testids: target_id, pid = combi patfeats_pairs_test[(target_id, pid)] = patfeats_pairs[(target_id, pid)] test_pair_ids = patfeats_pairs_test.keys() print "make feature matrix test" featmat_test, featurenames = features2mat(patfeats_pairs_test, test_pair_ids, featurenames) # get the corresponding label vectors y_human_train = [human_label_pairs[tid] for tid in train_pair_ids] y_human_test = [human_label_pairs[tid] for tid in test_pair_ids] y_binary_train = [binary_label_pairs[tid] for tid in train_pair_ids] y_binary_test = [binary_label_pairs[tid] for tid in test_pair_ids] for alpha in alphas: # perform the linear regression for binary (cited/not cited) labels print "perform regression for binary scoring" clf = lm.Lasso(alpha=alpha, fit_intercept=True, random_state=13) clf.fit(featmat_train, y_binary_train) ## calculate AUC-values # the fitted coefficients are now our word weights # perform regression for all weight postprocessings weights = {} weights['idf_weights'] = norm_dict( dict(zip(featurenames, clf.coef_))) weights['idf_weights_zeroed'] = postprocess_weights( weights['idf_weights'], zero=True, sqrt=False) weights['idf_weights_sqrt'] = postprocess_weights( weights['idf_weights'], zero=False, sqrt=False) weights['idf_weights_zeroed_sqrt'] = postprocess_weights( weights['idf_weights'], zero=True, sqrt=True) # multiply patfeats with idf weights for wtype in [ 'idf_weights', 'idf_weights_sqrt', 'idf_weights_zeroed', 'idf_weights_zeroed_sqrt' ]: ft = FeatureTransform(identify_bigrams=False, norm=None, weight=True, renorm='length') ft.Dw = weights[wtype] patfeats_idf = ft.texts2features(single_pat_corpus) # calculate auc for cited/not cited on test set for simcoef in ['linear']: y_true = [] y_pred = [] for combi in testids: y_true.append(binary_label_pairs[(combi[0], combi[1])]) y_pred.append( compute_sim(patfeats_idf[combi[0]], patfeats_idf[combi[1]], simcoef)) fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=1) auc_val = auc(fpr, tpr) print "cited, alpha: %.5f, AUC: %.4f" % (alpha, auc_val) param_auc_dict['cited']['%.5f' % alpha][wtype].append(auc_val) print "perform regression for human scoring" clf = lm.Lasso(alpha=alpha, fit_intercept=True, random_state=13) clf.fit(featmat_train, y_human_train) ## calculate AUC-values # the fitted coefficients are now our word weights # perform regression for all weight postprocessings weights = {} weights['idf_weights'] = norm_dict( dict(zip(featurenames, clf.coef_))) weights['idf_weights_zeroed'] = postprocess_weights( weights['idf_weights'], zero=True, sqrt=False) weights['idf_weights_sqrt'] = postprocess_weights( weights['idf_weights'], zero=False, sqrt=False) weights['idf_weights_zeroed_sqrt'] = postprocess_weights( weights['idf_weights'], zero=True, sqrt=True) # multiply patfeats with idf weights for wtype in [ 'idf_weights', 'idf_weights_sqrt', 'idf_weights_zeroed', 'idf_weights_zeroed_sqrt' ]: ft = FeatureTransform(identify_bigrams=False, norm=None, weight=True, renorm='length') ft.Dw = weights[wtype] patfeats_idf = ft.texts2features(single_pat_corpus) # calculate auc for cited/not cited on test set for simcoef in ['linear']: y_true = [] y_pred = [] for combi in testids: y_true.append( int(human_label_pairs[(combi[0], combi[1])] >= 0.5)) y_pred.append( compute_sim(patfeats_idf[combi[0]], patfeats_idf[combi[1]], simcoef)) fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=1) auc_val = auc(fpr, tpr) print "human, alpha: %.5f, AUC: %.4f" % (alpha, auc_val) param_auc_dict['human']['%.5f' % alpha][wtype].append(auc_val) np.save('human_eval/regression/param_auc_dict.npy', param_auc_dict)