class QuoraSVM(Quora): def __init__(self, model='svm', features='bm25,', comment_features='bm25,', stop=True, vector='word2vec', path=FEATURES_PATH, alpha=0.1, sigma=0.9, gridsearch='random'): Quora.__init__(self, stop=stop, vector=vector) self.path = path self.features = features.split(',') self.comment_features = comment_features.split(',') self.gridsearch = gridsearch self.svm = Model() self.model = model self.bm25 = QuoraBM25(stop=stop) if 'bm25' in self.features+self.comment_features else None self.cosine = QuoraCosine(stop=stop) if 'cosine' in self.features+self.comment_features else None self.softcosine = QuoraSoftCosine(stop=stop, vector=vector) if 'softcosine' in self.features+self.comment_features else None self.translation = QuoraTranslations(alpha=alpha, sigma=sigma, stop=stop, vector=self.vector) if 'translation' in self.features+self.comment_features else None self.train() def extract_features(self, pairdata, elmoidx, elmovec, fullelmoidx, fullelmovec): X, y = [], [] feat = [] for i, pair in enumerate(pairdata): try: percentage = round(float(i + 1) / len(pairdata), 2) print('Extracting features: ', percentage, i + 1, sep='\t', end = '\r') q1id = pair['qid1'] if 'qid1' in pair else str(i) + '1' q2id = pair['qid2'] if 'qid2' in pair else str(i) + '2' q1, q2 = pair['tokens_proc1'], pair['tokens_proc2'] x = [] if self.stop: q1_emb = self.encode(q1id, q1, elmoidx, elmovec) else: q1_emb = self.encode(q1id, q1, fullelmoidx, fullelmovec) # bm25 if 'bm25' in self.features: score = self.bm25.model(q1, q2id) x.append(score) # softcosine if 'softcosine' in self.features: if self.stop: q2_emb = self.encode(q2id, q2, elmoidx, elmovec) else: q2_emb = self.encode(q2id, q2, fullelmoidx, fullelmovec) score = self.softcosine.model(q1, q1_emb, q2, q2_emb) x.append(score) # translation if 'translation' in self.features: if self.stop: q2_emb = self.encode(q2id, q2, elmoidx, elmovec) else: q2_emb = self.encode(q2id, q2, fullelmoidx, fullelmovec) lmprob, trmprob, trlmprob, proctime = self.translation.model(q1, q1_emb, q2, q2_emb) x.append(trlmprob) # cosine if 'cosine' in self.features: score = self.cosine.model(q1, q2) x.append(score) y_ = int(pair['is_duplicate']) feat.append((x, y_)) X.append(x) y.append(y_) except: print('Error') print(pair) return feat, X, y def train(self): path = os.path.join(FEATURES_PATH, 'train', self.path) self.X, self.y = [], [] if not os.path.exists(path): feat, self.X, self.y = self.extract_features(self.trainset, self.trainidx, self.trainelmo, self.fulltrainidx, self.fulltrainelmo) p.dump(feat, open(path, 'wb')) else: feat = p.load(open(path, 'rb')) for row in feat: self.X.append(row[0]) self.y.append(row[1]) self.scaler = MinMaxScaler(feature_range=(-1, 1)) self.scaler.fit(self.X) self.X = self.scaler.transform(self.X) if self.model == 'svm': self.svm.train_svm( trainvectors=self.X, labels=self.y, c='search', kernel='search', gamma='search', jobs=10, gridsearch=self.gridsearch ) else: self.svm.train_regression(trainvectors=self.X, labels=self.y, c='search', penalty='search', tol='search', gridsearch=self.gridsearch, jobs=10) def validate(self): path = os.path.join(FEATURES_PATH, 'dev', self.path) if not os.path.exists(path): feat, X, y = self.extract_features(self.devset, self.devidx, self.develmo, self.fulldevidx, self.fulldevelmo) p.dump(feat, open(path, 'wb')) else: feat = p.load(open(path, 'rb')) y_real, y_pred = [], [] for i, pair in enumerate(feat): X = pair[0] X = self.scaler.transform([X])[0] score, pred_label = self.svm.score(X) y_pred.append(pred_label) real_label = pair[1] y_real.append(real_label) parameter_settings = self.svm.return_parameter_settings(clf=self.model) return y_real, y_pred, parameter_settings
class SemevalTreeKernel(Semeval): def __init__(self, alpha=0, decay=1, ignore_leaves=True, smoothed=True, vector='word2vec', w2vdim=300, lowercase=True, tree='tree', kernel_path=KERNEL_PATH): Semeval.__init__(self, vector=vector, stop=False, lowercase=lowercase, punctuation=False, w2vdim=w2vdim) self.path = kernel_path self.tree = tree self.memoization = {} self.svm = Model() self.flat_traindata() self.treekernel = TreeKernel(alpha=alpha, decay=decay, ignore_leaves=ignore_leaves, smoothed=smoothed, lowercase=lowercase) self.train() del self.additional def memoize(self, q1id, q1, q1_emb, q1_token2lemma, q2id, q2, q2_emb, q2_token2lemma, alignments): if q1id in self.memoization: if q2id in self.memoization[q1id]: return self.memoization[q1id][q2id] else: self.memoization[q1id] = {} if q2id in self.memoization: if q1id in self.memoization[q2id]: return self.memoization[q2id][q1id] else: self.memoization[q2id] = {} k = self.treekernel(q1, q1_emb, q1_token2lemma, q2, q2_emb, q2_token2lemma, alignments) self.memoization[q1id][q2id] = k self.memoization[q2id][q1id] = k return k def flat_traindata(self): self.flattraindata = [] for q1id in self.traindata: for q2id in self.traindata[q1id]: self.flattraindata.append(self.traindata[q1id][q2id]) def get_alignment(self, c1, c2): alignments = [] for i, w in enumerate(c1): alignments_i = [] for j, t in enumerate(c2): try: w_t = self.alignments[t[0]][t][w[0]][w] except: w_t = 0.0 alignments_i.append(w_t) alignments.append(alignments_i) return alignments def extract_features(self, procdata, elmoidx, elmovec): feat, X, y = {}, [], [] for i, q1id in enumerate(procdata): feat[q1id] = {} percentage = round(float(i + 1) / len(procdata), 2) for q2id in procdata[q1id]: q_pair = procdata[q1id][q2id] x = [] q1id = q_pair['q1_id'] q1 = q_pair['q1_full'] q1_tree = q_pair['q1_tree'] if self.tree == 'tree' else q_pair[ 'subj_q1_tree'] q1_emb = self.encode(q1id, q1, elmoidx, elmovec) q1_token2lemma = dict(zip(q1, q_pair['q1_lemmas'])) alignments = self.get_alignment( q1, q1) if self.vector == 'alignments' else [] kq1 = self.memoize(q1id, q1_tree, q1_emb, q1_token2lemma, q1id, q1_tree, q1_emb, q1_token2lemma, alignments) q2id = q_pair['q2_id'] q2 = q_pair['q2_full'] q2_tree = q_pair['q2_tree'] if self.tree == 'tree' else q_pair[ 'subj_q2_tree'] q2_emb = self.encode(q2id, q2, elmoidx, elmovec) q2_token2lemma = dict(zip(q2, q_pair['q2_lemmas'])) alignments = self.get_alignment( q2, q2) if self.vector == 'alignments' else [] kq2 = self.memoize(q2id, q2_tree, q2_emb, q2_token2lemma, q2id, q2_tree, q2_emb, q2_token2lemma, alignments) if i % 10 == 0: print('Path: ', self.path, 'Progress: ', percentage, i + 1, sep=10 * ' ', end='\r') for j, c in enumerate(self.flattraindata): c1id = c['q1_id'] c1 = c['q1_full'] c1_tree = c['q1_tree'] if self.tree == 'tree' else c[ 'subj_q1_tree'] c1_emb = self.encode(c1id, c1, self.trainidx, self.trainelmo) c1_token2lemma = dict(zip(c1, c['q1_lemmas'])) alignments = self.get_alignment( c1, c1) if self.vector == 'alignments' else [] kc1 = self.memoize(c1id, c1_tree, c1_emb, c1_token2lemma, c1id, c1_tree, c1_emb, c1_token2lemma, alignments) c2id = c['q2_id'] c2 = c['q2_full'] c2_tree = c['q2_tree'] if self.tree == 'tree' else c[ 'subj_q2_tree'] c2_emb = self.encode(c2id, c2, self.trainidx, self.trainelmo) c2_token2lemma = dict(zip(c2, c['q2_lemmas'])) alignments = self.get_alignment( c2, c2) if self.vector == 'alignments' else [] kc2 = self.memoize(c2id, c2_tree, c2_emb, c2_token2lemma, c2id, c2_tree, c2_emb, c2_token2lemma, alignments) if kq1 == 0 or kc1 == 0: kq1c1 = 0.0 else: alignments = self.get_alignment( q1, c1) if self.vector == 'alignments' else [] kq1c1 = float( self.memoize(q1id, q1_tree, q1_emb, q1_token2lemma, c1id, c1_tree, c1_emb, c1_token2lemma, alignments)) / np.sqrt( kq1 * kc1) # normalized if kq2 == 0 or kc2 == 0: kq2c2 = 0.0 else: alignments = self.get_alignment( q2, c2) if self.vector == 'alignments' else [] kq2c2 = float( self.memoize(q2id, q2_tree, q2_emb, q2_token2lemma, c2id, c2_tree, c2_emb, c2_token2lemma, alignments)) / np.sqrt( kq2 * kc2) # normalized k = kq1c1 + kq2c2 x.append(k) y_ = q_pair['label'] feat[q1id][q2id] = (x, y_) X.append(x) y.append(y_) return feat, X, y def train(self): path = os.path.join('kernel', 'train', self.path) self.X, self.y = [], [] if not os.path.exists(path): feat, self.X, self.y = self.extract_features( self.traindata, self.trainidx, self.trainelmo) p.dump(feat, open(path, 'wb')) else: feat = p.load(open(path, 'rb')) for q1id in feat: for q2id in feat[q1id]: self.X.append(feat[q1id][q2id][0]) self.y.append(feat[q1id][q2id][1]) self.X = np.array(self.X) self.svm.train_svm(trainvectors=self.X, labels=self.y, c='search', kernel='precomputed', gamma='search', jobs=10) def validate(self): path = os.path.join('kernel', 'dev', self.path) if not os.path.exists(path): feat, X, y = self.extract_features(self.devdata, self.devidx, self.develmo) p.dump(feat, open(path, 'wb')) else: feat = p.load(open(path, 'rb')) ranking = {} y_real, y_pred = [], [] for i, q1id in enumerate(feat): ranking[q1id] = [] for q2id in feat[q1id]: X = feat[q1id][q2id][0] score, pred_label = self.svm.score(X) y_pred.append(pred_label) real_label = feat[q1id][q2id][1] y_real.append(real_label) ranking[q1id].append((real_label, score, q2id)) parameter_settings = self.svm.return_parameter_settings(clf='svm') return ranking, y_real, y_pred, parameter_settings def test(self, testdata, elmoidx, elmovec, test_='test2016'): if test_ == 'test2016': path = os.path.join('kernel', 'test2016', self.path) elif test_ == 'train': path = os.path.join('kernel', 'train', self.path) elif test_ == 'dev': path = os.path.join('kernel', 'test2016', self.path) else: path = os.path.join('kernel', 'test2017', self.path) self.testdata = testdata if not os.path.exists(path): feat, X, y = self.extract_features(self.testdata, elmoidx, elmovec) p.dump(feat, open(path, 'wb')) else: feat = p.load(open(path, 'rb')) ranking = {} y_real, y_pred = [], [] for i, q1id in enumerate(feat): ranking[q1id] = [] for q2id in feat[q1id]: X = feat[q1id][q2id][0] score, pred_label = self.svm.score(X) y_pred.append(pred_label) real_label = feat[q1id][q2id][1] y_real.append(real_label) ranking[q1id].append((real_label, score, q2id)) parameter_settings = self.svm.return_parameter_settings(clf='svm') return ranking, y_real, y_pred, parameter_settings
class SemevalSVM(Semeval): def __init__(self, model='svm', features='bm25,', comment_features='bm25,', stop=True, vector='word2vec', lowercase=True, punctuation=True, proctrain=True, path=FEATURES_PATH, alpha=0.1, sigma=0.9, gridsearch='random'): Semeval.__init__(self, stop=stop, vector=vector, lowercase=lowercase, punctuation=punctuation) self.path = path self.features = features.split(',') self.comment_features = comment_features.split(',') self.gridsearch = gridsearch self.svm = Model() self.model = model self.bm25 = SemevalBM25( stop=stop, lowercase=lowercase, punctuation=punctuation, proctrain=proctrain ) if 'bm25' in self.features + self.comment_features else None self.cosine = SemevalCosine( stop=stop, lowercase=lowercase, punctuation=punctuation, proctrain=proctrain ) if 'cosine' in self.features + self.comment_features else None self.softcosine = SemevalSoftCosine( stop=stop, lowercase=lowercase, punctuation=punctuation, proctrain=proctrain, vector=vector ) if 'softcosine' in self.features + self.comment_features else None self.translation = SemevalTranslation( alpha=alpha, sigma=sigma, stop=stop, lowercase=lowercase, punctuation=punctuation, proctrain=proctrain, vector=self.vector ) if 'translation' in self.features + self.comment_features else None self.train() def extract_features(self, procdata, elmoidx, elmovec): X, y = [], [] feat = {} for i, q1id in enumerate(procdata): feat[q1id] = {} percentage = round(float(i + 1) / len(procdata), 2) print('Extracting features: ', percentage, i + 1, sep='\t', end='\r') for q2id in procdata[q1id]: query_question = procdata[q1id][q2id] q1, q2 = query_question['q1'], query_question['q2'] x = [] q1_emb = self.encode(q1id, q1, elmoidx, elmovec) # bm25 if 'bm25' in self.features: score = self.bm25.model(q1, q2id) x.append(score) for comment in query_question['comments']: q3id = comment['id'] q3 = comment['tokens'] if len(q3) > 0: score = self.bm25.model(q1, q3id) x.append(score) else: x.append(0) # softcosine elif 'softcosine' in self.features: if self.vector == 'alignments': score = self.softcosine.model.score( q1, q2, self.alignments) else: q2_emb = self.encode(q2id, q2, elmoidx, elmovec) score = self.softcosine.model(q1, q1_emb, q2, q2_emb) x.append(score) for comment in query_question['comments']: q3id = comment['id'] q3 = comment['tokens'] if len(q3) > 0: if self.vector == 'alignments': score = self.softcosine.model.score( q1, q2, self.alignments) else: q3_emb = self.encode(q3id, q3, elmoidx, elmovec) score = self.softcosine.model( q1, q1_emb, q3, q3_emb) x.append(score) else: x.append(0) # translation elif 'translation' in self.features: if self.vector == 'alignments': lmprob, trmprob, trlmprob, proctime = self.translation.model.score( q1, q2) else: q2_emb = self.encode(q2id, q2, elmoidx, elmovec) lmprob, trmprob, trlmprob, proctime = self.translation.model( q1, q1_emb, q2, q2_emb) x.append(trlmprob) for comment in query_question['comments']: q3id = comment['id'] q3 = comment['tokens'] if len(q3) > 0: if self.vector == 'alignments': lmprob, trmprob, trlmprob, proctime = self.translation.model.score( q1, q3) else: q3_emb = self.encode(q3id, q3, elmoidx, elmovec) lmprob, trmprob, trlmprob, proctime = self.translation.model( q1, q1_emb, q3, q3_emb) x.append(trlmprob) else: x.append(0) # cosine elif 'cosine' in self.features: score = self.cosine.model(q1, q2) x.append(score) for comment in query_question['comments']: q3id = comment['id'] q3 = comment['tokens'] if len(q3) > 0: score = self.cosine.model(q1, q3) x.append(score) else: x.append(0) y_ = query_question['label'] feat[q1id][q2id] = (x, y_) X.append(x) y.append(y_) return feat, X, y def train(self): self.X, self.y = [], [] path = os.path.join('feature', 'train', self.path) if not os.path.exists(path): feat, self.X, self.y = self.extract_features( self.traindata, self.trainidx, self.trainelmo) p.dump(feat, open(path, 'wb')) else: feat = p.load(open(path, 'rb')) for q1id in feat: for q2id in feat[q1id]: self.X.append(feat[q1id][q2id][0]) self.y.append(feat[q1id][q2id][1]) self.X = np.array(self.X) self.scaler = MinMaxScaler(feature_range=(-1, 1)) self.scaler.fit(self.X) self.X = self.scaler.transform(self.X) if self.model == 'svm': self.svm.train_svm(trainvectors=self.X, labels=self.y, c='search', kernel='search', gamma='search', jobs=10, gridsearch=self.gridsearch) else: self.svm.train_regression(trainvectors=self.X, labels=self.y, c='search', penalty='search', tol='search', gridsearch=self.gridsearch, jobs=10) def validate(self): path = os.path.join('feature', 'dev', self.path) if not os.path.exists(path): feat, X, y = self.extract_features(self.devdata, self.devidx, self.develmo) p.dump(feat, open(path, 'wb')) else: feat = p.load(open(path, 'rb')) ranking = {} y_real, y_pred = [], [] for i, q1id in enumerate(feat): ranking[q1id] = [] for q2id in feat[q1id]: X = feat[q1id][q2id][0] X = self.scaler.transform([X])[0] score, pred_label = self.svm.score(X) y_pred.append(pred_label) real_label = feat[q1id][q2id][1] y_real.append(real_label) ranking[q1id].append((real_label, score, q2id)) parameter_settings = self.svm.return_parameter_settings(clf=self.model) return ranking, y_real, y_pred, parameter_settings def test(self, testdata, elmoidx, elmovec, test_='test2016'): if test_ == 'test2016': path = os.path.join('feature', 'test2016', self.path) else: path = os.path.join('feature', 'test2017', self.path) self.testdata = testdata if not os.path.exists(path): feat, X, y = self.extract_features(self.testdata, elmoidx, elmovec) p.dump(feat, open(path, 'wb')) else: feat = p.load(open(path, 'rb')) ranking = {} y_real, y_pred = [], [] for i, q1id in enumerate(feat): ranking[q1id] = [] for q2id in feat[q1id]: X = feat[q1id][q2id][0] X = self.scaler.transform([X])[0] score, pred_label = self.svm.score(X) y_pred.append(pred_label) real_label = feat[q1id][q2id][1] y_real.append(real_label) ranking[q1id].append((real_label, score, q2id)) parameter_settings = self.svm.return_parameter_settings(clf=self.model) return ranking, y_real, y_pred, parameter_settings