示例#1
0
class QuoraSVM(Quora):
    def __init__(self, model='svm', features='bm25,', comment_features='bm25,', stop=True, vector='word2vec', path=FEATURES_PATH, alpha=0.1, sigma=0.9, gridsearch='random'):
        Quora.__init__(self, stop=stop, vector=vector)
        self.path = path
        self.features = features.split(',')
        self.comment_features = comment_features.split(',')
        self.gridsearch = gridsearch
        self.svm = Model()

        self.model = model
        self.bm25 = QuoraBM25(stop=stop) if 'bm25' in self.features+self.comment_features else None
        self.cosine = QuoraCosine(stop=stop) if 'cosine' in self.features+self.comment_features else None
        self.softcosine = QuoraSoftCosine(stop=stop, vector=vector) if 'softcosine' in self.features+self.comment_features else None
        self.translation = QuoraTranslations(alpha=alpha, sigma=sigma, stop=stop, vector=self.vector) if 'translation' in self.features+self.comment_features else None

        self.train()


    def extract_features(self, pairdata, elmoidx, elmovec, fullelmoidx, fullelmovec):
        X, y = [], []
        feat = []

        for i, pair in enumerate(pairdata):
            try:
                percentage = round(float(i + 1) / len(pairdata), 2)
                print('Extracting features: ', percentage, i + 1, sep='\t', end = '\r')
                q1id = pair['qid1'] if 'qid1' in pair else str(i) + '1'
                q2id = pair['qid2'] if 'qid2' in pair else str(i) + '2'
                q1, q2 = pair['tokens_proc1'], pair['tokens_proc2']

                x = []

                if self.stop:
                    q1_emb = self.encode(q1id, q1, elmoidx, elmovec)
                else:
                    q1_emb = self.encode(q1id, q1, fullelmoidx, fullelmovec)

                # bm25
                if 'bm25' in self.features:
                    score = self.bm25.model(q1, q2id)
                    x.append(score)
                # softcosine
                if 'softcosine' in self.features:
                    if self.stop:
                        q2_emb = self.encode(q2id, q2, elmoidx, elmovec)
                    else:
                        q2_emb = self.encode(q2id, q2, fullelmoidx, fullelmovec)
                    score = self.softcosine.model(q1, q1_emb, q2, q2_emb)
                    x.append(score)
                # translation
                if 'translation' in self.features:
                    if self.stop:
                        q2_emb = self.encode(q2id, q2, elmoidx, elmovec)
                    else:
                        q2_emb = self.encode(q2id, q2, fullelmoidx, fullelmovec)
                    lmprob, trmprob, trlmprob, proctime = self.translation.model(q1, q1_emb, q2, q2_emb)
                    x.append(trlmprob)
                # cosine
                if 'cosine' in self.features:
                    score = self.cosine.model(q1, q2)
                    x.append(score)

                y_ = int(pair['is_duplicate'])
                feat.append((x, y_))
                X.append(x)
                y.append(y_)
            except:
                print('Error')
                print(pair)
        return feat, X, y


    def train(self):
        path = os.path.join(FEATURES_PATH, 'train', self.path)
        self.X, self.y = [], []
        if not os.path.exists(path):
            feat, self.X, self.y = self.extract_features(self.trainset, self.trainidx, self.trainelmo, self.fulltrainidx, self.fulltrainelmo)

            p.dump(feat, open(path, 'wb'))
        else:
            feat = p.load(open(path, 'rb'))
            for row in feat:
                self.X.append(row[0])
                self.y.append(row[1])

        self.scaler = MinMaxScaler(feature_range=(-1, 1))
        self.scaler.fit(self.X)
        self.X = self.scaler.transform(self.X)

        if self.model == 'svm':
            self.svm.train_svm(
                trainvectors=self.X,
                labels=self.y,
                c='search',
                kernel='search',
                gamma='search',
                jobs=10,
                gridsearch=self.gridsearch
            )
        else:
            self.svm.train_regression(trainvectors=self.X, labels=self.y, c='search', penalty='search', tol='search', gridsearch=self.gridsearch, jobs=10)


    def validate(self):
        path = os.path.join(FEATURES_PATH, 'dev', self.path)
        if not os.path.exists(path):
            feat, X, y = self.extract_features(self.devset, self.devidx, self.develmo, self.fulldevidx, self.fulldevelmo)
            p.dump(feat, open(path, 'wb'))
        else:
            feat = p.load(open(path, 'rb'))

        y_real, y_pred = [], []
        for i, pair in enumerate(feat):
            X = pair[0]
            X = self.scaler.transform([X])[0]
            score, pred_label = self.svm.score(X)
            y_pred.append(pred_label)

            real_label = pair[1]
            y_real.append(real_label)

        parameter_settings = self.svm.return_parameter_settings(clf=self.model)

        return y_real, y_pred, parameter_settings
示例#2
0
class SemevalTreeKernel(Semeval):
    def __init__(self,
                 alpha=0,
                 decay=1,
                 ignore_leaves=True,
                 smoothed=True,
                 vector='word2vec',
                 w2vdim=300,
                 lowercase=True,
                 tree='tree',
                 kernel_path=KERNEL_PATH):
        Semeval.__init__(self,
                         vector=vector,
                         stop=False,
                         lowercase=lowercase,
                         punctuation=False,
                         w2vdim=w2vdim)
        self.path = kernel_path
        self.tree = tree
        self.memoization = {}
        self.svm = Model()
        self.flat_traindata()
        self.treekernel = TreeKernel(alpha=alpha,
                                     decay=decay,
                                     ignore_leaves=ignore_leaves,
                                     smoothed=smoothed,
                                     lowercase=lowercase)
        self.train()

        del self.additional

    def memoize(self, q1id, q1, q1_emb, q1_token2lemma, q2id, q2, q2_emb,
                q2_token2lemma, alignments):
        if q1id in self.memoization:
            if q2id in self.memoization[q1id]:
                return self.memoization[q1id][q2id]
        else:
            self.memoization[q1id] = {}

        if q2id in self.memoization:
            if q1id in self.memoization[q2id]:
                return self.memoization[q2id][q1id]
        else:
            self.memoization[q2id] = {}

        k = self.treekernel(q1, q1_emb, q1_token2lemma, q2, q2_emb,
                            q2_token2lemma, alignments)
        self.memoization[q1id][q2id] = k
        self.memoization[q2id][q1id] = k

        return k

    def flat_traindata(self):
        self.flattraindata = []
        for q1id in self.traindata:
            for q2id in self.traindata[q1id]:
                self.flattraindata.append(self.traindata[q1id][q2id])

    def get_alignment(self, c1, c2):
        alignments = []
        for i, w in enumerate(c1):
            alignments_i = []

            for j, t in enumerate(c2):
                try:
                    w_t = self.alignments[t[0]][t][w[0]][w]
                except:
                    w_t = 0.0
                alignments_i.append(w_t)
            alignments.append(alignments_i)
        return alignments

    def extract_features(self, procdata, elmoidx, elmovec):
        feat, X, y = {}, [], []

        for i, q1id in enumerate(procdata):
            feat[q1id] = {}
            percentage = round(float(i + 1) / len(procdata), 2)
            for q2id in procdata[q1id]:
                q_pair = procdata[q1id][q2id]

                x = []
                q1id = q_pair['q1_id']
                q1 = q_pair['q1_full']
                q1_tree = q_pair['q1_tree'] if self.tree == 'tree' else q_pair[
                    'subj_q1_tree']
                q1_emb = self.encode(q1id, q1, elmoidx, elmovec)
                q1_token2lemma = dict(zip(q1, q_pair['q1_lemmas']))
                alignments = self.get_alignment(
                    q1, q1) if self.vector == 'alignments' else []
                kq1 = self.memoize(q1id, q1_tree, q1_emb, q1_token2lemma, q1id,
                                   q1_tree, q1_emb, q1_token2lemma, alignments)

                q2id = q_pair['q2_id']
                q2 = q_pair['q2_full']
                q2_tree = q_pair['q2_tree'] if self.tree == 'tree' else q_pair[
                    'subj_q2_tree']
                q2_emb = self.encode(q2id, q2, elmoidx, elmovec)
                q2_token2lemma = dict(zip(q2, q_pair['q2_lemmas']))
                alignments = self.get_alignment(
                    q2, q2) if self.vector == 'alignments' else []
                kq2 = self.memoize(q2id, q2_tree, q2_emb, q2_token2lemma, q2id,
                                   q2_tree, q2_emb, q2_token2lemma, alignments)

                if i % 10 == 0:
                    print('Path: ',
                          self.path,
                          'Progress: ',
                          percentage,
                          i + 1,
                          sep=10 * ' ',
                          end='\r')
                for j, c in enumerate(self.flattraindata):
                    c1id = c['q1_id']
                    c1 = c['q1_full']
                    c1_tree = c['q1_tree'] if self.tree == 'tree' else c[
                        'subj_q1_tree']
                    c1_emb = self.encode(c1id, c1, self.trainidx,
                                         self.trainelmo)
                    c1_token2lemma = dict(zip(c1, c['q1_lemmas']))
                    alignments = self.get_alignment(
                        c1, c1) if self.vector == 'alignments' else []
                    kc1 = self.memoize(c1id, c1_tree, c1_emb, c1_token2lemma,
                                       c1id, c1_tree, c1_emb, c1_token2lemma,
                                       alignments)

                    c2id = c['q2_id']
                    c2 = c['q2_full']
                    c2_tree = c['q2_tree'] if self.tree == 'tree' else c[
                        'subj_q2_tree']
                    c2_emb = self.encode(c2id, c2, self.trainidx,
                                         self.trainelmo)
                    c2_token2lemma = dict(zip(c2, c['q2_lemmas']))
                    alignments = self.get_alignment(
                        c2, c2) if self.vector == 'alignments' else []
                    kc2 = self.memoize(c2id, c2_tree, c2_emb, c2_token2lemma,
                                       c2id, c2_tree, c2_emb, c2_token2lemma,
                                       alignments)

                    if kq1 == 0 or kc1 == 0:
                        kq1c1 = 0.0
                    else:
                        alignments = self.get_alignment(
                            q1, c1) if self.vector == 'alignments' else []
                        kq1c1 = float(
                            self.memoize(q1id, q1_tree, q1_emb, q1_token2lemma,
                                         c1id, c1_tree, c1_emb, c1_token2lemma,
                                         alignments)) / np.sqrt(
                                             kq1 * kc1)  # normalized

                    if kq2 == 0 or kc2 == 0:
                        kq2c2 = 0.0
                    else:
                        alignments = self.get_alignment(
                            q2, c2) if self.vector == 'alignments' else []
                        kq2c2 = float(
                            self.memoize(q2id, q2_tree, q2_emb, q2_token2lemma,
                                         c2id, c2_tree, c2_emb, c2_token2lemma,
                                         alignments)) / np.sqrt(
                                             kq2 * kc2)  # normalized

                    k = kq1c1 + kq2c2
                    x.append(k)

                y_ = q_pair['label']
                feat[q1id][q2id] = (x, y_)
                X.append(x)
                y.append(y_)
        return feat, X, y

    def train(self):
        path = os.path.join('kernel', 'train', self.path)
        self.X, self.y = [], []
        if not os.path.exists(path):
            feat, self.X, self.y = self.extract_features(
                self.traindata, self.trainidx, self.trainelmo)

            p.dump(feat, open(path, 'wb'))
        else:
            feat = p.load(open(path, 'rb'))
            for q1id in feat:
                for q2id in feat[q1id]:
                    self.X.append(feat[q1id][q2id][0])
                    self.y.append(feat[q1id][q2id][1])

        self.X = np.array(self.X)
        self.svm.train_svm(trainvectors=self.X,
                           labels=self.y,
                           c='search',
                           kernel='precomputed',
                           gamma='search',
                           jobs=10)

    def validate(self):
        path = os.path.join('kernel', 'dev', self.path)
        if not os.path.exists(path):
            feat, X, y = self.extract_features(self.devdata, self.devidx,
                                               self.develmo)
            p.dump(feat, open(path, 'wb'))
        else:
            feat = p.load(open(path, 'rb'))

        ranking = {}
        y_real, y_pred = [], []
        for i, q1id in enumerate(feat):
            ranking[q1id] = []
            for q2id in feat[q1id]:
                X = feat[q1id][q2id][0]

                score, pred_label = self.svm.score(X)
                y_pred.append(pred_label)

                real_label = feat[q1id][q2id][1]
                y_real.append(real_label)
                ranking[q1id].append((real_label, score, q2id))

        parameter_settings = self.svm.return_parameter_settings(clf='svm')

        return ranking, y_real, y_pred, parameter_settings

    def test(self, testdata, elmoidx, elmovec, test_='test2016'):
        if test_ == 'test2016':
            path = os.path.join('kernel', 'test2016', self.path)
        elif test_ == 'train':
            path = os.path.join('kernel', 'train', self.path)
        elif test_ == 'dev':
            path = os.path.join('kernel', 'test2016', self.path)
        else:
            path = os.path.join('kernel', 'test2017', self.path)

        self.testdata = testdata
        if not os.path.exists(path):
            feat, X, y = self.extract_features(self.testdata, elmoidx, elmovec)
            p.dump(feat, open(path, 'wb'))
        else:
            feat = p.load(open(path, 'rb'))

        ranking = {}
        y_real, y_pred = [], []
        for i, q1id in enumerate(feat):
            ranking[q1id] = []
            for q2id in feat[q1id]:
                X = feat[q1id][q2id][0]

                score, pred_label = self.svm.score(X)
                y_pred.append(pred_label)

                real_label = feat[q1id][q2id][1]
                y_real.append(real_label)
                ranking[q1id].append((real_label, score, q2id))

        parameter_settings = self.svm.return_parameter_settings(clf='svm')

        return ranking, y_real, y_pred, parameter_settings
示例#3
0
class SemevalSVM(Semeval):
    def __init__(self,
                 model='svm',
                 features='bm25,',
                 comment_features='bm25,',
                 stop=True,
                 vector='word2vec',
                 lowercase=True,
                 punctuation=True,
                 proctrain=True,
                 path=FEATURES_PATH,
                 alpha=0.1,
                 sigma=0.9,
                 gridsearch='random'):
        Semeval.__init__(self,
                         stop=stop,
                         vector=vector,
                         lowercase=lowercase,
                         punctuation=punctuation)
        self.path = path
        self.features = features.split(',')
        self.comment_features = comment_features.split(',')
        self.gridsearch = gridsearch
        self.svm = Model()

        self.model = model
        self.bm25 = SemevalBM25(
            stop=stop,
            lowercase=lowercase,
            punctuation=punctuation,
            proctrain=proctrain
        ) if 'bm25' in self.features + self.comment_features else None
        self.cosine = SemevalCosine(
            stop=stop,
            lowercase=lowercase,
            punctuation=punctuation,
            proctrain=proctrain
        ) if 'cosine' in self.features + self.comment_features else None
        self.softcosine = SemevalSoftCosine(
            stop=stop,
            lowercase=lowercase,
            punctuation=punctuation,
            proctrain=proctrain,
            vector=vector
        ) if 'softcosine' in self.features + self.comment_features else None
        self.translation = SemevalTranslation(
            alpha=alpha,
            sigma=sigma,
            stop=stop,
            lowercase=lowercase,
            punctuation=punctuation,
            proctrain=proctrain,
            vector=self.vector
        ) if 'translation' in self.features + self.comment_features else None

        self.train()

    def extract_features(self, procdata, elmoidx, elmovec):
        X, y = [], []
        feat = {}
        for i, q1id in enumerate(procdata):
            feat[q1id] = {}
            percentage = round(float(i + 1) / len(procdata), 2)
            print('Extracting features: ',
                  percentage,
                  i + 1,
                  sep='\t',
                  end='\r')
            for q2id in procdata[q1id]:
                query_question = procdata[q1id][q2id]
                q1, q2 = query_question['q1'], query_question['q2']
                x = []

                q1_emb = self.encode(q1id, q1, elmoidx, elmovec)

                # bm25
                if 'bm25' in self.features:
                    score = self.bm25.model(q1, q2id)
                    x.append(score)

                    for comment in query_question['comments']:
                        q3id = comment['id']
                        q3 = comment['tokens']

                        if len(q3) > 0:
                            score = self.bm25.model(q1, q3id)
                            x.append(score)
                        else:
                            x.append(0)

                # softcosine
                elif 'softcosine' in self.features:
                    if self.vector == 'alignments':
                        score = self.softcosine.model.score(
                            q1, q2, self.alignments)
                    else:
                        q2_emb = self.encode(q2id, q2, elmoidx, elmovec)
                        score = self.softcosine.model(q1, q1_emb, q2, q2_emb)
                    x.append(score)

                    for comment in query_question['comments']:
                        q3id = comment['id']
                        q3 = comment['tokens']

                        if len(q3) > 0:
                            if self.vector == 'alignments':
                                score = self.softcosine.model.score(
                                    q1, q2, self.alignments)
                            else:
                                q3_emb = self.encode(q3id, q3, elmoidx,
                                                     elmovec)
                                score = self.softcosine.model(
                                    q1, q1_emb, q3, q3_emb)
                            x.append(score)
                        else:
                            x.append(0)

                # translation
                elif 'translation' in self.features:
                    if self.vector == 'alignments':
                        lmprob, trmprob, trlmprob, proctime = self.translation.model.score(
                            q1, q2)
                    else:
                        q2_emb = self.encode(q2id, q2, elmoidx, elmovec)
                        lmprob, trmprob, trlmprob, proctime = self.translation.model(
                            q1, q1_emb, q2, q2_emb)
                    x.append(trlmprob)

                    for comment in query_question['comments']:
                        q3id = comment['id']
                        q3 = comment['tokens']

                        if len(q3) > 0:
                            if self.vector == 'alignments':
                                lmprob, trmprob, trlmprob, proctime = self.translation.model.score(
                                    q1, q3)
                            else:
                                q3_emb = self.encode(q3id, q3, elmoidx,
                                                     elmovec)
                                lmprob, trmprob, trlmprob, proctime = self.translation.model(
                                    q1, q1_emb, q3, q3_emb)
                            x.append(trlmprob)
                        else:
                            x.append(0)

                # cosine
                elif 'cosine' in self.features:
                    score = self.cosine.model(q1, q2)
                    x.append(score)

                    for comment in query_question['comments']:
                        q3id = comment['id']
                        q3 = comment['tokens']

                        if len(q3) > 0:
                            score = self.cosine.model(q1, q3)
                            x.append(score)
                        else:
                            x.append(0)

                y_ = query_question['label']
                feat[q1id][q2id] = (x, y_)
                X.append(x)
                y.append(y_)
        return feat, X, y

    def train(self):
        self.X, self.y = [], []
        path = os.path.join('feature', 'train', self.path)
        if not os.path.exists(path):
            feat, self.X, self.y = self.extract_features(
                self.traindata, self.trainidx, self.trainelmo)

            p.dump(feat, open(path, 'wb'))
        else:
            feat = p.load(open(path, 'rb'))
            for q1id in feat:
                for q2id in feat[q1id]:
                    self.X.append(feat[q1id][q2id][0])
                    self.y.append(feat[q1id][q2id][1])

        self.X = np.array(self.X)
        self.scaler = MinMaxScaler(feature_range=(-1, 1))
        self.scaler.fit(self.X)
        self.X = self.scaler.transform(self.X)

        if self.model == 'svm':
            self.svm.train_svm(trainvectors=self.X,
                               labels=self.y,
                               c='search',
                               kernel='search',
                               gamma='search',
                               jobs=10,
                               gridsearch=self.gridsearch)
        else:
            self.svm.train_regression(trainvectors=self.X,
                                      labels=self.y,
                                      c='search',
                                      penalty='search',
                                      tol='search',
                                      gridsearch=self.gridsearch,
                                      jobs=10)

    def validate(self):
        path = os.path.join('feature', 'dev', self.path)
        if not os.path.exists(path):
            feat, X, y = self.extract_features(self.devdata, self.devidx,
                                               self.develmo)
            p.dump(feat, open(path, 'wb'))
        else:
            feat = p.load(open(path, 'rb'))

        ranking = {}
        y_real, y_pred = [], []
        for i, q1id in enumerate(feat):
            ranking[q1id] = []
            for q2id in feat[q1id]:
                X = feat[q1id][q2id][0]

                X = self.scaler.transform([X])[0]
                score, pred_label = self.svm.score(X)
                y_pred.append(pred_label)

                real_label = feat[q1id][q2id][1]
                y_real.append(real_label)
                ranking[q1id].append((real_label, score, q2id))

        parameter_settings = self.svm.return_parameter_settings(clf=self.model)

        return ranking, y_real, y_pred, parameter_settings

    def test(self, testdata, elmoidx, elmovec, test_='test2016'):
        if test_ == 'test2016':
            path = os.path.join('feature', 'test2016', self.path)
        else:
            path = os.path.join('feature', 'test2017', self.path)

        self.testdata = testdata
        if not os.path.exists(path):
            feat, X, y = self.extract_features(self.testdata, elmoidx, elmovec)
            p.dump(feat, open(path, 'wb'))
        else:
            feat = p.load(open(path, 'rb'))

        ranking = {}
        y_real, y_pred = [], []
        for i, q1id in enumerate(feat):
            ranking[q1id] = []
            for q2id in feat[q1id]:
                X = feat[q1id][q2id][0]

                X = self.scaler.transform([X])[0]
                score, pred_label = self.svm.score(X)
                y_pred.append(pred_label)

                real_label = feat[q1id][q2id][1]
                y_real.append(real_label)
                ranking[q1id].append((real_label, score, q2id))

        parameter_settings = self.svm.return_parameter_settings(clf=self.model)

        return ranking, y_real, y_pred, parameter_settings