예제 #1
0
    def validate(self):
        logging.info('Validating', extra=d)
        softranking, simranking = {}, {}
        for j, q1id in enumerate(self.devset):
            softranking[q1id], simranking[q1id] = [], []
            percentage = round(float(j + 1) / len(self.devset), 2)
            print('Progress: ', percentage, j + 1, sep='\t', end='\r')

            query = self.devset[q1id]
            q1 = query['tokens_proc']
            elmo_emb1 = self.develmo.get(str(self.devidx[q1id]))
            w2v_emb = features.encode(q1, self.word2vec)
            # q1emb = features.glove_encode(q1, self.glove, self.voc2id)
            q1emb = [
                np.concatenate([w2v_emb[i], elmo_emb1[i]])
                for i in range(len(w2v_emb))
            ]

            duplicates = query['duplicates']
            for duplicate in duplicates:
                rel_question = duplicate['rel_question']
                q2id = rel_question['id']

                q2 = rel_question['tokens_proc']
                elmo_emb2 = self.develmo.get(str(self.devidx[q2id]))
                w2v_emb = features.encode(q2, self.word2vec)
                # q2emb = features.glove_encode(q2, self.glove, self.voc2id)
                q2emb = [
                    np.concatenate([w2v_emb[i], elmo_emb2[i]])
                    for i in range(len(w2v_emb))
                ]

                simple_score = self.simple_score(q1, q2)
                score = self.score(q1, q1emb, q2, q2emb)
                real_label = 0
                if rel_question['relevance'] != 'Irrelevant':
                    real_label = 1
                simranking[q1id].append((real_label, simple_score, q2id))
                softranking[q1id].append((real_label, score, q2id))

        with open(os.path.join(DATA_QUESTION_PATH, 'softranking.txt'),
                  'w') as f:
            for q1id in softranking:
                for row in softranking[q1id]:
                    label = 'false'
                    if row[0] == 1:
                        label = 'true'
                    f.write('\t'.join([
                        str(q1id),
                        str(row[2]),
                        str(0),
                        str(row[1]), label, '\n'
                    ]))

        logging.info('Finishing to validate.', extra=d)
        return softranking, simranking
예제 #2
0
    def get_features(self, q1id, q1, q2id, q2, set='train'):
        X = []
        if set == 'train':
            q1_elmo = self.trainelmo.get(str(self.trainidx[q1id]))
            q2_elmo = self.trainelmo.get(str(self.trainidx[q2id]))
        else:
            q1_elmo = self.develmo.get(str(self.devidx[q1id]))
            q2_elmo = self.develmo.get(str(self.devidx[q2id]))

        q1_w2v = features.encode(q1, self.word2vec)
        q1_elmo_bottom = [
            np.concatenate([q1_w2v[i], q1_elmo[0][i]])
            for i in range(len(q1_w2v))
        ]
        q1_elmo_middle = [
            np.concatenate([q1_w2v[i], q1_elmo[1][i]])
            for i in range(len(q1_w2v))
        ]
        q1_elmo_top = [
            np.concatenate([q1_w2v[i], q1_elmo[2][i]])
            for i in range(len(q1_w2v))
        ]

        q2_w2v = features.encode(q2, self.word2vec)
        q2_elmo_bottom = [
            np.concatenate([q2_w2v[i], q2_elmo[0][i]])
            for i in range(len(q2_w2v))
        ]
        q2_elmo_middle = [
            np.concatenate([q2_w2v[i], q2_elmo[1][i]])
            for i in range(len(q2_w2v))
        ]
        q2_elmo_top = [
            np.concatenate([q2_w2v[i], q2_elmo[2][i]])
            for i in range(len(q2_w2v))
        ]

        # X.append(self.simbow.score(q1, q1_w2v, q2, q2_w2v))
        X.append(self.simbow.score(q1, q1_elmo_bottom, q2, q2_elmo_bottom))
        X.append(self.simbow.score(q1, q1_elmo_middle, q2, q2_elmo_middle))
        X.append(self.simbow.score(q1, q1_elmo_top, q2, q2_elmo_top))
        return X
예제 #3
0
    def validate(self):
        logging.info('Validating', extra=d)
        simplelm, simpletrm, simpletrlm = {}, {}, {}
        lm, trm, trlm = {}, {}, {}
        for j, q1id in enumerate(self.devset):
            simplelm[q1id] = []
            simpletrm[q1id] = []
            simpletrlm[q1id] = []
            lm[q1id] = []
            trm[q1id] = []
            trlm[q1id] = []
            percentage = round(float(j + 1) / len(self.devset), 2)
            print('Progress: ', percentage, j + 1, sep='\t', end='\r')

            query = self.devset[q1id]
            q1 = query['tokens_proc']
            elmo_emb1 = self.develmo.get(str(self.devidx[q1id]))
            w2v_emb = features.encode(q1, self.word2vec)
            q1emb = [
                np.concatenate([w2v_emb[i], elmo_emb1[i]])
                for i in range(len(w2v_emb))
            ]

            duplicates = query['duplicates']
            for duplicate in duplicates:
                rel_question = duplicate['rel_question']
                q2id = rel_question['id']

                q2 = rel_question['tokens_proc']
                elmo_emb2 = self.develmo.get(str(self.devidx[q2id]))
                w2v_emb = features.encode(q2, self.word2vec)
                q2emb = [
                    np.concatenate([w2v_emb[i], elmo_emb2[i]])
                    for i in range(len(w2v_emb))
                ]

                slmprob, strmprob, strlmprob, _ = self.translation.score(
                    q1, q2)
                lmprob, trmprob, trlmprob, _ = self.translation.score_embeddings(
                    q1, q1emb, q2, q2emb)
                real_label = 0
                if rel_question['relevance'] != 'Irrelevant':
                    real_label = 1
                simplelm[q1id].append((real_label, slmprob, q2id))
                simpletrm[q1id].append((real_label, strmprob, q2id))
                simpletrlm[q1id].append((real_label, strlmprob, q2id))
                lm[q1id].append((real_label, lmprob, q2id))
                trm[q1id].append((real_label, trmprob, q2id))
                trlm[q1id].append((real_label, trlmprob, q2id))

        with open('data/translationranking.txt', 'w') as f:
            for q1id in trlm:
                for row in trlm[q1id]:
                    label = 'false'
                    if row[0] == 1:
                        label = 'true'
                    f.write('\t'.join([
                        str(q1id),
                        str(row[2]),
                        str(0),
                        str(row[1]), label, '\n'
                    ]))

        logging.info('Finishing to validate.', extra=d)
        return simplelm, simpletrm, simpletrlm, lm, trm, trlm
예제 #4
0
    def validate(self):
        logging.info('Validating tree svm.', extra=d)
        treekernel = features.TreeKernel()
        ranking = {}
        y_real, y_pred = [], []
        for i, q1id in enumerate(self.devset):
            ranking[q1id] = []
            percentage = round(float(i + 1) / len(self.devset), 2)

            query = self.devset[q1id]
            q1_token2lemma = dict(zip(query['tokens'], query['lemmas']))
            q1_tree = utils.binarize(
                utils.parse_tree(query['tree'], q1_token2lemma))

            q1_w2v = features.encode(query['tokens'], self.word2vec)
            q1_elmo = self.fulldevelmo.get(str(self.fulldevidx[q1id]))
            q1_emb = [
                np.concatenate([q1_w2v[i], q1_elmo[i]])
                for i in range(len(q1_w2v))
            ]

            duplicates = query['duplicates']
            for duplicate in duplicates:
                rel_question = duplicate['rel_question']
                q2id = rel_question['id']
                # tree kernel
                q2_token2lemma = dict(
                    zip(rel_question['tokens'], rel_question['lemmas']))
                q2_tree = utils.binarize(
                    utils.parse_tree(rel_question['tree'], q2_token2lemma))

                # word2vec vectors
                q2_w2v = features.encode(rel_question['tokens'], self.word2vec)
                q2_elmo = self.fulldevelmo.get(str(self.fulldevidx[q2id]))
                q2_emb = [
                    np.concatenate([q2_w2v[i], q2_elmo[i]])
                    for i in range(len(q2_w2v))
                ]

                q1_tree, q2_tree = treekernel.similar_terminals(
                    q1_tree, q2_tree)

                X = []
                for j, trainrow in enumerate(self.traindata):
                    c1id, c2id = trainrow['q1_id'], trainrow['q2_id']
                    c1_token2lemma = dict(
                        zip(trainrow['q1_full'], trainrow['q1_lemmas']))
                    c2_token2lemma = dict(
                        zip(trainrow['q2_full'], trainrow['q2_lemmas']))
                    c1_tree = utils.binarize(
                        utils.parse_tree(trainrow['q1_tree'], c1_token2lemma))
                    c2_tree = utils.binarize(
                        utils.parse_tree(trainrow['q2_tree'], c2_token2lemma))

                    # word2vec vectors
                    c1_w2v = features.encode(trainrow['q1_full'],
                                             self.word2vec)
                    c1_elmo = self.fulltrainelmo.get(
                        str(self.fulltrainidx[c1id]))
                    c1_emb = [
                        np.concatenate([c1_w2v[i], c1_elmo[i]])
                        for i in range(len(c1_w2v))
                    ]

                    c2_w2v = features.encode(trainrow['q2_full'],
                                             self.word2vec)
                    c2_elmo = self.fulltrainelmo.get(
                        str(self.fulltrainidx[c2id]))
                    c2_emb = [
                        np.concatenate([c2_w2v[i], c2_elmo[i]])
                        for i in range(len(c2_w2v))
                    ]

                    c1_tree, c2_tree = treekernel.similar_terminals(
                        c1_tree, c2_tree)

                    kq1 = self.memoize(q1id, q1_tree, q1_emb, q1id, q1_tree,
                                       q1_emb, treekernel)
                    kc1 = self.memoize(c1id, c1_tree, c1_emb, c1id, c1_tree,
                                       c1_emb, treekernel)
                    kq1c1 = float(
                        self.memoize(q1id, q1_tree, q1_emb, c1id,
                                     c1_tree, c1_emb, treekernel)) / np.sqrt(
                                         kq1 * kc1)  # normalized

                    kq2 = self.memoize(q2id, q2_tree, q2_emb, q2id, q2_tree,
                                       q2_emb, treekernel)
                    kc2 = self.memoize(c2id, c2_tree, c2_emb, c2id, c2_tree,
                                       c2_emb, treekernel)
                    kq2c2 = float(
                        self.memoize(q2id, q2_tree, q2_emb, c2id,
                                     c2_tree, c2_emb, treekernel)) / np.sqrt(
                                         kq2 * kc2)  # normalized

                    # kq1c2 = float(self.memoize(q1id, q1_tree, q1_emb, c2id, c2_tree, c2_emb, treekernel)) / np.sqrt(kq1 * kc2) # normalized
                    # kq2c1 = float(self.memoize(q2id, q2_tree, q2_emb, c1id, c1_tree, c1_emb, treekernel)) / np.sqrt(kq2 * kc1) # normalized

                    k = kq1c1 + kq2c2
                    X.append(k)
                print('Progress: ', percentage, i + 1, sep='\t', end='\r')

                score = self.model.decision_function([X])[0]
                pred_label = self.model.predict([X])[0]
                y_pred.append(pred_label)

                real_label = 0
                if rel_question['relevance'] != 'Irrelevant':
                    real_label = 1
                y_real.append(real_label)
                ranking[q1id].append((real_label, score, q2id))

        with open('data/treeranking.txt', 'w') as f:
            for qid in ranking:
                for row in ranking[qid]:
                    label = 'false'
                    if row[0] == 1:
                        label = 'true'
                    f.write('\t'.join([
                        str(qid),
                        str(row[2]),
                        str(0),
                        str(row[1]), label, '\n'
                    ]))

        logging.info('Finishing to validate tree svm.', extra=d)
        return ranking, y_real, y_pred
예제 #5
0
    def train(self):
        logging.info('Training tree svm.', extra=d)
        treekernel = features.TreeKernel()

        if not os.path.exists(KERNEL_PATH):
            X, y = [], []
            for i, q in enumerate(self.traindata):
                percentage = round(float(i + 1) / len(self.traindata), 2)
                x = []
                q1id, q2id = q['q1_id'], q['q2_id']
                # trees
                q1_token2lemma = dict(zip(q['q1_full'], q['q1_lemmas']))
                q2_token2lemma = dict(zip(q['q2_full'], q['q2_lemmas']))
                q1 = utils.binarize(
                    utils.parse_tree(q['q1_tree'], q1_token2lemma))
                q2 = utils.binarize(
                    utils.parse_tree(q['q2_tree'], q2_token2lemma))

                # word2vec and elmo vectors
                q1_w2v = features.encode(q['q1_full'], self.word2vec)
                q1_elmo = self.fulltrainelmo.get(str(self.fulltrainidx[q1id]))
                q1_emb = [
                    np.concatenate([q1_w2v[i], q1_elmo[i]])
                    for i in range(len(q1_w2v))
                ]

                q2_w2v = features.encode(q['q2_full'], self.word2vec)
                q2_elmo = self.fulltrainelmo.get(str(self.fulltrainidx[q2id]))
                q2_emb = [
                    np.concatenate([q2_w2v[i], q2_elmo[i]])
                    for i in range(len(q2_w2v))
                ]

                q1, q2 = treekernel.similar_terminals(q1, q2)
                for j, c in enumerate(self.traindata):
                    c1id, c2id = c['q1_id'], c['q2_id']
                    # trees
                    c1_token2lemma = dict(zip(c['q1_full'], c['q1_lemmas']))
                    c2_token2lemma = dict(zip(c['q2_full'], c['q2_lemmas']))
                    c1 = utils.binarize(
                        utils.parse_tree(c['q1_tree'], c1_token2lemma))
                    c2 = utils.binarize(
                        utils.parse_tree(c['q2_tree'], c2_token2lemma))
                    # word2vec vectors
                    c1_w2v = features.encode(c['q1_full'], self.word2vec)
                    c1_elmo = self.fulltrainelmo.get(
                        str(self.fulltrainidx[c1id]))
                    c1_emb = [
                        np.concatenate([c1_w2v[i], c1_elmo[i]])
                        for i in range(len(c1_w2v))
                    ]

                    c2_w2v = features.encode(c['q2_full'], self.word2vec)
                    c2_elmo = self.fulltrainelmo.get(
                        str(self.fulltrainidx[c2id]))
                    c2_emb = [
                        np.concatenate([c2_w2v[i], c2_elmo[i]])
                        for i in range(len(c2_w2v))
                    ]

                    c1, c2 = treekernel.similar_terminals(c1, c2)
                    kq1 = self.memoize(q1id, q1, q1_emb, q1id, q1, q1_emb,
                                       treekernel)
                    kc1 = self.memoize(c1id, c1, c1_emb, c1id, c1, c1_emb,
                                       treekernel)
                    kq1c1 = float(
                        self.memoize(q1id, q1, q1_emb, c1id,
                                     c1, c1_emb, treekernel)) / np.sqrt(
                                         kq1 * kc1)  # normalized

                    kq2 = self.memoize(q2id, q2, q2_emb, q2id, q2, q2_emb,
                                       treekernel)
                    kc2 = self.memoize(c2id, c2, c2_emb, c2id, c2, c2_emb,
                                       treekernel)
                    kq2c2 = float(
                        self.memoize(q2id, q2, q2_emb, c2id,
                                     c2, c2_emb, treekernel)) / np.sqrt(
                                         kq2 * kc2)  # normalized

                    # kq1c2 = float(self.memoize(q1id, q1, q1_emb, c2id, c2, c2_emb, treekernel)) / np.sqrt(kq1 * kc2) # normalized
                    # kq2c1 = float(self.memoize(q2id, q2, q2_emb, c1id, c1, c1_emb, treekernel)) / np.sqrt(kq2 * kc1) # normalized

                    k = kq1c1 + kq2c2
                    x.append(k)
                    print('Preparing kernel: ',
                          percentage,
                          i + 1,
                          j + 1,
                          sep='\t',
                          end='\r')
                X.append(x)
                y.append(q['label'])
            p.dump(list(zip(X, y)), open(KERNEL_PATH, 'wb'))
            X = np.array(X)
        else:
            f = p.load(open(KERNEL_PATH, 'rb'))
            X = np.array([x[0] for x in f])
            y = list(map(lambda x: x[1], f))

        self.model = self.train_svm(trainvectors=X,
                                    labels=y,
                                    c='search',
                                    kernel='precomputed',
                                    gamma='search',
                                    jobs=4)
        logging.info('Finishing to train tree svm.', extra=d)
예제 #6
0
    def validate(self):
        logging.info('Validating svm.', extra=d)
        treekernel = features.TreeKernel(alpha=0,
                                         decay=1,
                                         ignore_leaves=True,
                                         smoothed=False)
        ranking = {}
        y_real, y_pred = [], []
        for i, q1id in enumerate(self.devset):
            ranking[q1id] = []
            percentage = round(float(i + 1) / len(self.devset), 2)
            print('Progress: ', percentage, i + 1, sep='\t', end='\r')

            query = self.devset[q1id]
            q1 = query['tokens_proc']
            # q1_lemma = query['lemmas']
            # q1_pos = query['pos']
            # q1_token2lemma = dict(zip(query['tokens'], query['lemmas']))
            # q1_tree = utils.parse_tree(query['subj_tree'], q1_token2lemma)

            q1_elmo = self.develmo.get(str(self.devidx[q1id]))
            q1_w2v = features.encode(q1, self.word2vec)
            q1_emb = [
                np.concatenate([q1_w2v[i], q1_elmo[i]])
                for i in range(len(q1_w2v))
            ]

            duplicates = query['duplicates']
            for duplicate in duplicates:
                rel_question = duplicate['rel_question']
                q2id = rel_question['id']
                q2 = rel_question['tokens_proc']
                # X = self.get_features(q1id, q1, q2id, q2, set='dev')
                # X = self.__transform__(q1, q2)
                X = []

                q2_elmo = self.develmo.get(str(self.devidx[q2id]))
                q2_w2v = features.encode(q2, self.word2vec)
                q2_emb = [
                    np.concatenate([q2_w2v[i], q2_elmo[i]])
                    for i in range(len(q2_w2v))
                ]

                # # translation
                # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q2, q2_emb)
                # X.append(trlmprob)
                #
                # # bm25
                # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[q2id], self.avg_idf)
                # X.append(bm25_score)
                #
                # # cosine
                # q2_lemma = rel_question['lemmas']
                # q2_pos = rel_question['pos']
                # for n in range(1,5):
                #     try:
                #         X.append(features.cosine(' '.join(q1), ' '.join(q2), n=n))
                #     except:
                #         X.append(0.0)
                #     try:
                #         X.append(features.cosine(' '.join(q1_lemma), ' '.join(q2_lemma), n=n))
                #     except:
                #         X.append(0.0)
                #     try:
                #         X.append(features.cosine(' '.join(q1_pos), ' '.join(q2_pos), n=n))
                #     except:
                #         X.append(0.0)
                #
                # # tree kernel
                # q2_token2lemma = dict(zip(rel_question['tokens'], rel_question['lemmas']))
                # q2_tree = utils.parse_tree(rel_question['subj_tree'], q2_token2lemma)
                # q1_tree, q2_tree = treekernel.similar_terminals(q1_tree, q2_tree)
                # X.append(treekernel(q1_tree, q2_tree))
                #
                # # frobenius norm
                # X.append(features.frobenius_norm(q1_emb, q2_emb))

                # softcosine
                simbow = self.simbow.score(q1, q1_emb, q2, q2_emb)
                X.append(simbow)

                for comment in duplicate['rel_comments']:
                    q3id = comment['id']
                    q3 = comment['tokens_proc']
                    simbow_q1q3, simbow_q2q3 = 0, 0
                    if len(q3) > 0:
                        # X.extend(self.get_features(q1id, q1, q3id, q3, set='dev'))
                        q3_elmo = self.develmo.get(
                            str(self.devidx[comment['id']]))
                        q3_w2v = features.encode(q3, self.word2vec)
                        q3_emb = [
                            np.concatenate([q3_w2v[i], q3_elmo[i]])
                            for i in range(len(q3_w2v))
                        ]
                        simbow_q1q3 = self.simbow.score(q1, q1_emb, q3, q3_emb)
                        # simbow_q2q3 = self.simbow.score(q2, q2_emb, q3, q3_emb)
                        # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[comment['id']], self.avg_idf)
                        # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q3, q3_emb)
                    # X.append(trlmprob)
                    # X.append(bm25_score)
                    X.append(simbow_q1q3)
                    # X.append(simbow_q2q3)

                # scale
                X = self.scaler.transform([X])
                # feature selection
                X = self.feat_selector.transform(X)

                score = self.model.decision_function(X)[0]
                pred_label = self.model.predict(X)[0]
                y_pred.append(pred_label)

                real_label = 0
                if rel_question['relevance'] != 'Irrelevant':
                    real_label = 1
                y_real.append(real_label)
                ranking[q1id].append((real_label, score, q2id))

        with open('data/ranking.txt', 'w') as f:
            for q1id in ranking:
                for row in ranking[q1id]:
                    label = 'false'
                    if row[0] == 1:
                        label = 'true'
                    f.write('\t'.join([
                        str(q1id),
                        str(row[2]),
                        str(0),
                        str(row[1]), label, '\n'
                    ]))

        logging.info('Finishing to validate svm.', extra=d)
        return ranking, y_real, y_pred
예제 #7
0
    def train(self):
        logging.info('Training svm.', extra=d)
        treekernel = features.TreeKernel(alpha=0,
                                         decay=1,
                                         ignore_leaves=True,
                                         smoothed=False)
        self.bm25_model, self.avg_idf, self.bm25_qid_index = features.init_bm25(
            traindata=self.trainset, devdata=self.devset, testdata=[])

        if not os.path.exists(FEATURE_PATH):
            X, y = [], []
            for i, query_question in enumerate(self.traindata):
                percentage = round(float(i + 1) / len(self.traindata), 2)
                print('Preparing traindata: ',
                      percentage,
                      i + 1,
                      sep='\t',
                      end='\r')
                q1id = query_question['q1_id']
                q2id = query_question['q2_id']
                q1, q2 = query_question['q1'], query_question['q2']
                # x = self.get_features(q1id, q1, q2id, q2)
                x = []
                # x = self.__transform__(q1, q2)
                #
                # # elmo and word2vec embeddings
                q1_elmo = self.trainelmo.get(str(self.trainidx[q1id]))
                q1_w2v = features.encode(q1, self.word2vec)
                q1_emb = [
                    np.concatenate([q1_w2v[i], q1_elmo[i]])
                    for i in range(len(q1_w2v))
                ]

                q2_elmo = self.trainelmo.get(str(self.trainidx[q2id]))
                q2_w2v = features.encode(q2, self.word2vec)
                q2_emb = [
                    np.concatenate([q2_w2v[i], q2_elmo[i]])
                    for i in range(len(q2_w2v))
                ]

                # # translation
                # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q2, q2_emb)
                # x.append(trlmprob)
                #
                # # bm25
                # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[q2id], self.avg_idf)
                # x.append(bm25_score)
                #
                # # cosine
                # q1_lemma = query_question['q1_lemmas']
                # q1_pos = query_question['q1_pos']
                # q2_lemma = query_question['q2_lemmas']
                # q2_pos = query_question['q2_pos']
                # for n in range(1,5):
                #     try:
                #         x.append(features.cosine(' '.join(q1), ' '.join(q2), n=n))
                #     except:
                #         x.append(0.0)
                #     try:
                #         x.append(features.cosine(' '.join(q1_lemma), ' '.join(q2_lemma), n=n))
                #     except:
                #         x.append(0.0)
                #     try:
                #         x.append(features.cosine(' '.join(q1_pos), ' '.join(q2_pos), n=n))
                #     except:
                #         x.append(0.0)
                #
                # # tree kernels
                # q1_token2lemma = dict(zip(query_question['q1_full'], query_question['q1_lemmas']))
                # q2_token2lemma = dict(zip(query_question['q2_full'], query_question['q2_lemmas']))
                # q1_tree, q2_tree = utils.parse_tree(query_question['q1_tree'], q1_token2lemma), utils.parse_tree(query_question['q2_tree'], q2_token2lemma)
                # q1_tree, q2_tree = treekernel.similar_terminals(q1_tree, q2_tree)
                # x.append(treekernel(q1_tree, q2_tree))
                #
                # # frobenius norm
                # x.append(features.frobenius_norm(q1_emb, q2_emb))
                #
                # # softcosine
                simbow = self.simbow.score(q1, q1_emb, q2, q2_emb)
                x.append(simbow)

                for comment in query_question['comments']:
                    q3id = comment['id']
                    q3 = comment['tokens']
                    simbow_q1q3, simbow_q2q3 = 0, 0
                    if len(q3) > 0:
                        # x.extend(self.get_features(q1id, q1, q3id, q3))
                        q3_elmo = self.trainelmo.get(str(self.trainidx[q3id]))
                        q3_w2v = features.encode(q3, self.word2vec)
                        q3_emb = [
                            np.concatenate([q3_w2v[i], q3_elmo[i]])
                            for i in range(len(q3_w2v))
                        ]
                        simbow_q1q3 = self.simbow.score(q1, q1_emb, q3, q3_emb)
                        # simbow_q2q3 = self.simbow.score(q2, q2_emb, q3, q3_emb)
                        # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q3, q3_emb)
                        # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[comment['id']], self.avg_idf)

                    # x.append(trlmprob)
                    # x.append(bm25_score)
                    x.append(simbow_q1q3)
                    # x.append(simbow_q2q3)

                X.append(x)
                y.append(query_question['label'])

            p.dump(list(zip(X, y)), open(FEATURE_PATH, 'wb'))
        else:
            f = p.load(open(FEATURE_PATH, 'rb'))
            X = list(map(lambda x: x[0], f))
            y = list(map(lambda x: x[1], f))

        # scale features
        self.scaler = MinMaxScaler(feature_range=(-1, 1))
        self.scaler.fit(X)
        X = self.scaler.transform(X)

        clf = LassoCV(cv=10)
        self.feat_selector = SelectFromModel(clf)
        self.feat_selector.fit(X, y)
        X = self.feat_selector.transform(X)

        self.model = self.train_svm(trainvectors=X,
                                    labels=y,
                                    c='search',
                                    kernel='search',
                                    gamma='search',
                                    degree='search',
                                    jobs=4)
        # self.model = self.train_regression(trainvectors=X, labels=y, c='search', penalty='search', tol='search')
        logging.info('Finishing to train svm.')