示例#1
0
    def run(self):
        self.output().makedirs()
        wc_mat_train = rf_word_count_features.WordCountMatrix(
        ).load_raw_vectors('train')
        wc_mat_test = rf_word_count_features.WordCountMatrix(
        ).load_raw_vectors('test')

        all_vecs = sp.vstack(list(wc_mat_train) + list(wc_mat_test))
        decomp = self.decomposition()
        decomposed = decomp.fit_transform(all_vecs)
        train_size = wc_mat_train[0].shape[0]
        test_size = wc_mat_test[0].shape[0]
        train_decomp = decomposed[:train_size * 2]
        test_decomp = decomposed[train_size * 2:]
        assert test_decomp.shape[0] == test_size * 2
        decomp_train = {
            'q1': train_decomp[:train_size],
            'q2': train_decomp[train_size:]
        }
        decomp_test = {
            'q1': test_decomp[:test_size],
            'q2': test_decomp[test_size:]
        }

        train_dists = list(
            tqdm(multiprocessing.Pool().imap(self.decomp_dist,
                                             zip(decomp_train['q1'],
                                                 decomp_train['q2']),
                                             chunksize=50_000),
                 total=train_size,
                 desc='vectorizing the training data'))
示例#2
0
    def run(self):
        self.output().makedirs()
        train_q1, train_q2 = rf_word_count_features.WordCountMatrix(
        ).load_raw_vectors('train')
        train_distances = distances_from_mats(train_q1, train_q2)
        test_q1, test_q2 = rf_word_count_features.WordCountMatrix(
        ).load_raw_vectors('test')
        test_distances = distances_from_mats(test_q1, test_q2)

        np.savez_compressed(self.make_path('done_tmp.npz'),
                            train_distances=train_distances,
                            test_distances=test_distances)
        os.rename(self.make_path('done_tmp.npz'), self.output().path)
示例#3
0
    def run(self):
        self.output().makedirs()
        wc_data = rf_word_count_features.WordCountMatrix()

        X = wc_data.load('train', self.fold).astype(np.float32)
        y = rf_dataset.Dataset().load('train', self.fold,
                                      as_df=True).is_duplicate

        cls = self.make_cls()
        cls.fit(X, y)

        X_val = wc_data.load('valid', self.fold).astype(np.float32)
        y_val = rf_dataset.Dataset().load('valid', self.fold,
                                          as_df=True).is_duplicate

        y_pred = cls.predict_proba(X_val)[:, 1]
        np.savez_compressed(self.make_path('valid.npz'), data=y_pred)
        score = core.score_data(y_val, y_pred)

        del X, y, X_val, y_val
        X_test = wc_data.load('test', None).astype(np.float32)
        y_test_pred = cls.predict_proba(X_test)[:, 1]
        np.savez_compressed(self.make_path('test.npz'), data=y_test_pred)

        print(colors.green | 'Score: {:s}: {:f}'.format(repr(self), score))

        with self.output().open('w') as f:
            f.write('Score: {:s}: {:f}'.format(repr(self), score))
        return score
示例#4
0
 def requires(self):
     yield rf_decomposition.AllDecompositions()
     yield rf_word_count_distances.WordCountDistances()
     yield rf_distances.RFDistanceCalculator()
     yield rf_vectorspaces.VectorSpaceTask(include_space=False)
     yield rf_magic_features.QuestionFrequency()
     yield rf_magic_features.NeighbourhoodFeature()
     yield rf_magic_features.QuestionOrderFeature()
     yield rf_leaky.RF_LeakyXGB_Dataset()
     yield rf_pos_distances.RF_POS_Distance()
     yield rf_word_count_features.WordCountMatrix()
示例#5
0
    def run(self):
        self.output().makedirs()
        m1, m2 = rf_word_count_features.WordCountMatrix().load_raw_vectors(
            'train')
        m1 = m1 > 0
        m2 = m2 > 0
        X = m1.multiply(m2)
        folds = (rf_dataset.Dataset().load_dataset_folds() +
                 self.fold) % fold_max
        train_X = X[folds != 0]
        train_y = rf_dataset.Dataset().load('train',
                                            fold=self.fold,
                                            as_df=True).is_duplicate.values
        cls = naive_bayes.BernoulliNB()
        cls.fit(train_X, train_y)

        valid_X = X[folds == 0]
        valid_y = rf_dataset.Dataset().load('valid',
                                            fold=self.fold,
                                            as_df=True).is_duplicate.values
        valid_pred = cls.predict_proba(valid_X)[:, 1]

        score = score_data(valid_y, valid_pred)

        print(colors.green | "Score for {:s}: {:f}".format(repr(self), score))

        t1, t2 = rf_word_count_features.WordCountMatrix().load_raw_vectors(
            'test')
        t1 = t1 > 0
        t2 = t2 > 0
        test_X = t1.multiply(t2)
        test_pred = cls.predict_proba(test_X)[:, 1]
        np.savez_compressed(self.make_path('done_tmp.npz'),
                            valid=valid_pred,
                            test=test_pred)
        os.rename(self.make_path('done_tmp.npz'), self.make_path('done.npz'))
        return score
示例#6
0
 def requires(self):
     yield rf_dataset.Dataset()
     yield rf_word_count_features.WordCountMatrix()
示例#7
0
 def requires(self):
     yield rf_word_count_features.WordCountMatrix()