예제 #1
0
    def generate_outer_feature(self):
        train_df, _ = read_data()
        test_df = read_data(test=True)
        all_df = pd.concat([train_df, test_df], ignore_index=True)

        users = all_df['user_id'].unique()

        docs = []
        for u in users:
            docs.append(all_df[all_df['user_id'] == u]['kiji_id'].values)

        vc = all_df['kiji_id'].value_counts()
        to_none_ids = vc[vc < 5].index

        def to_word(d):
            if d in to_none_ids:
                return 'None'
            return d

        if os.path.exists(self.fast_model_path):
            model = FastText.load(self.fast_model_path)
        else:
            docs = [[to_word(w) for w in doc] for doc in docs]
            with timer(logger,
                       format_str='create kiji_id fast_model' + ' {:.3f}[s]'):
                model = FastText(docs, workers=6, size=64)
            model.save(self.fast_model_path)

        z = self.df_outer['kiji_id_raw'].map(to_word).map(
            lambda x: model.wv[x])
        df = pd.DataFrame(np.array(z.values.tolist())).add_prefix('kiji_wv_')
        df[self.merge_key] = self.df_outer['kiji_id_raw']
        return df
예제 #2
0
 def call(self, df_input, y=None):
     df = df_input.groupby('user_id').agg(['mean', 'sum', 'max', 'min', 'std', 'nunique']).sort_values('user_id')
     df.columns = ['_'.join(x) for x in df.columns.to_flat_index()]
     df = df.reset_index()
     additional_atoms = [UedaAtom()]
     for atom in additional_atoms:
         if y is None:
             input_df = read_data(test=True)
         else:
             input_df, _ = read_data()
         df = pd.merge(df, atom.generate(input_df, y=None), on='user_id', how='left')
     return df.reset_index(drop=True)
예제 #3
0
import numpy as np
import pandas as pd
from vivid.featureset.molecules import MoleculeFeature, find_molecule

from kaggle_days.dataset import read_data
from kaggle_days.dataset import read_sample_submit
from kaggle_days.models.classifiers import LGBMCls
from kaggle_days.molecules import user_merge_molecule

if __name__ == '__main__':
    m = find_molecule('benchmark')[0]
    raw_feature = MoleculeFeature(
        m, root_dir='/analysis/data/checkpoint/classification')
    entry_feature = MoleculeFeature(user_merge_molecule, parent=raw_feature)

    train_df, y = read_data()
    test_df = read_data(test=True)
    origin = np.sort(np.unique(y))
    k_labels = np.arange(len(origin))
    y2k = dict(zip(origin, k_labels))
    y_labels = pd.Series(y).map(y2k).values

    clf = LGBMCls(name='lgbm_cls', parent=entry_feature)
    oof_df = clf.fit(train_df, y_labels)
    prob_predict = clf.predict(test_df).values
    predict = np.sum(prob_predict * origin, axis=1)

    sub_df = read_sample_submit()
    sub_df['age'] = predict
    sub_df.to_csv(os.path.join(clf.output_dir, 'predict.csv'), index=False)
예제 #4
0
                            formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '--molecule',
        default='benchmark',
        choices=[str(m.name) for m in MoleculeFactory.molecules],
        help='molecule name (see kaggle_days.molecules.py file)')
    parser.add_argument(
        '--simple',
        action='store_true',
        help=
        'If True, run on small models (LightGBMx3 different objective function)'
    )
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_argument()
    test_df = read_data(test=True)

    m = find_molecule(args.molecule)[0]
    pred_in_best = None
    for i in range(5):
        train_df, y = generate_next_step_dataset(y_pred=pred_in_best)
        composer = TrainComposer(molecule=m,
                                 simple=args.simple,
                                 suffix=f'psudo_{i}')
        score_df, pred_dict = composer.fit(train_df, y, test_df)

        best_model = score_df.sort_values('rmse').index[0]
        pred_in_best = pred_dict.get(best_model)