def get_installment_payments(path, num_rows=None):
    """Preprocess and extract features from installments_payments.

    Arguments:
        path: Path to the folder where files are saved (string).
        num_rows: Number of rows to read; None reads all rows (int, default: None).

    Returns:
        df: DataFrame with processed data.
    """
    pay = pd.read_csv(os.path.join(path, 'installments_payments.csv'), nrows=num_rows)
    # Group payments and get Payment difference
    pay = utils.do_sum(pay, ['SK_ID_PREV', 'NUM_INSTALMENT_NUMBER'], 'AMT_PAYMENT', 'AMT_PAYMENT_GROUPED')
    pay['PAYMENT_DIFFERENCE'] = pay['AMT_INSTALMENT'] - pay['AMT_PAYMENT_GROUPED']
    pay['PAYMENT_RATIO'] = pay['AMT_INSTALMENT'] / pay['AMT_PAYMENT_GROUPED']
    pay['PAID_OVER_AMOUNT'] = pay['AMT_PAYMENT'] - pay['AMT_INSTALMENT']
    pay['PAID_OVER'] = (pay['PAID_OVER_AMOUNT'] > 0).astype(int)
    # Payment Entry: Days past due and Days before due
    pay['DPD'] = pay['DAYS_ENTRY_PAYMENT'] - pay['DAYS_INSTALMENT']
    pay['DPD'] = pay['DPD'].apply(lambda x: 0 if x <= 0 else x)
    pay['DBD'] = pay['DAYS_INSTALMENT'] - pay['DAYS_ENTRY_PAYMENT']
    pay['DBD'] = pay['DBD'].apply(lambda x: 0 if x <= 0 else x)
    # Flag late payment
    pay['LATE_PAYMENT'] = pay['DBD'].apply(lambda x: 1 if x > 0 else 0)
    # Percentage of payments that were late
    pay['INSTALMENT_PAYMENT_RATIO'] = pay['AMT_PAYMENT'] / pay['AMT_INSTALMENT']
    pay['LATE_PAYMENT_RATIO'] = pay.apply(lambda x: x['INSTALMENT_PAYMENT_RATIO'] if x['LATE_PAYMENT'] == 1 else 0, axis=1)
    # Flag late payments that have a significant amount
    pay['SIGNIFICANT_LATE_PAYMENT'] = pay['LATE_PAYMENT_RATIO'].apply(lambda x: 1 if x > 0.05 else 0)
    # Flag k threshold late payments
    pay['DPD_7'] = pay['DPD'].apply(lambda x: 1 if x > 7 else 0)
    pay['DPD_15'] = pay['DPD'].apply(lambda x: 1 if x > 15 else 0)
    # Aggregations by SK_ID_CURR
    pay_agg = utils.group(pay, 'INS_', config.INSTALLMENTS_AGG)

    # Installments in the last x months
    for months in [18, 36]:
        recent_prev_id = pay[pay['DAYS_INSTALMENT'] >= -30*months]['SK_ID_PREV'].unique()
        pay_recent = pay[pay['SK_ID_PREV'].isin(recent_prev_id)]
        prefix = 'INS_{}M_'.format(months)
        pay_agg = utils.group_and_merge(pay_recent, pay_agg, prefix, config.INSTALLMENTS_TIME_AGG)

    # Last x periods trend features
    group_features = ['SK_ID_CURR', 'SK_ID_PREV', 'DPD', 'LATE_PAYMENT',
                      'PAID_OVER_AMOUNT', 'PAID_OVER', 'DAYS_INSTALMENT']
    group = pay[group_features].groupby('SK_ID_CURR')
    func = partial(_trend_in_last_k_instalment_features, periods=[12, 24, 60, 120])
    g = utils.parallel_apply(group, func, index_name='SK_ID_CURR', chunk_size=10000).reset_index()
    pay_agg = pay_agg.merge(g, on='SK_ID_CURR', how='left')

    # Last loan features
    g = utils.parallel_apply(group, _installments_last_loan_features, index_name='SK_ID_CURR', chunk_size=10000).reset_index()
    pay_agg = pay_agg.merge(g, on='SK_ID_CURR', how='left')
    return pay_agg
예제 #2
0
def ensemble(args):
    global dfs, ens_dets, MAX_NUM, bg_time
    MAX_NUM = args.max_num

    df_test = pd.read_csv(os.path.join(DATA_DIR,
                                       'sample_empty_submission.csv'))
    df_test.PredictionString = df_test.PredictionString.fillna('')
    print(df_test.head())

    print('loading {} ...'.format(csv_files))
    dfs = [pd.read_csv(fn) for fn in csv_files]
    for df in dfs:
        df.PredictionString = df.PredictionString.fillna('')
    #assert len(preds[0][1]) == len(classes)
    for i in range(len(dfs)):
        dfs[i] = dfs[i].set_index('ImageID')
        dfs[i] = dfs[i].reindex(index=df_test['ImageID'])
        dfs[i] = dfs[i].reset_index()

    print('ensembling...')
    bg_time = time.time()
    counter = Value('i', 0)
    with Pool(24, initializer=init, initargs=(counter, )) as p:
        num_imgs = len(dfs[0])
        #ens_dets = list(tqdm(iterable=p.map(get_ens_det, list(range(num_imgs))), total=num_imgs))
        ens_dets = p.map(get_ens_det, range(num_imgs))

    print('creating submission...')

    df_test['img_index'] = df_test.index
    df_test = parallel_apply(df_test, set_pred_str)
    df_test = df_test.drop(columns=['img_index'], axis=1)

    df_test.to_csv(args.out, index=False)
    print('done')
예제 #3
0
def submit(args):
    global preds, classes

    classes, _ = get_top_classes(args.start_index, args.end_index,
                                 args.class_file)
    print('loading {}...'.format(args.pred_file))
    with open(args.pred_file, 'rb') as f:
        preds = pickle.load(f)

    print('len(preds):', len(preds))
    print('num classes of preds:', len(preds[0][1]))
    print('specified num classes:', len(classes))
    #assert len(preds[0][1]) == len(classes)

    print('creating submission...')
    df_test = pd.read_csv(osp.join(DATA_DIR, 'sample_empty_submission.csv'))
    df_test.ImageWidth = df_test.ImageID.map(
        lambda x: get_image_size(get_fn(x))[0])
    df_test.ImageHeight = df_test.ImageID.map(
        lambda x: get_image_size(get_fn(x))[1])
    df_test['img_index'] = df_test.index
    df_test = parallel_apply(df_test, set_pred_str)
    df_test = df_test.drop(columns=['img_index'], axis=1)

    df_test.to_csv(args.out, index=False)
    print('done')
예제 #4
0
 def _update(self, update_helper_func):
     self._current_resolution_level = parallel_apply(
         self._current_resolution_level,
         update_helper_func,
         self._n_threads, 1,
         **{"current_vector":
             self._current_resolution.values}
     )
예제 #5
0
def ensemble(args):
    global all_preds, classes, ens_dets, MAX_NUM, bg_time
    MAX_NUM = args.max_num

    #print('getting img size...')
    df_test = pd.read_csv(osp.join(DATA_DIR, 'sample_empty_submission.csv'))
    print(df_test.head())
    #df_test.ImageWidth = df_test.ImageID.map(lambda x: get_image_size(get_fn(x))[0])
    #df_test.ImageHeight = df_test.ImageID.map(lambda x: get_image_size(get_fn(x))[1])

    classes, _ = get_top_classes(args.start_index, args.end_index,
                                 args.class_file)
    for fn in pred_files:
        print('loading {} ...'.format(fn))
        with open(fn, 'rb') as f:
            all_preds.append(pickle.load(f))

    print('len(preds):', len(all_preds[0]))
    print('num classes of preds:', len(all_preds[0][0][1]))
    print('specified num classes:', len(classes))
    #assert len(preds[0][1]) == len(classes)

    print('ensembling...')
    bg_time = time.time()
    counter = Value('i', 0)

    with Pool(24, initializer=init, initargs=(counter, )) as p:
        num_imgs = len(all_preds[0])
        #ens_dets = list(tqdm(iterable=p.map(get_ens_det, list(range(num_imgs))), total=num_imgs))
        ens_dets = p.map(get_ens_det, range(num_imgs))
    #num_imgs = len(preds1)
    #for idx in tqdm(range(num_imgs), total=num_imgs):
    #    ens_dets.append(get_ens_det(idx))

    print('creating submission...')

    df_test['img_index'] = df_test.index
    df_test = parallel_apply(df_test, set_pred_str)
    df_test = df_test.drop(columns=['img_index'], axis=1)

    df_test.to_csv(args.out, index=False)
    print('done')
예제 #6
0
historical_transactions = pd.read_csv(
    os.path.join(PATH, 'historical_transactions.csv'))

historical_transactions['purchase_date'] = pd.to_datetime(
    historical_transactions['purchase_date'])
historical_transactions['days'] = (
    datetime.date(2018, 2, 28) -
    historical_transactions['purchase_date'].dt.date).dt.days
historical_transactions = historical_transactions.query(
    '0 <= installments and installments <= 12')

# =============================================================================
#
# =============================================================================

groupby = historical_transactions.groupby('card_id')

func = partial(last_k_instalment_features_with_fractions,
               periods=[60, 180, 360, 540],
               fraction_periods=[(60, 180), (60, 360), (180, 540), (360, 540)])

g = utils.parallel_apply(groupby,
                         func,
                         index_name='card_id',
                         num_workers=4,
                         chunk_size=10000).reset_index()
g.to_pickle(f'../feature/{PREF}.pkl')

#==============================================================================
utils.end(__file__)
         print("flag:", 7)
         data_link_split = data_link_split.reset_index(drop=True)
         data_link_split[[
             'link_id', 'link_time', 'link_ratio', 'link_current_status',
             'link_arrival_status'
         ]] = data_link_split['link_info'].str.split(':|,', 5, expand=True)
         print("flag:", 8)
         data_link_split = data_link_split[['order_id', 'link_id']]
         data_link_split['link_id'] = data_link_split['link_id'].astype(int)
         features = pd.DataFrame(
             {'order_id': data_link_split['order_id'].unique()})
         groupby = data_link_split.groupby(['order_id'])
         func = partial(link_id_find)
         g = parallel_apply(groupby,
                            func,
                            index_name='order_id',
                            num_workers=5,
                            chunk_size=10000)
         g = pd.DataFrame(g, columns=['from_id', 'to_id'])
         g = g.drop_duplicates()
         nextlinks_new.append(g)
     nextlinks_new = pd.concat(nextlinks_new, axis=0)
     nextlinks_new = nextlinks_new.drop_duplicates()
     nextlinks_new = nextlinks_new.sort_values(by='from_id').reset_index(
         drop=True)
     nextlinks = pd.concat([nextlinks, nextlinks_new], axis=0)
     nextlinks = nextlinks.drop_duplicates()
     nextlinks = nextlinks.sort_values(by='from_id').reset_index(drop=True)
     print('save all csv')
     nextlinks.to_csv(root_path + 'nextlinks_allday.csv', index=False)
 print('calcute weight')
예제 #8
0
 def _score(self, score_helper_func):
     score = parallel_apply(
         self._current_resolution_level,
         score_helper_func,
         self._n_threads, 1)
     return score