示例#1
0
    def append_to_file(self):
        start_path = os.getenv("HOME") + '/bigdata/database' if os.path.isdir(
            os.getenv("HOME") + '/bigdata') else '/bigdata/database'
        os.chdir(start_path)

        df_historical_inc = pd.read_csv("allfiles.csv", sep=',')
        df_historical_trade = pd.read_csv("alltrades.csv", sep=',')

        df_new_trade = self._load_postgres_data_for_trades()
        df_new_inc = self._cleanser_for_inc()

        df_uptaded_trade = pd.concate(df_historical_trade, df_new_trade)
        df_uptaded_inc = pd.concate(df_historical_inc, df_new_inc)

        df_uptaded_trade.drop_duplicates(subset="title",
                                         keep='first',
                                         inplace=True)
        df_uptaded_inc.drop_duplicates(subset="title",
                                       keep='first',
                                       inplace=True)

        df_uptaded_inc.to_csv(start_path + 'allfiles.csv',
                              sep=',',
                              header=True,
                              index=False)
        df_uptaded_trade.to_csv(start_path + 'alltrades.csv',
                                sep=',',
                                header=True,
                                index=False)
示例#2
0
def main():
    pd.options.display.max_rows = 999999
    # report_attacks = [None]*10
    #shift_attacks = pd.read_csv(f'..\\results\\old\\cpc {2}-shifted_XORED_csv.csv')
    #wc_attacks = pd.read_csv(f'..\\results\\cpc {2}-wc_new_XORED_csv.csv')
    shift_attacks = pd.read_csv(
        f'..\\results\\old\\cpc {2}-shifted_XORED_csv.csv')[[
            'original_address', 'address'
        ]]
    for cpc in range(3, 11):
        try:
            shift_attacks = pd.concate([
                shift_attacks,
                pd.read_csv(
                    f'..\\results\\old\\cpc {cpc}-shifted_XORED_csv.csv')[[
                        'original_address', 'address'
                    ]]
            ],
                                       ignore_index=True)

            #shift_attacks = pd.concat([shift_attacks, pd.read_csv(f'..\\results\\cpc {x}-shifted_XORED_csv.csv')])
            #shift_attacks = pd.read_csv(f'..\\results\\cpc {x}-shifted_XORED_csv.csv')
            #wc_attacks = pd.concat([wc_attacks, pd.read_csv(f'..\\results\\cpc {x}-wc_new_XORED_csv.csv')])
            #shift_attacks = shift_attacks.drop_duplicates(['full_write_word', 'full_read_word','write_red_with_ADR','read_red_with_ADR','shift_err_detected'])
            #shift_attacks.to_csv(f'..\\results\\filtered cpc {x}-shifted_XORED_csv.csv')
            """wc_attacks = pd.read_csv(f'..\\results\\cpc {x}-wc_new_XORED_csv.csv')
            wc_attacks = (wc_attacks[wc_attacks['wc fault'] == True]).drop_duplicates(['full_write_word', 'full_read_word','write_red_with_ADR','read_red_with_ADR','wc_new_detected'])
            wc_attacks.to_csv(f'..\\results\\filtered cpc {x}-wc_new_XORED_csv.csv')"""
        except:
            print(f'cpc {cpc} , does not have files..')
    shift_attacks.to_csv(f'..\\results\\old\\all_shifted_attacks.csv')
    """
示例#3
0
def get_spectra_from_file(file):
    if file.split('.')[-1].lower() == 'mzml':
        raw_data = mzml_to_df(
            file)  #returns a dict of dataframes from an mzml file
    elif (file.split('.')[-1].lower()
          == 'h5') | (file.split('.')[-1].lower()
                      == 'hdf5') | (file.split('.')[-1].lower() == 'hdf'):
        raw_data = mgd.df_container_from_metatlas_file(
            file)  #This is used when input is hdf5 file
    spectra = None
    if isinstance(raw_data['ms2_pos'], pd.DataFrame) & isinstance(
            raw_data['ms2_neg'],
            pd.DataFrame):  #it has both pos and neg spectra
        spectra = pd.concate([
            create_msms_dataframe(raw_data['ms2_pos']),
            create_msms_dataframe(raw_data['ms2_neg'])
        ])
    elif isinstance(raw_data['ms2_pos'], pd.DataFrame):
        spectra = create_msms_dataframe(raw_data['ms2_pos'])
    elif isinstance(raw_data['ms2_neg'], pd.DataFrame):
        spectra = create_msms_dataframe(raw_data['ms2_neg'])
    else:
        print('File has no MSMS data.')  #, file=sys.stderr)
        open(make_output_filename(file), 'a').close()  #make empty file

    return spectra
示例#4
0
def main():
    args = parse_args()
    #make sure the label hdf5 inputs are matched with prediction hdf5 inputs
    assert (len(args.labels_hdf5) == len(args.predictions_hdf5))
    num_datasets = len(args.labels_hdf5)
    sample_to_auprc = dict()
    for i in range(num_datasets):
        print(args.labels_hdf5[i])
        print(args.predictions_hdf5[i])
        cur_preds = pd.read_hdf(args.predictions_hdf5[i])
        cur_labels = pd.read_hdf(args.labels_hdf5[i])
        num_tasks = cur_preds.shape[1]
        if (num_tasks > 1) and (args.multitask == True):
            #score all the tasks
            for cur_task in range(num_tasks):
                task_labels = cur_labels[cur_task]
                task_preds = cur_preds[cur_task]
                cur_subset = pd.concate([task_labels, task_preds],
                                        axis=1).dropna()
                cur_subset.columns = ['labels', 'preds']
                task_name = colname_to_task_name[cur_task]
                cur_sample = args.labels_hdf5[i].strip(
                    '.labels.0') + '.' + task_name
                cur_auprc = average_precision_score(cur_subset['labels'],
                                                    cur_subset['preds'])
                sample_to_auprc[cur_sample] = cur_auprc
        elif (num_tasks > 1):
            #get the actual task column
            for key in task_name_to_colname:
                if key in args.labels_hdf5[i]:
                    #extract the corresponding column
                    cur_task_colname = task_name_to_colname[key]
                    cur_labels = cur_labels[cur_task_colname]
                    #assert the labels and predictions dataframes are matched
                    assert key in args.predictions_hdf5[i]
                    cur_preds = cur_preds[cur_task_colname]

                    cur_data = pd.concat((cur_preds, cur_labels), axis=1)
                    cur_data = cur_data.dropna()
                    cur_data.columns = ['preds', 'labels']
                    cur_auprc = average_precision_score(
                        cur_data['labels'], cur_data['preds'])
                    cur_sample = args.labels_hdf5[i].strip('.labels.0')
                    sample_to_auprc[cur_sample] = cur_auprc
        else:
            cur_data = pd.concat((cur_preds, cur_labels), axis=1).dropna()
            cur_data.columns = ['preds', 'labels']
            #pdb.set_trace()
            cur_auprc = average_precision_score(cur_data['labels'],
                                                cur_data['preds'])
            cur_sample = args.labels_hdf5[i].strip('.labels.0')
            sample_to_auprc[cur_sample] = cur_auprc
    print(sample_to_auprc)
    outf = open(args.outf + "/perf.metrics.txt", 'w')
    outf.write('Dataset\tauPRC\n')
    for key in sample_to_auprc:
        outf.write(key + '\t' + str(sample_to_auprc[key]) + '\n')
    outf.close()
示例#5
0
def add_new_country_schedule(filename):
    """Summary

    Args:
        filename (TYPE): Description

    Returns:
        TYPE: Description
    """
    df_holidays = reindex_holidays(filename)
    df_holidays = add_region_id(df_holidays)
    df_current_school_holidays = pd.read_csv(get_file_path(
        'data/school_holidays.csv', fileDir),
                                             parse_dates=['date'])
    df_current_school_holidays = pd.concate(
        [df_holidays, df_current_school_holidays], axis=0)
    df_current_school_holidays.to_csv('data/school_holidays.csv')
    float) / df_percent['inst_cnt_installed']  # inst_cate_percent

df_installed['count'] = np.ones(df_installed.shape[0])
df_group_exist = df_installed.groupby(
    ['userID', 'appID']).count().rename(columns={
        'count': 'inst_is_installed'
    }).reset_index()

df_group_app = df_installed.groupby('appID').count().rename(
    columns={
        'userID': 'inst_app_installed'
    }).reset_index()

df_train = pd.read_csv('df_basic_train.csv')
df_test = pd.read_csv('df_basic_test.csv')
df_result = pd.concate(df_train, df_test)

df_result = pd.merge(df_result,
                     df_percent,
                     how='left',
                     on=['userID', 'appCategory'])
df_result['inst_cate_percent'].fillna(0.0, inplace=True)  # 同类应用比例
df_result['inst_cnt_installed'].fillna(0, inplace=True)

df_result = pd.merge(df_result,
                     df_group_exist,
                     how='left',
                     on=['userID', 'appID'])
df_result['inst_is_installed'].fillna(0, inplace=True)
del df_installed['count']
def main(args: Namespace):

    ratings = pd.read_feather(
        os.path.join(args.data_path, args.data_name + '_smaple'))
    user_num, item_num = ratings.uidx.max() + 1, ratings.iidx.max() + 1

    #df = pd.read_feather(os.path.join(args.sim_path, f'{args.prefix}_sim_full.feather'))
    tr_df = pd.read_feather(
        os.path.join(args.sim_path, f'{args.prefix}_sim_train.feather'))
    val_df = pd.read_feather(
        os.path.join(args.sim_path, f'{args.prefix}_sim_val.feather'))
    te_df = pd.read_feather(
        os.path.join(args.sim_path, f'{args.prefix}_sim_test.feather'))

    if args.tune_mode:
        tr_df = pd.concate([tr_df, val_df])
        te_df = te_df
    else:
        tr_df = tr_df
        te_df = val_df

    past_hist = tr_df.groupby('uidx').apply(lambda x: set(x.iidx)).to_dict()
    item_cnt_dict = tr_df.groupby('iidx').count().uidx.to_dict()
    item_cnt = np.array(
        [item_cnt_dict.get(iidx, 0) for iidx in range(item_num)])

    logger.info(f'test data size: {te_df.shape}')

    dim = args.dim

    rel_factor = FactorModel(user_num, item_num, dim)
    PATH = os.path.join(args.sim_path, f'{args.prefix}_rel.pt')
    rel_factor.load_state_dict(torch.load(PATH))
    rel_factor.eval()

    train_expo_factor = FactorModel(user_num, item_num, dim)
    PATH = os.path.join(args.sim_path, f'{args.prefix}_expo.pt')
    train_expo_factor.load_state_dict(torch.load(PATH))
    train_expo_factor.eval()

    train_expo_factor = NoiseFactor(train_expo_factor, args.dim)
    train_expo_factor = train_expo_factor.to(
        torch.device(f'cuda:{args.cuda_idx}'))
    train_expo_factor.load_state_dict(
        torch.load(os.path.join(args.sim_path,
                                f'{args.prefix}_expo_noise.pt')))
    train_expo_factor.eval()

    expo_factor = FactorModel(user_num, item_num, dim)
    PATH = os.path.join(args.sim_path, f'{args.prefix}_expo_bs.pt')
    expo_factor.load_state_dict(torch.load(PATH))
    expo_factor.eval()

    rating_model = RatingEstimator(user_num, item_num, rel_factor)
    expo_model = ClassRecommender(user_num, item_num, expo_factor)
    tr_mat = frame2mat(tr_df, user_num, item_num)
    val_mat = frame2mat(val_df, user_num, item_num)

    choices = args.models
    logging.info(f'Running {choices}')

    def get_model(model_str, user_num, item_num, factor_num):
        if model_str == 'mlp':
            return MLPRecModel(user_num, item_num, factor_num)
        elif model_str == 'gmf':
            return FactorModel(user_num, item_num, factor_num)
        elif model_str == 'ncf':
            return NCFModel(user_num, item_num, factor_num)
        else:
            raise NotImplementedError(f'{model_str} is not implemented')

    logging.info('-------The Popularity model-------')
    pop_factor = PopularModel(item_cnt)
    pop_model = PopRecommender(pop_factor)
    logger.info('unbiased eval for plian popular model on test')
    unbiased_eval(user_num,
                  item_num,
                  te_df,
                  pop_model,
                  epsilon=args.epsilon,
                  rel_model=rating_model,
                  past_hist=past_hist,
                  expo_model=expo_model,
                  expo_compound=args.p)

    logger.info('-------The SVD model---------')
    sv = SVDRecommender(tr_mat.shape[0], tr_mat.shape[1], dim)
    logger.info(f'model with dimension {dim}')
    sv.fit(tr_mat)
    logger.info('un-biased eval for SVD model on test')
    unbiased_eval(user_num,
                  item_num,
                  te_df,
                  sv,
                  epsilon=args.epsilon,
                  rel_model=rating_model,
                  past_hist=past_hist,
                  expo_model=expo_model,
                  expo_compound=args.p)

    def complete_experiment(model_str, user_num, item_num, dim):
        logging.info(f'-------The {model_str} model-------')
        base_factor = get_model(model_str,
                                user_num=user_num,
                                item_num=item_num,
                                factor_num=dim)
        base_model = ClassRecommender(user_num, item_num, base_factor)
        base_model.fit(tr_df,
                       num_epochs=args.epoch,
                       cuda=args.cuda_idx,
                       decay=1e-8,
                       num_neg=args.num_neg,
                       past_hist=past_hist,
                       lr=args.lr)
        logger.info(f'unbiased eval for {model_str}  model on test')
        unbiased_eval(user_num,
                      item_num,
                      te_df,
                      base_model,
                      epsilon=args.epsilon,
                      rel_model=rating_model,
                      past_hist=past_hist,
                      expo_model=expo_model,
                      expo_compound=args.p)

        logging.info(f'-------The {model_str} Pop Adjust model-------')
        pop_adjust_factor = get_model(model_str,
                                      user_num=user_num,
                                      item_num=item_num,
                                      factor_num=dim)
        pop_adjust_model = ClassRecommender(user_num,
                                            item_num,
                                            pop_adjust_factor,
                                            pop_factor,
                                            expo_thresh=0.1)
        pop_adjust_model.fit(tr_df,
                             num_epochs=args.epoch,
                             cuda=args.cuda_idx,
                             decay=args.decay,
                             num_neg=args.num_neg,
                             past_hist=past_hist,
                             lr=args.lr)
        logger.info(
            f'unbiased eval for adjust {model_str} with popular model on test')
        unbiased_eval(user_num,
                      item_num,
                      te_df,
                      pop_adjust_model,
                      epsilon=args.epsilon,
                      rel_model=rating_model,
                      past_hist=past_hist,
                      expo_model=expo_model,
                      expo_compound=args.p)
        del pop_adjust_factor

        logging.info(f'-------The {model_str} Mirror Adjust model-------')
        adjust_factor = get_model(model_str,
                                  user_num=user_num,
                                  item_num=item_num,
                                  factor_num=dim)
        adjust_model = ClassRecommender(user_num,
                                        item_num,
                                        adjust_factor,
                                        base_factor,
                                        expo_thresh=0.1)
        adjust_model.fit(tr_df,
                         num_epochs=args.epoch,
                         cuda=args.cuda_idx,
                         num_neg=args.num_neg,
                         past_hist=past_hist,
                         decay=args.decay,
                         lr=args.lr)

        logger.info(f'un-biased eval for {model_str} mirror adjusted model')
        unbiased_eval(user_num,
                      item_num,
                      te_df,
                      adjust_model,
                      epsilon=args.epsilon,
                      rel_model=rating_model,
                      past_hist=past_hist,
                      expo_model=expo_model,
                      expo_compound=args.p)
        del adjust_factor

        logger.info(f'-------The {model_str} Oracle Adjust model---------')
        oracle_factor = get_model(model_str,
                                  user_num=user_num,
                                  item_num=item_num,
                                  factor_num=dim)
        oracle_model = ClassRecommender(user_num,
                                        item_num,
                                        oracle_factor,
                                        train_expo_factor,
                                        expo_thresh=0.1,
                                        expo_compound=args.p)

        oracle_model.fit(tr_df,
                         num_epochs=args.epoch,
                         cuda=args.cuda_idx,
                         num_neg=args.num_neg,
                         past_hist=past_hist,
                         decay=args.decay,
                         lr=args.lr)

        logger.info('un-biased eval for oracle model on test')
        unbiased_eval(user_num,
                      item_num,
                      te_df,
                      oracle_model,
                      epsilon=args.epsilon,
                      rel_model=rating_model,
                      past_hist=past_hist,
                      expo_model=expo_model,
                      expo_compound=args.p)
        del oracle_factor

    for model_str in choices:
        if model_str != 'acgan':
            complete_experiment(model_str, user_num, item_num, dim)

    if 'acgan' in choices:
        logger.info('-------The AC GAN model---------')
        f = get_model(args.f_model, user_num, item_num, dim)
        g = get_model(args.g_model, user_num, item_num, dim)
        beta = BetaModel(user_num=user_num, item_num=item_num)
        f_recommender = ClassRecommender(user_num, item_num, f)
        g_recommender = ClassRecommender(user_num, item_num, g)
        g_recommender.fit(tr_df,
                          num_epochs=args.g_round_head,
                          cuda=args.cuda_idx,
                          num_neg=args.num_neg,
                          past_hist=past_hist,
                          decay=args.decay,
                          lr=args.lr)
        ac_train_v3(f,
                    False,
                    g,
                    False,
                    beta,
                    tr_df,
                    user_num=user_num,
                    item_num=item_num,
                    num_neg=args.num_neg,
                    past_hist=past_hist,
                    val_df=te_df,
                    rating_model=rating_model,
                    expo_model=expo_model,
                    num_epochs=args.epoch,
                    decay=args.decay,
                    cuda_idx=args.cuda_idx,
                    lr=args.lr,
                    g_weight=0.5,
                    expo_compound=args.p,
                    epsilon=args.epsilon)

        logger.info(f'eval on test with f_model ({args.f_model})')
        unbiased_eval(user_num,
                      item_num,
                      te_df,
                      f_recommender,
                      epsilon=args.epsilon,
                      rel_model=rating_model,
                      past_hist=past_hist,
                      expo_model=expo_model,
                      expo_compound=args.p)
        logger.info(f'eval on test with g_model ({args.g_model})')
        unbiased_eval(user_num,
                      item_num,
                      te_df,
                      g_recommender,
                      epsilon=args.epsilon,
                      rel_model=rating_model,
                      past_hist=past_hist,
                      expo_model=expo_model,
                      expo_compound=args.p)
示例#8
0
tea_counts = tea_counts.rename(columns = {'id' : 'counts'})

high_earners = df.groupby('category').wage.apply(lambda x: np.percentile(x, 75)).reset_index()
df.groupby(['location', 'Day of week'])['Total cales'].mean().reset_index()

#pivot table
df.pivot(columns = 'column pivot',
		inex = 'column to be row',
		values = 'column to be values')
		
#merge df
new_df = pd.merge(df1, df2)
new_df = df.merge(df1).merge(df3)

#concate
menu = pd.concate([df1], [df2])

#merge left / right
how = left
#only left df item will be rept

#merge inner / outter
df_new = pd.merge(df1, df2, how = 'outer')
#merge all lines without losing data 'nan' and 'None' will be filled

#merge and change column name
pd.merge(
		orders,
		customers,
		left_on = 'customer_id',
		right_on = 'id',
    keras_fold_val = model.predict(X_val_list).ravel()
    keras_oof[valid_idx] += keras_fold_val / folds.n_splits
    keras_fold_auc = roc_auc_score(valid_y, keras_fold_val)
    keras_preds += model.predict(X_test_list).ravel() / folds.n_splits

    #######################CatBooost########################################

    layer_name = 'batch_normalization_2'
    intermediate_layer_model = Model(
        inputs=model.input, outputs=model.get_layer(layer_name).output)

    X_train_k = pd.DataFrame(intermediate_layer_model.predict(X_train_list))
    X_val_k = pd.DataFrame(intermediate_layer_model.predict(X_val_list))
    X_test_k = pd.DataFrame(intermediate_layer_model.predict(X_test_list))
    tempdata = pd.concate((X_train_k, X_val_k), axis=0, ignore_index=True)
    encode_features += tempdata.values / folds.n_splits

    # cls.fit(X_train_k, train_y, eval_set=(X_val_k, valid_y),early_stopping_rounds=50,verbose=100,plot=False)

    # cls_fold_val=cls.predict_proba(X_val_k)[:,1]
    # cls_oof[valid_idx] += cls_fold_val/folds.n_splits
    # cls_fold_AUC=roc_auc_score(valid_y, cls_fold_val)
    # cls_preds +=cls.predict_proba(X_test_k)[:,1]/folds.n_splits

    ###########################Fold Results#########################

    print("\n")
    print('-' * 30)
    print('Fold {} - Keras_OOF = {}'.format(fold + 1, keras_fold_auc))
    # print('Fold {} - CatBoost_OOF = {}'.format(fold + 1,cls_fold_AUC))