예제 #1
0
파일: settings.py 프로젝트: sg-s/pyvocab
def build_distractor_db():
    bar = st.progress(0)

    words = read_df("words.csv")
    words = words["word"]

    distractors = []

    for i, word in enumerate(words):

        percent_complete = int(i / len(words) * 100)

        bar.progress(percent_complete)
        distractors.append(find_n_closest_words(word))
    bar.progress(100)

    distractors_db = dict()
    distractors_db["word"] = words
    distractors_db["distractors"] = distractors

    distractors_db = pd.DataFrame(distractors_db)

    st.write(distractors_db)

    save_df(distractors_db, "distractors.csv")
예제 #2
0
def extract(df_ori, des):
    print(df_ori.shape)
    df_ori = df_ori.merge(df_hist, on=['session_id'], how='left')
    df_ori = df_ori.merge(df_sid, on=['session_id'], how='left')
    print(df_ori.head(10))
    df_ori.columns = df_ori.columns.astype(str)
    utils.save_df(df_ori, des)
예제 #3
0
def convert(df_ori, des):
    print(df_ori.shape)
    df_ori = df_ori.merge(df, on=['session_id', 'impressions'], how='left')
    df_ori['cur_ts_sub_last'] = df_ori['timestamp'] - df_ori['timestamp_x']
    df_ori.drop(['timestamp', 'timestamp_x'], axis=1, inplace=True)
    df_ori.columns = df_ori.columns.astype(str)
    utils.save_df(df_ori, des)
def extract(df_ori, des):
    print(df_ori.shape)
    df_ori = df_ori.merge(df, on = ['session_id'], how = 'left')
    df_ori['last_act_gap'] = df_ori['timestamp'] - df_ori['timestamp_x']
    df_ori.drop(['timestamp','timestamp_x'],axis=1,inplace=True)
    print(df_ori.head(10))
    df_ori.columns = df_ori.columns.astype(str)
    utils.save_df(df_ori, des)
예제 #5
0
def gen_tr_click(df):
    df = df[['session_id',
             'reference']].drop_duplicates(subset='session_id',
                                           keep='last').reset_index(drop=True)
    print(df.shape)
    df = df[pd.notnull(df.reference)].reset_index(drop=True)
    print(df.shape)
    utils.save_df(df, config.data + 'm3_tr_click.ftr')
예제 #6
0
def convert(ori, des, feats):
    df_ori = utils.load_df(ori)
    for f in feats:
        tmp = utils.load_df(config.feat+'m3_' +f)
        print(f)
        df_ori = pd.concat([df_ori,tmp.drop(['session_id','impressions'],axis=1)],axis=1)
    df_ori.columns = df_ori.columns.astype(str)
    utils.save_df(df_ori,des)
예제 #7
0
 def snapshot(self):
     """Save a snapshot of the cluster
     """
     if self.add_t_snap:
         utils.save_df(self.cluster.withColumn("t", lit(float(self.t))),
                       f"t{self.t}", **self.save_params)
     else:
         utils.save_df(self.cluster, f"t{self.t}", **self.save_params)
예제 #8
0
def extract(df_ori, des):
    print(df_ori.shape)
    df_ori = df_ori.merge(df_uid, on=['user_id'], how='left')
    df_ori = df_ori.merge(df_session, on=['session_id'], how='left')
    df_ori.drop('user_id', axis=1, inplace=True)
    print(df_ori.head())
    df_ori.columns = df_ori.columns.astype(str)
    utils.save_df(df_ori, des)
예제 #9
0
def extract(df_ori, des):
    print(df_ori.shape)
    df_ori = df_ori.merge(df_meta, on = ['impressions'], how = 'left')
    df_ori = df_ori.merge(df_feat, on = ['impressions'], how = 'left')
    df_ori['item_price_div_median'] = df_ori['prices'] / df_ori['impressions_by_prices_median']
    df_ori['item_rank_sub_median'] = df_ori['impr_rank'] - df_ori['impressions_by_impr_rank_median']
    df_ori.drop(['impr_rank','prices'],axis=1,inplace=True)
    print(df_ori.head())
    df_ori.columns = df_ori.columns.astype(str)
    utils.save_df(df_ori, des)
예제 #10
0
def convert(ori, des, feat):
    df_ori = utils.load_df(ori)
    print(df_ori.shape)
    df_feat = utils.load_df(config.feat + feat)
    df_ori = df_ori.merge(df_feat,
                          on=['session_id', 'impressions'],
                          how='left')
    print(df_ori.shape)
    df_ori.columns = df_ori.columns.astype(str)
    utils.save_df(df_ori, des)
예제 #11
0
파일: app.py 프로젝트: rajat-1994/DIF
def get_path(path, children):
    files = read_files(path)
    if path:
        children.append(html.P(f"files found : {len(files)}"))
        embedding = Embedding()
        embs = np.array(embedding.embeddings(files))
        matrix = similarity_matrix(embs, embs)
        index_pair = sort_matrix(matrix)
        np.save('index_pair.npy', index_pair)
        df = new_df(files)
        save_df(df, './files.csv')
        return children
    return []
예제 #12
0
def convert(ori, des, prefix):
    df_ori = utils.load_df(ori)
    print(df_ori.shape)
    for feat in feats:
        df_feat = utils.load_df(config.model + prefix +
                                '%s.csv' % feat).rename(
                                    columns={'target': feat})
        df_ori = df_ori.merge(df_feat[['session_id', 'impressions', feat]],
                              on=['session_id', 'impressions'],
                              how='left')
        print(df_ori.shape)
    df_ori.columns = df_ori.columns.astype(str)
    utils.save_df(df_ori, des)
예제 #13
0
def convert(ori, des, sample):
    tr = utils.load_df(ori)
    print(tr.shape)
    tr_out = tr[['session_id', 'impressions']]
    dfs = utils.load_df(sample)
    dfs['impr_rank'] = dfs.groupby(['session_id', 'step']).cumcount().values
    print(dfs.head())
    tr_out = cate_encoding.cate_num_stat(dfs, tr_out,
                                         ['session_id', 'impressions'],
                                         'impr_rank', ['min', 'max', 'median'])
    tr_out.columns = tr_out.columns.astype(str)
    print(tr_out.head())
    utils.save_df(tr_out, des)
예제 #14
0
def convert(ori, des, feats):
    df_ori = utils.load_df(ori)
    print(df_ori.shape)
    for feat in feats:
        df_feat = utils.load_df(config.feat + feat)
        df_ori = df_ori.merge(df_feat,
                              on=['session_id', 'impressions'],
                              how='left')
        print(df_ori.shape)
        del df_feat
        gc.collect()
    df_ori = utils.reduce_mem(df_ori)
    df_ori.columns = df_ori.columns.astype(str)
    utils.save_df(df_ori, des)
def convert(df_ori, des):
    print(df_ori.shape)
    df_ori = df_ori.merge(df_last, on=['session_id'], how='left')
    df_ori['last_item_rank_diff'] = df_ori['impr_rank'] - df_ori[
        'last_item_impr_rank']
    df_ori[
        'last_item_price_div'] = df_ori['prices'] / df_ori['last_item_price']
    df_ori.drop(
        ['last_item_impr_rank', 'last_item_price', 'prices', 'impr_rank'],
        axis=1,
        inplace=True)
    df_ori.columns = df_ori.columns.astype(str)
    print(df_ori.head())
    utils.save_df(df_ori, des)
예제 #16
0
def convert(ori, des, feat):
    df_ori = utils.load_df(ori)
    print(df_ori.shape)
    for c in cols:
        df_ori = cate_encoding.cate_num_rank(df_ori, ['session_id'],
                                             c,
                                             ascending=True,
                                             show_agg=True)
    df_ori = df_ori.reset_index(drop=True)
    df_ori.columns = df_ori.columns.astype(str)
    utils.save_df(df_ori, des)
    utils.save_df(
        df_ori[[
            'session_id', 'impressions', 'session_id_by_prices_rank',
            'session_id_by_ctr_rank', 'session_id_by_last_ts_sub_max_rank'
        ]], feat)
예제 #17
0
파일: app.py 프로젝트: rajat-1994/DIF
def display_image(children, nc1, nc2):
    global SESSION_ID
    changed_id = [p['prop_id'] for p in dash.callback_context.triggered][0]
    print("changed_id", changed_id)
    print("clicks", children)
    print(f"cccc {nc1,nc2}")
    if children != []:
        # Reading files
        index_pair = np.load('index_pair.npy')
        files_path = pd.read_csv('./files.csv')
        # Checking which button is clicked and update Session id
        update_session(index_pair, files_path, changed_id)
        # Reading images and filename
        image1, image2, filename1, filename2 = encoded_images(
            index_pair, files_path)
        # Saving updated files datafrmae
        save_df(files_path)

        return image1, image2, filename1, filename2
    return "", "", "", ""
예제 #18
0
def get_test_sample(df):
    df['target'] = (df['reference'] == df['impressions']).astype(int)
    # drop noisy sample
    mask = (df.session_id == 'cbe3752713eee') & (df.timestamp == 1541660358)
    df = df[~mask]
    df_session = df[['session_id', 'step'
                     ]].drop_duplicates(subset='session_id',
                                        keep='last').reset_index(drop=True)
    df = df_session.merge(df, on=['session_id', 'step'],
                          how='left').reset_index(drop=True)
    te = df[pd.isnull(df['reference'])].reset_index(drop=True)
    print(te.shape)
    tr = df[pd.notnull(df['reference'])].reset_index(drop=True)
    print(tr.shape)
    tr.drop(['current_filters', 'reference', 'action_type'],
            axis=1,
            inplace=True)
    te.drop(['current_filters', 'reference', 'action_type', 'target'],
            axis=1,
            inplace=True)
    utils.save_df(te, config.data + 'm3_te.ftr')
    return tr, te
def extract(sample, ori, feat):
    nrows = None
    df = pd.read_csv(
        sample,
        nrows=nrows,
        usecols=['session_id', 'step', 'reference', 'impressions'])
    print(df.head())
    df_ori = utils.load_df(ori)
    print(df_ori.head())
    df = df.merge(df_ori[['session_id', 'step']].drop_duplicates(),
                  on='session_id',
                  how='left')
    print(df.head())
    df = df[df.step_x < df.step_y]

    tmp = df.drop_duplicates(subset=['session_id', 'step_x'])
    df_clk = tmp.groupby(['session_id',
                          'reference'])['step_x'].agg('count').reset_index()
    print(df_clk.head())
    df_clk.rename(columns={
        'reference': 'impressions',
        'step_x': 'item_sid_clk_cnt'
    },
                  inplace=True)
    df_impr = df.groupby(['session_id',
                          'impressions'])['step_x'].agg('count').reset_index()
    print(df_impr.head())
    df_impr.rename(columns={'step_x': 'item_sid_impr_cnt'}, inplace=True)

    df_out = df_ori[['session_id', 'impressions']]
    df_out = df_out.merge(df_clk, on=['session_id', 'impressions'], how='left')
    df_out = df_out.merge(df_impr,
                          on=['session_id', 'impressions'],
                          how='left')
    print(df_out.head())
    df_out.columns = df_out.columns.astype(str)
    utils.save_df(df_out, feat)
예제 #20
0
    def diag(self):
        """Save diagnostic information about the cluster energy
        """

        T, U = cluster.calc_T(self.cluster,
                              self.G), cluster.calc_U(self.cluster, self.G)
        E = T + U

        if not self.E_initial:
            self.E_initial = E

        dE = (E - self.E_initial) / self.E_initial
        diagInfo = (self.t, E, dE)

        if self.saveDiag:
            # necessary to match the spark schema
            diagInfo = [float(x) for x in diagInfo]
            df_diag = self.spark.createDataFrame([diagInfo],
                                                 schema=schemas.diag)

            utils.save_df(df_diag, f"diag_t{self.t}", **self.save_params)
        else:
            print("{: >30} {: >30} {: >30}".format(*("t", "E", "dE")))
            print("{: >30} {: >30} {: >30}".format(*diagInfo))
예제 #21
0
def dump_feat(ori, des):
    df = utils.load_df(ori)
    df = df[cols + ['session_id', 'impressions']]
    df.columns = re_cols + ['session_id', 'impressions']
    print(df.shape)
    utils.save_df(df, des)
예제 #22
0
import pandas as pd
import sys
import utils
import config
import cate_encoding

tr = utils.load_df(config.data+'m3_tr.ftr')
te = utils.load_df(config.data+'m3_te.ftr')
df = pd.concat([tr,te]).reset_index(drop=True)
df['dt'] = pd.to_datetime(df['timestamp'], unit='s')
df['hour'] = df['dt'].dt.hour

cols = ['city','device','platform']
for c in cols:
    df = cate_encoding.label_encode(df, c)

# impr rank 
df['impr_rank'] = df.groupby(['session_id']).cumcount().values
# price statistics by session
df = cate_encoding.cate_num_stat(df,df,['session_id'],'prices',['median','std','count'])

df['price_sub'] = df['prices'] - df['session_id_by_prices_median']
df['price_div'] = df['prices'] / df['session_id_by_prices_median']
df.drop(['dt'],axis=1,inplace=True)
df.columns = df.columns.astype(str)

utils.save_df(df[pd.isnull(df['target'])].reset_index(drop=True), config.feat+'m3_te_0.ftr')
utils.save_df(df[pd.notnull(df['target'])].reset_index(drop=True),config.feat+'m3_tr_0.ftr')
예제 #23
0
args = parser.parse_args()
"""/arguments"""

G = 1
TOLERANCE = 1e-04
res = []
data = [
    'c_0500.csv', 'c_0700.csv', 'c_0600.csv', 'c_1000.csv', 'c_0900.csv',
    'c_1200.csv', 'c_1100.csv', 'c_1500.csv', 'c_0300.csv', 'c_1800.csv',
    'c_1300.csv', 'c_0800.csv', 'c_1700.csv', 'c_0200.csv', 'c_0100.csv',
    'c_0400.csv', 'c_0000.csv', 'c_1600.csv', 'c_1400.csv'
]

for fname in data:
    df = utils.load_df(os.path.join(args.input, fname),
                       schema=schemas.clust,
                       part="id")
    e = cluster.calc_E(df)
    diff = abs(e - (-0.25))
    res.append([
        fname,
        e,
        -0.25,
        diff,
    ])

sc = SparkContext.getOrCreate()
res = sc.parallelize(res).toDF(schema=schemas.E_test_res)

utils.save_df(res, "E_TEST", args.outputDir, fformat="csv")
예제 #24
0
for cv in range(5):
    mask = (df.cv == cv)
    val_tr = df.loc[mask][['session_id', 'impressions']].drop_duplicates()
    tra_tr = df.loc[~mask]
    tmp = tra_tr.groupby('impressions')['click'].agg(['sum',
                                                      'count']).reset_index()
    val_tr = val_tr.merge(tmp, on='impressions', how='left')
    tr_lis.append(val_tr)

tr_ctr = pd.concat(tr_lis, axis=0).reset_index(drop=True)
tr_ctr['ctr'] = tr_ctr['sum'] / tr_ctr['count']
te_ctr['ctr'] = te_ctr['sum'] / te_ctr['count']

trs = utils.load_df(config.feat + 'm3_tr_0.ftr')
tes = utils.load_df(config.feat + 'm3_te_0.ftr')

tr_out = trs[['session_id', 'impressions']]
te_out = tes[['session_id', 'impressions']]

te_out = te_out.merge(te_ctr.drop(['sum', 'count'], axis=1),
                      on=['impressions'],
                      how='left')
tr_out = tr_out.merge(tr_ctr.drop(['sum', 'count'], axis=1),
                      on=['session_id', 'impressions'],
                      how='left')

tr_out.columns = tr_out.columns.astype(str)
te_out.columns = te_out.columns.astype(str)
utils.save_df(tr_out, config.feat + 'm3_tr_ctr.ftr')
utils.save_df(te_out, config.feat + 'm3_te_ctr.ftr')
예제 #25
0
df = pd.concat([tr, te])

trs = utils.load_df(config.feat + 'm3_tr_0.ftr')
tes = utils.load_df(config.feat + 'm3_te_0.ftr')

tr_out = trs[['session_id', 'impressions']]
te_out = tes[['session_id', 'impressions']]

for act in actions:
    tmp = df[df.action_type == act][['reference', 'user_id']]
    tmp = tmp.groupby(['reference'])['user_id'].agg(['count',
                                                     'nunique']).reset_index()
    tmp.rename(columns={
        'reference': 'impressions',
        'count': act + '_pv',
        'nunique': act + '_uv'
    },
               inplace=True)
    tmp['impressions'] = tmp['impressions'].astype(str)
    num_index = tmp['impressions'].str.isnumeric()
    tmp = tmp[num_index]
    tmp['impressions'] = tmp['impressions'].astype('int')
    print(tmp.head())
    tr_out = tr_out.merge(tmp, on=['impressions'], how='left')
    te_out = te_out.merge(tmp, on=['impressions'], how='left')

tr_out.columns = tr_out.columns.astype(str)
te_out.columns = te_out.columns.astype(str)
utils.save_df(tr_out, config.feat + 'm3_tr_item_act_pv.ftr')
utils.save_df(te_out, config.feat + 'm3_te_item_act_pv.ftr')
def convert(df_ori, des, df_out):
    print(df_ori.shape)
    df_ori = df_ori.merge(df_out, on=['session_id', 'impressions'], how='left')
    df_ori.columns = df_ori.columns.astype(str)
    print(df_ori.head())
    utils.save_df(df_ori, des)
예제 #27
0
def ETL(extractor,
        components,
        data_dict,
        same_dt_aggregator,
        hdf5_fname=None,
        joined_path=None,
        hadm_ids=ALL,
        use_base_df=True,
        to_pandas=False,
        chunksize=500000):

    logger.log('***ETL***', new_level=True)
    logger.log('SETUP', new_level=True)

    category_map = mimic_category_map(data_dict)
    ureg = units.MedicalUreg()

    transformer = transform_pipeline()

    standard_clean_pipeline = Pipeline([
        ('aggregate_same_datetime', same_dt_aggregator),
        ('split_dtype', transformers.split_dtype()),
        ('standardize_columns',
         transformers.column_standardizer(data_dict, ureg)),
        ('standardize_categories',
         transformers.standardize_categories(data_dict, category_map)),
        ('split_bad_categories', transformers.split_bad_categories(data_dict)),
        # ('one_hotter',transformers.nominal_to_onehot()),
        ('drop_oob_values', transformers.oob_value_remover(data_dict))
    ])

    should_save = (hdf5_fname is not None)

    df_base = None

    if should_save & use_base_df:
        try:
            df_base = utils.open_df(hdf5_fname, joined_path)
        except:
            pass

    if df_base is not None:

        existing_components = df_base.columns.get_level_values(
            column_names.COMPONENT).unique().tolist()
        existing_ids = set(
            df_base.index.get_level_values(column_names.ID).tolist())
        requested_ids = hadm_ids if hadm_ids != ALL else get_all_hadm_ids()

        new_ids = [ID for ID in requested_ids if ID not in existing_ids]

        #case 1: new ids in existing columns, don't try to be smart with ALL unless not a lot of IDs
        if len(new_ids) > 0:
            df_addition = ETL(extractor,
                              existing_components,
                              data_dict,
                              same_dt_aggregator,
                              hadm_ids=new_ids,
                              to_pandas=True)
            if df_addition is not None:
                df_base = pd.concat([df_base, df_addition])
            #now we only need to load NEW components
            components = [
                comp for comp in components if comp not in existing_components
            ]

        logger.log('Base DF to Dask')
        df_base = dd.from_pandas(df_base.reset_index(), chunksize=chunksize)

    df_all = df_base

    logger.log('BEGIN ETL for {} admissions and {} components: {}'.format(
        hadm_ids if hadm_ids == ALL else len(hadm_ids), len(components),
        components),
               new_level=True,
               end_level=True)
    for component in components:
        logger.log('{}: {}/{}'.format(component.upper(),
                                      components.index(component) + 1,
                                      len(components)),
                   new_level=True)
        """
        @@@@@@@@@@@@@@@
        ----EXTRACT----
        @@@@@@@@@@@@@@@
        """

        logger.log("Extracting...", new_level=True)
        df_extracted = extractor.extract_component(component, hadm_ids)

        if df_extracted.empty:
            print 'EMPTY Dataframe EXTRACTED for {}, n={} ids'.format(
                component, len(hadm_ids))
            logger.end_log_level()
            continue

        if should_save:
            logger.log('Save EXTRACTED DF = {}'.format(df_extracted.shape))
            utils.save_df(df_extracted, hdf5_fname,
                          'extracted/{}'.format(component))
        logger.end_log_level()
        """
        @@@@@@@@@@@@@@@@@
        ----TRANSFORM----
        @@@@@@@@@@@@@@@@@
        """

        logger.log("Transforming... {}".format(df_extracted.shape),
                   new_level=True)
        transformer.set_params(add_level__level_val=component)
        df_transformed = transformer.transform(df_extracted)

        print 'Data Loss (Extract > Transformed):', utils.data_loss(
            df_extracted.set_index(column_names.ID).value.to_frame(),
            df_transformed)

        if df_transformed.empty:
            print 'EMPTY Dataframe TRANSFORMED for {}, n={} ids'.format(
                component, len(hadm_ids))
            logger.end_log_level()
            continue

        if should_save:
            logger.log('Save TRANSFORMED DF = {}'.format(df_transformed.shape))
            utils.save_df(df_transformed, hdf5_fname,
                          'transformed/{}'.format(component))
        logger.end_log_level()
        """
        @@@@@@@@@@@@@@@
        -----CLEAN-----
        @@@@@@@@@@@@@@@
        """

        logger.log("Cleaning... {}".format(df_transformed.shape),
                   new_level=True)
        df = standard_clean_pipeline.transform(df_transformed)

        print 'Data Loss (Extract > Cleaned):', utils.data_loss(
            df_extracted.set_index(column_names.ID).value.to_frame(), df)

        if df.empty:
            print 'EMPTY Dataframe TRANSFORMED for {}, n={} ids'.format(
                component, len(hadm_ids))
            logger.end_log_level()
            continue

        if should_save:
            logger.log('Save CLEANED DF = {}'.format(df.shape))
            utils.save_df(df, hdf5_fname, 'cleaned/{}'.format(component))
        logger.end_log_level()

        del df_extracted, df_transformed

        logger.log('Filter & sort - {}'.format(df.shape))

        df.sort_index(inplace=True)
        df.sort_index(inplace=True, axis=1)

        logger.log('Convert to dask - {}'.format(df.shape))
        df_dask = dd.from_pandas(df.reset_index(), chunksize=chunksize)
        del df

        logger.log('Join to big DF')

        if df_all is None: df_all = df_dask
        else:
            df_all = df_all.merge(df_dask, how='outer', on=['id', 'datetime'])
            del df_dask

        logger.end_log_level()
    logger.end_log_level()

    if df_all is None or not to_pandas:
        logger.end_log_level()
        return df_all

    logger.log('Dask DF back to pandas')
    df_pd = df_all.compute()
    del df_all
    df_pd.set_index(['id', 'datetime'], inplace=True)

    logger.log('SORT Joined DF')
    df_pd.sort_index(inplace=True)
    df_pd.sort_index(inplace=True, axis=1)

    if should_save:
        logger.log('SAVE Big DF')
        utils.save_df(df_pd, hdf5_fname, joined_path)
    logger.end_log_level()

    return df_pd
예제 #28
0
    tr.drop(['current_filters', 'reference', 'action_type'],
            axis=1,
            inplace=True)
    te.drop(['current_filters', 'reference', 'action_type', 'target'],
            axis=1,
            inplace=True)
    utils.save_df(te, config.data + 'm3_te.ftr')
    return tr, te


def gen_tr_click(df):
    df = df[['session_id',
             'reference']].drop_duplicates(subset='session_id',
                                           keep='last').reset_index(drop=True)
    print(df.shape)
    df = df[pd.notnull(df.reference)].reset_index(drop=True)
    print(df.shape)
    utils.save_df(df, config.data + 'm3_tr_click.ftr')


if __name__ == '__main__':
    nrow = None
    train = utils.load_df(config.data + 'sample_train.csv', nrows=nrow)
    test = utils.load_df(config.data + 'sample_test.csv', nrows=nrow)
    df = pd.concat([train, test]).reset_index(drop=True)
    tr1 = gen_train_sample(train)
    tr2, te = get_test_sample(test)
    tr = pd.concat([tr1, tr2]).reset_index(drop=True)
    utils.save_df(tr1, config.data + 'm3_tr.ftr')
    gen_tr_click(df)