Пример #1
0
def read_and_translate(filename, destname):
    print('>> reading', filename)
    if READMODE != '.feather':
        df = read_file(filename)
    else:
        df = mlc.load(filename)

    print_memory()
    df = drop_to_save_memory(df)
    print_memory()
    print(df.head(5))

    for feature in df:
        df = desc_missing(df, feature)

    df_translated = df
    for feature in CAT_TRANSLATE:
        print('>> doing', feature)
        df_translated = translate_col_to_en(df_translated, feature)

    mlc.save(
        df_translated,
        destname)  # DataFrames can be saved with ultra-fast feather format.
    del df_translated
    gc.collect()
    df_translated = mlc.load(destname)
    print(df_translated.head())
Пример #2
0
    def prepare_data(self):
        if os.path.exists(
                self.section.get('train_file_tmp')) and os.path.exists(
                    self.section.get('test_file_tmp')):
            train_data = mlc.load(self.section.get('train_file_tmp'))
            test_data = mlc.load(self.section.get('test_file_tmp'))
        else:
            train_data, test_data = self.feature_more()

        train_data = train_data.drop('ip', axis=1)
        self.len_train = train_data.shape
        self.len_test = test_data.shape
        self.sample_cols = train_data.columns.values[sample_match(
            train_data.columns.values)].tolist()
        # self.embedding_cols = train_data.columns.difference(self.sample_cols).difference([self.label]).tolist()
        self.embedding_cols = train_data.columns.difference(
            [self.label, 'click_id']).tolist()

        for col in self.embedding_cols:
            self.col_max[col] = max(train_data[col].max(),
                                    test_data[col].max()) + 1
        # self.col_max['hour'] = 24
        # self.col_max['day'] = 31
        # self.col_max['wday'] = 7

        return train_data, test_data
Пример #3
0
def check_if_all_translated():
    for name in ['train_translated', 'train_active_translated', 
                'test_translated', 'test_active_translated']:     
        print('--------------------------------------------------------------------------')
        debug = DEBUG
        if debug:
            dstname = '../input/debug{}/{}_debug{}.feather'.format(debug,name,debug)
        else:            
            dstname = '../input/{}.feather'.format(name)
        t_start = time.time()
        print('>> loading', dstname)
        df = mlc.load(dstname)
        print('no. of rows:', len(df))
        t_end = time.time()
        print('loading time:', t_end-t_start)
        for feature in CAT_TRANSLATE:
            print('>> doing', feature)
            list_not_translated = []
            count_not_translated = 0
            for index, row in df.iterrows():
                if index%100000==0: print(index,'/',len(df))
                if row[feature] == row[feature + '_en']:
                    count_not_translated = count_not_translated + 1
                    list_not_translated = list_not_translated + [row[feature]]
            
            print('feature {} not translated {}/{}'.format(feature,count_not_translated,len(df)))  
            list_not_translated_unique, count_not_translated_unique = find_unique_element_and_count(list_not_translated)                             
            print('list not translated', list_not_translated_unique)
            print('count not translated', count_not_translated_unique)
Пример #4
0
def write_debug_mode_csv():
    for name in ['train', 'test', 'train_active',  'test_active', 
                    'periods_train', 'periods_test', 'train_translated', 
                    'train_active_translated', 'test_translated', 'test_active_translated']:
          for debug in [1,2]:
                    print('----------------------------------------------------------------')
                    dstname = '../input/{}.feather'.format(name)
                    t_start = time.time()
                    print('>> loading', dstname)
                    df = mlc.load(dstname)
                    print('no. of rows:', len(df))
                    print(df.head())
                    # del df; gc.collect()
                    t_end = time.time()
                    print('loading time:', t_end-t_start)

                    savename = '../input/debug{}/{}_debug{}.csv'.format(debug,name,debug)
                    if debug == 1:
                              df_extracted = df.sample(frac=0.1, random_state = SEED)
                    else:
                              df_extracted = df.sample(frac=0.001, random_state = SEED) 
                    print('no. of rows:', len(df_extracted))        
                    print('>> saving to', savename)
                    df_extracted.to_csv(savename,index=False)    
                    print('done')                                                                                                                                                           
def test_load_time():
    for name in [
            'train', 'test', 'train_active', 'test_active', 'periods_train',
            'periods_test', 'train_translated', 'train_active_translated',
            'test_translated', 'test_active_translated'
    ]:
        print(
            '----------------------------------------------------------------')
        dstname = '../input/{}.feather'.format(name)
        t_start = time.time()
        print('>> loading', dstname)
        df = mlc.load(dstname)
        print('no. of rows:', len(df))
        print(df.head())
        del df
        gc.collect()
        t_end = time.time()
        print('loading time:', t_end - t_start)
Пример #6
0
def read_and_translate(filename, which_dataset):
    print('>> reading', filename)
    if READMODE != '.feather':
        df = read_file(filename)
    else:
        df = mlc.load(filename)
    # for feature in df:
    #     df = desc_missing(df,feature)
    df_translated = df
    for feature in CAT_TRANSLATE:
        print('>> translating', feature)
        dstname = '../dict/dict_ru_to_en_{}_{}.pickle'.format(
            which_dataset, feature)
        map_dict = pickle.load(open(dstname, "rb"))
        # map_dict['n/a'] = 'n/a'
        new_feature = feature + '_en'
        df_translated[new_feature] = df[feature].apply(lambda x: map_dict[x])

    return df_translated
Пример #7
0
def read_and_translate(filename, destname):
    print('>> reading...')
    df = read_file(filename)
    # df.head(5)

    for feature in df:
        df = desc_missing(df, feature)

    df_translated = df
    for feature in CAT_TRANSLATE:
        print('>> doing', feature)
        df_translated = translate_col_to_en(df_translated, feature)

    mlc.save(
        df_translated,
        destname)  # DataFrames can be saved with ultra-fast feather format.
    del df_translated
    gc.collect()
    df_translated = mlc.load(destname)
    print(df_translated.head())
Пример #8
0
def read_and_build_dict(filename, which_dataset, from_iloc):
    print('>> reading', filename)
    if READMODE != '.feather':
        df = read_file(filename)
    else:
        df = mlc.load(filename)

    print_memory()
    df = drop_to_save_memory(df)
    print_memory()
    print(df.head(5))

    for feature in df:
        df = desc_missing(df, feature)

    df_translated = df
    for feature in CAT_TRANSLATE:
        print('>> doing', feature)
        translate_col_and_save(df_translated, feature, which_dataset,
                               from_iloc)
Пример #9
0
def read_and_build_map(filename, which_dataset):
    print('>> reading', filename)
    if READMODE != '.feather':
        df = read_file(filename)
    else:
        df = mlc.load(filename)

    # for feature in df:
    #     df = desc_missing(df,feature)

    for feature in CAT_TRANSLATE:
        map_dict = dict()
        dstname = '../dict/dict_ru_to_en_{}_{}.pickle'.format(
            which_dataset, feature)
        print('>> doing', feature)
        unique_element = df[feature].unique()
        num_split = len(range(0, len(unique_element), NCHUNKS))
        print_memory()
        is_cont = True
        if os.path.exists(dstname):
            print('done already')
        else:
            for k in range(num_split):
                if is_cont:
                    savename = '../dict_part/translated_{}_{}_{}.pickle'.format(
                        which_dataset, feature, k)
                    if os.path.exists(savename):
                        print('loading', savename)
                        map_temp = pickle.load(open(savename, "rb"))
                        print('updating map')
                        map_dict.update(map_temp)
                    else:
                        print('missing', savename, '. Please check!!')
                        is_cont = False
            if is_cont:
                print('saving final dict to', dstname)
                with open(dstname, 'wb') as handle:
                    pickle.dump(map_dict,
                                handle,
                                protocol=pickle.HIGHEST_PROTOCOL)
Пример #10
0
def translate_textblob():
    print(
            '--------------------------------------------------------------------------'
    )
    debug = DEBUG
    if debug==3:
        debug=2
    name = DATASET
    if debug:
            dstname = '../input/debug{}/{}_debug{}.feather'.format(
                    debug, name, debug)
            savename = '../input/debug{}/{}_textblob_debug{}.feather'.format(
                    debug, name, debug)                    
    else:
            dstname = '../input/{}.feather'.format(name)
            savename = '../input/{}_textblob.feather'.format(name) 


    if os.path.exists(savename):
        print('done already')
    else:                    
        t_start = time.time()
        print('>> loading', dstname)
        df = mlc.load(dstname)
        if DEBUG == 3:
            df = df.sample(frac=0.01)
        print('no. of rows:', len(df))
        t_end = time.time()
        print('loading time:', t_end - t_start)
        print_memory()

        print('>> translating')
        df_translated = map_translate(df)
        print (df_translated.head())
        print (df_translated.tail())

        print('>> saving', savename)
        mlc.save(df_translated, savename)
Пример #11
0
def convert_to_pickle():
    for name in ['train_active_translated', 'test_active_translated']:
        filename = '../input/{}.feather'.format(name)
        print(
            '----------------------------------------------------------------')
        print('>> loading', filename)
        t_start = time.time()
        df = mlc.load(filename)
        t_end = time.time()
        print('loading time feather:', t_end - t_start)
        print('no. of rows:', len(df))
        dstname = '../input/{}.pickle'.format(name)
        print('>> saving', dstname)
        with open(dstname, 'wb') as handle:
            pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)
        del df
        gc.collect()
        print('>> loading', dstname)
        t_start = time.time()
        df = pickle.load(open(dstname, "rb"))
        t_end = time.time()
        print('loading time pickle:', t_end - t_start)
        print('no. of rows:', len(df))
Пример #12
0
def main():
    global args, debug, NCHUNKS, READMODE
    args = parser.parse_args()
    datasets = args.dataset
    debug = args.debug
    READMODE = args.readmode

    if debug == 2:
        NCHUNKS = 13
    if debug == 1:
        NCHUNKS = 130
    if debug == 0:
        NCHUNKS = 2000

    print('summary: debug {}, chunks {}'.format(debug, NCHUNKS))

    if datasets == 'all':
        datasets = ['train', 'test', 'train_active', 'test_active']
    else:
        datasets = [datasets]

    for which_dataset in datasets:
        filename = '../input/' + which_dataset + READMODE
        read_and_build_map(filename, which_dataset)
        df_translated = read_and_translate(filename, which_dataset)
        destname = '../input/' + which_dataset + '_translated.feather'
        print('>> saving to ...', destname)
        mlc.save(df_translated, destname)
        del df_translated
        gc.collect()
        print('>> loading ...', destname)
        df_translated = mlc.load(destname)
        print(df_translated.head())
        df2 = df_translated.sample(frac=0.01)
        print(df2.head(5))
        print(df2.tail(5))
Пример #13
0
def load_feather(filename):
    print_doing('read {}'.format(filename))
    df = mlc.load(filename)
    return df
Пример #14
0
                                        num_workers=2)

kuzu_keep = ImageFolder(
    kuzu, transform=torchvision.transforms.Compose(transforms_keep))
img_batch_keep = torch.utils.data.DataLoader(kuzu_keep,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             num_workers=2)

img_sizes = {}
all_image_lst = []

#This loop is for getting file sizes and a list of all image names
print('Getting image sizes...')
try:
    img_sizes, all_image_lst = mlc.load('cache/image_sizes.pkl')
    print('Loaded image sizes from cache')
except FileNotFoundError:
    for _, (images, labels, file_locs) in enumerate(tqdm(img_batch_keep)):
        for file_loc in file_locs:
            size2 = lycon.load(file_loc).shape
            new_size = torch.Size([1, 3, *size2])
            img_sizes[file_loc] = new_size
            all_image_lst.append(file_loc)

    mlc.save([img_sizes, all_image_lst], 'cache/image_sizes.pkl')

print('{} images loaded'.format(len(all_image_lst)))

#This randomly takes 50 images for the "test set".
test_images = random.sample(all_image_lst, 50)