示例#1
0
    ]


t0 = time.time()
if nthreads == 1:
    print('Extracting features with 1 thread ...')
    for i in range(0, len(train.index)):
        if i % 10000 == 0:
            a.print_progress(i, t0, len(train.index))
        ftrs.append(process_row(i))
else:
    print('Extracting features multi-threaded ... ', end='', flush=True)
    pool = Pool(nthreads)
    ftrs = pool.map(process_row, range(0, len(train.index)))
    pool.close()
    a.print_elapsed(t0)

start = time.time()
print('Caching data to disk ... ', end='', flush=True)
ftrs = pd.DataFrame(ftrs)
ftrs.columns = [
    'itemID_1', 'itemID_2', 'simtitle', 'mattitle1', 'mattitle2', 'nwords1',
    'nwords2'
]

# Save updated dataset
if mode == 0:
    feather.write_dataframe(ftrs, cache_loc + 'features_train_set3b.fthr')
if mode == 1:
    feather.write_dataframe(ftrs, cache_loc + 'features_test_set3b.fthr')
示例#2
0
                allkey[x] = allkey[x] + 1
            else:
                allkey[x] = 1
    except KeyboardInterrupt:
        raise
    except Exception as e:
        pa += 1

t0 = time.time()
print('Transforming key dict ... ', end='', flush=True)
icount = 0
keydict = {}
for k, n in allkey.items():
    keydict[k] = icount
    icount += 1
a.print_elapsed(t0)

ftrs_train = []
print('Generating for train ... ')
t0 = time.time()
pa = 0
for i in range(0, len(df_train.index)):
    if i % 10000 == 0:
        a.print_progress(i, t0, len(df_train.index))
    try:
        jx = df_train.iloc[i]['cleanjson_1'].replace("'", "")
        jy = df_train.iloc[i]['cleanjson_2'].replace("'", "")
        resx = json.loads(jx)
        resy = json.loads(jy)
    except KeyboardInterrupt:
        raise