Exemplo n.º 1
0
o = len(files)
if nthreads == 1:
    print('Extracting image info with 1 thread ...')
    k = 0
    # Iterate over files
    for f in files:
        x = process_line(f)
        l_id.append(x[0])
        l_width.append(x[1])
        l_height.append(x[2])
        l_ratio.append(x[3])
        l_hash.append(x[4])
        l_size.append(x[5])
        k += 1
        if k % 1000 == 0:
            a.print_progress(k, start, o)
# Otherwise perform multi-threaded mapping
else:
    print('Extracting image info multi-threaded ... ', end='', flush=True)
    pool = Pool(nthreads)
    newdata = pool.map(process_line, files)
    pool.close()
    for x in newdata:
        l_id.append(x[0])
        l_width.append(x[1])
        l_height.append(x[2])
        l_ratio.append(x[3])
        l_hash.append(x[4])
        l_size.append(x[5])
    del newdata
    gc.collect()
Exemplo n.º 2
0
    mat1_t = ratio_of_matches(tx, ty)
    mat2_t = ratio_of_matches(ty, tx)
    return [
        train.iloc[i]['itemID_1'], train.iloc[i]['itemID_2'], sim_t, mat1_t,
        mat2_t,
        len(tx),
        len(ty)
    ]


t0 = time.time()
if nthreads == 1:
    print('Extracting features with 1 thread ...')
    for i in range(0, len(train.index)):
        if i % 10000 == 0:
            a.print_progress(i, t0, len(train.index))
        ftrs.append(process_row(i))
else:
    print('Extracting features multi-threaded ... ', end='', flush=True)
    pool = Pool(nthreads)
    ftrs = pool.map(process_row, range(0, len(train.index)))
    pool.close()
    a.print_elapsed(t0)

start = time.time()
print('Caching data to disk ... ', end='', flush=True)
ftrs = pd.DataFrame(ftrs)
ftrs.columns = [
    'itemID_1', 'itemID_2', 'simtitle', 'mattitle1', 'mattitle2', 'nwords1',
    'nwords2'
]
Exemplo n.º 3
0
df_test = feather.read_dataframe(cache_loc + 'test.fthr')

df_train = df_train[['itemID_1', 'itemID_2', 'cleanjson_1', 'cleanjson_2']]
df_test = df_test[['itemID_1', 'itemID_2', 'cleanjson_1', 'cleanjson_2']]

df = pd.concat([df_train, df_test])

clean_jsons = df['cleanjson_1'].tolist() + df['cleanjson_2'].tolist()

print('Creating key dict ... ')
allkey = {}
pa = 0
t0 = time.time()
for i in range(0, len(clean_jsons)):
    if i % 100000 == 0:
        a.print_progress(i, t0, len(clean_jsons))
    try:
        jx = clean_jsons[i].replace("'", "")
        resx = json.loads(jx)
        for x in resx.keys():
            if x in allkey:
                allkey[x] = allkey[x] + 1
            else:
                allkey[x] = 1
    except KeyboardInterrupt:
        raise
    except Exception as e:
        pa += 1

t0 = time.time()
print('Transforming key dict ... ', end='', flush=True)