] t0 = time.time() if nthreads == 1: print('Extracting features with 1 thread ...') for i in range(0, len(train.index)): if i % 10000 == 0: a.print_progress(i, t0, len(train.index)) ftrs.append(process_row(i)) else: print('Extracting features multi-threaded ... ', end='', flush=True) pool = Pool(nthreads) ftrs = pool.map(process_row, range(0, len(train.index))) pool.close() a.print_elapsed(t0) start = time.time() print('Caching data to disk ... ', end='', flush=True) ftrs = pd.DataFrame(ftrs) ftrs.columns = [ 'itemID_1', 'itemID_2', 'simtitle', 'mattitle1', 'mattitle2', 'nwords1', 'nwords2' ] # Save updated dataset if mode == 0: feather.write_dataframe(ftrs, cache_loc + 'features_train_set3b.fthr') if mode == 1: feather.write_dataframe(ftrs, cache_loc + 'features_test_set3b.fthr')
allkey[x] = allkey[x] + 1 else: allkey[x] = 1 except KeyboardInterrupt: raise except Exception as e: pa += 1 t0 = time.time() print('Transforming key dict ... ', end='', flush=True) icount = 0 keydict = {} for k, n in allkey.items(): keydict[k] = icount icount += 1 a.print_elapsed(t0) ftrs_train = [] print('Generating for train ... ') t0 = time.time() pa = 0 for i in range(0, len(df_train.index)): if i % 10000 == 0: a.print_progress(i, t0, len(df_train.index)) try: jx = df_train.iloc[i]['cleanjson_1'].replace("'", "") jy = df_train.iloc[i]['cleanjson_2'].replace("'", "") resx = json.loads(jx) resy = json.loads(jy) except KeyboardInterrupt: raise