Пример #1
0
def make_file(input_file, output_file):
    dtypes = csv_loader.get_dtypes()
    input_df = dd.read_csv(input_file, dtype=dtypes).compute()
    timer.time("started")
    do_it_all(input_df)

    input_df.to_csv(output_file, float_format='%.6f', index=False)
def make_file(input_file, output_file):
    dtypes = csv_loader.get_dtypes()
    input_df = dd.read_csv(input_file,
                           dtype=dtypes).repartition(npartitions=32)
    timer.time("load csv")
    get_time(input_df)
    timer.time("got time")
    input_df = input_df.compute()
    timer.time("got pandas dataframe")

    with futures.ThreadPoolExecutor(max_workers=16) as executor:
        future_list = list()
        future_list.append(executor.submit(get_last_try, input_df))
        # future_list.extend(executor.submit(get_first_appear_hour, input_df))
        future_list.extend(submit_tasks(input_df, executor))
    timer.time("done executor")

    for one_future in future_list:
        list_of_tuples = one_future.result()
        for results in list_of_tuples:
            col_name, series = results
            print(col_name)
            input_df[col_name] = series
    timer.time("done fitting to df")

    input_df.to_csv(output_file, float_format='%.6f', index=False)
    timer.time("done output")
Пример #3
0
def make_file(input_file, output_file, num_rows=None):
    dtypes = csv_loader.get_dtypes()
    if num_rows is None:
        input_df = pd.read_csv(input_file, dtype=dtypes)
    else:
        input_df = pd.read_csv(input_file, nrows=num_rows, dtype=dtypes)

    print(input_df.info())
    do_it_all(input_df)
    print(input_df.info())

    output_filename = os.path.join(OUTPUT_DIR, "train_day3_featured.csv")
    input_df.to_csv(output_file, float_format='%.6f', index=False)
Пример #4
0
import os, sys
ROOT = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../'))
sys.path.append(ROOT)
APP_ROOT = os.path.join(ROOT, "talkingdata")
INPUT_DIR = os.path.join(APP_ROOT, "input")
OUTPUT_DIR = os.path.join(APP_ROOT, "output")
TRAIN_DATA = os.path.join(INPUT_DIR, "test_old.csv")
OUTPUT_DATA = os.path.join(OUTPUT_DIR, "channel_eda_test.csv")

import pandas as pd
import numpy as np
from dask import dataframe as dd
from talkingdata.common import csv_loader

print("started")
dtypes = csv_loader.get_dtypes()
df = dd.read_csv(TRAIN_DATA, dtype=dtypes).compute()
grouped = df.groupby("channel")["ip"].count()
output_df = grouped.reset_index()
output_df.to_csv(OUTPUT_DATA, index=False)