Python write_raw_to_parquet示例

编程语言: Python

命名空间/包名称: preprocessing

方法/功能: write_raw_to_parquet

hotexamples.com的示例: 2

Python write_raw_to_parquet - 已找到2个示例。这些是从开源项目中提取的最受好评的preprocessing.write_raw_to_parquet现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： main.py 项目： masc-it/CryptoData

def write_to_parquet(file, batches, base, quote, append=False):
    """
    Writes a batch of candles data to a parquet file.
    """
    df = pd.concat(batches, ignore_index=True)
    df = pp.quick_clean(df)
    if append:
        pp.append_raw_to_parquet(df, file, False)
    else:
        pp.write_raw_to_parquet(df, file, False)
    
    return len(df.index)

示例#2

显示文件

文件： main.py 项目： CryptoBlockChain/candlestick_retriever

def all_candles_to_csv(base, quote, interval='1m', with_parquet=False):
    """Collect a list of candlestick batches with all candlesticks of a trading pair,
    concat into a dataframe and write it to CSV.
    """

    # see if there is any data saved on disk already
    try:
        batches = [pd.read_csv(f'data/{base}-{quote}_interval-{interval}.csv')]
        last_timestamp = batches[-1]['open_time'].max()
    except FileNotFoundError:
        batches = [pd.DataFrame([], columns=LABELS)]
        last_timestamp = 0
    old_lines = len(batches[-1].index)

    # gather all candlesticks available, starting from the last timestamp loaded from disk or 0
    # stop if the timestamp that comes back from the api is the same as the last one
    previous_timestamp = None

    while previous_timestamp != last_timestamp:
        # stop if we reached data from today
        if date.fromtimestamp(last_timestamp / 1000) >= date.today():
            break

        previous_timestamp = last_timestamp

        new_batch = get_batch(symbol=base + quote,
                              interval=interval,
                              start_time=last_timestamp + 1)

        # requesting candles from the future returns empty
        # also stop in case response code was not 200
        if new_batch.empty:
            break

        last_timestamp = new_batch['open_time'].max()

        # sometimes no new trades took place yet on date.today();
        # in this case the batch is nothing new
        if previous_timestamp == last_timestamp:
            break

        batches.append(new_batch)
        last_datetime = datetime.fromtimestamp(last_timestamp / 1000)

        covering_spaces = 20 * ' '
        print(datetime.now(),
              base,
              quote,
              interval,
              str(last_datetime) + covering_spaces,
              end='\r',
              flush=True)

    df = pd.concat(batches, ignore_index=True)
    df = pp.quick_clean(df)

    if with_parquet:
        # write clean version of csv to parquet
        parquet_name = f'{base}-{quote}.parquet'
        full_path = f'compressed/{parquet_name}'
        pp.write_raw_to_parquet(df, full_path)
        METADATA['data'].append({
            'description':
            f'All trade history for the pair {base} and {quote} at {interval} intervals. Counts {df.index.size} records.',
            'name': parquet_name,
            'totalBytes': os.stat(full_path).st_size,
            'columns': []
        })

    # in the case that new data was gathered write it to disk
    if len(batches) > 1:
        df.to_csv(f'data/{base}-{quote}_interval-{interval}.csv', index=False)
        return len(df.index) - old_lines
    return 0