예제 #1
0
    def patable(self):
        for file in os.listdir():
            if file.endswith(".csv"):
                file1 = file
                df = pd.read_csv(os.path.abspath(file1))

                #select required columns and enrich data with partition attributes

                df = df[[
                    'ForecastSiteCode', 'ObservationTime', 'ObservationDate',
                    'ScreenTemperature', 'SiteName', 'Region'
                ]]
                df = df.sort_values(by=[
                    'ForecastSiteCode', 'ObservationDate', 'ObservationTime'
                ])
                df = df.reset_index(drop=True)
                df['ObsYear'] = pd.DatetimeIndex(df['ObservationDate']).year
                df['ObsMonth'] = pd.DatetimeIndex(df['ObservationDate']).month
                df['ObsDay'] = pd.DatetimeIndex(df['ObservationDate']).day

                table = pa.Table.from_pandas(df)

                #create additional files for testing

                file1 = file1.replace(".csv", ".")
                file2 = file1 + 'parquet.snappy'
                pq.write_table(table, file2, compression='snappy')

                pq.write_to_dataset(
                    table,
                    root_path='weather_results',
                    partition_cols=['ObsYear', 'ObsMonth', 'Region'])
예제 #2
0
def _chunk_readwrite(archive_url, dest_path, chunksize, header, encoding,
                     dtype, dataset):
    """stream read and write archives

    pandas reads and parquet writes

    notes
    -----
    * dest_path can be either a file.parquet, or in hte case of partitioned parquet
      it will be only the destination folder of the parquet partition files
    """
    pqwriter = None
    header = []
    for i, df in enumerate(
            pd.read_csv(archive_url,
                        chunksize=chunksize,
                        names=header,
                        encoding=encoding,
                        dtype=dtype)):
        table = pa.Table.from_pandas(df)
        if i == 0:
            if dataset:
                header = np.copy(table.schema)
            else:
                pqwriter = pq.ParquetWriter(dest_path, table.schema)
        if dataset:
            pq.write_to_dataset(table,
                                root_path=dest_path,
                                partition_cols=partition_cols)
        else:
            pqwriter.write_table(table)
    if pqwriter:
        pqwriter.close()

    return header
예제 #3
0
def tokenize(partition):
    partition_name = "{}-{}-{}".format(partition["tw_year"].iloc[0],
                                       partition["tw_month"].iloc[0],
                                       partition["tw_day"].iloc[0])
    start = timer()
    print("Begining Tokenization: {}".format(partition_name))
    tokenizer = CrazyTokenizer(extra_patterns=PATTERNS,
                               lowercase=True,
                               normalize=3,
                               ignore_quotes=False,
                               ignore_stopwords=True,
                               stem="lemm",
                               remove_punct=True,
                               remove_numbers=True,
                               remove_breaks=True,
                               decontract=True,
                               hashtags="split",
                               twitter_handles='',
                               urls=False)
    partition["tokens"] = partition["full_text"].apply(tokenizer.tokenize)
    table = pa.Table.from_pandas(partition)
    pq.write_to_dataset(table,
                        root_path=OUTPUT_DIR,
                        partition_cols=['tw_year', 'tw_month', 'tw_day'])
    end = timer()
    print("Tokenization Finished for {}. Took {} seconds.".format(
        partition_name, end - start))
예제 #4
0
def write_pq(
    file_path,
    dataf,
    partition_cols=None,
    flavor='spark',
    filesystem=None,
    append=False,
    log=log,
):
    "Write to Parquet, python3 compatible. 'data' must be list of interables"
    s_t = now()

    if not append and os.path.exists(file_path):
        shutil.rmtree(file_path, ignore_errors=True)

    table = pa.Table.from_pandas(dataf, nthreads=psutil.cpu_count())
    counter = table.num_rows
    pq.write_to_dataset(
        table,
        root_path=file_path,
        partition_cols=partition_cols,
        flavor=flavor,
        preserve_index=False,
        filesystem=filesystem,
        use_deprecated_int96_timestamps=True,
        compression='snappy')  # will append. delete folder for overwrite

    secs = (now() - s_t).total_seconds()
    rate = round(counter / secs, 1)
    log("Wrote: {} rows to {} [{} r/s].".format(counter, file_path, rate))
    return counter
예제 #5
0
    def _make_parquet_file(
        filename,
        nrows=NROWS,
        ncols=2,
        force=True,
        directory=False,
        partitioned_columns=[],
    ):
        """Helper function to generate parquet files/directories.

        Args:
            filename: The name of test file, that should be created.
            nrows: Number of rows for the dataframe.
            ncols: Number of cols for the dataframe.
            force: Create a new file/directory even if one already exists.
            directory: Create a partitioned directory using pyarrow.
            partitioned_columns: Create a partitioned directory using pandas.
            Will be ignored if directory=True.
        """
        if force or not os.path.exists(filename):
            df = pandas.DataFrame(
                {f"col{x + 1}": np.arange(nrows)
                 for x in range(ncols)})
            if directory:
                if os.path.exists(filename):
                    shutil.rmtree(filename)
                else:
                    os.makedirs(filename)
                table = pa.Table.from_pandas(df)
                pq.write_to_dataset(table, root_path=filename)
            elif len(partitioned_columns) > 0:
                df.to_parquet(filename, partition_cols=partitioned_columns)
            else:
                df.to_parquet(filename)
            filenames.append(filename)
예제 #6
0
def parse(xml_dir, output, fs, seed):
    xml_dir = xml_dir.rstrip('/')  # remove trailing /
    output = output.rstrip('/')
    os.makedirs(output,
                exist_ok=True)  # create output directory if it does not exist
    if seed is not None:
        random.seed(int(seed))
        npr.seed(int(seed))

    parser = CtakesXmlParser()
    files = get_files(fs, xml_dir)
    parsed = []
    for f in files:
        xml_result = parser.parse(f)
        parsed.append(xml_result)

    def filenamer(x):
        print(x)
        try:
            return '-'.join(x) + '.parquet'
        except TypeError:
            return str(x) + '.parquet'

    for p in parsed:
        for key, val in p.items():
            feature_df = pd.DataFrame(list(val))
            if feature_df.shape[0] > 0:
                table = pa.Table.from_pandas(feature_df)
                #pq.write_to_dataset(table, output + f'/{key}', partition_filename_cb=filenamer,
                #pq.write_to_dataset(table, output + f'/{key}', partition_filename_cb=lambda x:'-'.join(x)+'.parquet',
                pq.write_to_dataset(table, output + f'/{key}', filesystem=None)
예제 #7
0
    def df2parquet(self,
                   pandasDF,
                   bucket: str,
                   folder: str,
                   file: str,
                   overwrite: bool = False,
                   engine: str = 'auto',
                   compression: str = 'snappy',
                   use_dictionary: bool = False,
                   coerce_timestamps: str = 'ms',
                   partition_cols: list = None,
                   row_group_size: int = None,
                   **kwargs):
        s3Path =      "s3://%s/%s/%s" % (bucket, folder, file) if folder != None \
                 else "s3://%s/%s" % (bucket, file)

        pq.write_to_dataset(
            table=pa.Table.from_pandas(pandasDF),
            root_path=s3Path,
            partition_cols=partition_cols,
            filesystem=self._s3fs,
            preserve_index=False,
            compression=compression,
            flavor='spark',  #Enable Spark compatibility
            coerce_timestamps=
            coerce_timestamps,  #Limit the timestamp to miliseconds
            allow_truncated_timestamps=
            True,  #Don't raise exception during truncation
            use_dictionary=use_dictionary,
            version='2.0')
예제 #8
0
def move_labels_to_datalake(label_files, wikis):

    fs = pa.hdfs.connect(host='an-coord1001.eqiad.wmnet', port=10000)
    fs = fs.connect()
    parquet_path = "/user/nathante/ores_bias_data/ores_label_editors"
    if fs.exists(parquet_path):
        fs.rm(parquet_path, recursive=True)

    out_schema = [
        'wiki', 'ns', 'pageid', 'title', 'revid', 'parentid', 'user', 'userid'
    ]
    print("collecting userids")

    for label_file, context in zip(label_files, wikis):
        if label_file is not None:

            labels = load_labels(label_file)

            rows = get_editor_traits(labels, context, out_schema)
            pddf = pd.DataFrame(rows)

            pddf.to_pickle("ores_label_editors.pickle")
            out_table = pa.Table.from_pandas(pddf)

            pq.write_to_dataset(out_table,
                                root_path=parquet_path,
                                partition_cols=['wiki'],
                                filesystem=fs,
                                flavor='spark')

            print("pushed labels for {0}".format(context))
예제 #9
0
파일: do.py 프로젝트: Sdoof/notebooks
def one_off_update(product='etfs'):
    start = datetime.date(2010, 1, 1)
    end = datetime.date(2018, 11, 29)
    names = _meta[product]
    base = os.path.join('raw/yahoo/')
    if not os.path.exists(base):
        os.makedirs(base)

    # filenames = glob.glob(os.path.join(base, '*.parquet'))
    # TODO: check dates on existing for the update

    filename = os.path.join(base, '{}_to_{}'.format(start, end))
    filename_check = os.path.join(
        filename, 'product={}'.format(product))  # this is terrible
    if os.path.exists(filename_check):
        print("{} exists".format(filename_check))
        return
    print('getting {} names'.format(len(names)))
    df = get_data_yahoo(names, start, end)
    df['product'] = product
    table = pa.Table.from_pandas(df, preserve_index=False)
    # partitioning by name is less efficient storage wise but makes for better joins in the next step
    pq.write_to_dataset(table,
                        root_path=filename,
                        partition_cols=['product', 'name'],
                        preserve_index=False)
    return df
예제 #10
0
    def write_data(self, data):
        cdir = "{}/{}/".format(self.root_output_dir, data["topic"])
        if not os.path.isdir(cdir):
            os.makedirs(cdir)

        # dtypes = {x: data['schema'].field(x).type.__str__()
        #           if 'list' not in data['schema'].field(x).type.__str__()
        #           else data['schema'].field(x).type.to_pandas_dtype()
        #           for x in data['schema'].names}

        df = pd.DataFrame.from_dict(data["records"])
        # df.to_parquet(
        #     path=cdir,
        #     partition_cols=data['partition_cols'],
        #     index=True,
        #     engine='pyarrow')
        # pq.write_metadata(
        #     self.schema,'{}/_metadata'.format(cdir),
        #     version='2.0',
        #     coerce_timestamps='us')

        table = pa.Table.from_pandas(df, schema=data["schema"],
                                     preserve_index=False)

        pq.write_to_dataset(
            table,
            root_path=cdir,
            partition_cols=data['partition_cols'],
            version="2.0",
            compression='ZSTD',
            row_group_size=100000,
        )
예제 #11
0
파일: conftest.py 프로젝트: yyz940922/modin
    def _make_parquet_file(
        filename,
        row_size=NROWS,
        force=True,
        directory=False,
        partitioned_columns=[],
    ):
        """Helper function to generate parquet files/directories.

        Args:
            filename: The name of test file, that should be created.
            row_size: Number of rows for the dataframe.
            force: Create a new file/directory even if one already exists.
            directory: Create a partitioned directory using pyarrow.
            partitioned_columns: Create a partitioned directory using pandas.
            Will be ignored if directory=True.
        """
        df = pandas.DataFrame(
            {"col1": np.arange(row_size), "col2": np.arange(row_size)}
        )
        if os.path.exists(filename) and not force:
            pass
        elif directory:
            if os.path.exists(filename):
                shutil.rmtree(filename)
            else:
                os.mkdir(filename)
            table = pa.Table.from_pandas(df)
            pq.write_to_dataset(table, root_path=filename)
        elif len(partitioned_columns) > 0:
            df.to_parquet(filename, partition_cols=partitioned_columns)
        else:
            df.to_parquet(filename)

        filenames.append(filename)
예제 #12
0
 def _send_to_s3(self, force=False):
     """Copy in-memory batches to s3"""
     for table_name, batches in self._batches.items():
         if not force and len(batches) <= CACHE_SIZE:
             continue
         if table_name == SITE_VISITS_INDEX:
             out_str = '\n'.join([json.dumps(x) for x in batches])
             if not isinstance(out_str, six.binary_type):
                 out_str = out_str.encode('utf-8')
             fname = '%s/site_index/instance-%s-%s.json.gz' % (
                 self.dir, self._instance_id,
                 hashlib.md5(out_str).hexdigest())
             self._write_str_to_s3(out_str, fname)
         else:
             try:
                 table = pa.Table.from_batches(batches)
                 pq.write_to_dataset(table,
                                     self._s3_bucket_uri % table_name,
                                     filesystem=self._fs,
                                     preserve_index=False,
                                     partition_cols=['instance_id'],
                                     compression='snappy',
                                     flavor='spark')
             except pa.lib.ArrowInvalid as e:
                 self.logger.error(
                     "Error while sending record:\n%s\n%s\n%s\n" %
                     (table_name, type(e), e))
                 pass
         self._batches[table_name] = list()
예제 #13
0
    def write_file(self, stream_name:str, data:DataStream.data, file_mode:str) -> bool:
        """
        Write pyspark DataFrame to a file storage system

        Args:
            stream_name (str): name of the stream
            data (object): pyspark DataFrame object
            file_mode (str): write mode, append is currently supportes

        Returns:
            bool: True if data is stored successfully or throws an Exception.
        Raises:
            Exception: if DataFrame write operation fails
        """
        data_path = self._get_storage_path(stream_name=stream_name)
        if isinstance(data, pd.DataFrame):
            try:
                table = pa.Table.from_pandas(data, preserve_index=False)
                pq.write_to_dataset(table, root_path=data_path, partition_cols=["version", "user"])
                return True
            except Exception as e:
                raise Exception("Cannot store pandas dataframe: "+str(e))
        else:
            try:
                data.write.partitionBy(["version","user"]).format('parquet').mode(file_mode).save(data_path)
                return True
            except Exception as e:
                raise Exception("Cannot store spark dataframe: "+str(e))
예제 #14
0
    def execute(cls, ctx, op):
        df = ctx[op.input.key]
        out = op.outputs[0]
        i = op.outputs[0].index[0]
        path = op.path
        has_wildcard = False
        if '*' in path:
            path = path.replace('*', str(i))
            has_wildcard = True

        if op.partition_cols is None:
            if not has_wildcard:
                fs = get_fs(path, op.storage_options)
                path = fs.pathsep.join([path.rstrip(fs.pathsep), f'{i}.parquet'])
            if op.engine == 'fastparquet':
                df.to_parquet(path, engine=op.engine, compression=op.compression,
                              index=op.index, open_with=open_file, **op.additional_kwargs)
            else:
                with open_file(path, mode='wb', storage_options=op.storage_options) as f:
                    df.to_parquet(f, engine=op.engine, compression=op.compression,
                                  index=op.index, **op.additional_kwargs or dict())
        else:
            if op.engine == 'pyarrow':
                pq.write_to_dataset(pa.Table.from_pandas(df), path,
                                    partition_cols=op.partition_cols)
            else:  # pragma: no cover
                raise NotImplementedError('Only support pyarrow engine when '
                                          'specify `partition_cols`.')

        ctx[out.key] = pd.DataFrame()
예제 #15
0
async def normalization() -> None:
    """
    Waits until Record is available, dump record to parquet format,
    then checkpoint last successful offset

    This function runs as a separate task in the asyncio event loop

    :return: None
    """
    while True:
        record: Record = await app.data_queue.get()

        df: DataFrame = pandas.DataFrame.from_records(record.payload)

        df: DataFrame = pandas_transform(df=df)

        table = pyarrow.Table.from_pandas(df)

        parquet.write_to_dataset(
            table=table,
            root_path=str(app.parquet_path.joinpath('restaurant_inspections')),
            compression='snappy',
            partition_cols=['geohash', 'year', 'month', 'day'])

        await set_current_offset(offset=record.checkpoint_offset)
예제 #16
0
def run_partition_test(input_file: str, output_dir: str, filters: Optional[list] = None):
    milliseconds_since_epoch = int(time() * 1000)

    print('Parquet metadata: ' + str(pq.read_metadata(input_file)))
    print('Parquet schema: ' + pq.read_schema(input_file).to_string())

    data = pq.read_table(source=input_file, filters=filters)

    # Write a dataset and collect metadata information of all written files
    metadata_collector = []
    root_path = output_dir + 'partitioned_' + str(milliseconds_since_epoch)
    pq.write_to_dataset(data,
                        root_path=root_path,
                        partition_cols=['start_year'],
                        metadata_collector=metadata_collector)

    # Write the ``_common_metadata`` parquet file without row groups statistics
    pq.write_metadata(data.schema, root_path + '/_common_metadata')

    # Write the ``_metadata`` parquet file with row groups statistics of all files
    # Gives following error:
    #       File "pyarrow/_parquet.pyx", line 616, in pyarrow._parquet.FileMetaData.append_row_groups
    #       RuntimeError: AppendRowGroups requires equal schemas.
    # data.schema has one more column than partitioned files when partitioning by one column
    # Related? https://github.com/dask/dask/issues/6243
    # pq.write_metadata(data.schema, root_path + '/_metadata', metadata_collector=metadata_collector)

    # Read from partitioned dataset
    # use the new generic Dataset API
    start_year = 2018
    value = 50000
    table = pq.read_table(root_path,
                          filters=[('start_year', '>=', start_year), ('value', '>', value)])
                          # filters=[('start_year', '>=', start_year)])
    print(table.to_pandas())
예제 #17
0
파일: parquet.py 프로젝트: tingzhendu/dask
def _write_partition_pyarrow(
    df, path, fs, filename, write_index, partition_on, metadata_path=None, **kwargs
):
    import pyarrow as pa
    from pyarrow import parquet

    t = pa.Table.from_pandas(df, preserve_index=write_index)

    if partition_on:
        parquet.write_to_dataset(
            t,
            path,
            partition_cols=partition_on,
            preserve_index=write_index,
            filesystem=fs,
            **kwargs
        )
    else:
        with fs.open(filename, "wb") as fil:
            parquet.write_table(t, fil, **kwargs)

    if metadata_path is not None:
        with fs.open(metadata_path, "wb") as fil:
            # Get only arguments specified in the function
            kwargs_meta = {
                k: v for k, v in kwargs.items() if k in _pyarrow_write_metadata_kwargs
            }
            parquet.write_metadata(t.schema, fil, **kwargs_meta)
예제 #18
0
def _test_write_to_dataset_with_partitions(base_path, filesystem=None):
    # ARROW-1400
    import pyarrow.parquet as pq

    output_df = pd.DataFrame({
        'group1':
        list('aaabbbbccc'),
        'group2':
        list('eefeffgeee'),
        'num':
        list(range(10)),
        'date':
        np.arange('2017-01-01', '2017-01-11', dtype='datetime64[D]')
    })
    cols = output_df.columns.tolist()
    partition_by = ['group1', 'group2']
    output_table = pa.Table.from_pandas(output_df)
    pq.write_to_dataset(output_table,
                        base_path,
                        partition_by,
                        filesystem=filesystem)
    input_table = pq.ParquetDataset(base_path, filesystem=filesystem).read()
    input_df = input_table.to_pandas()

    # Read data back in and compare with original DataFrame
    # Partitioned columns added to the end of the DataFrame when read
    input_df_cols = input_df.columns.tolist()
    assert partition_by == input_df_cols[-1 * len(partition_by):]

    # Partitioned columns become 'categorical' dtypes
    input_df = input_df[cols]
    for col in partition_by:
        output_df[col] = output_df[col].astype('category')
    assert output_df.equals(input_df)
예제 #19
0
파일: do.py 프로젝트: Sdoof/notebooks
def download_update_one_from_yahoo(product,
                                   name,
                                   start_date=None,
                                   end_date=None):
    # example tan, if start_date is None use last avail date in the raw data
    output_dirname = download_update_target()
    filename = os.path.join
    if start_date is None:
        missing_back_dates, start_date = get_missing_dates(product, name)
    # TODO: worrying about exact timing
    if start_date >= (datetime.date.today() - datetime.timedelta(days=1)):
        print("{} {} has data up to but not including {}".format(
            product, name, start_date))
        return
    df = get_data_yahoo([name], start=start_date, end=end_date)
    if missing_back_dates:
        raise Exception('nip')
    print('writing {}'.format(output_dirname))
    df['product'] = product  # TODO: better ways to do this but this is safer
    df['name'] = name  # TODO: better ways to do this but this is safer
    df = df.reset_index()
    df.columns = [x.lower() for x in df.columns]
    table = pa.Table.from_pandas(df, preserve_index=False)
    # this CAN lead to duplicate entries so deduping is necessary, idea is that this is basically append-only like
    pq.write_to_dataset(table,
                        root_path=output_dirname,
                        partition_cols=['product', 'name'],
                        preserve_index=False)
예제 #20
0
def save_training_data(dataframe, path):
    """ Convert dataframe into pyarrow table and save it on s3 """
    s3 = S3FileSystem()
    table = pa.Table.from_pandas(dataframe)
    print(f"Saving for machine learning team on {path}")
    pq.write_to_dataset(table, root_path=path, filesystem=s3)
    print("OK")
예제 #21
0
파일: writer.py 프로젝트: Mu-L/airbyte
 def _flush(self) -> None:
     """
     Intermediate data flush that's triggered during the
     buffering operation. Uploads data stored in memory to the S3.
     """
     for table, data in self._buffer.items():
         key_list, ts_list, payload = zip(*data)
         upload_data = [
             pa.array(key_list),
             pa.array(ts_list),
             pa.array(payload)
         ]
         pa_table = pa.table(upload_data,
                             names=[
                                 "_airbyte_ab_id", "_airbyte_emitted_at",
                                 "_airbyte_data"
                             ])
         pq.write_to_dataset(
             table=pa_table,
             root_path=
             f"{self.s3_bucket}/airbyte_output/{self.unique_dir}/{table}",
             filesystem=self.fs)
     # Update tables
     self._updated_tables.update(self._buffer.keys())
     self._buffer.clear()
     self._values = 0
예제 #22
0
def convert_bin_to_parquet(static_path: str) -> None:
    """
    Converts the data from a binary file to a parquet file.

    Args:
        static_path: (str) the path to the static file

    Returns: None
    """
    with ExitStack() as stack:
        footprint_obj = stack.enter_context(Footprint.load(static_path=static_path,
                                                           ignore_file_type={'z', 'csv', 'parquet'}))
        index_data = footprint_obj.footprint_index

        meta_data = {
            "num_intensity_bins": footprint_obj.num_intensity_bins,
            "has_intensity_uncertainty": True if footprint_obj.has_intensity_uncertainty == 1 else False
        }

        for event_id in index_data.keys():
            data_slice = footprint_obj.get_event(event_id)
            df = pd.DataFrame(data_slice)
            df["event_id"] = event_id
            pq.write_to_dataset(
                pa.Table.from_pandas(df),
                root_path=f'{static_path}/footprint.parquet',
                partition_cols=['event_id'],
                compression="BROTLI"
            )
        with open(f'{static_path}/footprint_parquet_meta.json', 'w') as outfile:
            json.dump(meta_data, outfile)
예제 #23
0
 def _send_to_s3(self, force=False):
     """Copy in-memory batches to s3"""
     for table_name, batches in self._batches.items():
         if not force and len(batches) <= CACHE_SIZE:
             continue
         if table_name == SITE_VISITS_INDEX:
             out_str = '\n'.join([json.dumps(x) for x in batches])
             if not isinstance(out_str, six.binary_type):
                 out_str = out_str.encode('utf-8')
             fname = '%s/site_index/instance-%s-%s.json.gz' % (
                 self.dir, self._instance_id,
                 hashlib.md5(out_str).hexdigest()
             )
             self._write_str_to_s3(out_str, fname)
         else:
             try:
                 table = pa.Table.from_batches(batches)
                 pq.write_to_dataset(
                     table, self._s3_bucket_uri % table_name,
                     filesystem=self._fs,
                     preserve_index=False,
                     partition_cols=['instance_id'],
                     compression='snappy',
                     flavor='spark'
                 )
             except pa.lib.ArrowInvalid as e:
                 self.logger.error(
                     "Error while sending record:\n%s\n%s\n%s\n"
                     % (table_name, type(e), e)
                 )
                 pass
         self._batches[table_name] = list()
예제 #24
0
def write_to_s3_parquet(s3_: Union[s3fs.S3FileSystem, None], df: pd.DataFrame = None, path: str = None,
                        partition_cols: List[str] = None):
    assert df is not None
    df = pyarrow.Table.from_pandas(df)
    df = df.drop([c for c in df.column_names if '__index_level_' in c])
    print('Writing parquet to {}'.format(path))
    pq.write_to_dataset(table=df, root_path=path, filesystem=s3_, partition_cols=partition_cols, preserve_index=False)
예제 #25
0
def lambda_handler(event, context):
    for record in event['Records']:
        input_file_name = record['s3']['object']['key'].replace(
            INPUT_OBJECT_PATH,
            '')
        output_file_name = input_file_name.replace('.csv', '.parquet')

        # Getting data from the bucket
        s3 = boto3.client('s3', region_name=BUCKET_REGION)
        print("Reading data from the following path:")
        print(INPUT_OBJECT_PATH + input_file_name)
        bucket_object = s3.get_object(
            Bucket=BUCKET_NAME,
            Key=INPUT_OBJECT_PATH + input_file_name)
        input_body = pd.read_csv(bucket_object['Body'])
        print("Creating parquet file in the following path: ")
        print(OUTPUT_OBJECT_PATH )
        # Transforming to parquet
        data_table = pyar.Table.from_pandas(input_body)
        pypa.write_to_dataset(table=data_table,
                              root_path=OUTPUT_OBJECT_PATH,
                              partition_cols=PARTITION_COLUMNS,
                              filesystem=S3_FILE_SYSTEM)

        print("Transformation from CSV to parquet finished successfully.")
        print("New parquet file created in " +
              OUTPUT_OBJECT_PATH +
              output_file_name)
예제 #26
0
def test_write_to_dataset_pandas_preserve_index(tempdir, use_legacy_dataset):
    # ARROW-8251 - preserve pandas index in roundtrip

    df = pd.DataFrame({'part': ['a', 'a', 'b'], "col": [1, 2, 3]})
    df.index = pd.Index(['a', 'b', 'c'], name="idx")
    table = pa.table(df)
    df_cat = df[["col", "part"]].copy()
    df_cat["part"] = df_cat["part"].astype("category")

    pq.write_to_dataset(
        table, str(tempdir / "case1"), partition_cols=['part'],
        use_legacy_dataset=use_legacy_dataset
    )
    result = pq.read_table(
        str(tempdir / "case1"), use_legacy_dataset=use_legacy_dataset
    ).to_pandas()
    tm.assert_frame_equal(result, df_cat)

    pq.write_to_dataset(
        table, str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset
    )
    result = pq.read_table(
        str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset
    ).to_pandas()
    tm.assert_frame_equal(result, df)

    pq.write_table(table, str(tempdir / "data.parquet"))
    result = pq.read_table(
        str(tempdir / "data.parquet"), use_legacy_dataset=use_legacy_dataset
    ).to_pandas()
    tm.assert_frame_equal(result, df)
예제 #27
0
def _test_write_to_dataset_no_partitions(base_path, filesystem=None):
    # ARROW-1400
    import pyarrow.parquet as pq

    output_df = pd.DataFrame({'group1': list('aaabbbbccc'),
                              'group2': list('eefeffgeee'),
                              'num': list(range(10)),
                              'date': np.arange('2017-01-01', '2017-01-11',
                                                dtype='datetime64[D]')})
    cols = output_df.columns.tolist()
    output_table = pa.Table.from_pandas(output_df)

    if filesystem is None:
        filesystem = LocalFileSystem.get_instance()

    # Without partitions, append files to root_path
    n = 5
    for i in range(n):
        pq.write_to_dataset(output_table, base_path,
                            filesystem=filesystem)
    output_files = [file for file in filesystem.ls(base_path)
                    if file.endswith(".parquet")]
    assert len(output_files) == n

    # Deduplicated incoming DataFrame should match
    # original outgoing Dataframe
    input_table = pq.ParquetDataset(base_path,
                                    filesystem=filesystem).read()
    input_df = input_table.to_pandas()
    input_df = input_df.drop_duplicates()
    input_df = input_df[cols]
    assert output_df.equals(input_df)
예제 #28
0
def test_write_to_dataset_pandas_preserve_extensiondtypes(
    tempdir, use_legacy_dataset
):
    # ARROW-8251 - preserve pandas extension dtypes in roundtrip
    if Version(pd.__version__) < Version("1.0.0"):
        pytest.skip("__arrow_array__ added to pandas in 1.0.0")

    df = pd.DataFrame({'part': 'a', "col": [1, 2, 3]})
    df['col'] = df['col'].astype("Int64")
    table = pa.table(df)

    pq.write_to_dataset(
        table, str(tempdir / "case1"), partition_cols=['part'],
        use_legacy_dataset=use_legacy_dataset
    )
    result = pq.read_table(
        str(tempdir / "case1"), use_legacy_dataset=use_legacy_dataset
    ).to_pandas()
    tm.assert_frame_equal(result[["col"]], df[["col"]])

    pq.write_to_dataset(
        table, str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset
    )
    result = pq.read_table(
        str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset
    ).to_pandas()
    tm.assert_frame_equal(result[["col"]], df[["col"]])

    pq.write_table(table, str(tempdir / "data.parquet"))
    result = pq.read_table(
        str(tempdir / "data.parquet"), use_legacy_dataset=use_legacy_dataset
    ).to_pandas()
    tm.assert_frame_equal(result[["col"]], df[["col"]])
예제 #29
0
def _test_write_to_dataset_with_partitions(base_path, filesystem=None):
    # ARROW-1400
    import pyarrow.parquet as pq

    output_df = pd.DataFrame({'group1': list('aaabbbbccc'),
                              'group2': list('eefeffgeee'),
                              'num': list(range(10)),
                              'date': np.arange('2017-01-01', '2017-01-11',
                                                dtype='datetime64[D]')})
    cols = output_df.columns.tolist()
    partition_by = ['group1', 'group2']
    output_table = pa.Table.from_pandas(output_df)
    pq.write_to_dataset(output_table, base_path, partition_by,
                        filesystem=filesystem)
    input_table = pq.ParquetDataset(base_path, filesystem=filesystem).read()
    input_df = input_table.to_pandas()

    # Read data back in and compare with original DataFrame
    # Partitioned columns added to the end of the DataFrame when read
    input_df_cols = input_df.columns.tolist()
    assert partition_by == input_df_cols[-1 * len(partition_by):]

    # Partitioned columns become 'categorical' dtypes
    input_df = input_df[cols]
    for col in partition_by:
        output_df[col] = output_df[col].astype('category')
    assert output_df.equals(input_df)
예제 #30
0
    def _make_parquet_file(row_size=SMALL_ROW_SIZE,
                           force=False,
                           directory=False,
                           partitioned_columns=[]):
        """Helper function to generate parquet files/directories.

        Args:
            row_size: Number of rows for the dataframe.
            force: Create a new file/directory even if one already exists.
            directory: Create a partitioned directory using pyarrow.
            partitioned_columns: Create a partitioned directory using pandas.
            Will be ignored if directory=True.
        """
        df = pandas.DataFrame({
            "col1": np.arange(row_size),
            "col2": np.arange(row_size)
        })
        if os.path.exists(TEST_PARQUET_FILENAME) and not force:
            pass
        elif directory:
            if os.path.exists(TEST_PARQUET_FILENAME):
                shutil.rmtree(TEST_PARQUET_FILENAME)
            else:
                os.mkdir(TEST_PARQUET_FILENAME)
            table = pa.Table.from_pandas(df)
            pq.write_to_dataset(table, root_path=TEST_PARQUET_FILENAME)
        elif len(partitioned_columns) > 0:
            df.to_parquet(TEST_PARQUET_FILENAME,
                          partition_cols=partitioned_columns)
        else:
            df.to_parquet(TEST_PARQUET_FILENAME)
예제 #31
0
def test_parquet_row_group_fragments(tempdir):
    import pyarrow as pa
    import pyarrow.parquet as pq

    table = pa.table({'a': ['a', 'a', 'b', 'b'], 'b': [1, 2, 3, 4]})

    # write_to_dataset currently requires pandas
    pq.write_to_dataset(table,
                        str(tempdir / "test_parquet_dataset"),
                        partition_cols=["a"])

    import pyarrow.dataset as ds
    dataset = ds.dataset(str(tempdir / "test_parquet_dataset/"),
                         format="parquet",
                         partitioning="hive")

    fragments = list(dataset.get_fragments())
    f = fragments[0]
    parquet_format = f.format
    parquet_format.make_fragment(f.path,
                                 f.filesystem,
                                 partition_expression=f.partition_expression)
    parquet_format.make_fragment(f.path,
                                 f.filesystem,
                                 partition_expression=f.partition_expression,
                                 row_groups={1})
예제 #32
0
def write_df_to_parquet_to_s3(df: pd.DataFrame,
                              filename: str,
                              s3_bucketname: str,
                              s3_bucketkey=None):
    # TODO: Need to figure out how to modify this file so it doesn't write the parquet file into the current working directory and then subsequently upload to S3. We want it to just upload directly to S3 (w/o having to write it to the current working directory)

    assert 's3://' not in s3_bucketname, 'prefix "s3://" not required'
    assert filename[-8:] == '.parquet', 'filename must have suffix ".parquet"'

    if 's3://' in s3_bucketname:
        pass
    else:
        s3_bucketname = 's3://' + s3_bucketname

    table = pa.Table.from_pandas(df)
    pq.write_table(table, filename)

    if s3_bucketkey is not None:
        key_to_use = s3_bucketkey + '/' + filename
    else:
        key_to_use = filename

    outputfile = s3_bucketname + '/' + key_to_use

    s3 = S3FileSystem()
    pq.write_to_dataset(table=table, root_path=outputfile, filesystem=s3)
예제 #33
0
파일: parquet.py 프로젝트: dremio/arrow
    def setup(self, num_partitions, num_threads):
        if pq is None:
            raise NotImplementedError

        self.tmpdir = tempfile.mkdtemp('benchmark_parquet')
        num1 = [random.choice(range(0, num_partitions))
                for _ in range(self.size)]
        num2 = [random.choice(range(0, 1000)) for _ in range(self.size)]
        output_df = pd.DataFrame({'num1': num1, 'num2': num2})
        output_table = pa.Table.from_pandas(output_df)
        pq.write_to_dataset(output_table, self.tmpdir, ['num1'])
예제 #34
0
    def setup(self, num_partitions, num_threads):
        if pq is None:
            raise NotImplementedError("Parquet support not enabled")

        self.tmpdir = tempfile.mkdtemp('benchmark_parquet')
        rnd = np.random.RandomState(42)
        num1 = rnd.randint(0, num_partitions, size=self.size)
        num2 = rnd.randint(0, 1000, size=self.size)
        output_df = pd.DataFrame({'num1': num1, 'num2': num2})
        output_table = pa.Table.from_pandas(output_df)
        pq.write_to_dataset(output_table, self.tmpdir, ['num1'])
예제 #35
0
def write_parquet_table_as_partitioned_dataset(parquet_file) -> pq.ParquetDataset:
    """ Write a parquet table as a parititioned dataset (i.e. multiple Parquet files)
    An example of a dataset partitioned by year and month on disk might look like:
        dataset_name/
            year=2018/
                month=09/
                    0.parq
                    1.parq
                month=10/
                    0.parq
                    1.parq
    """
    parquet_table = pq.read_table(parquet_file)  # Read back Parquet File as a Table
    #pq.write_to_dataset(parquet_table, root_path='starships', partition_cols=['created'])
    pq.write_to_dataset(parquet_table, root_path='starships', partition_cols=['year', 'month', 'day'], flavor='spark')
    dataset = pq.ParquetDataset('starships')
    return dataset
예제 #36
0
파일: parquet.py 프로젝트: fortizc/dask
def _write_partition_pyarrow(df, open_with, path, fs, filename, write_index,
                             partition_on, metadata_path=None, **kwargs):
    import pyarrow as pa
    from pyarrow import parquet
    t = pa.Table.from_pandas(df, preserve_index=write_index)

    if partition_on:
        parquet.write_to_dataset(t, path, partition_cols=partition_on, filesystem=fs)
    else:
        with open_with(filename, 'wb') as fil:
            parquet.write_table(t, fil, **kwargs)

    if metadata_path is not None:
        with open_with(metadata_path, 'wb') as fil:
            # Get only arguments specified in the function
            kwargs_meta = {k: v for k, v in kwargs.items()
                           if k in _pyarrow_write_metadata_kwargs}
            parquet.write_metadata(t.schema, fil, **kwargs_meta)
예제 #37
0
파일: do.py 프로젝트: cottrell/notebooks
def run_raw(nrows=None, force=False):
    """ parse the data, save a parquet """
    # not a lot of data here ... do it in memory, forget about parallelism
    outfile = raw_target(nrows=nrows)
    if force:
        if os.path.exists(outfile):
            shutil.rmtree(outfile)
    if os.path.exists(outfile):
        print('{} exists. force=True to rerun'.format(outfile))
        return
    for k in ['ETFs', 'Stocks']:
        data = list()
        product = k.lower()
        filenames = raw_source(k)
        t = time.time()
        i = 0
        for filename in filenames:
            name, market = os.path.basename(filename).split('.')[:2]
            product, market, name, filename, dict(nrows=nrows)
            lc = mylib.io.get_capped_line_count(filename)
            if lc >= 2:
                df = pd.read_csv(filename, nrows=nrows, encoding='utf-8')
                df['product'] = product
                df['market'] = market
                df['name'] = name
                df['Date'] = pd.to_datetime(df.Date)
                data.append(df)
            else:
                print('skipping {}'.format(filename))
            i += 1
            if i % 10 == 0:
                print('{} of {} eta {} seconds for {}'.format(i, len(filenames), (time.time() - t) * (len(filenames) - i) / i, product))
        df = pd.concat(data, axis=0)
        df = df.sort_values(['name', 'Date'])
        # TODO: add market back later if relevant
        df = df.drop(['market', 'OpenInt'], axis=1)
        df.columns = [x.lower() for x in df.columns]
        print('writing {}'.format(outfile))
        # plain to_parquet seems to be always writing the index, also partition_cols not in pandas yet
        table = pa.Table.from_pandas(df, preserve_index=False)
        # partitioning by name is less efficient storage wise but makes for better joins in the next step
        pq.write_to_dataset(table, root_path=outfile, partition_cols=['product', 'name'], preserve_index=False)
예제 #38
0
파일: do.py 프로젝트: cottrell/notebooks
def download_update_one_from_yahoo(product, name, start_date=None, end_date=None):
    # example tan, if start_date is None use last avail date in the raw data
    output_dirname = download_update_target()
    filename = os.path.join
    if start_date is None:
        missing_back_dates, start_date = get_missing_dates(product, name)
    # TODO: worrying about exact timing
    if start_date >= (datetime.date.today() - datetime.timedelta(days=1)):
        print("{} {} has data up to but not including {}".format(product, name, start_date))
        return
    df = get_data_yahoo([name], start=start_date, end=end_date)
    if missing_back_dates:
        raise Exception('nip')
    print('writing {}'.format(output_dirname))
    df['product'] = product # TODO: better ways to do this but this is safer
    df['name'] = name # TODO: better ways to do this but this is safer
    df = df.reset_index()
    df.columns = [x.lower() for x in df.columns]
    table = pa.Table.from_pandas(df, preserve_index=False)
    # this CAN lead to duplicate entries so deduping is necessary, idea is that this is basically append-only like
    pq.write_to_dataset(table, root_path=output_dirname, partition_cols=['product', 'name'], preserve_index=False)
예제 #39
0
파일: do.py 프로젝트: cottrell/notebooks
def one_off_update(product='etfs'):
    start = datetime.date(2010, 1, 1)
    end = datetime.date(2018, 11, 29)
    names = _meta[product]
    base = os.path.join('raw/yahoo/')
    if not os.path.exists(base):
        os.makedirs(base)

    # filenames = glob.glob(os.path.join(base, '*.parquet'))
    # TODO: check dates on existing for the update

    filename = os.path.join(base, '{}_to_{}'.format(start, end))
    filename_check = os.path.join(filename, 'product={}'.format(product)) # this is terrible
    if os.path.exists(filename_check):
        print("{} exists".format(filename_check))
        return
    print('getting {} names'.format(len(names)))
    df = get_data_yahoo(names, start, end)
    df['product'] = product
    table = pa.Table.from_pandas(df, preserve_index=False)
    # partitioning by name is less efficient storage wise but makes for better joins in the next step
    pq.write_to_dataset(table, root_path=filename, partition_cols=['product', 'name'], preserve_index=False)
    return df
예제 #40
0
파일: do.py 프로젝트: cottrell/notebooks
 def read_transform_write(infile, outfile):
     print('{} -> {}'.format(infile, outfile))
     df = pd.read_parquet(infile)
     enrich_pandas_single(df, inplace=True)
     table = pa.Table.from_pandas(df, preserve_index=False)
     pq.write_to_dataset(table, root_path=outfile, preserve_index=False)