示例#1
0
    def to_pandas(id: int, columns: list = None, samples: list = None):
        """
		- After unzipping `gzip.open()`, bytesio still needed to be read into PyArrow before being read into Pandas.
		- All methods return all columns by default if they receive None: 
		  `pc.read_csv(read_options.column_names)`, `pa.read_table()`, `pd.read_csv(uscols)`, `pd.read_parquet(columns)`
		"""
        d = Dataset.get_by_id(id)
        is_compressed = d.is_compressed
        ff = d.file_format

        # When user provides only 1 column and forgets to [] it (e.g. the label column).
        if type(columns) == str:
            columns = [columns]

        data = d.data
        bytesio_data = io.BytesIO(data)
        if (ff == 'csv') or (ff == 'tsv'):
            # `pc.ReadOptions.column_names` verifies the existence of the names, does not filter for them.
            if is_compressed:
                bytesio_csv = gzip.open(bytesio_data)
                if ff == 'tsv':
                    parse_opt = pc.ParseOptions(delimiter='\t')
                    tbl = pc.read_csv(bytesio_csv, parse_options=parse_opt)
                else:
                    tbl = pc.read_csv(bytesio_csv)
                df = tbl.to_pandas()
                if columns is not None:
                    df = df.filter(columns)
            else:
                if ff == 'tsv':
                    df = pd.read_csv(bytesio_data, sep='\t', usecols=columns)
                else:
                    df = pd.read_csv(bytesio_data, usecols=columns)
        elif ff == 'parquet':
            if is_compressed:
                bytesio_parquet = gzip.open(bytesio_data)
                tbl = pq.read_table(bytesio_parquet, columns=columns)
                df = tbl.to_pandas()
            else:
                df = pd.read_parquet(bytesio_data, columns=columns)

        if samples is not None:
            df = df.iloc[samples]

        d_dtype = d.dtype
        if d_dtype is not None:
            if (type(d_dtype) == dict):
                if columns is None:
                    columns = d.columns
                # need to prune out the excluded columns from the dtype dict
                d_dtype_cols = list(d_dtype.keys())
                for col in d_dtype_cols:
                    if col not in columns:
                        del d_dtype[col]
            df = df.astype(d_dtype)

        return df
示例#2
0
def load_model_movies_mtx():
    genome_dim = read_csv('genome-tags.csv').num_rows
    genome_table = read_csv('genome-scores.csv')
    movie_ids = genome_table['movieId'].to_numpy()
    scores = genome_table['relevance'].to_numpy()
    model_movies_mtx = {}
    for i in range(0, len(scores), genome_dim):
        model_movies_mtx[movie_ids[i]] = scores[i:i + genome_dim]
    return model_movies_mtx, genome_dim
示例#3
0
 def test_import_table_long_csv(self):
     pa_table = csv.read_csv(self.csv_file)
     new_table = self.session.import_table(pa_table)
     pa_table2 = new_table.snapshot()
     self.assertEqual(pa_table2, pa_table)
     df = pa_table2.to_pandas()
     self.assertEquals(1000, len(df.index))
def parse_green_taxi_csv(fobj):
    """
    Parse a binary file object of cleaned "green taxi" CSV data as
    returned by the "read_green_taxi_csv" function, and return a PyArrow
    table.
    """

    convert_options = ConvertOptions(
        column_types=SCHEMA,
        false_values=['N'],
        null_values=[''],
        timestamp_parsers=['%Y-%m-%d %H:%M:%S'],
        true_values=['Y'],
    )
    parse_options = ParseOptions(quote_char=False)
    read_options = ReadOptions(
        column_names=SCHEMA.names,
        encoding=ENCODING,
    )

    return read_csv(
        fobj,
        convert_options=convert_options,
        parse_options=parse_options,
        read_options=read_options,
    )
示例#5
0
文件: io.py 项目: pranabsarkar/modin
def _read_csv_with_offset_pyarrow_on_ray(fname, num_splits, start, end, kwargs,
                                         header):  # pragma: no cover
    """Use a Ray task to read a chunk of a CSV into a pyarrow Table.
     Note: Ray functions are not detected by codecov (thus pragma: no cover)
     Args:
        fname: The filename of the file to open.
        num_splits: The number of splits (partitions) to separate the DataFrame into.
        start: The start byte offset.
        end: The end byte offset.
        kwargs: The kwargs for the pyarrow `read_csv` function.
        header: The header of the file.
     Returns:
         A list containing the split pyarrow Tables and the the number of
         rows of the tables as the last element. This is used to determine
         the total length of the DataFrame to build a default Index.
    """
    bio = open(fname, "rb")
    # The header line for the CSV file
    first_line = bio.readline()
    bio.seek(start)
    to_read = header + first_line + bio.read(end - start)
    bio.close()
    table = csv.read_csv(BytesIO(to_read),
                         parse_options=csv.ParseOptions(header_rows=1))
    chunksize = get_default_chunksize(table.num_columns, num_splits)
    chunks = [
        pa.Table.from_arrays(table.columns[chunksize * i:chunksize * (i + 1)])
        for i in range(num_splits)
    ]
    return chunks + [
        table.num_rows,
        pandas.Series([t.to_pandas_dtype() for t in table.schema.types],
                      index=table.schema.names),
    ]
def pa_read_csv(
    input_file: Union[IO, str],
    schema: Union[pa.Schema, None] = None,
    expect_full_schema: bool = True,
    **kwargs,
):
    """Read a csv file into an Arrow table.
    Args:
        input_file (Union[IO, str]): the CSV you want to read. string, path or
            file-like object.
        schema (pyarrow.Schema): pyarrow Schema with the expected columns wanted.
            If unset pyarrow will infer datatypes.
        expect_full_schema (bool, optional): if True, pyarrow reader will
            expect the input schema to have fields for every col in the
            input file. If False, then will only cast columns that
            are listed in the schema, leaving all other columns to their
            default type on read.
        **kwargs (optional): Additional kwargs are passed to pyarrow.csv.read_csv
    Returns:
        pyarrow.Table: the csv file in pyarrow format.
    """

    if schema:
        schema = _get_arrow_schema(schema)

    pa_csv_table = csv.read_csv(input_file=input_file, **kwargs)
    if schema:
        pa_csv_table = cast_arrow_table_to_schema(
            pa_csv_table, schema=schema, expect_full_schema=expect_full_schema)

    return pa_csv_table
def import_taxi_records(dh_session: Session) -> Table:

    # download the CSV data and read it into a pyarrow table and prepare it for uploading into DH
    csv_file_name = download_csv(
        url=
        "https://nyc-tlc.s3.amazonaws.com/trip+data/yellow_tripdata_2020-12.csv"
    )
    pa_table = csv.read_csv(csv_file_name)

    # drop unwanted columns
    unwanted_columns = [
        "tpep_pickup_datetime", "tpep_dropoff_datetime", "RatecodeID",
        "store_and_fwd_flag", "PULocationID", "DOLocationID"
    ]
    pa_table = pa_table.drop(unwanted_columns)

    # drop any column with a unsupported data type
    for column, column_name in zip(pa_table.columns, pa_table.column_names):
        if not is_deephaven_compatible(column.type):
            print(
                f"drop column: {column_name} because of unsupported data type {column.type}"
            )
            pa_table = pa_table.drop([column_name])

    # upload the pyarrow table to the Deephaven server
    return dh_session.import_table(pa_table)
示例#8
0
def feature_split():
    train_df = csv.read_csv('./new_train_normal.csv').to_pandas()

    print('train dataset feature split')

    cid_features = [
        'id', 'click', 'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20',
        'C21'
    ]
    banner_features = ['id', 'click', 'banner_pos']
    app_features = ['id', 'click', 'app_id', 'app_domain', 'app_category']
    site_features = ['id', 'click', 'site_id', 'site_domain', 'site_category']
    device_features = [
        'id', 'click', 'device_id', 'device_ip', 'device_type',
        'device_conn_type'
    ]
    day_features = ['id', 'click', 'day', 'date']

    preprocess.feature_split(train_df, cid_features)
    preprocess.feature_split(train_df, banner_features)
    preprocess.feature_split(train_df, app_features)
    preprocess.feature_split(train_df, site_features)
    preprocess.feature_split(train_df, device_features)
    preprocess.feature_split(train_df, day_features)

    print("train feature split compeleted.")
示例#9
0
 def test_count_by(self):
     pa_table = csv.read_csv(self.csv_file)
     test_table = self.session.import_table(pa_table)
     num_distinct_a = test_table.select_distinct(column_names=["a"]).size
     result_table = test_table.count_by(count_column="b",
                                        column_names=["a"])
     self.assertEqual(result_table.size, num_distinct_a)
示例#10
0
def test_time_parsing():
    target = {
        'timestamp': [
            datetime.datetime(2010, 4, 1, 0, 0),
            datetime.datetime(2010, 4, 1, 0, 30),
            datetime.datetime(2010, 4, 1, 1, 0)
        ],
        'symbol': ['USD/JPY', 'USD/JPY', 'USD/JPY'],
        'high': [93.526, 93.475, 93.421],
        'low': [93.361, 93.352, 93.326],
        'open': [93.518, 93.385, 93.391],
        'close': [93.382, 93.391, 93.384],
        'spread': [0.005, 0.006, 0.006],
        'volume': [3049, 2251, 1577]
    }
    fp = io.BytesIO(
        b'timestamp,symbol,high,low,open,close,spread,volume\n'
        b'2010-04-01 00:00:00,USD/JPY,93.52600,93.36100,93.51800,93.38200,0.00500,3049\n'
        b'2010-04-01 00:30:00,USD/JPY,93.47500,93.35200,93.38500,93.39100,0.00600,2251\n'
        b'2010-04-01 01:00:00,USD/JPY,93.42100,93.32600,93.39100,93.38400,0.00600,1577\n'
    )
    fp.seek(0)
    table = csv.read_csv(fp)
    assert table
    engine.importArrowTable("time_parsing", table)
    assert bool(engine.closed) == False
    cursor = engine.executeDML("select * from time_parsing")
    assert cursor
    batch = cursor.getArrowRecordBatch()
    assert batch
    assert batch.to_pydict() == target
示例#11
0
def test_usecols_csv():
    target = {
        'a': [1, 2, 3, 4, 5, 6],
        'b': [2, 3, 4, 5, 6, 7],
        'c': [3, 4, 5, 6, 7, 8],
        'd': [4, 5, 6, 7, 8, 9],
        'e': ['5', '6', '7', '8', '9', '0']
    }
    fp = io.BytesIO(
        b'a,b,c,d,e\n1,2,3,4,5\n2,3,4,5,6\n3,4,5,6,7\n4,5,6,7,8\n5,6,7,8,9\n6,7,8,9,0'
    )
    fp.seek(0)
    table = csv.read_csv(fp,
                         convert_options=csv.ConvertOptions(
                             column_types={
                                 'a': pa.int32(),
                                 'b': pa.int64(),
                                 'c': pa.int64(),
                                 'd': pa.int64(),
                                 'e': pa.string(),
                             }))
    assert table
    engine.importArrowTable("usecols", table)
    assert bool(engine.closed) == False
    cursor = engine.executeDML("select * from usecols")
    assert cursor
    batch = cursor.getArrowRecordBatch()
    assert batch
    assert batch.to_pydict() == target
示例#12
0
def test_pyarrow_statistics():
    table = pv.read_csv("./data/pets/pets1.csv")
    pq.write_table(table, "./tmp/pyarrow_out/pets1.parquet")
    parquet_file = pq.ParquetFile("./tmp/pyarrow_out/pets1.parquet")
    stats = parquet_file.metadata.row_group(0).column(1).statistics
    assert stats.min == 1
    assert stats.max == 9
示例#13
0
    def _read_table_arrow(self, source: tp.BinaryIO,
                          schema: pa.Schema) -> pa.Table:

        try:

            read_options = pa_csv.ReadOptions()
            read_options.encoding = 'utf-8'
            read_options.use_threads = False

            parse_options = pa_csv.ParseOptions()
            parse_options.newlines_in_values = True

            convert_options = pa_csv.ConvertOptions()
            convert_options.include_columns = schema.names
            convert_options.column_types = {
                n: t
                for (n, t) in zip(schema.names, schema.types)
            }
            convert_options.strings_can_be_null = True
            convert_options.quoted_strings_can_be_null = False

            return pa_csv.read_csv(source, read_options, parse_options,
                                   convert_options)

        except pa.ArrowInvalid as e:
            err = f"CSV file decoding failed, content is garbled"
            self._log.exception(err)
            raise _ex.EDataCorruption(err) from e

        except pa.ArrowKeyError as e:
            err = f"CSV file decoding failed, one or more columns is missing"
            self._log.error(err)
            self._log.exception(str(e))
            raise _ex.EDataCorruption(err) from e
示例#14
0
 def test_head_tail(self):
     ops = [Table.head, Table.tail]
     pa_table = csv.read_csv(self.csv_file)
     test_table = self.session.import_table(pa_table)
     for op in ops:
         result_table = op(test_table, num_rows=50)
         self.assertEqual(result_table.size, 50)
示例#15
0
 def test_select_distinct(self):
     pa_table = csv.read_csv(self.csv_file)
     test_table = self.session.import_table(pa_table)
     unique_table = test_table.select_distinct(cols=["a"])
     self.assertLessEqual(unique_table.size, pa_table.num_rows)
     unique_table = test_table.select_distinct(cols=[])
     self.assertLessEqual(unique_table.size, pa_table.num_rows)
示例#16
0
def read_feats(file_name):
    attrs = csv.read_csv(
        file_name,
        read_options=pyarrow.csv.ReadOptions(autogenerate_column_names=True),
        parse_options=pyarrow.csv.ParseOptions(delimiter=' '))
    num_cols = len(attrs.columns)
    return np.stack([attrs.columns[i].to_numpy() for i in range(num_cols)], 1)
示例#17
0
 def test_group(self):
     pa_table = csv.read_csv(self.csv_file)
     test_table = self.session.import_table(pa_table)
     grouped_table = test_table.group_by(by=["a", "c"])
     self.assertLessEqual(grouped_table.size, test_table.size)
     grouped_table = test_table.group_by()
     self.assertLessEqual(grouped_table.size, 1)
示例#18
0
 def make(self, key):
     key = (CompassFile * CompassTable & key).fetch1()
     print('populating for {file}'.format(**key))
     tab = csv.read_csv(key['file'])
     u_names = list(tab[key['partition_col']].unique())
     rows = [dict(procedure=n, type=key['type']) for n in u_names]
     self.insert(rows, skip_duplicates=True)
示例#19
0
 def test_head_tail_by(self):
     ops = [Table.head_by, Table.tail_by]
     pa_table = csv.read_csv(self.csv_file)
     test_table = self.session.import_table(pa_table)
     for op in ops:
         result_table = op(test_table, num_rows=1, by=["a"])
         self.assertLessEqual(result_table.size, test_table.size)
示例#20
0
    def parse(self, **kwargs):
        import pyarrow as pa
        import pyarrow.csv as csv

        fname = kwargs.pop("fname", None)
        num_splits = kwargs.pop("num_splits", None)
        start = kwargs.pop("start", None)
        end = kwargs.pop("end", None)
        header = kwargs.pop("header", None)
        bio = open(fname, "rb")
        # The header line for the CSV file
        first_line = bio.readline()
        bio.seek(start)
        to_read = header + first_line + bio.read(end - start)
        bio.close()
        table = csv.read_csv(BytesIO(to_read),
                             parse_options=csv.ParseOptions(header_rows=1))
        chunksize = get_default_chunksize(table.num_columns, num_splits)
        chunks = [
            pa.Table.from_arrays(table.columns[chunksize * i:chunksize *
                                               (i + 1)])
            for i in range(num_splits)
        ]
        return chunks + [
            table.num_rows,
            pandas.Series(
                [t.to_pandas_dtype() for t in table.schema.types],
                index=table.schema.names,
            ),
        ]
示例#21
0
def format_to_parquet(src_file):
    if not src_file.endswith('.csv'):
        logging.error(
            "Can only accept source files in CSV format, for the moment")
        return
    table = pv.read_csv(src_file)
    pq.write_table(table, src_file.replace('.csv', '.parquet'))
示例#22
0
def test_csv_fillna():
    target = {
        'CRIM': [0.00632],
        'ZN': [18.0],
        'INDUS': [2.31],
        'CHAS': [0.0],
        'NOX': [0.538],
        'RM': [6.575],
        'AGE': [65.2],
        'DIS': [4.09],
        'RAD': [1.0],
        'TAX': [296.0],
        'PTRATIO': [15.3],
        'B': [396.9],
        'LSTAT': [4.98],
        'PRICE': [24.0]
    }
    fp = io.BytesIO(
        b',CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE\n'
        b'0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0\n'
    )
    fp.seek(0)
    table = csv.read_csv(fp)
    assert table
    engine.importArrowTable("csv_fillna", table)
    assert bool(engine.closed) == False
    cursor = engine.executeDML(
        "select CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE from csv_fillna"
    )
    assert cursor
    batch = cursor.getArrowRecordBatch()
    assert batch
    assert batch.to_pydict() == target
示例#23
0
def multi_calc(path_list):

    for j in range(len(path_list)):
        train_df = csv.read_csv(path_list[j]).to_pandas()

        # train_df = train_df.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
        train_df = train_df.sample(frac=0.2, replace=True).reset_index()

        not_click = train_df['click'] == 0
        not_clk_df = train_df[not_click]
        not_clk_df = not_clk_df.sample(frac=0.2, replace=True)

        is_click = train_df['click'] == 1
        is_clk_df = train_df[is_click]

        print(len(not_clk_df))
        print(len(is_clk_df))

        train_df = pd.concat([not_clk_df, is_clk_df])
        train_df = train_df.reset_index()
        train_df = train_df.drop(['level_0'], axis=1)

        print(f'{path_list[j]} : traing dataset preprocessing....')

        n = 10000
        list_df = [train_df[i:i+n] for i in range(0,train_df.shape[0], n)]

        targets = ['site_id','site_domain','site_category',
                   'app_id','app_domain','app_category',
                   'device_id','device_ip','device_model']


        #  첫번째 청크 처리
        # pre_df = drop_cols(list_df[0])
        pre_df = list_df[0].copy()
        for k in range(len(targets)):
            pre_df = hashing_info(pre_df, targets[k])
            # pre_df, label[k] = mapping_info(pre_df, targets[k], label[k])
        pre_df = split_hour(pre_df)
        list_df[0] = pre_df


        #  남은 청크들 처리 + concat
        for idx in tqdm(range(1, len(list_df))):
            # pre_df = drop_cols(list_df[idx])
            pre_df = list_df[idx].copy()
            for k in range(len(targets)):
                pre_df = hashing_info(pre_df, targets[k])

            pre_df = split_hour(pre_df)
            list_df[0] = pd.concat([list_df[0], pre_df])

        train_df = list_df[0].reset_index()
        train_df = train_df.drop(['level_0'], axis=1)
        train_df = train_df.drop(['index'], axis=1)

        # name = './train_preprocess/new_prepros_' + str(k) + '.csv'
        name = path_list[j].replace('./split/split_train_', './train_preprocess/new_prepros_')
        print(name)
        train_df.to_csv(name, index=False)
示例#24
0
    def read_stores(self):
        s_cols = [
            'retailer_code', 'parent_code', 'fips_state_code',
            'fips_county_code', 'dma_code', 'store_zip3'
        ]
        # To reduce space -- update with dictionary arrays later
        store_convert = {
            'year': pa.uint16(),
            'dma_code': pa.uint16(),
            'retailer_code': pa.uint16(),
            'parent_code': pa.uint16(),
            'store_zip3': pa.uint16(),
            'fips_county_code': pa.uint16(),
            'fips_state_code': pa.uint8()
        }

        # Use pyarrow to read CSVs and parse using the dict -- we have to fix some types again later.
        tmp = pa.concat_tables([
            csv.read_csv(
                x,
                parse_options=csv.ParseOptions(delimiter='\t'),
                convert_options=csv.ConvertOptions(column_types=store_convert))
            for x in self.stores_dict.values()
        ]).to_pandas().rename(columns={'year': 'panel_year'})

        # some columns have blanks --fill with zero to avoid converting to floats(!)
        tmp.loc[:, s_cols] = tmp.loc[:, s_cols].fillna(0)

        # use the compressed types
        my_dict = {
            key: value
            for (key, value) in type_dict.items() if key in tmp.columns
        }
        self.stores_df = tmp.astype(my_dict)
        return
示例#25
0
def run_preprocess(num):
    pd.set_option('display.max_columns', None)
    path = './split/split_train_' + str(num) + '.csv'
    train_df = csv.read_csv(path).to_pandas()

    # train_df = train_df.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
    start = time.time()
    train_df = train_df.sample(frac=0.2, replace=True).reset_index()
    print("time :", time.time() - start)

    not_click = train_df['click'] == 0
    not_clk_df = train_df[not_click]
    not_clk_df = not_clk_df.sample(frac=0.2, replace=True)

    is_click = train_df['click'] == 1
    is_clk_df = train_df[is_click]

    print(len(not_clk_df))
    print(len(is_clk_df))

    train_df = pd.concat([not_clk_df, is_clk_df])
    train_df = train_df.reset_index()
    train_df = train_df.drop(['level_0'], axis=1)

    print(f'{path} : traing dataset preprocessing....')

    n = 10000
    list_df = [train_df[i:i+n] for i in range(0,train_df.shape[0], n)]

    targets = ['site_id','site_domain','site_category',
               'app_id','app_domain','app_category',
               'device_id','device_ip','device_model']


    #  첫번째 청크 처리
    # pre_df = drop_cols(list_df[0])
    pre_df = list_df[0].copy()
    for k in range(len(targets)):
        pre_df = hashing_info(pre_df, targets[k])
        # pre_df, label[k] = mapping_info(pre_df, targets[k], label[k])
    pre_df = split_hour(pre_df)
    list_df[0] = pre_df


    #  남은 청크들 처리 + concat
    for idx in tqdm(range(1, len(list_df))):
        # pre_df = drop_cols(list_df[idx])
        pre_df = list_df[idx].copy()
        for k in range(len(targets)):
            pre_df = hashing_info(pre_df, targets[k])

        pre_df = split_hour(pre_df[idx])
        list_df[0] = pd.concat([list_df[0], pre_df])

    train_df = list_df[0].reset_index()
    train_df = train_df.drop(['level_0'], axis=1)
    train_df = train_df.drop(['index'], axis=1)

    name = './train_preprocess/new_prepros_' + str(num) + '.csv'
示例#26
0
    def test_sort(self):
        pa_table = csv.read_csv(self.csv_file)
        test_table = self.session.import_table(pa_table)
        sorted_table = test_table.sort(order_by=["a", "b"],
                                       order=[SortDirection.DESCENDING])
        df = sorted_table.snapshot().to_pandas()

        self.assertTrue(df.iloc[:, 0].is_monotonic_decreasing)
示例#27
0
 def test_drop_columns(self):
     pa_table = csv.read_csv(self.csv_file)
     table1 = self.session.import_table(pa_table)
     column_names = []
     for f in table1.schema:
         column_names.append(f.name)
     table2 = table1.drop_columns(cols=column_names[:-1])
     self.assertEquals(1, len(table2.schema))
示例#28
0
 def test_create_data_table_then_update(self):
     pa_table = csv.read_csv(self.csv_file)
     new_table = self.session.import_table(pa_table).update(
         formulas=['Sum = a + b + c + d'])
     pa_table2 = new_table.snapshot()
     df = pa_table2.to_pandas()
     self.assertEquals(df.shape[1], 6)
     self.assertEquals(1000, len(df.index))
示例#29
0
 def test_bind_table(self):
     pa_table = csv.read_csv(self.csv_file)
     user_table = self.session.import_table(pa_table)
     variable_name = "t"
     self.session.bind_table(variable_name, user_table)
     server_script = '''t2 = t.update("col1 = i*i")'''
     self.session.run_script(server_script)
     self.assertIn('t2', self.session.tables)
示例#30
0
 def test_close(self):
     pa_table = csv.read_csv(self.csv_file)
     table = self.session.import_table(pa_table)
     table.close()
     self.assertTrue(table.is_closed)
     table = self.session.empty_table(10)
     table.close()
     self.assertTrue(table.is_closed)
示例#31
0
文件: test_csv.py 项目: wesm/arrow
 def test_random_csv(self):
     csv, expected = make_random_csv(num_cols=2, num_rows=100)
     csv_path = os.path.join(self.tmpdir, self.csv_filename)
     self.write_file(csv_path, csv)
     try:
         table = read_csv(csv_path)
     except pa.ArrowNotImplementedError as e:
         pytest.skip(str(e))
         return
     table._validate()
     assert table.schema == expected.schema
     assert table.equals(expected)
     assert table.to_pydict() == expected.to_pydict()
示例#32
0
文件: test_csv.py 项目: wesm/arrow
 def read_csv(self, *args, **kwargs):
     read_options = kwargs.setdefault('read_options', ReadOptions())
     read_options.use_threads = True
     table = read_csv(*args, **kwargs)
     table._validate()
     return table