Python read_csv示例，pyarrow.csv.read_csv Python示例

示例#1

0

显示文件

    def to_pandas(id: int, columns: list = None, samples: list = None):
        """
		- After unzipping `gzip.open()`, bytesio still needed to be read into PyArrow before being read into Pandas.
		- All methods return all columns by default if they receive None: 
		  `pc.read_csv(read_options.column_names)`, `pa.read_table()`, `pd.read_csv(uscols)`, `pd.read_parquet(columns)`
		"""
        d = Dataset.get_by_id(id)
        is_compressed = d.is_compressed
        ff = d.file_format

        # When user provides only 1 column and forgets to [] it (e.g. the label column).
        if type(columns) == str:
            columns = [columns]

        data = d.data
        bytesio_data = io.BytesIO(data)
        if (ff == 'csv') or (ff == 'tsv'):
            # `pc.ReadOptions.column_names` verifies the existence of the names, does not filter for them.
            if is_compressed:
                bytesio_csv = gzip.open(bytesio_data)
                if ff == 'tsv':
                    parse_opt = pc.ParseOptions(delimiter='\t')
                    tbl = pc.read_csv(bytesio_csv, parse_options=parse_opt)
                else:
                    tbl = pc.read_csv(bytesio_csv)
                df = tbl.to_pandas()
                if columns is not None:
                    df = df.filter(columns)
            else:
                if ff == 'tsv':
                    df = pd.read_csv(bytesio_data, sep='\t', usecols=columns)
                else:
                    df = pd.read_csv(bytesio_data, usecols=columns)
        elif ff == 'parquet':
            if is_compressed:
                bytesio_parquet = gzip.open(bytesio_data)
                tbl = pq.read_table(bytesio_parquet, columns=columns)
                df = tbl.to_pandas()
            else:
                df = pd.read_parquet(bytesio_data, columns=columns)

        if samples is not None:
            df = df.iloc[samples]

        d_dtype = d.dtype
        if d_dtype is not None:
            if (type(d_dtype) == dict):
                if columns is None:
                    columns = d.columns
                # need to prune out the excluded columns from the dtype dict
                d_dtype_cols = list(d_dtype.keys())
                for col in d_dtype_cols:
                    if col not in columns:
                        del d_dtype[col]
            df = df.astype(d_dtype)

        return df

示例#2

0

显示文件

def load_model_movies_mtx():
    genome_dim = read_csv('genome-tags.csv').num_rows
    genome_table = read_csv('genome-scores.csv')
    movie_ids = genome_table['movieId'].to_numpy()
    scores = genome_table['relevance'].to_numpy()
    model_movies_mtx = {}
    for i in range(0, len(scores), genome_dim):
        model_movies_mtx[movie_ids[i]] = scores[i:i + genome_dim]
    return model_movies_mtx, genome_dim

示例#3

0

显示文件

 def test_import_table_long_csv(self):
     pa_table = csv.read_csv(self.csv_file)
     new_table = self.session.import_table(pa_table)
     pa_table2 = new_table.snapshot()
     self.assertEqual(pa_table2, pa_table)
     df = pa_table2.to_pandas()
     self.assertEquals(1000, len(df.index))

示例#4

0

显示文件

文件： 01.py 项目： manselmi/data-engineering-assessment

def parse_green_taxi_csv(fobj):
    """
    Parse a binary file object of cleaned "green taxi" CSV data as
    returned by the "read_green_taxi_csv" function, and return a PyArrow
    table.
    """

    convert_options = ConvertOptions(
        column_types=SCHEMA,
        false_values=['N'],
        null_values=[''],
        timestamp_parsers=['%Y-%m-%d %H:%M:%S'],
        true_values=['Y'],
    )
    parse_options = ParseOptions(quote_char=False)
    read_options = ReadOptions(
        column_names=SCHEMA.names,
        encoding=ENCODING,
    )

    return read_csv(
        fobj,
        convert_options=convert_options,
        parse_options=parse_options,
        read_options=read_options,
    )

示例#5

0

显示文件

文件： io.py 项目： pranabsarkar/modin

def _read_csv_with_offset_pyarrow_on_ray(fname, num_splits, start, end, kwargs,
                                         header):  # pragma: no cover
    """Use a Ray task to read a chunk of a CSV into a pyarrow Table.
     Note: Ray functions are not detected by codecov (thus pragma: no cover)
     Args:
        fname: The filename of the file to open.
        num_splits: The number of splits (partitions) to separate the DataFrame into.
        start: The start byte offset.
        end: The end byte offset.
        kwargs: The kwargs for the pyarrow `read_csv` function.
        header: The header of the file.
     Returns:
         A list containing the split pyarrow Tables and the the number of
         rows of the tables as the last element. This is used to determine
         the total length of the DataFrame to build a default Index.
    """
    bio = open(fname, "rb")
    # The header line for the CSV file
    first_line = bio.readline()
    bio.seek(start)
    to_read = header + first_line + bio.read(end - start)
    bio.close()
    table = csv.read_csv(BytesIO(to_read),
                         parse_options=csv.ParseOptions(header_rows=1))
    chunksize = get_default_chunksize(table.num_columns, num_splits)
    chunks = [
        pa.Table.from_arrays(table.columns[chunksize * i:chunksize * (i + 1)])
        for i in range(num_splits)
    ]
    return chunks + [
        table.num_rows,
        pandas.Series([t.to_pandas_dtype() for t in table.schema.types],
                      index=table.schema.names),
    ]

示例#6

0

显示文件

文件： _arrow_parsers.py 项目： moj-analytical-services/mojap-arrow-pd-parser

def pa_read_csv(
    input_file: Union[IO, str],
    schema: Union[pa.Schema, None] = None,
    expect_full_schema: bool = True,
    **kwargs,
):
    """Read a csv file into an Arrow table.
    Args:
        input_file (Union[IO, str]): the CSV you want to read. string, path or
            file-like object.
        schema (pyarrow.Schema): pyarrow Schema with the expected columns wanted.
            If unset pyarrow will infer datatypes.
        expect_full_schema (bool, optional): if True, pyarrow reader will
            expect the input schema to have fields for every col in the
            input file. If False, then will only cast columns that
            are listed in the schema, leaving all other columns to their
            default type on read.
        **kwargs (optional): Additional kwargs are passed to pyarrow.csv.read_csv
    Returns:
        pyarrow.Table: the csv file in pyarrow format.
    """

    if schema:
        schema = _get_arrow_schema(schema)

    pa_csv_table = csv.read_csv(input_file=input_file, **kwargs)
    if schema:
        pa_csv_table = cast_arrow_table_to_schema(
            pa_csv_table, schema=schema, expect_full_schema=expect_full_schema)

    return pa_csv_table

示例#7

0

显示文件

文件： import_test_data.py 项目： chipkent/deephaven-core

def import_taxi_records(dh_session: Session) -> Table:

    # download the CSV data and read it into a pyarrow table and prepare it for uploading into DH
    csv_file_name = download_csv(
        url=
        "https://nyc-tlc.s3.amazonaws.com/trip+data/yellow_tripdata_2020-12.csv"
    )
    pa_table = csv.read_csv(csv_file_name)

    # drop unwanted columns
    unwanted_columns = [
        "tpep_pickup_datetime", "tpep_dropoff_datetime", "RatecodeID",
        "store_and_fwd_flag", "PULocationID", "DOLocationID"
    ]
    pa_table = pa_table.drop(unwanted_columns)

    # drop any column with a unsupported data type
    for column, column_name in zip(pa_table.columns, pa_table.column_names):
        if not is_deephaven_compatible(column.type):
            print(
                f"drop column: {column_name} because of unsupported data type {column.type}"
            )
            pa_table = pa_table.drop([column_name])

    # upload the pyarrow table to the Deephaven server
    return dh_session.import_table(pa_table)

示例#8

0

显示文件

文件： features.py 项目： seculayer/AI_Competitions_2

def feature_split():
    train_df = csv.read_csv('./new_train_normal.csv').to_pandas()

    print('train dataset feature split')

    cid_features = [
        'id', 'click', 'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20',
        'C21'
    ]
    banner_features = ['id', 'click', 'banner_pos']
    app_features = ['id', 'click', 'app_id', 'app_domain', 'app_category']
    site_features = ['id', 'click', 'site_id', 'site_domain', 'site_category']
    device_features = [
        'id', 'click', 'device_id', 'device_ip', 'device_type',
        'device_conn_type'
    ]
    day_features = ['id', 'click', 'day', 'date']

    preprocess.feature_split(train_df, cid_features)
    preprocess.feature_split(train_df, banner_features)
    preprocess.feature_split(train_df, app_features)
    preprocess.feature_split(train_df, site_features)
    preprocess.feature_split(train_df, device_features)
    preprocess.feature_split(train_df, day_features)

    print("train feature split compeleted.")

示例#9

0

显示文件

文件： test_table.py 项目： JamesXNelson/deephaven-core

 def test_count_by(self):
     pa_table = csv.read_csv(self.csv_file)
     test_table = self.session.import_table(pa_table)
     num_distinct_a = test_table.select_distinct(column_names=["a"]).size
     result_table = test_table.count_by(count_column="b",
                                        column_names=["a"])
     self.assertEqual(result_table.size, num_distinct_a)

示例#10

0

显示文件

def test_time_parsing():
    target = {
        'timestamp': [
            datetime.datetime(2010, 4, 1, 0, 0),
            datetime.datetime(2010, 4, 1, 0, 30),
            datetime.datetime(2010, 4, 1, 1, 0)
        ],
        'symbol': ['USD/JPY', 'USD/JPY', 'USD/JPY'],
        'high': [93.526, 93.475, 93.421],
        'low': [93.361, 93.352, 93.326],
        'open': [93.518, 93.385, 93.391],
        'close': [93.382, 93.391, 93.384],
        'spread': [0.005, 0.006, 0.006],
        'volume': [3049, 2251, 1577]
    }
    fp = io.BytesIO(
        b'timestamp,symbol,high,low,open,close,spread,volume\n'
        b'2010-04-01 00:00:00,USD/JPY,93.52600,93.36100,93.51800,93.38200,0.00500,3049\n'
        b'2010-04-01 00:30:00,USD/JPY,93.47500,93.35200,93.38500,93.39100,0.00600,2251\n'
        b'2010-04-01 01:00:00,USD/JPY,93.42100,93.32600,93.39100,93.38400,0.00600,1577\n'
    )
    fp.seek(0)
    table = csv.read_csv(fp)
    assert table
    engine.importArrowTable("time_parsing", table)
    assert bool(engine.closed) == False
    cursor = engine.executeDML("select * from time_parsing")
    assert cursor
    batch = cursor.getArrowRecordBatch()
    assert batch
    assert batch.to_pydict() == target

示例#11

0

显示文件

def test_usecols_csv():
    target = {
        'a': [1, 2, 3, 4, 5, 6],
        'b': [2, 3, 4, 5, 6, 7],
        'c': [3, 4, 5, 6, 7, 8],
        'd': [4, 5, 6, 7, 8, 9],
        'e': ['5', '6', '7', '8', '9', '0']
    }
    fp = io.BytesIO(
        b'a,b,c,d,e\n1,2,3,4,5\n2,3,4,5,6\n3,4,5,6,7\n4,5,6,7,8\n5,6,7,8,9\n6,7,8,9,0'
    )
    fp.seek(0)
    table = csv.read_csv(fp,
                         convert_options=csv.ConvertOptions(
                             column_types={
                                 'a': pa.int32(),
                                 'b': pa.int64(),
                                 'c': pa.int64(),
                                 'd': pa.int64(),
                                 'e': pa.string(),
                             }))
    assert table
    engine.importArrowTable("usecols", table)
    assert bool(engine.closed) == False
    cursor = engine.executeDML("select * from usecols")
    assert cursor
    batch = cursor.getArrowRecordBatch()
    assert batch
    assert batch.to_pydict() == target

示例#12

0

显示文件

def test_pyarrow_statistics():
    table = pv.read_csv("./data/pets/pets1.csv")
    pq.write_table(table, "./tmp/pyarrow_out/pets1.parquet")
    parquet_file = pq.ParquetFile("./tmp/pyarrow_out/pets1.parquet")
    stats = parquet_file.metadata.row_group(0).column(1).statistics
    assert stats.min == 1
    assert stats.max == 9

示例#13

0

显示文件

    def _read_table_arrow(self, source: tp.BinaryIO,
                          schema: pa.Schema) -> pa.Table:

        try:

            read_options = pa_csv.ReadOptions()
            read_options.encoding = 'utf-8'
            read_options.use_threads = False

            parse_options = pa_csv.ParseOptions()
            parse_options.newlines_in_values = True

            convert_options = pa_csv.ConvertOptions()
            convert_options.include_columns = schema.names
            convert_options.column_types = {
                n: t
                for (n, t) in zip(schema.names, schema.types)
            }
            convert_options.strings_can_be_null = True
            convert_options.quoted_strings_can_be_null = False

            return pa_csv.read_csv(source, read_options, parse_options,
                                   convert_options)

        except pa.ArrowInvalid as e:
            err = f"CSV file decoding failed, content is garbled"
            self._log.exception(err)
            raise _ex.EDataCorruption(err) from e

        except pa.ArrowKeyError as e:
            err = f"CSV file decoding failed, one or more columns is missing"
            self._log.error(err)
            self._log.exception(str(e))
            raise _ex.EDataCorruption(err) from e

示例#14

0

显示文件

 def test_head_tail(self):
     ops = [Table.head, Table.tail]
     pa_table = csv.read_csv(self.csv_file)
     test_table = self.session.import_table(pa_table)
     for op in ops:
         result_table = op(test_table, num_rows=50)
         self.assertEqual(result_table.size, 50)

示例#15

0

显示文件

 def test_select_distinct(self):
     pa_table = csv.read_csv(self.csv_file)
     test_table = self.session.import_table(pa_table)
     unique_table = test_table.select_distinct(cols=["a"])
     self.assertLessEqual(unique_table.size, pa_table.num_rows)
     unique_table = test_table.select_distinct(cols=[])
     self.assertLessEqual(unique_table.size, pa_table.num_rows)

示例#16

0

显示文件

def read_feats(file_name):
    attrs = csv.read_csv(
        file_name,
        read_options=pyarrow.csv.ReadOptions(autogenerate_column_names=True),
        parse_options=pyarrow.csv.ParseOptions(delimiter=' '))
    num_cols = len(attrs.columns)
    return np.stack([attrs.columns[i].to_numpy() for i in range(num_cols)], 1)

示例#17

0

显示文件

 def test_group(self):
     pa_table = csv.read_csv(self.csv_file)
     test_table = self.session.import_table(pa_table)
     grouped_table = test_table.group_by(by=["a", "c"])
     self.assertLessEqual(grouped_table.size, test_table.size)
     grouped_table = test_table.group_by()
     self.assertLessEqual(grouped_table.size, 1)

示例#18

0

显示文件

 def make(self, key):
     key = (CompassFile * CompassTable & key).fetch1()
     print('populating for {file}'.format(**key))
     tab = csv.read_csv(key['file'])
     u_names = list(tab[key['partition_col']].unique())
     rows = [dict(procedure=n, type=key['type']) for n in u_names]
     self.insert(rows, skip_duplicates=True)

示例#19

0

显示文件

 def test_head_tail_by(self):
     ops = [Table.head_by, Table.tail_by]
     pa_table = csv.read_csv(self.csv_file)
     test_table = self.session.import_table(pa_table)
     for op in ops:
         result_table = op(test_table, num_rows=1, by=["a"])
         self.assertLessEqual(result_table.size, test_table.size)

示例#20

0

显示文件

    def parse(self, **kwargs):
        import pyarrow as pa
        import pyarrow.csv as csv

        fname = kwargs.pop("fname", None)
        num_splits = kwargs.pop("num_splits", None)
        start = kwargs.pop("start", None)
        end = kwargs.pop("end", None)
        header = kwargs.pop("header", None)
        bio = open(fname, "rb")
        # The header line for the CSV file
        first_line = bio.readline()
        bio.seek(start)
        to_read = header + first_line + bio.read(end - start)
        bio.close()
        table = csv.read_csv(BytesIO(to_read),
                             parse_options=csv.ParseOptions(header_rows=1))
        chunksize = get_default_chunksize(table.num_columns, num_splits)
        chunks = [
            pa.Table.from_arrays(table.columns[chunksize * i:chunksize *
                                               (i + 1)])
            for i in range(num_splits)
        ]
        return chunks + [
            table.num_rows,
            pandas.Series(
                [t.to_pandas_dtype() for t in table.schema.types],
                index=table.schema.names,
            ),
        ]

示例#21

0

显示文件

文件： taxi_gcs_dag.py 项目： rparthas/data

def format_to_parquet(src_file):
    if not src_file.endswith('.csv'):
        logging.error(
            "Can only accept source files in CSV format, for the moment")
        return
    table = pv.read_csv(src_file)
    pq.write_table(table, src_file.replace('.csv', '.parquet'))

示例#22

0

显示文件

def test_csv_fillna():
    target = {
        'CRIM': [0.00632],
        'ZN': [18.0],
        'INDUS': [2.31],
        'CHAS': [0.0],
        'NOX': [0.538],
        'RM': [6.575],
        'AGE': [65.2],
        'DIS': [4.09],
        'RAD': [1.0],
        'TAX': [296.0],
        'PTRATIO': [15.3],
        'B': [396.9],
        'LSTAT': [4.98],
        'PRICE': [24.0]
    }
    fp = io.BytesIO(
        b',CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE\n'
        b'0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0\n'
    )
    fp.seek(0)
    table = csv.read_csv(fp)
    assert table
    engine.importArrowTable("csv_fillna", table)
    assert bool(engine.closed) == False
    cursor = engine.executeDML(
        "select CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE from csv_fillna"
    )
    assert cursor
    batch = cursor.getArrowRecordBatch()
    assert batch
    assert batch.to_pydict() == target

示例#23

0

显示文件

文件： preprocess.py 项目： seculayer/AI_Competitions_2

def multi_calc(path_list):

    for j in range(len(path_list)):
        train_df = csv.read_csv(path_list[j]).to_pandas()

        # train_df = train_df.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
        train_df = train_df.sample(frac=0.2, replace=True).reset_index()

        not_click = train_df['click'] == 0
        not_clk_df = train_df[not_click]
        not_clk_df = not_clk_df.sample(frac=0.2, replace=True)

        is_click = train_df['click'] == 1
        is_clk_df = train_df[is_click]

        print(len(not_clk_df))
        print(len(is_clk_df))

        train_df = pd.concat([not_clk_df, is_clk_df])
        train_df = train_df.reset_index()
        train_df = train_df.drop(['level_0'], axis=1)

        print(f'{path_list[j]} : traing dataset preprocessing....')

        n = 10000
        list_df = [train_df[i:i+n] for i in range(0,train_df.shape[0], n)]

        targets = ['site_id','site_domain','site_category',
                   'app_id','app_domain','app_category',
                   'device_id','device_ip','device_model']


        #  첫번째 청크 처리
        # pre_df = drop_cols(list_df[0])
        pre_df = list_df[0].copy()
        for k in range(len(targets)):
            pre_df = hashing_info(pre_df, targets[k])
            # pre_df, label[k] = mapping_info(pre_df, targets[k], label[k])
        pre_df = split_hour(pre_df)
        list_df[0] = pre_df


        #  남은 청크들 처리 + concat
        for idx in tqdm(range(1, len(list_df))):
            # pre_df = drop_cols(list_df[idx])
            pre_df = list_df[idx].copy()
            for k in range(len(targets)):
                pre_df = hashing_info(pre_df, targets[k])

            pre_df = split_hour(pre_df)
            list_df[0] = pd.concat([list_df[0], pre_df])

        train_df = list_df[0].reset_index()
        train_df = train_df.drop(['level_0'], axis=1)
        train_df = train_df.drop(['index'], axis=1)

        # name = './train_preprocess/new_prepros_' + str(k) + '.csv'
        name = path_list[j].replace('./split/split_train_', './train_preprocess/new_prepros_')
        print(name)
        train_df.to_csv(name, index=False)

示例#24

0

显示文件

文件： nielsen_read2.py 项目： chrisconlon/kiltsnielsen

    def read_stores(self):
        s_cols = [
            'retailer_code', 'parent_code', 'fips_state_code',
            'fips_county_code', 'dma_code', 'store_zip3'
        ]
        # To reduce space -- update with dictionary arrays later
        store_convert = {
            'year': pa.uint16(),
            'dma_code': pa.uint16(),
            'retailer_code': pa.uint16(),
            'parent_code': pa.uint16(),
            'store_zip3': pa.uint16(),
            'fips_county_code': pa.uint16(),
            'fips_state_code': pa.uint8()
        }

        # Use pyarrow to read CSVs and parse using the dict -- we have to fix some types again later.
        tmp = pa.concat_tables([
            csv.read_csv(
                x,
                parse_options=csv.ParseOptions(delimiter='\t'),
                convert_options=csv.ConvertOptions(column_types=store_convert))
            for x in self.stores_dict.values()
        ]).to_pandas().rename(columns={'year': 'panel_year'})

        # some columns have blanks --fill with zero to avoid converting to floats(!)
        tmp.loc[:, s_cols] = tmp.loc[:, s_cols].fillna(0)

        # use the compressed types
        my_dict = {
            key: value
            for (key, value) in type_dict.items() if key in tmp.columns
        }
        self.stores_df = tmp.astype(my_dict)
        return

示例#25

0

显示文件

文件： preprocess.py 项目： seculayer/AI_Competitions_2

def run_preprocess(num):
    pd.set_option('display.max_columns', None)
    path = './split/split_train_' + str(num) + '.csv'
    train_df = csv.read_csv(path).to_pandas()

    # train_df = train_df.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
    start = time.time()
    train_df = train_df.sample(frac=0.2, replace=True).reset_index()
    print("time :", time.time() - start)

    not_click = train_df['click'] == 0
    not_clk_df = train_df[not_click]
    not_clk_df = not_clk_df.sample(frac=0.2, replace=True)

    is_click = train_df['click'] == 1
    is_clk_df = train_df[is_click]

    print(len(not_clk_df))
    print(len(is_clk_df))

    train_df = pd.concat([not_clk_df, is_clk_df])
    train_df = train_df.reset_index()
    train_df = train_df.drop(['level_0'], axis=1)

    print(f'{path} : traing dataset preprocessing....')

    n = 10000
    list_df = [train_df[i:i+n] for i in range(0,train_df.shape[0], n)]

    targets = ['site_id','site_domain','site_category',
               'app_id','app_domain','app_category',
               'device_id','device_ip','device_model']


    #  첫번째 청크 처리
    # pre_df = drop_cols(list_df[0])
    pre_df = list_df[0].copy()
    for k in range(len(targets)):
        pre_df = hashing_info(pre_df, targets[k])
        # pre_df, label[k] = mapping_info(pre_df, targets[k], label[k])
    pre_df = split_hour(pre_df)
    list_df[0] = pre_df


    #  남은 청크들 처리 + concat
    for idx in tqdm(range(1, len(list_df))):
        # pre_df = drop_cols(list_df[idx])
        pre_df = list_df[idx].copy()
        for k in range(len(targets)):
            pre_df = hashing_info(pre_df, targets[k])

        pre_df = split_hour(pre_df[idx])
        list_df[0] = pd.concat([list_df[0], pre_df])

    train_df = list_df[0].reset_index()
    train_df = train_df.drop(['level_0'], axis=1)
    train_df = train_df.drop(['index'], axis=1)

    name = './train_preprocess/new_prepros_' + str(num) + '.csv'

示例#26

0

显示文件

    def test_sort(self):
        pa_table = csv.read_csv(self.csv_file)
        test_table = self.session.import_table(pa_table)
        sorted_table = test_table.sort(order_by=["a", "b"],
                                       order=[SortDirection.DESCENDING])
        df = sorted_table.snapshot().to_pandas()

        self.assertTrue(df.iloc[:, 0].is_monotonic_decreasing)

示例#27

0

显示文件

 def test_drop_columns(self):
     pa_table = csv.read_csv(self.csv_file)
     table1 = self.session.import_table(pa_table)
     column_names = []
     for f in table1.schema:
         column_names.append(f.name)
     table2 = table1.drop_columns(cols=column_names[:-1])
     self.assertEquals(1, len(table2.schema))

示例#28

0

显示文件

 def test_create_data_table_then_update(self):
     pa_table = csv.read_csv(self.csv_file)
     new_table = self.session.import_table(pa_table).update(
         formulas=['Sum = a + b + c + d'])
     pa_table2 = new_table.snapshot()
     df = pa_table2.to_pandas()
     self.assertEquals(df.shape[1], 6)
     self.assertEquals(1000, len(df.index))

示例#29

0

显示文件

 def test_bind_table(self):
     pa_table = csv.read_csv(self.csv_file)
     user_table = self.session.import_table(pa_table)
     variable_name = "t"
     self.session.bind_table(variable_name, user_table)
     server_script = '''t2 = t.update("col1 = i*i")'''
     self.session.run_script(server_script)
     self.assertIn('t2', self.session.tables)

示例#30

0

显示文件

 def test_close(self):
     pa_table = csv.read_csv(self.csv_file)
     table = self.session.import_table(pa_table)
     table.close()
     self.assertTrue(table.is_closed)
     table = self.session.empty_table(10)
     table.close()
     self.assertTrue(table.is_closed)

示例#31

0

显示文件

文件： test_csv.py 项目： wesm/arrow

 def test_random_csv(self):
     csv, expected = make_random_csv(num_cols=2, num_rows=100)
     csv_path = os.path.join(self.tmpdir, self.csv_filename)
     self.write_file(csv_path, csv)
     try:
         table = read_csv(csv_path)
     except pa.ArrowNotImplementedError as e:
         pytest.skip(str(e))
         return
     table._validate()
     assert table.schema == expected.schema
     assert table.equals(expected)
     assert table.to_pydict() == expected.to_pydict()

示例#32

0

显示文件

文件： test_csv.py 项目： wesm/arrow

 def read_csv(self, *args, **kwargs):
     read_options = kwargs.setdefault('read_options', ReadOptions())
     read_options.use_threads = True
     table = read_csv(*args, **kwargs)
     table._validate()
     return table