Python read_csv примеры, pyarrow.csv.read_csv Python примеры использования

Пример #1

0

Показать файл

    def to_pandas(id: int, columns: list = None, samples: list = None):
        """
		- After unzipping `gzip.open()`, bytesio still needed to be read into PyArrow before being read into Pandas.
		- All methods return all columns by default if they receive None: 
		  `pc.read_csv(read_options.column_names)`, `pa.read_table()`, `pd.read_csv(uscols)`, `pd.read_parquet(columns)`
		"""
        d = Dataset.get_by_id(id)
        is_compressed = d.is_compressed
        ff = d.file_format

        # When user provides only 1 column and forgets to [] it (e.g. the label column).
        if type(columns) == str:
            columns = [columns]

        data = d.data
        bytesio_data = io.BytesIO(data)
        if (ff == 'csv') or (ff == 'tsv'):
            # `pc.ReadOptions.column_names` verifies the existence of the names, does not filter for them.
            if is_compressed:
                bytesio_csv = gzip.open(bytesio_data)
                if ff == 'tsv':
                    parse_opt = pc.ParseOptions(delimiter='\t')
                    tbl = pc.read_csv(bytesio_csv, parse_options=parse_opt)
                else:
                    tbl = pc.read_csv(bytesio_csv)
                df = tbl.to_pandas()
                if columns is not None:
                    df = df.filter(columns)
            else:
                if ff == 'tsv':
                    df = pd.read_csv(bytesio_data, sep='\t', usecols=columns)
                else:
                    df = pd.read_csv(bytesio_data, usecols=columns)
        elif ff == 'parquet':
            if is_compressed:
                bytesio_parquet = gzip.open(bytesio_data)
                tbl = pq.read_table(bytesio_parquet, columns=columns)
                df = tbl.to_pandas()
            else:
                df = pd.read_parquet(bytesio_data, columns=columns)

        if samples is not None:
            df = df.iloc[samples]

        d_dtype = d.dtype
        if d_dtype is not None:
            if (type(d_dtype) == dict):
                if columns is None:
                    columns = d.columns
                # need to prune out the excluded columns from the dtype dict
                d_dtype_cols = list(d_dtype.keys())
                for col in d_dtype_cols:
                    if col not in columns:
                        del d_dtype[col]
            df = df.astype(d_dtype)

        return df

Пример #2

0

Показать файл

def load_model_movies_mtx():
    genome_dim = read_csv('genome-tags.csv').num_rows
    genome_table = read_csv('genome-scores.csv')
    movie_ids = genome_table['movieId'].to_numpy()
    scores = genome_table['relevance'].to_numpy()
    model_movies_mtx = {}
    for i in range(0, len(scores), genome_dim):
        model_movies_mtx[movie_ids[i]] = scores[i:i + genome_dim]
    return model_movies_mtx, genome_dim

Пример #3

0

Показать файл

 def test_import_table_long_csv(self):
     pa_table = csv.read_csv(self.csv_file)
     new_table = self.session.import_table(pa_table)
     pa_table2 = new_table.snapshot()
     self.assertEqual(pa_table2, pa_table)
     df = pa_table2.to_pandas()
     self.assertEquals(1000, len(df.index))

Пример #4

0

Показать файл

Файл: 01.py Проект: manselmi/data-engineering-assessment

def parse_green_taxi_csv(fobj):
    """
    Parse a binary file object of cleaned "green taxi" CSV data as
    returned by the "read_green_taxi_csv" function, and return a PyArrow
    table.
    """

    convert_options = ConvertOptions(
        column_types=SCHEMA,
        false_values=['N'],
        null_values=[''],
        timestamp_parsers=['%Y-%m-%d %H:%M:%S'],
        true_values=['Y'],
    )
    parse_options = ParseOptions(quote_char=False)
    read_options = ReadOptions(
        column_names=SCHEMA.names,
        encoding=ENCODING,
    )

    return read_csv(
        fobj,
        convert_options=convert_options,
        parse_options=parse_options,
        read_options=read_options,
    )

Пример #5

0

Показать файл

Файл: io.py Проект: pranabsarkar/modin

def _read_csv_with_offset_pyarrow_on_ray(fname, num_splits, start, end, kwargs,
                                         header):  # pragma: no cover
    """Use a Ray task to read a chunk of a CSV into a pyarrow Table.
     Note: Ray functions are not detected by codecov (thus pragma: no cover)
     Args:
        fname: The filename of the file to open.
        num_splits: The number of splits (partitions) to separate the DataFrame into.
        start: The start byte offset.
        end: The end byte offset.
        kwargs: The kwargs for the pyarrow `read_csv` function.
        header: The header of the file.
     Returns:
         A list containing the split pyarrow Tables and the the number of
         rows of the tables as the last element. This is used to determine
         the total length of the DataFrame to build a default Index.
    """
    bio = open(fname, "rb")
    # The header line for the CSV file
    first_line = bio.readline()
    bio.seek(start)
    to_read = header + first_line + bio.read(end - start)
    bio.close()
    table = csv.read_csv(BytesIO(to_read),
                         parse_options=csv.ParseOptions(header_rows=1))
    chunksize = get_default_chunksize(table.num_columns, num_splits)
    chunks = [
        pa.Table.from_arrays(table.columns[chunksize * i:chunksize * (i + 1)])
        for i in range(num_splits)
    ]
    return chunks + [
        table.num_rows,
        pandas.Series([t.to_pandas_dtype() for t in table.schema.types],
                      index=table.schema.names),
    ]

Пример #6

0

Показать файл

Файл: _arrow_parsers.py Проект: moj-analytical-services/mojap-arrow-pd-parser

def pa_read_csv(
    input_file: Union[IO, str],
    schema: Union[pa.Schema, None] = None,
    expect_full_schema: bool = True,
    **kwargs,
):
    """Read a csv file into an Arrow table.
    Args:
        input_file (Union[IO, str]): the CSV you want to read. string, path or
            file-like object.
        schema (pyarrow.Schema): pyarrow Schema with the expected columns wanted.
            If unset pyarrow will infer datatypes.
        expect_full_schema (bool, optional): if True, pyarrow reader will
            expect the input schema to have fields for every col in the
            input file. If False, then will only cast columns that
            are listed in the schema, leaving all other columns to their
            default type on read.
        **kwargs (optional): Additional kwargs are passed to pyarrow.csv.read_csv
    Returns:
        pyarrow.Table: the csv file in pyarrow format.
    """

    if schema:
        schema = _get_arrow_schema(schema)

    pa_csv_table = csv.read_csv(input_file=input_file, **kwargs)
    if schema:
        pa_csv_table = cast_arrow_table_to_schema(
            pa_csv_table, schema=schema, expect_full_schema=expect_full_schema)

    return pa_csv_table

Пример #7

0

Показать файл

Файл: import_test_data.py Проект: chipkent/deephaven-core

def import_taxi_records(dh_session: Session) -> Table:

    # download the CSV data and read it into a pyarrow table and prepare it for uploading into DH
    csv_file_name = download_csv(
        url=
        "https://nyc-tlc.s3.amazonaws.com/trip+data/yellow_tripdata_2020-12.csv"
    )
    pa_table = csv.read_csv(csv_file_name)

    # drop unwanted columns
    unwanted_columns = [
        "tpep_pickup_datetime", "tpep_dropoff_datetime", "RatecodeID",
        "store_and_fwd_flag", "PULocationID", "DOLocationID"
    ]
    pa_table = pa_table.drop(unwanted_columns)

    # drop any column with a unsupported data type
    for column, column_name in zip(pa_table.columns, pa_table.column_names):
        if not is_deephaven_compatible(column.type):
            print(
                f"drop column: {column_name} because of unsupported data type {column.type}"
            )
            pa_table = pa_table.drop([column_name])

    # upload the pyarrow table to the Deephaven server
    return dh_session.import_table(pa_table)

Пример #8

0

Показать файл

Файл: features.py Проект: seculayer/AI_Competitions_2

def feature_split():
    train_df = csv.read_csv('./new_train_normal.csv').to_pandas()

    print('train dataset feature split')

    cid_features = [
        'id', 'click', 'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20',
        'C21'
    ]
    banner_features = ['id', 'click', 'banner_pos']
    app_features = ['id', 'click', 'app_id', 'app_domain', 'app_category']
    site_features = ['id', 'click', 'site_id', 'site_domain', 'site_category']
    device_features = [
        'id', 'click', 'device_id', 'device_ip', 'device_type',
        'device_conn_type'
    ]
    day_features = ['id', 'click', 'day', 'date']

    preprocess.feature_split(train_df, cid_features)
    preprocess.feature_split(train_df, banner_features)
    preprocess.feature_split(train_df, app_features)
    preprocess.feature_split(train_df, site_features)
    preprocess.feature_split(train_df, device_features)
    preprocess.feature_split(train_df, day_features)

    print("train feature split compeleted.")

Пример #9

0

Показать файл

Файл: test_table.py Проект: JamesXNelson/deephaven-core

 def test_count_by(self):
     pa_table = csv.read_csv(self.csv_file)
     test_table = self.session.import_table(pa_table)
     num_distinct_a = test_table.select_distinct(column_names=["a"]).size
     result_table = test_table.count_by(count_column="b",
                                        column_names=["a"])
     self.assertEqual(result_table.size, num_distinct_a)

Пример #10

0

Показать файл

def test_time_parsing():
    target = {
        'timestamp': [
            datetime.datetime(2010, 4, 1, 0, 0),
            datetime.datetime(2010, 4, 1, 0, 30),
            datetime.datetime(2010, 4, 1, 1, 0)
        ],
        'symbol': ['USD/JPY', 'USD/JPY', 'USD/JPY'],
        'high': [93.526, 93.475, 93.421],
        'low': [93.361, 93.352, 93.326],
        'open': [93.518, 93.385, 93.391],
        'close': [93.382, 93.391, 93.384],
        'spread': [0.005, 0.006, 0.006],
        'volume': [3049, 2251, 1577]
    }
    fp = io.BytesIO(
        b'timestamp,symbol,high,low,open,close,spread,volume\n'
        b'2010-04-01 00:00:00,USD/JPY,93.52600,93.36100,93.51800,93.38200,0.00500,3049\n'
        b'2010-04-01 00:30:00,USD/JPY,93.47500,93.35200,93.38500,93.39100,0.00600,2251\n'
        b'2010-04-01 01:00:00,USD/JPY,93.42100,93.32600,93.39100,93.38400,0.00600,1577\n'
    )
    fp.seek(0)
    table = csv.read_csv(fp)
    assert table
    engine.importArrowTable("time_parsing", table)
    assert bool(engine.closed) == False
    cursor = engine.executeDML("select * from time_parsing")
    assert cursor
    batch = cursor.getArrowRecordBatch()
    assert batch
    assert batch.to_pydict() == target

Пример #11

0

Показать файл

def test_usecols_csv():
    target = {
        'a': [1, 2, 3, 4, 5, 6],
        'b': [2, 3, 4, 5, 6, 7],
        'c': [3, 4, 5, 6, 7, 8],
        'd': [4, 5, 6, 7, 8, 9],
        'e': ['5', '6', '7', '8', '9', '0']
    }
    fp = io.BytesIO(
        b'a,b,c,d,e\n1,2,3,4,5\n2,3,4,5,6\n3,4,5,6,7\n4,5,6,7,8\n5,6,7,8,9\n6,7,8,9,0'
    )
    fp.seek(0)
    table = csv.read_csv(fp,
                         convert_options=csv.ConvertOptions(
                             column_types={
                                 'a': pa.int32(),
                                 'b': pa.int64(),
                                 'c': pa.int64(),
                                 'd': pa.int64(),
                                 'e': pa.string(),
                             }))
    assert table
    engine.importArrowTable("usecols", table)
    assert bool(engine.closed) == False
    cursor = engine.executeDML("select * from usecols")
    assert cursor
    batch = cursor.getArrowRecordBatch()
    assert batch
    assert batch.to_pydict() == target

Пример #12

0

Показать файл

def test_pyarrow_statistics():
    table = pv.read_csv("./data/pets/pets1.csv")
    pq.write_table(table, "./tmp/pyarrow_out/pets1.parquet")
    parquet_file = pq.ParquetFile("./tmp/pyarrow_out/pets1.parquet")
    stats = parquet_file.metadata.row_group(0).column(1).statistics
    assert stats.min == 1
    assert stats.max == 9

Пример #13

0

Показать файл

    def _read_table_arrow(self, source: tp.BinaryIO,
                          schema: pa.Schema) -> pa.Table:

        try:

            read_options = pa_csv.ReadOptions()
            read_options.encoding = 'utf-8'
            read_options.use_threads = False

            parse_options = pa_csv.ParseOptions()
            parse_options.newlines_in_values = True

            convert_options = pa_csv.ConvertOptions()
            convert_options.include_columns = schema.names
            convert_options.column_types = {
                n: t
                for (n, t) in zip(schema.names, schema.types)
            }
            convert_options.strings_can_be_null = True
            convert_options.quoted_strings_can_be_null = False

            return pa_csv.read_csv(source, read_options, parse_options,
                                   convert_options)

        except pa.ArrowInvalid as e:
            err = f"CSV file decoding failed, content is garbled"
            self._log.exception(err)
            raise _ex.EDataCorruption(err) from e

        except pa.ArrowKeyError as e:
            err = f"CSV file decoding failed, one or more columns is missing"
            self._log.error(err)
            self._log.exception(str(e))
            raise _ex.EDataCorruption(err) from e

Пример #14

0

Показать файл

 def test_head_tail(self):
     ops = [Table.head, Table.tail]
     pa_table = csv.read_csv(self.csv_file)
     test_table = self.session.import_table(pa_table)
     for op in ops:
         result_table = op(test_table, num_rows=50)
         self.assertEqual(result_table.size, 50)

Пример #15

0

Показать файл

 def test_select_distinct(self):
     pa_table = csv.read_csv(self.csv_file)
     test_table = self.session.import_table(pa_table)
     unique_table = test_table.select_distinct(cols=["a"])
     self.assertLessEqual(unique_table.size, pa_table.num_rows)
     unique_table = test_table.select_distinct(cols=[])
     self.assertLessEqual(unique_table.size, pa_table.num_rows)

Пример #16

0

Показать файл

def read_feats(file_name):
    attrs = csv.read_csv(
        file_name,
        read_options=pyarrow.csv.ReadOptions(autogenerate_column_names=True),
        parse_options=pyarrow.csv.ParseOptions(delimiter=' '))
    num_cols = len(attrs.columns)
    return np.stack([attrs.columns[i].to_numpy() for i in range(num_cols)], 1)

Пример #17

0

Показать файл

 def test_group(self):
     pa_table = csv.read_csv(self.csv_file)
     test_table = self.session.import_table(pa_table)
     grouped_table = test_table.group_by(by=["a", "c"])
     self.assertLessEqual(grouped_table.size, test_table.size)
     grouped_table = test_table.group_by()
     self.assertLessEqual(grouped_table.size, 1)

Пример #18

0

Показать файл

 def make(self, key):
     key = (CompassFile * CompassTable & key).fetch1()
     print('populating for {file}'.format(**key))
     tab = csv.read_csv(key['file'])
     u_names = list(tab[key['partition_col']].unique())
     rows = [dict(procedure=n, type=key['type']) for n in u_names]
     self.insert(rows, skip_duplicates=True)

Пример #19

0

Показать файл

 def test_head_tail_by(self):
     ops = [Table.head_by, Table.tail_by]
     pa_table = csv.read_csv(self.csv_file)
     test_table = self.session.import_table(pa_table)
     for op in ops:
         result_table = op(test_table, num_rows=1, by=["a"])
         self.assertLessEqual(result_table.size, test_table.size)

Пример #20

0

Показать файл

    def parse(self, **kwargs):
        import pyarrow as pa
        import pyarrow.csv as csv

        fname = kwargs.pop("fname", None)
        num_splits = kwargs.pop("num_splits", None)
        start = kwargs.pop("start", None)
        end = kwargs.pop("end", None)
        header = kwargs.pop("header", None)
        bio = open(fname, "rb")
        # The header line for the CSV file
        first_line = bio.readline()
        bio.seek(start)
        to_read = header + first_line + bio.read(end - start)
        bio.close()
        table = csv.read_csv(BytesIO(to_read),
                             parse_options=csv.ParseOptions(header_rows=1))
        chunksize = get_default_chunksize(table.num_columns, num_splits)
        chunks = [
            pa.Table.from_arrays(table.columns[chunksize * i:chunksize *
                                               (i + 1)])
            for i in range(num_splits)
        ]
        return chunks + [
            table.num_rows,
            pandas.Series(
                [t.to_pandas_dtype() for t in table.schema.types],
                index=table.schema.names,
            ),
        ]

Пример #21

0

Показать файл

Файл: taxi_gcs_dag.py Проект: rparthas/data

def format_to_parquet(src_file):
    if not src_file.endswith('.csv'):
        logging.error(
            "Can only accept source files in CSV format, for the moment")
        return
    table = pv.read_csv(src_file)
    pq.write_table(table, src_file.replace('.csv', '.parquet'))

Пример #22

0

Показать файл

def test_csv_fillna():
    target = {
        'CRIM': [0.00632],
        'ZN': [18.0],
        'INDUS': [2.31],
        'CHAS': [0.0],
        'NOX': [0.538],
        'RM': [6.575],
        'AGE': [65.2],
        'DIS': [4.09],
        'RAD': [1.0],
        'TAX': [296.0],
        'PTRATIO': [15.3],
        'B': [396.9],
        'LSTAT': [4.98],
        'PRICE': [24.0]
    }
    fp = io.BytesIO(
        b',CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE\n'
        b'0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0\n'
    )
    fp.seek(0)
    table = csv.read_csv(fp)
    assert table
    engine.importArrowTable("csv_fillna", table)
    assert bool(engine.closed) == False
    cursor = engine.executeDML(
        "select CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE from csv_fillna"
    )
    assert cursor
    batch = cursor.getArrowRecordBatch()
    assert batch
    assert batch.to_pydict() == target

Пример #23

0

Показать файл

Файл: preprocess.py Проект: seculayer/AI_Competitions_2

def multi_calc(path_list):

    for j in range(len(path_list)):
        train_df = csv.read_csv(path_list[j]).to_pandas()

        # train_df = train_df.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
        train_df = train_df.sample(frac=0.2, replace=True).reset_index()

        not_click = train_df['click'] == 0
        not_clk_df = train_df[not_click]
        not_clk_df = not_clk_df.sample(frac=0.2, replace=True)

        is_click = train_df['click'] == 1
        is_clk_df = train_df[is_click]

        print(len(not_clk_df))
        print(len(is_clk_df))

        train_df = pd.concat([not_clk_df, is_clk_df])
        train_df = train_df.reset_index()
        train_df = train_df.drop(['level_0'], axis=1)

        print(f'{path_list[j]} : traing dataset preprocessing....')

        n = 10000
        list_df = [train_df[i:i+n] for i in range(0,train_df.shape[0], n)]

        targets = ['site_id','site_domain','site_category',
                   'app_id','app_domain','app_category',
                   'device_id','device_ip','device_model']


        #  첫번째 청크 처리
        # pre_df = drop_cols(list_df[0])
        pre_df = list_df[0].copy()
        for k in range(len(targets)):
            pre_df = hashing_info(pre_df, targets[k])
            # pre_df, label[k] = mapping_info(pre_df, targets[k], label[k])
        pre_df = split_hour(pre_df)
        list_df[0] = pre_df


        #  남은 청크들 처리 + concat
        for idx in tqdm(range(1, len(list_df))):
            # pre_df = drop_cols(list_df[idx])
            pre_df = list_df[idx].copy()
            for k in range(len(targets)):
                pre_df = hashing_info(pre_df, targets[k])

            pre_df = split_hour(pre_df)
            list_df[0] = pd.concat([list_df[0], pre_df])

        train_df = list_df[0].reset_index()
        train_df = train_df.drop(['level_0'], axis=1)
        train_df = train_df.drop(['index'], axis=1)

        # name = './train_preprocess/new_prepros_' + str(k) + '.csv'
        name = path_list[j].replace('./split/split_train_', './train_preprocess/new_prepros_')
        print(name)
        train_df.to_csv(name, index=False)

Пример #24

0

Показать файл

Файл: nielsen_read2.py Проект: chrisconlon/kiltsnielsen

    def read_stores(self):
        s_cols = [
            'retailer_code', 'parent_code', 'fips_state_code',
            'fips_county_code', 'dma_code', 'store_zip3'
        ]
        # To reduce space -- update with dictionary arrays later
        store_convert = {
            'year': pa.uint16(),
            'dma_code': pa.uint16(),
            'retailer_code': pa.uint16(),
            'parent_code': pa.uint16(),
            'store_zip3': pa.uint16(),
            'fips_county_code': pa.uint16(),
            'fips_state_code': pa.uint8()
        }

        # Use pyarrow to read CSVs and parse using the dict -- we have to fix some types again later.
        tmp = pa.concat_tables([
            csv.read_csv(
                x,
                parse_options=csv.ParseOptions(delimiter='\t'),
                convert_options=csv.ConvertOptions(column_types=store_convert))
            for x in self.stores_dict.values()
        ]).to_pandas().rename(columns={'year': 'panel_year'})

        # some columns have blanks --fill with zero to avoid converting to floats(!)
        tmp.loc[:, s_cols] = tmp.loc[:, s_cols].fillna(0)

        # use the compressed types
        my_dict = {
            key: value
            for (key, value) in type_dict.items() if key in tmp.columns
        }
        self.stores_df = tmp.astype(my_dict)
        return

Пример #25

0

Показать файл

Файл: preprocess.py Проект: seculayer/AI_Competitions_2

def run_preprocess(num):
    pd.set_option('display.max_columns', None)
    path = './split/split_train_' + str(num) + '.csv'
    train_df = csv.read_csv(path).to_pandas()

    # train_df = train_df.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
    start = time.time()
    train_df = train_df.sample(frac=0.2, replace=True).reset_index()
    print("time :", time.time() - start)

    not_click = train_df['click'] == 0
    not_clk_df = train_df[not_click]
    not_clk_df = not_clk_df.sample(frac=0.2, replace=True)

    is_click = train_df['click'] == 1
    is_clk_df = train_df[is_click]

    print(len(not_clk_df))
    print(len(is_clk_df))

    train_df = pd.concat([not_clk_df, is_clk_df])
    train_df = train_df.reset_index()
    train_df = train_df.drop(['level_0'], axis=1)

    print(f'{path} : traing dataset preprocessing....')

    n = 10000
    list_df = [train_df[i:i+n] for i in range(0,train_df.shape[0], n)]

    targets = ['site_id','site_domain','site_category',
               'app_id','app_domain','app_category',
               'device_id','device_ip','device_model']


    #  첫번째 청크 처리
    # pre_df = drop_cols(list_df[0])
    pre_df = list_df[0].copy()
    for k in range(len(targets)):
        pre_df = hashing_info(pre_df, targets[k])
        # pre_df, label[k] = mapping_info(pre_df, targets[k], label[k])
    pre_df = split_hour(pre_df)
    list_df[0] = pre_df


    #  남은 청크들 처리 + concat
    for idx in tqdm(range(1, len(list_df))):
        # pre_df = drop_cols(list_df[idx])
        pre_df = list_df[idx].copy()
        for k in range(len(targets)):
            pre_df = hashing_info(pre_df, targets[k])

        pre_df = split_hour(pre_df[idx])
        list_df[0] = pd.concat([list_df[0], pre_df])

    train_df = list_df[0].reset_index()
    train_df = train_df.drop(['level_0'], axis=1)
    train_df = train_df.drop(['index'], axis=1)

    name = './train_preprocess/new_prepros_' + str(num) + '.csv'

Пример #26

0

Показать файл

    def test_sort(self):
        pa_table = csv.read_csv(self.csv_file)
        test_table = self.session.import_table(pa_table)
        sorted_table = test_table.sort(order_by=["a", "b"],
                                       order=[SortDirection.DESCENDING])
        df = sorted_table.snapshot().to_pandas()

        self.assertTrue(df.iloc[:, 0].is_monotonic_decreasing)

Пример #27

0

Показать файл

 def test_drop_columns(self):
     pa_table = csv.read_csv(self.csv_file)
     table1 = self.session.import_table(pa_table)
     column_names = []
     for f in table1.schema:
         column_names.append(f.name)
     table2 = table1.drop_columns(cols=column_names[:-1])
     self.assertEquals(1, len(table2.schema))

Пример #28

0

Показать файл

 def test_create_data_table_then_update(self):
     pa_table = csv.read_csv(self.csv_file)
     new_table = self.session.import_table(pa_table).update(
         formulas=['Sum = a + b + c + d'])
     pa_table2 = new_table.snapshot()
     df = pa_table2.to_pandas()
     self.assertEquals(df.shape[1], 6)
     self.assertEquals(1000, len(df.index))

Пример #29

0

Показать файл

 def test_bind_table(self):
     pa_table = csv.read_csv(self.csv_file)
     user_table = self.session.import_table(pa_table)
     variable_name = "t"
     self.session.bind_table(variable_name, user_table)
     server_script = '''t2 = t.update("col1 = i*i")'''
     self.session.run_script(server_script)
     self.assertIn('t2', self.session.tables)

Пример #30

0

Показать файл

 def test_close(self):
     pa_table = csv.read_csv(self.csv_file)
     table = self.session.import_table(pa_table)
     table.close()
     self.assertTrue(table.is_closed)
     table = self.session.empty_table(10)
     table.close()
     self.assertTrue(table.is_closed)

Пример #31

0

Показать файл

Файл: test_csv.py Проект: wesm/arrow

 def test_random_csv(self):
     csv, expected = make_random_csv(num_cols=2, num_rows=100)
     csv_path = os.path.join(self.tmpdir, self.csv_filename)
     self.write_file(csv_path, csv)
     try:
         table = read_csv(csv_path)
     except pa.ArrowNotImplementedError as e:
         pytest.skip(str(e))
         return
     table._validate()
     assert table.schema == expected.schema
     assert table.equals(expected)
     assert table.to_pydict() == expected.to_pydict()

Пример #32

0

Показать файл

Файл: test_csv.py Проект: wesm/arrow

 def read_csv(self, *args, **kwargs):
     read_options = kwargs.setdefault('read_options', ReadOptions())
     read_options.use_threads = True
     table = read_csv(*args, **kwargs)
     table._validate()
     return table

Python read_csv примеры использования