示例#1
0
文件: test_csv.py 项目: ahasha/odo
def test_csv_to_compressed_csv():
    with tmpfile('.csv') as fn:
        with open(fn, 'w') as f:
            f.write('a,1\nb,2\nc,3')
        with tmpfile('.csv.gz') as gfn:
            result = odo(fn, gfn)
            assert odo(result, list) == odo(fn, list)
示例#2
0
def into(input, output, name, identifier):
    """

    :param input:
    :param output:
    :param name:
    :param identifier:

    :return:

    """

    with tempfile.TemporaryDirectory() as directory:
        source = os.path.join(directory, os.path.basename(input))

        with open(input, "r") as fin, open(source, "w") as fout:
            reader = csv.reader(fin)
            writer = csv.writer(fout)

            headers = next(reader)
            headers = [__format__(name, header) for header in headers]
            headers = ["TableNumber"] + headers

            writer.writerow(headers)

            [writer.writerow([identifier] + row) for row in reader]

        odo.odo(source, "{}::{}".format(output, name), has_header=True, delimiter=",")
示例#3
0
def test_nan_to_nat():
    assert odo(float('nan'), pd.Timestamp) is pd.NaT
    assert odo(np.nan, pd.Timestamp) is pd.NaT

    with pytest.raises(NetworkXNoPath):
        # Check that only nan can be converted.
        odo(0.5, pd.Timestamp)
示例#4
0
def test_dist(nyc):
    def distance(lat1, lon1, lat2, lon2, R=3959):
        # http://andrew.hedges.name/experiments/haversine/
        dlon = radians(lon2 - lon1)
        dlat = radians(lat2 - lat1)
        a = sin(dlat / 2.0) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2.0) ** 2
        return R * 2 * atan2(sqrt(a), sqrt(1 - a))

    t = symbol('t', discover(nyc))

    filtered = t[
        (t.pickup_latitude >= 40.477399) &
        (t.pickup_latitude <= 40.917577) &
        (t.dropoff_latitude >= 40.477399) &
        (t.dropoff_latitude <= 40.917577) &
        (t.pickup_longitude >= -74.259090) &
        (t.pickup_longitude <= -73.700272) &
        (t.dropoff_longitude >= -74.259090) &
        (t.dropoff_longitude <= -73.700272) &
        (t.passenger_count < 6)
    ]
    dist = distance(filtered.pickup_latitude, filtered.pickup_longitude,
                    filtered.dropoff_latitude, filtered.dropoff_longitude)
    transformed = transform(filtered, dist=dist)
    assert (
        odo(compute(transformed.dist.max(), nyc), float) ==
        odo(compute(transformed.dist, nyc), pd.Series).max().item()
    )
示例#5
0
def test_shift_arithmetic(sql, n):
    t = symbol('t', discover(sql))
    expr = t.B - t.B.shift(n)
    result = odo(compute(expr, sql), pd.Series)
    df = odo(sql, pd.DataFrame)
    expected = df.B - df.B.shift(n)
    tm.assert_series_equal(result, expected)
示例#6
0
def test_strlen(ctx, db):
    expr = db.t.name.strlen()
    result = odo(compute(expr, ctx), pd.Series)
    expected = compute(expr, {db: {'t': df}})
    assert result.name == 'name'
    assert expected.name == 'name'
    assert odo(result, set) == odo(expected, set)
示例#7
0
def test_convert_logfiles_to_bag():
    with filetexts({'a1.log': 'Hello\nWorld', 'a2.log': 'Hola\nMundo'}) as fns:
        logs = chunks(TextFile)(list(map(TextFile, fns)))
        b = odo(logs, Bag)
        assert isinstance(b, Bag)
        assert 'a1.log' in str(b.dask.values())
        assert odo(b, list) == odo(logs, list)
示例#8
0
def concrete_head(expr, n=10):
    """ Return head of computed expression """
    if not expr._resources():
        raise ValueError("Expression does not contain data resources")
    if not iscollection(expr.dshape):
        return compute(expr)

    head = expr.head(n + 1)

    if not iscollection(expr.dshape):
        return odo(head, object)
    elif isrecord(expr.dshape.measure):
        return odo(head, DataFrame)
    else:
        df = odo(head, DataFrame)
        df.columns = [expr._name]
        return df
    result = compute(head)

    if len(result) == 0:
        return DataFrame(columns=expr.fields)
    if isrecord(expr.dshape.measure):
        return odo(result, DataFrame, dshape=expr.dshape)
    else:
        df = odo(result, DataFrame, dshape=expr.dshape)
        df.columns = [expr._name]
        return df
示例#9
0
文件: core.py 项目: kongscn/zipline
def get_materialized_checkpoints(checkpoints, colnames, lower_dt, odo_kwargs):
    """
    Computes a lower bound and a DataFrame checkpoints.

    Parameters
    ----------
    checkpoints : Expr
        Bound blaze expression for a checkpoints table from which to get a
        computed lower bound.
    colnames : iterable of str
        The names of the columns for which checkpoints should be computed.
    lower_dt : pd.Timestamp
        The lower date being queried for that serves as an upper bound for
        checkpoints.
    odo_kwargs : dict, optional
        The extra keyword arguments to pass to ``odo``.
    """
    if checkpoints is not None:
        ts = checkpoints[TS_FIELD_NAME]
        checkpoints_ts = odo(ts[ts <= lower_dt].max(), pd.Timestamp)
        if pd.isnull(checkpoints_ts):
            materialized_checkpoints = pd.DataFrame(columns=colnames)
            lower = None
        else:
            materialized_checkpoints = odo(
                checkpoints[ts == checkpoints_ts][colnames],
                pd.DataFrame,
                **odo_kwargs
            )
            lower = checkpoints_ts
    else:
        materialized_checkpoints = pd.DataFrame(columns=colnames)
        lower = None
    return lower, materialized_checkpoints
示例#10
0
def test_seed(dataset):
    munge_file = pkg_resources.resource_filename(__name__, "../perturbation/scripts/munge.sh")

    if dataset["munge"]:
        subprocess.call([munge_file, dataset["data_dir"]])

    config_file = os.path.join(dataset["data_dir"], "config.ini")

    config = configparser.ConfigParser()

    config.read(config_file)

    with tempfile.TemporaryDirectory() as temp_dir:
        sqlite_file = os.path.join(temp_dir, "test.db")

        perturbation.ingest.seed(config=config, source=dataset["data_dir"], target="sqlite:///{}".format(str(sqlite_file)))

        for (k, v) in dict({"cells": "Cells.csv", "cytoplasm": "Cytoplasm.csv", "nuclei": "Nuclei.csv"}).items():
            config["filenames"][k] = v

        for table_key in ["image", "cells", "cytoplasm", "nuclei"]:
            csv_filename = os.path.join(temp_dir, config["filenames"][table_key])

            table_name = config["filenames"][table_key].split(".")[0]

            odo.odo("sqlite:///{}::{}".format(str(sqlite_file), table_name), csv_filename)

            df = pd.read_csv(csv_filename)

            assert df.shape[0] == dataset["ingest"]["{}_nrows".format(table_name)]

            assert df.shape[1] == dataset["ingest"]["{}_ncols".format(table_name)] + 1

            if table_key != "image":
                assert df.groupby(["TableNumber", "ImageNumber"]).size().sum() == dataset["ingest"]["{}_nrows".format(table_name)]
def create_plot(team="LAA", year=2012):
    expr = bz.by(db.Salaries.teamID,
                 avg=db.Salaries.salary.mean(),
                 max=db.Salaries.salary.max(),
                 ratio=db.Salaries.salary.max() / db.Salaries.salary.min())
    expr = expr.sort('ratio', ascending=False)

    df_salary_gb = odo(expr, pd.DataFrame)
    source1 = odo(df_salary_gb[["teamID", "avg"]], ColumnDataSource)

    plot1 = plt.figure(title="Salary ratio by team", x_range=list(df_salary_gb["teamID"]))
    plot1.scatter(x="teamID", y="avg", source=source1, size=20)
    plot1.xaxis.major_label_orientation = np.pi/3

    df = odo(db.Salaries, pd.DataFrame)
    df = df[df["teamID"] == team]
    df = df[df["yearID"] == year]

    df = df[["playerID","salary"]].sort('salary')
    source_team = odo(df, ColumnDataSource)
    p_team = plt.figure(title="Salary of players for %s during %s" % (team, year),
                        x_range=list(df["playerID"]))#, tools=TOOLS)
    p_team.scatter(x="playerID", y="salary", source=source_team, size=20)
    p_team.xaxis.major_label_orientation = np.pi/3

    p = plt.gridplot([[plot1, p_team]])
    return p
示例#12
0
def test_convert_logfiles_to_bag():
    with filetexts({'a1.log': 'Hello\nWorld', 'a2.log': 'Hola\nMundo'}) as fns:
        logs = chunks(TextFile)(list(map(TextFile, fns)))
        b = odo(logs, Bag)
        assert isinstance(b, Bag)
        assert (list(map(methodcaller('strip'), odo(b, list))) ==
                list(map(methodcaller('strip'), odo(logs, list))))
示例#13
0
def test_append_chunks():
    tbl = resource("sqlite:///:memory:::test", dshape="var * {a: int, b: int}")
    res = odo(chunks(np.ndarray)((np.array([[0, 1], [2, 3]]), np.array([[4, 5], [6, 7]]))), tbl)
    assert res is tbl
    assert (
        odo(tbl, np.ndarray) == np.array([(0, 1), (2, 3), (4, 5), (6, 7)], dtype=[("a", "<i4"), ("b", "<i4")])
    ).all()
示例#14
0
def test_df_to_in_memory_db():
    df = pd.DataFrame([[1, 2], [3, 4]], columns=list('ab'))
    tbl = odo(df, 'sqlite:///:memory:::tbl')
    pd.util.testing.assert_frame_equal(
        odo(tbl, pd.DataFrame),
        df,
    )
    def handle(self, *args, **options):
        # set up
        config = get_config()
        if config is None:
            raise CommandError('Unable to process configuration file p_to_p.yml')

        connection = get_connection(config)
        pedsnet_session = init_pedsnet(connection)
        init_pcornet(connection)

        observation_period = pedsnet_session.query(ObservationPeriod.person_id,
                                                   ObservationPeriod.observation_period_start_date,
                                                   ObservationPeriod.observation_period_end_date,
                                                   ObservationPeriod.site,
                                                   bindparam("chart", 'Y'),
                                                   bindparam("enr_basis", 'E')
                                                   ).filter(
            exists().where(ObservationPeriod.person_id == PersonVisit.person_id)).all()

        odo(observation_period, Enrollment.__table__,
            dshape='var * {patid: string, enr_start_date: date, enr_end_date: date, site: string, chart: String, '
                   'enr_basis: String} '
            )
        # close session
        pedsnet_session.close()

        # ouutput result
        self.stdout.ending = ''
        print('Enrollment ETL completed successfully', end='', file=self.stdout)
示例#16
0
def test_str_len(ctx, db):
    expr = db.t.name.str.len()
    result = odo(compute(expr, ctx, return_type='native'), pd.Series)
    expected = compute(expr, {db: {'t': df}}, return_type='native')
    assert result.name == 'name'
    assert expected.name == 'name'
    assert odo(result, set) == odo(expected, set)
示例#17
0
文件: test_postgres.py 项目: jni/odo
def test_na_value(sql, csv):
    sql = odo(null_data, sql)
    with tmpfile('.csv') as fn:
        csv = odo(sql, fn, na_value='NA')
        with open(csv.path, 'rt') as f:
            raw = f.read()
    assert raw == 'a,b\n1,NA\n10,20\n100,200\n'
示例#18
0
def test_compute_kwargs(test, serial):
    expr = t.dumb.sort()
    bad_query = {'expr': to_tree(expr)}

    result = test.post(
        '/compute',
        headers=mimetype(serial),
        data=serial.dumps(bad_query),
    )
    assert result.status_code == 500
    assert b'return_df must be passed' in result.data

    good_query = {
        'expr': to_tree(expr),
        'compute_kwargs': {
            'return_df': odo(DumbResource.df, list),
        },
    }
    result = test.post(
        '/compute',
        headers=mimetype(serial),
        data=serial.dumps(good_query)
    )
    assert result.status_code == 200
    data = serial.loads(result.data)
    dshape = discover(DumbResource.df)
    assert_dshape_equal(
        datashape.dshape(data['datashape']),
        dshape,
    )
    assert_frame_equal(
        odo(data['data'], DataFrame, dshape=dshape),
        DumbResource.df,
    )
示例#19
0
def test_sql_to_csv(sql, csv, tmpdir):
    sql, bind = sql
    sql = odo(csv, sql, bind=bind)
    with tmpfile('.csv', dir=tmpdir) as fn:
        csv = odo(sql, fn, bind=bind)
        assert odo(csv, list) == data
        assert discover(csv).measure.names == discover(sql).measure.names
示例#20
0
def merge_temp_files(dbname, image_names=None, do_odo=False):
    if do_odo:
        logging.info('Merging temp files with odo.')
    else:
        logging.info('Merging temp files manually.')

    if image_names is None:
        image_names = get_image_names(dbname)

    dbname_base, ext = os.path.splitext(dbname)
    dbnamenew = dbname_base + '_cleaned' + ext
    logging.info('Creating concatenated db file {}'.format(dbnamenew))
    if not do_odo:
        df = []
    for image_name in image_names:
        try:
            if do_odo:
                odo('hdfstore://{}::df'.format(get_temp_fname(image_name)),
                    'hdfstore://{}::df'.format(dbnamenew))
            else:
                df.append(pd.read_hdf(get_temp_fname(image_name), 'df'))
        except OSError:
            continue
        else:
            os.remove(get_temp_fname(image_name))
    df = pd.concat(df, ignore_index=True)
    df.to_hdf(dbnamenew, 'df',
              format='table',
              data_columns=data_columns)
    logging.info('Duplicates removal complete.')
    return dbnamenew
示例#21
0
def each_partition(values):
    for fpath in values:
        print('Processing: ', fpath)
        reader = FACC1Reader(fpath)
        for fname, df in reader:
            output_path = get_output_path(fpath, fname)
            odo(df, output_path)
示例#22
0
文件: test_csv.py 项目: shoyer/odo
def test_csv_to_compressed_csv():
    with tmpfile(".csv") as fn:
        with open(fn, "w") as f:
            f.write("a,1\nb,2\nc,3")
        with tmpfile(".csv.gz") as gfn:
            result = odo(fn, gfn)
            assert odo(result, list) == odo(fn, list)
示例#23
0
def test_math(ctx, db, func):
    expr = func(db.t.amount)
    result = compute(expr, ctx)
    expected = compute(expr, {db: {'t': df}})
    np.testing.assert_allclose(np.sort(odo(result, np.ndarray,
                                           dshape=expr.dshape)),
                               np.sort(odo(expected, np.ndarray)))
示例#24
0
def test_sql_select_to_csv(sql, csv, tmpdir):
    sql, bind = sql
    sql = odo(csv, sql, bind=bind)
    query = sa.select([sql.c.a])
    with tmpfile('.csv', dir=tmpdir) as fn:
        csv = odo(query, fn, bind=bind)
        assert odo(csv, list) == [(x,) for x, _ in data]
示例#25
0
def test_invalid_escapechar(sql, csv):
    sql, bind = sql
    with pytest.raises(ValueError):
        odo(csv, sql, escapechar='12', bind=bind)

    with pytest.raises(ValueError):
        odo(csv, sql, escapechar='', bind=bind)
    def handle(self, *args, **options):
        # set up
        config = get_config()
        if config is None:
            raise CommandError('Unable to process configuration file p_to_p.yml')

        connection = get_connection(config)
        pedsnet_session = init_pedsnet(connection)
        init_pcornet(connection)

        for df in pd.read_sql(pedsnet_session.query(VisitOccurrence.person_id,
                                                    VisitOccurrence.visit_occurrence_id.label('visit_id')) \
                                      .filter(extract('year', VisitOccurrence.visit_start_date) >= 2001).statement,
                              pedsnet_session.bind, chunksize=50000):

            odo(df, PersonVisit.__table__,
                dshape='var * {person_id: int, visit_id: int}'
                )

        # close session
        pedsnet_session.close()

        # ouutput result
        self.stdout.ending = ''
        print('Person Visit ETL completed successfully', end='', file=self.stdout)
示例#27
0
def join_dataframe_to_selectable(expr, lhs, rhs, scope=None, **kwargs):
    lexpr, rexpr = expr._leaves()
    return compute(
        expr,
        {lexpr: odo(lhs, pd.DataFrame, dshape=lexpr.dshape), rexpr: odo(rhs, pd.DataFrame, dshape=rexpr.dshape)},
        **kwargs
    )
def demographic_etl(config):
    # set up
    connection = get_connection(config)
    pedsnet_session = init_pedsnet(connection)
    init_pcornet(connection)

    # multiple aliases for pedsnet_pcornet_valueset_map
    # to allow the three named joins
    gender_value_map = aliased(ValueSetMap)
    ethnicity_value_map = aliased(ValueSetMap)
    race_value_map = aliased(ValueSetMap)

    # extract the data from the person table
    person = pedsnet_session.query(Person.person_id,
                                   Person.birth_date,
                                   Person.birth_time,
                                   coalesce(gender_value_map.target_concept, 'OT'),
                                   coalesce(ethnicity_value_map.target_concept, 'OT'),
                                   coalesce(race_value_map.target_concept, 'OT'),
                                   bindparam("biobank_flag", "N"),
                                   Person.gender_source_value,
                                   Person.ethnicity_source_value,
                                   Person.race_source_value,
                                   Person.site,
                                   bindparam("gender_identity", None),
                                   bindparam("raw_gender_identity", None),
                                   bindparam("sexual_orientation", None),
                                   bindparam("raw_sexual_orientation", None)
                                   ). \
        outerjoin(gender_value_map,
                  and_(gender_value_map.source_concept_class == 'Gender',
                       case([(and_(Person.gender_concept_id == None,
                                   gender_value_map.source_concept_id == None), True)],
                            else_=cast(Person.gender_concept_id, String(200)) ==
                                  gender_value_map.source_concept_id))). \
        outerjoin(ethnicity_value_map,
                  and_(ethnicity_value_map.source_concept_class == 'Hispanic',
                       case([(and_(Person.ethnicity_concept_id == None,
                                   ethnicity_value_map.source_concept_id == None), True)],
                            else_=cast(Person.ethnicity_concept_id, String(200)) ==
                                  ethnicity_value_map.source_concept_id))). \
        outerjoin(race_value_map,
                  and_(race_value_map.source_concept_class == 'Race',
                       case([(and_(Person.race_concept_id == None,
                                   race_value_map.source_concept_id == None), True)],
                            else_=cast(Person.race_concept_id, String(200)) ==
                                  race_value_map.source_concept_id))).all()

    # transform data to pcornet names and types
    # load to demographic table
    odo(person, Demographic.__table__,
        dshape='var * {patid: string, birth_date: date, birth_time: string, sex: string,'
               'hispanic: string, race: string, biobank_flag: string, raw_sex: string,'
               'raw_hispanic: string, raw_race:string, site: string, gender_identity: string,'
               'raw_gender_identity: string, sexual_orientation: string, raw_sexual_orientation: string}'
        )
    # close session

    pedsnet_session.close()
示例#29
0
def test_sample(sql):
    t = symbol('t', discover(sql))
    result = compute(t.sample(n=1), sql)
    s = odo(result, pd.DataFrame)
    assert len(s) == 1
    result2 = compute(t.sample(frac=0.5), sql)
    s2 = odo(result2, pd.DataFrame)
    assert len(s) == len(s2)
示例#30
0
def test_from_dataframe_strings(sql_with_strings):
    sql_with_strings, bind = sql_with_strings

    input_ = pd.DataFrame([['ayy', 'hello "world"'], ['lmao', None]],
                          columns=['non_optional', 'optional'])
    odo(input_, sql_with_strings, bind=bind)
    output = odo(sql_with_strings, pd.DataFrame, bind=bind)
    pd.util.testing.assert_frame_equal(output, input_)
示例#31
0
def convert_base(typ, x):
    x = compute(x)
    try:
        return typ(x)
    except:
        return typ(odo(x, typ))
示例#32
0
 def tosqlite (self):
     odo(self.df, 'sqlite:///cme.db::ticktable')
示例#33
0
# import a.csv
pd.read_sql_table('imported', sql_engine)
# import b.csv => fix with d6tstack

# pandas to sql
df.to_sql('a', sql_engine, if_exists='replace', index=False)
pd.read_sql_table('a', sql_engine)
df.to_sql('a', sql_engine, if_exists='append', index=False)
pd.read_sql_table('a', sql_engine)

# dask NO to sql

# odo import to db
import odo

odo.odo('data/s3-201806/a.csv',
        'mysql+mysqlconnector://augvest:augvest@localhost/augvest::imported')
pd.read_sql_table('imported', sql_engine)

# excel? convert to csv using d6stack

#****************************************
# regular updates
#****************************************

# csv, txt, excel
# option 1: rerun with more data
# option 2: incremental run

# database
dfts1 = pd.DataFrame({
    'date': pd.date_range('2018-01-01', periods=5),
示例#34
0
def dumb_to_df(d, return_df=None, **kwargs):
    if return_df is None:
        raise DumbResource.NoResource('return_df must be passed')
    to_return = odo(return_df, DataFrame, dshape=discover(d))
    assert_frame_equal(to_return, DumbResource.df)
    return to_return
示例#35
0
def test_convert_bag_to_list():
    assert odo(b, list) == L
from odo import odo
import pymongo
import pandas as pd

# This is a fix for fields previouslySmoking and currentlySmoking being exclusive.
# From the UKBB, if you are currently smoking, it assumes that you were not previously smoking. This script
# populates 'previouslySmoking' with 1, if 'currentlySmoking' is 1, and adjusts the -perDay numbers as well.
client = pymongo.MongoClient('localhost', 27017)
db = client.ukbb
collection = db['ahriCleaner2']

cursor = collection.find()
df = pd.DataFrame(list(cursor))
df.drop(inplace=True, columns=["_id"])

df.loc[df['currentlySmoking'] == 1, ['previouslySmoked']] = 1
df['noOfCigarettesPreviouslyPerDay'] = df.apply(
    lambda row: row['noOfCigarettesPerDay']
    if row['currentlySmoking'] == 1 else row['noOfCigarettesPreviouslyPerDay'],
    axis=1)
odo(df, db.ahriSmokingFix)
示例#37
0
def gl_data(sql_two_tables):
    u_data, t_data = sql_two_tables
    # populate the tables with some data and return it
    return data(odo([(1, )], u_data)), data(odo([(2, )], t_data))
示例#38
0
def test_sequence():
    b = odo([1, 2, 3], Bag)
    assert set(b.map(inc)) == set([2, 3, 4])
示例#39
0
def test_least(gl_data):
    u, t = gl_data
    assert odo(least(u.a.max(), t.a.max()), int) == 1
示例#40
0
def test_greatest(gl_data):
    u, t = gl_data
    assert odo(greatest(u.a.max(), t.a.max()), int) == 2
示例#41
0
def test_postgres_create(sql):
    assert odo(sql, list) == [('a', 1), ('b', 2)]
示例#42
0
def test_postgres_isnan(sql_with_float):
    data = (1.0, ), (float('nan'), )
    table = odo(data, sql_with_float)
    sym = symbol('s', discover(data))
    assert odo(compute(sym.isnan(), table), list) == [(False, ), (True, )]
示例#43
0
def test_shift_on_column(n, column, sql):
    t = symbol('t', discover(sql))
    expr = t[column].shift(n)
    result = odo(compute(expr, sql), pd.Series)
    expected = odo(sql, pd.DataFrame)[column].shift(n)
    tm.assert_series_equal(result, expected)
                         names=[r[0] for r in village_schema])
village_df.to_csv('villagecd.csv', index=False)

# csv => postgresql
import odo
import sqlalchemy
engine = sqlalchemy.create_engine('postgresql://jimmy@localhost:5432/jimmy')
conn = engine.connect()

history_cols = [
    "{} character varying({})".format(r[0], r[2]) for r in history_schema
]
history_ct = "create table qvf_det_history ({})".format(
    ", ".join(history_cols))
conn.execute(history_ct)
odo.odo('./detroit_history.csv',
        'postgresql://jimmy@localhost:5432/jimmy::qvf_det_history')

voter_cols = [
    "{} character varying({})".format(r[0], r[2]) for r in voter_schema
]
voter_ct = "create table qvf_det_voters ({})".format(", ".join(voter_cols))
conn.execute(voter_ct)
odo.odo('./detroit_voters.csv',
        "postgresql://jimmy@localhost:5432/jimmy::qvf_det_voters")

county_cols = [
    "{} character varying({})".format(r[0], r[2]) for r in county_schema
]
county_ct = "create table qvf_county ({})".format(", ".join(county_cols))
conn.execute(county_ct)
odo.odo('./countycd.csv',
示例#45
0
def test_csv_infer_header():
    with tmpfile('db') as dbfilename:
        with filetext('a,b\n1,2\n3,4', extension='csv') as csvfilename:
            t = odo(csvfilename, 'sqlite:///%s::mytable' % dbfilename)
            assert discover(t) == dshape('var * {a: int64, b: int64}')
            assert odo(t, set) == set([(1, 2), (3, 4)])
示例#46
0
def test_quoted_name(csv, quoted_sql):
    with tmpfile('csv') as filename:
        csv = odo(data, filename, dshape=ds, has_header=True)
        s = odo(csv, quoted_sql)
        t = odo(csv, list)
        assert sorted(odo(s, list)) == sorted(t)
示例#47
0
def test_nunique_spark_dataframe(ctx, db):
    assert (odo(compute(db.t.nunique(), ctx), int) ==
            ctx.table('t').distinct().count())
def sync_from_csv_to_sqlite(csv_path, sqlite_uri):
    dshape = discover(resource(csv_path))
    odo(csv_path, sqlite_uri, dshape=dshape)
示例#49
0
def csv():
    with tmpfile('csv') as filename:
        yield odo(data, filename, dshape=ds, has_header=False)
示例#50
0
def test_ugly_schema(csv, sql_with_ugly_schema):
    sql_with_ugly_schema, bind = sql_with_ugly_schema
    assert (odo(odo(csv, sql_with_ugly_schema, bind=bind), list,
                bind=bind) == data)
    def _store(self, df):
        #self.schema.conform_df(df, storage_target=self.storage_target_type, skip_sort=True)
        odo.odo(df, self.odo_target) #, dshape=schema_to_dshape(self.schema))        


            
示例#52
0
def test_by_with_date(ctx, db, attr):
    expr = by(getattr(db.dates.ds, attr),
              mean=db.dates.amount.mean())
    result = odo(compute(expr, ctx), set)
    expected = odo(compute(expr, {db: {'dates': date_df}}), set)
    assert result == expected
示例#53
0
def test_datetime_to_timestamp():
    dt = datetime(2014, 1, 1)
    ts = odo(dt, pd.Timestamp)
    assert isinstance(ts, pd.Timestamp)
    assert ts == pd.Timestamp('2014-01-01')
示例#54
0
def test_quoted_name(quoted_sql, csv):
    s = odo(csv, quoted_sql)
    t = odo(csv, list)
    assert sorted(odo(s, list)) == sorted(t)
示例#55
0
def test_isin(ctx, db, keys):
    expr = db.t[db.t.id.isin(keys)]
    result = odo(compute(expr, ctx), set)
    expected = odo(compute(expr, {db: {'t': df}}), set)
    assert (set(map(frozenset, odo(result, list))) ==
            set(map(frozenset, odo(expected, list))))