Пример #1
0
def read_csv_dask(filepath, usecols=None):
    # Pandas writes CSV files out as a single file
    if os.path.isfile(filepath):
        return dd.read_csv(filepath, usecols=usecols)
    # Dask may have written out CSV files in partitions
    filepath_expr = filepath.replace('.csv', '*.csv')
    return dd.read_csv(filepath_expr, usecols=usecols)
Пример #2
0
def test_late_dtypes():
    text = 'numbers,names,more_numbers,integers\n'
    for i in range(1000):
        text += '1,foo,2,3\n'
    text += '1.5,bar,2.5,3\n'
    with filetext(text) as fn:
        sol = pd.read_csv(fn)
        with pytest.raises(ValueError) as e:
            dd.read_csv(fn, sample=50).compute(get=get_sync)

        msg = ("Mismatched dtypes found.\n"
               "Expected integers, but found floats for columns:\n"
               "- 'more_numbers'\n"
               "- 'numbers'\n"
               "\n"
               "To fix, specify dtypes manually by adding:\n"
               "\n"
               "dtype={'more_numbers': float,\n"
               "       'numbers': float}\n"
               "\n"
               "to the call to `read_csv`/`read_table`.\n"
               "\n"
               "Alternatively, provide `assume_missing=True` to interpret "
               "all unspecified integer columns as floats.")

        assert str(e.value) == msg

        # Specifying dtypes works
        res = dd.read_csv(fn, sample=50,
                          dtype={'more_numbers': float, 'numbers': float})
        assert_eq(res, sol)
Пример #3
0
def test_index_col():
    with filetext(text) as fn:
        try:
            dd.read_csv(fn, chunkbytes=30, index_col='name')
            assert False
        except ValueError as e:
            assert 'set_index' in str(e)
Пример #4
0
def test_read_csv_sync(loop):
    import dask.dataframe as dd
    import pandas as pd
    with cluster(nworkers=3) as (s, [a, b, c]):
        with make_hdfs() as (hdfs, basedir):
            with hdfs.open('%s/1.csv' % basedir, 'wb') as f:
                f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

            with hdfs.open('%s/2.csv' % basedir, 'wb') as f:
                f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

            with Client(('127.0.0.1', s['port']), loop=loop) as e:
                values = dd.read_csv('hdfs://%s/*.csv' % basedir,
                                     lineterminator='\n',
                                     collection=False, header=0)
                futures = e.compute(values)
                assert all(isinstance(f, Future) for f in futures)
                L = e.gather(futures)
                assert isinstance(L[0], pd.DataFrame)
                assert list(L[0].columns) == ['name', 'amount', 'id']

                df = dd.read_csv('hdfs://%s/*.csv' % basedir,
                                 lineterminator='\n',
                                 collection=True, header=0)
                assert isinstance(df, dd.DataFrame)
                assert list(df.head().iloc[0]) == ['Alice', 100, 1]
Пример #5
0
def test_read_csv_raises_on_no_files():
    fn = '.not.a.real.file.csv'
    try:
        dd.read_csv(fn)
        assert False
    except (OSError, IOError) as e:
        assert fn in str(e)
Пример #6
0
def test_index_col():
    with filetext(csv_text) as fn:
        try:
            dd.read_csv(fn, blocksize=30, index_col='name')
            assert False
        except ValueError as e:
            assert 'set_index' in str(e)
Пример #7
0
def test_assume_missing():
    text = 'numbers,names,more_numbers,integers\n'
    for i in range(1000):
        text += '1,foo,2,3\n'
    text += '1.5,bar,2.5,3\n'
    with filetext(text) as fn:
        sol = pd.read_csv(fn)

        # assume_missing affects all columns
        res = dd.read_csv(fn, sample=50, assume_missing=True)
        assert_eq(res, sol.astype({'integers': float}))

        # assume_missing doesn't override specified dtypes
        res = dd.read_csv(fn, sample=50, assume_missing=True,
                          dtype={'integers': 'int64'})
        assert_eq(res, sol)

        # assume_missing works with dtype=None
        res = dd.read_csv(fn, sample=50, assume_missing=True, dtype=None)
        assert_eq(res, sol.astype({'integers': float}))

    text = 'numbers,integers\n'
    for i in range(1000):
        text += '1,2\n'
    text += '1.5,2\n'

    with filetext(text) as fn:
        sol = pd.read_csv(fn)

        # assume_missing ignored when all dtypes specifed
        df = dd.read_csv(fn, sample=30, dtype='int64', assume_missing=True)
        assert df.numbers.dtype == 'int64'
Пример #8
0
def test_read_csv_header_issue_823():
    text = '''a b c-d\n1 2 3\n4 5 6'''.replace(' ', '\t')
    with filetext(text) as fn:
        df = dd.read_csv(fn, sep='\t')
        assert_eq(df, pd.read_csv(fn, sep='\t'))

        df = dd.read_csv(fn, delimiter='\t')
        assert_eq(df, pd.read_csv(fn, delimiter='\t'))
Пример #9
0
def test_string_blocksize():
    with filetext(timeseries) as fn:
        a = dd.read_csv(fn, blocksize='30B')
        b = dd.read_csv(fn, blocksize='30')
        assert a.npartitions == b.npartitions

        c = dd.read_csv(fn, blocksize='64MiB')
        assert c.npartitions == 1
Пример #10
0
def animate(i, symbol):
    print(symbol)

    df = dd.read_csv(
        './{0}_*.csv'.format(symbol),
        header=None,
        names=['DateTime', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Volume'])
    df['Volume'] = df['Volume'].apply(value_to_float)
    df = df.compute()

    #Read the FB_Max and FB_Min from file
    dfFB = pd.read_csv('./FB_Max_Min.csv', header=0)
    price_max = dfFB['FB_Max'].tail(1).values[0]
    price_min = dfFB['FB_Min'].tail(1).values[0]
    if (price_max == 0.0 and price_min == 0.0):
        #price_max = df['Close'].max()
        #price_min = df['Close'].min()
        price_max = 2 * (df['Close'].max() + df['Close'].min() + df['Close'].
                         tail(1).values[0]) / 3 - df['Close'].min()
        price_min = 2 * (df['Close'].max() + df['Close'].min() + df['Close'].
                         tail(1).values[0]) / 3 - df['Close'].max()

    # Fibonacci Levels considering original trend as upward move
    diff = price_max - price_min
    level1 = price_max - 0.236 * diff
    level2 = price_max - 0.382 * diff
    level3 = price_max - 0.618 * diff

    df['Volume_New'] = df['Volume'] - df['Volume'].shift(1)
    df['Volume_New'] = df['Volume_New'].fillna(0)
    df['Volume_New'] = np.where(df['Volume_New'] < 0, 0, df['Volume_New'])
    #df['Volume_New'] = df['Volume_New']/1e6  # dollar volume in millions

    #df['SMA(10)'] = df['Close'].rolling(window=10).mean()
    #df['SMA(20)'] = df['Close'].rolling(window=20).mean()
    #df['SMA(50)'] = df['Close'].rolling(window=50).mean()
    df['SMA(150)'] = df['Close'].rolling(window=150).mean()
    df['SMA(200)'] = df['Close'].rolling(window=200).mean()

    #y = df['Close'].tail(60).values
    #x = df.tail(60).index.values

    #fit = np.polyfit(x, y, deg=1)
    titleColor = 'red'
    pctChange = 100 * (
        df['Close'].tail(1).values[0] -
        df['Open'].tail(1).values[0]) / df['Open'].tail(1).values[0]
    if pctChange > 0:
        titleColor = 'green'
    elif pctChange == 0.0:
        titleColor = 'black'

    df2 = df[['DateTime', 'Open', 'High', 'Low', 'Close', 'Volume']]
    print(df2.head(5))

    #clear ax1, ax2
    ax1.clear()
    ax2.clear()
    ax1.set_title('{0} ({1:.2f}%)'.format(symbol, pctChange),
                  color=titleColor,
                  fontsize=15)
    #ax1.plot(x, fit[0] * x + fit[1], color='red', linewidth=5.0)

    ax1.axhspan(level1, price_min, alpha=0.4, color='lightsalmon')
    #ax1.axhspan(level2, level1, alpha=0.5, color='palegoldenrod')
    ax1.axhspan(level2, level1, alpha=0.5, color='gold')
    ax1.axhspan(level3, level2, alpha=0.5, color='palegreen')
    ax1.axhspan(price_max, level3, alpha=0.5, color='powderblue')

    #Plot Close, High as line
    df.plot(y=['Close', 'High'], color=['Blue', 'Green'], ax=ax1)
    #df.plot(y= ['SMA(10)', 'SMA(20)', 'SMA(50)', 'SMA(200)'], color=['Red', 'Yellow', 'Purple', 'Orange'], ax=ax1)
    #df.plot(y= ['SMA(10)', 'SMA(50)', 'SMA(200)'], color=['Red', 'Purple', 'Orange'], ax=ax1)
    df.plot(y=['SMA(150)', 'SMA(200)'], color=['Yellow', 'Purple'], ax=ax1)

    yLast = df.tail(1)['Close'].values[0]
    #print(yLast)
    ax1.annotate('%0.3f' % yLast,
                 xy=(0.95, yLast),
                 xytext=(8, 0),
                 xycoords=('axes fraction', 'data'),
                 textcoords='offset points')
    ax1.axhline(y=yLast, color='y', linestyle='-.')
    ax1.legend(loc='upper left')

    #Plot Volume as positive and negative bar
    #df['Volume']=df['Volume'].loc[::10]
    quotes = list(
        zip(df.index.tolist(), df['Open'].tolist(), df['High'].tolist(),
            df['Low'].tolist(), df['Close'].tolist(),
            df['Volume_New'].tolist()))
    bc = volume_overlay3(ax2,
                         quotes,
                         colorup='g',
                         colordown='r',
                         width=2.5,
                         alpha=1.0)
    ax2.set_ylim(df['Volume_New'].min(), 5 * df['Volume_New'].max())
    ax2.add_collection(bc)

    # Formatter Class to eliminate weekend data gaps on chart
    class MyFormatter(Formatter):
        def __init__(self, datetimes, fmt='%Y-%m-%d %H:%M:%S'):
            self.datetimes = datetimes
            self.fmt = fmt

        def __call__(self, x, pos=0):
            'Return the label for time x at position pos'
            ind = int(round(x))
            #print(ind)

            if ind >= len(self.datetimes) or ind < 0:
                return ''

            #print(self.datetimes[ind])
            return self.datetimes[ind].strftime(self.fmt)
            #return self.dates[ind]

    dff = pd.to_datetime(df['DateTime'])
    #print(type(dff))
    formatter = MyFormatter(dff)

    ax1.set_xticklabels(df["DateTime"].tolist(), rotation=15, ha='right')
    ax1.xaxis.set_major_formatter(formatter)
    #ax1.set_xlim(0, len(df)-1)

    ax1.minorticks_on()
    # Customize the major grid
    ax1.grid(which='major', linestyle='-', linewidth='0.5', color='red')
    # Customize the minor grid
    ax1.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
Пример #11
0
# In[ ]:

# https://www.kaggle.com/cttsai/blend-app-channel-and-app-mean


def mean_feat(train, test, attrs=[]):
    return pd.merge(test,
                    train.groupby(attrs)['is_attributed'].mean().reset_index(),
                    on=attrs,
                    how='left').fillna(0).set_index('click_id')


# In[ ]:

import dask.dataframe as dd
dask_df = dd.read_csv('../input/train.csv', dtype=dtypes)
dask_df.npartitions

# In[ ]:

df_pos = dask_df[(dask_df['is_attributed'] == 1)].compute()
print("Total positives : ", df_pos.shape[0])
df_neg = dask_df[(dask_df['is_attributed'] == 0)].compute()
print("Total Negatives : ", df_neg.shape[0])
print("Base percentage of positives : ",
      100 * df_pos.shape[0] / df_neg.shape[0])

# In[ ]:

df_neg = df_neg.sample(n=3000000)  # 2.25 million = 20% , 4.5 = ~10%
Пример #12
0
def test_empty_csv_file():
    with filetext('a,b') as fn:
        df = dd.read_csv(fn, header=0)
        assert len(df.compute()) == 0
        assert list(df.columns) == ['a', 'b']
Пример #13
0
path = r'C:\Users\trist\Documents\db_loc\home_loan\files'

params = {
    'num_leaves': 1023,
    'objective': 'regression',
    'min_data_in_leaf': 100,
    'learning_rate': 0.01,
    'feature_fraction': 1.0,
    'bagging_fraction': 0.8,
    'bagging_freq': 2,
    'metric': 'auc',
    'num_threads': 12
}
MAX_ROUNDS = 10000

df_app_train = dd.read_csv(path + '/application_train.csv')
df_app_test = dd.read_csv(path + '/application_test.csv')

df_concat = dd.concat([df_app_test, df_app_train])


def fill_na_encodings(df):
    df = df.mask(df == 'XNA/XAP', '')
    df = df.mask(df == 'XNA', '')
    # df = df.replace('XAP', np.nan)
    df = df.mask(df == 365243.00, np.nan)

    for i, j in zip(df.dtypes, df.columns):
        if i == 'object':
            pass
        else:
Пример #14
0
    def calculate_avg_arrival_rates_per_hour(self):
        hour_stats = {
            event_name: "Not Recorded"
            for event_name in self.event_names
        }
        for event_name in tqdm_notebook(self.event_names):
            mapped_name = self.source_map[event_name]
            if mapped_name is not None:
                all_columns = [self.unique_id, mapped_name]
                df = dd.read_csv(self.data_path, usecols=all_columns)
                df = df.dropna()
                df = df.compute()
                df[mapped_name] = pd.to_datetime(df[mapped_name],
                                                 errors="coerce",
                                                 infer_datetime_format=True)
                df.index = df[mapped_name]
                total_counts = df.groupby([
                    df.index.year, df.index.month, df.index.day, df.index.hour
                ])[self.unique_id].count().values
                fig, ax = plt.subplots()
                x = [e for e in range(0, 24 * 10)]
                x = np.array(x)
                days = np.random.choice(360, 10, replace=False)
                total = []
                [
                    total.extend(total_counts[(12 * x):(12 * x) + 24])
                    for x in days
                ]
                c = np.mean(total)
                fs, pw = ss.periodogram(x)
                max_y = max(pw)  # Find the maximum y value
                dom_freq = fs[pw.argmax()]
                amp = sqrt(sum(n * n for n in total) / len(total)) * sqrt(2)
                #print(dom_freq)
                #print(amp)
                #print(c)
                #print(fs)
                #ax.plot(fs,pw)
                params, params_covariance = optimize.curve_fit(sin_func,
                                                               x,
                                                               total,
                                                               p0=[amp, c])
                ax.plot(x, total, 'bo')
                ax.plot(x,
                        sin_func(x, params[0], params[1]),
                        label='Fitted function')
                hour_avgs = {}
                for x in self.hour_ranges:
                    hour_slice = df.between_time(*x)
                    total_counts = hour_slice.groupby([
                        hour_slice.index.year, hour_slice.index.month,
                        hour_slice.index.day, hour_slice.index.hour
                    ])[self.unique_id].count().values
                    mean = np.mean(total_counts)
                    hour_avgs[x] = mean
                hour_stats[event_name] = hour_avgs

        for k, v in hour_stats.items():
            x = list(v.keys())
            x_vals = np.array([e for e in range(0, 24)])
            y = list(v.values())
            fig, ax = plt.subplots()
            ax.plot(x_vals, y, 'bo')
            ax.plot(x_vals,
                    sin_func(x_vals, params[0], params[1]),
                    label='Fitted function')

        return hour_stats
Пример #15
0
 def time_read_csv(self, get):
     return dd.read_csv('{}/*.csv'.format(self.data_dir)).compute(get=get)
Пример #16
0
import sys

import dask.dataframe as dd
import numpy as np
from dask.diagnostics import progress
from IPHeatmap.settings_local import DATABASES

if __name__ == '__main__':
    columns = ['network', 'geoname_id', 'registered_country_geoname_id', 'represented_country_geoname_id',
               'is_anonymous_proxy',
               'is_satellite_provider', 'postal_code', 'latitude', 'longitude', 'accuracy_radius']

    types = {'geoname_id': np.int32,
             'registered_country_geoname_id': np.int32,
             'latitude': np.float16,
             'longitude': np.float16}
    used_cols = ['geoname_id', 'latitude', 'longitude', 'registered_country_geoname_id']
    df = dd.read_csv('data/GeoLite2-City-CSV_20190618/GeoLite2-City-Blocks-IPv4.csv', assume_missing=True, usecols=used_cols)
    df = df.dropna()
    df = df.astype(dtype=types)  # "reduce resolution"

    print(df.head(100), df.dtypes, df.index, sep='\n')
    uri = 'postgresql://*****:*****@localhost/geodata'.format(DATABASES.get('default').get('password'))
    with progress.ProgressBar():
        dd.to_sql(df, 'heatmapAPI_geonode', uri, if_exists='append', index=False, parallel=True)
    sys.exit(0)
Пример #17
0
def test_categorical_known():
    text1 = normalize_text(
        """
    A,B
    a,a
    b,b
    a,a
    """
    )
    text2 = normalize_text(
        """
    A,B
    a,a
    b,b
    c,c
    """
    )
    dtype = pd.api.types.CategoricalDtype(["a", "b", "c"], ordered=False)
    with filetexts({"foo.1.csv": text1, "foo.2.csv": text2}):
        result = dd.read_csv("foo.*.csv", dtype={"A": "category", "B": "category"})
        assert result.A.cat.known is False
        assert result.B.cat.known is False
        expected = pd.DataFrame(
            {
                "A": pd.Categorical(
                    ["a", "b", "a", "a", "b", "c"], categories=dtype.categories
                ),
                "B": pd.Categorical(
                    ["a", "b", "a", "a", "b", "c"], categories=dtype.categories
                ),
            },
            index=[0, 1, 2, 0, 1, 2],
        )
        assert_eq(result, expected)

        # Specify a dtype
        result = dd.read_csv("foo.*.csv", dtype={"A": dtype, "B": "category"})
        assert result.A.cat.known is True
        assert result.B.cat.known is False
        tm.assert_index_equal(result.A.cat.categories, dtype.categories)
        assert result.A.cat.ordered is False
        assert_eq(result, expected)

        # ordered
        dtype = pd.api.types.CategoricalDtype(["a", "b", "c"], ordered=True)
        result = dd.read_csv("foo.*.csv", dtype={"A": dtype, "B": "category"})
        expected["A"] = expected["A"].cat.as_ordered()
        assert result.A.cat.known is True
        assert result.B.cat.known is False
        assert result.A.cat.ordered is True

        assert_eq(result, expected)

        # Specify "unknown" categories
        result = dd.read_csv(
            "foo.*.csv", dtype=pd.api.types.CategoricalDtype(ordered=False)
        )
        assert result.A.cat.known is False

        result = dd.read_csv("foo.*.csv", dtype="category")
        assert result.A.cat.known is False
Пример #18
0
def test_usecols():
    with filetext(timeseries) as fn:
        df = dd.read_csv(fn, blocksize=30, usecols=["High", "Low"])
        expected = pd.read_csv(fn, usecols=["High", "Low"])
        assert (df.compute().values == expected.values).all()
Пример #19
0
def test_read_csv_skiprows_range():
    with filetext(csv_text) as fn:
        f = dd.read_csv(fn, skiprows=range(5))
        result = f
        expected = pd.read_csv(fn, skiprows=range(5))
        assert_eq(result, expected)
Пример #20
0
def test_read_csv_singleton_dtype():
    data = b"a,b\n1,2\n3,4\n5,6"
    with filetext(data, mode="wb") as fn:
        assert_eq(pd.read_csv(fn, dtype=float), dd.read_csv(fn, dtype=float))
Пример #21
0
def test_none_usecols():
    with filetext(csv_text) as fn:
        df = dd.read_csv(fn, usecols=None)
        assert_eq(df, pd.read_csv(fn, usecols=None))
Пример #22
0
print(q5.qsize())
result_count = q5.qsize()
data = client.gather(q5)
while data.qsize() < result_count:
    print('sleeping')
    sleep(.1)
print(data.qsize())
iterdata = [*iterq(data)]
print(len(iterdata))
df = pd.DataFrame(iterdata)
print(len(df))
ddf = dd.from_pandas(df, npartitions=4)
remote_ddf = client.scatter(ddf)
remote_result = remote_ddf.result()
remote_result.to_csv('./export4-*.csv')
new_ddf = dd.read_csv('./export4-*.csv')
new_ddf.compute()

#%%
print(len(remote_result))

#%%

#%%
df.info()

#%%
df.to_hdf('./export7.hdf', 'key')

#%%
new_df = pd.read_hdf('./export7.hdf')
Пример #23
0
def test_csv_with_integer_names():
    with filetext("alice,1\nbob,2") as fn:
        df = dd.read_csv(fn, header=None)
        assert list(df.columns) == [0, 1]
Пример #24
0
def test_late_dtypes():
    text = "numbers,names,more_numbers,integers,dates\n"
    for i in range(1000):
        text += "1,,2,3,2017-10-31 00:00:00\n"
    text += "1.5,bar,2.5,3,4998-01-01 00:00:00\n"

    date_msg = (
        "\n"
        "\n"
        "-------------------------------------------------------------\n"
        "\n"
        "The following columns also failed to properly parse as dates:\n"
        "\n"
        "- dates\n"
        "\n"
        "This is usually due to an invalid value in that column. To\n"
        "diagnose and fix it's recommended to drop these columns from the\n"
        "`parse_dates` keyword, and manually convert them to dates later\n"
        "using `dd.to_datetime`."
    )

    with filetext(text) as fn:
        sol = pd.read_csv(fn)
        msg = (
            "Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\n"
            "\n"
            "+--------------+---------+----------+\n"
            "| Column       | Found   | Expected |\n"
            "+--------------+---------+----------+\n"
            "| more_numbers | float64 | int64    |\n"
            "| names        | object  | float64  |\n"
            "| numbers      | float64 | int64    |\n"
            "+--------------+---------+----------+\n"
            "\n"
            "- names\n"
            "  ValueError(.*)\n"
            "\n"
            "Usually this is due to dask's dtype inference failing, and\n"
            "*may* be fixed by specifying dtypes manually by adding:\n"
            "\n"
            "dtype={'more_numbers': 'float64',\n"
            "       'names': 'object',\n"
            "       'numbers': 'float64'}\n"
            "\n"
            "to the call to `read_csv`/`read_table`."
        )

        with pytest.raises(ValueError) as e:
            dd.read_csv(fn, sample=50, parse_dates=["dates"]).compute(scheduler="sync")
        assert e.match(msg + date_msg)

        with pytest.raises(ValueError) as e:
            dd.read_csv(fn, sample=50).compute(scheduler="sync")
        assert e.match(msg)

        msg = (
            "Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\n"
            "\n"
            "+--------------+---------+----------+\n"
            "| Column       | Found   | Expected |\n"
            "+--------------+---------+----------+\n"
            "| more_numbers | float64 | int64    |\n"
            "| numbers      | float64 | int64    |\n"
            "+--------------+---------+----------+\n"
            "\n"
            "Usually this is due to dask's dtype inference failing, and\n"
            "*may* be fixed by specifying dtypes manually by adding:\n"
            "\n"
            "dtype={'more_numbers': 'float64',\n"
            "       'numbers': 'float64'}\n"
            "\n"
            "to the call to `read_csv`/`read_table`.\n"
            "\n"
            "Alternatively, provide `assume_missing=True` to interpret\n"
            "all unspecified integer columns as floats."
        )

        with pytest.raises(ValueError) as e:
            dd.read_csv(fn, sample=50, dtype={"names": "O"}).compute(scheduler="sync")
        assert str(e.value) == msg

        with pytest.raises(ValueError) as e:
            dd.read_csv(
                fn, sample=50, parse_dates=["dates"], dtype={"names": "O"}
            ).compute(scheduler="sync")
        assert str(e.value) == msg + date_msg

        msg = (
            "Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\n"
            "\n"
            "The following columns failed to properly parse as dates:\n"
            "\n"
            "- dates\n"
            "\n"
            "This is usually due to an invalid value in that column. To\n"
            "diagnose and fix it's recommended to drop these columns from the\n"
            "`parse_dates` keyword, and manually convert them to dates later\n"
            "using `dd.to_datetime`."
        )

        with pytest.raises(ValueError) as e:
            dd.read_csv(
                fn,
                sample=50,
                parse_dates=["dates"],
                dtype={"more_numbers": float, "names": object, "numbers": float},
            ).compute(scheduler="sync")
        assert str(e.value) == msg

        # Specifying dtypes works
        res = dd.read_csv(
            fn,
            sample=50,
            dtype={"more_numbers": float, "names": object, "numbers": float},
        )
        assert_eq(res, sol)
Пример #25
0
def test_empty_csv_file():
    with filetext("a,b") as fn:
        df = dd.read_csv(fn, header=0)
        assert len(df.compute()) == 0
        assert list(df.columns) == ["a", "b"]
Пример #26
0
 def time_read_csv_meta(self, get):
     return dd.read_csv('{}/*.csv'.format(self.data_dir))
Пример #27
0
def test_read_csv_no_sample():
    with filetexts(csv_files, mode="b") as fn:
        df = dd.read_csv(fn, sample=False)
        assert list(df.columns) == ["name", "amount", "id"]
Пример #28
0
import pandas as pd
import dask.dataframe as dd
import numpy as np
import boto3
import s3fs
import psycopg2
import sqlalchemy
from sqlalchemy import create_engine
import os

# NBA Data Sources
nba_team = dd.read_csv(
    's3://sportsdatawarehouse/NBA/teams/NBA_teams_team stats_*.csv').compute()
nba_player = dd.read_csv(
    's3://sportsdatawarehouse/NBA/players/NBA_Players_per game stats_*.csv'
).compute()
nba_salary = dd.read_csv(
    's3://sportsdatawarehouse/NBA/salary/*nbasalary.csv').compute()
nba_teamid = dd.read_csv(
    's3://sportsdatawarehouse/key_identifiers/nba_team.csv').compute()
nba_playerid = dd.read_csv(
    's3://sportsdatawarehouse/key_identifiers/nba_players.csv').compute()
nba_game_details = dd.read_csv(
    's3://sportsdatawarehouse/NBA/games_details.csv').compute()
# WNBA Data Sources
wnba_team = dd.read_csv(
    's3://sportsdatawarehouse/WNBA/team/wbna_team*.csv').compute()
wnba_player = dd.read_csv(
    's3://sportsdatawarehouse/WNBA/players/wbna_player*.csv').compute()
wnba_salary = dd.read_csv(
    's3://sportsdatawarehouse/WNBA/salaries/*_team_salaries.csv').compute()
Пример #29
0
def get_HQDetail():
    data = dd.read_csv('F:\\Data\\HQFact\\HQ20*.csv',encoding='gbk',dtype={'ORGCODE':object,'PLUID':object})
Пример #30
0
def get_DCDetail():
    data = dd.read_csv('F:\\Data\\FactWithCost\\20*.csv',encoding='gbk',dtype={'CUSTCODE':object,'PLUCODE':object})
    return data
Пример #31
0
def phase_converter(soi, outputdir, nt, input_file, lods_cut_off,
                    snp_threshold, num_of_hets, maxed_as, bed_file, refhap,
                    use_sample, hapstats, writelod, addmissingsites):
    '''Assign the number of process - this is the optimal position to start multiprocessing ! 
       **note: number of process should be declared after all the global variables are declared,
       because each pool will need to copy the variable/value of global variables. '''
    pool = Pool(processes=nt)  # number of pool to run at once; default at 1
    ''' Step 01: Read the input file and and prepare two files as output '''
    # a) One output file contains extended phase-states for the sample of interest (soi)
    # b) another output file contains the lines that have missing data for sample of interest
    data = dd.read_csv(input_file, sep='\t')
    data_header = list(data.columns)
    pg_al_set = {al for al in data_header if al.endswith(':PG_al')}
    pi_set = {pi for pi in data_header if pi.endswith(':PI')}
    soi_PI_index = soi + ':PI'
    soi_PG_index = soi + ':PG_al'

    # check if soi is in header
    if not soi_PI_index in pi_set:
        assert False, "soi pi index is not found"

    if not soi_PG_index in pg_al_set:
        assert False, "soi pg index is not found"

    os.makedirs(outputdir, exist_ok=True)
    missing_fpath = outputdir + '/' + "missingdata_" + soi + ".txt"

    missing = data[(data[soi_PI_index] == '.') | (data[soi_PG_index] == '.')]
    missing.to_csv(
        missing_fpath,
        sep='\t',
        index=False,
    )
    good_data = data[(data[soi_PI_index] != '.') | (data[soi_PG_index] != '.')]
    ''' Step 01 - B: check if "bed file" and "haplotype reference" file are given.
        - then read the "bed file" and "haplotype file" into the memory.
        - these data will be used downstream after reading the haplotype file as "good_data" '''

    # check and load bed file
    if bed_file:
        ''' we want to extend phase state only within bed boundries.
            - so, we merge the "input haplotype-file"  with "bed-file". '''
        my_bed = dd.read_csv(bed_file,
                             sep='\t',
                             names=['CHROM', 'start', 'end'])
        my_bed['CHROM'] = my_bed['CHROM'].astype(
            str)  # setting CHROM column as string type ..
        #  this is necessary because there has been problem with groupby operations downstream

    else:
        print('# Genomic bed file is not provided ... ')

    # check and load "haplotype reference panel"
    if refhap:
        hap_panel = dd.read_csv(refhap, sep='\t').drop(['REF', 'ALT'], axis=1)
        hap_panel['CHROM'] = hap_panel['CHROM'].astype(
            str)  # setting CHROM as string type data

        # also find the sample in refHap panel
        hap_panel_samples = find_samples(list(hap_panel.keys()))

    else:
        hap_panel_samples = []
        print('# Haplotype reference panel is not provided ... ')
        print(
            '  So, phase extension will run using the samples available in the input haplotype file. '
        )
    ''' Step 01 - D: Prepare the samples to use the data from. '''
    ''' Prepare a list of tuples of samples (PI, PG_al) from the input data and update it as needed.
        - **Note: the sample list should always include the soi (sample of interest)
            - this is done to include observation from soi rather than introducing a pseudo count
              when transition is missing from some observation (n to m). '''
    sample_list = find_samples(
        data_header)  # returns data from "input haplotype file"

    # update the names in "sample_list" if other samples are requested by the user:
    if use_sample == "" or use_sample == 'input':
        sample_list = sample_list

    # use all the samples from hapRefPanel and input samples
    elif use_sample == 'all':
        sample_list = sample_list + hap_panel_samples

    elif use_sample == 'refHap':
        sample_list = hap_panel_samples + [
            (soi + ":PI", soi + ":PG_al")
        ]  # add the self sample name to account ..
        # .. for missing observations instead of using pseudo count

    # if specific select samples are of interest, split the sample names and then prepare ..
    # .. the list of tuples of sample "PI" and "PG_al"
    else:
        sample_list = use_sample.split(',')
        sample_list = [((x + ':PI'), (x + ':PG_al')) for x in sample_list] + \
                        [(soi + ":PI", soi +":PG_al")]
    ''' Step 02: pipe the data into "pandas", then:
        A) group the data by "contig" which helps in multiprocessing/threading.
            A - optional: if "bed regions" are given add the bed_regions boundries as "start_end"
        B) within each group, group again by "PI keys" of soi and then sort by
            minimum "POS" value for each "PI key"
        C) then pipe the data within each "PI key" for phase-extension computation.'''
    ''' Step 02 - A : read good part of the data into "pandas" as dataframe.'''
    # good_data = pd.read_table(StringIO(good_data), delimiter='\t')
    good_data['CHROM'] = good_data['CHROM'].astype(
        str)  # setting CHROM as string type data # this is necessary
    # to maintain proper groupby downstream

    # ** only if "good_data" is desired as text output
    #pd.DataFrame.to_csv(good_data, 'good_data_test.txt', sep='\t', header=True, index=False)
    ''' Step 02 - A (add on - i) ** merge reference haplotype if provided '''
    if refhap:
        # update the "good_data" (i.e, haplotype data)
        print(
            'Merging input haplotype data with data from the hap-reference panel'
        )

        good_data = dd.merge(good_data,
                             hap_panel,
                             on=['CHROM', 'POS'],
                             how='left').fillna('.')
        good_data.sort_values(by=['CHROM', 'POS'], inplace=True)

        # if haplotype and reference panel merged lines are desired
        #pd.DataFrame.to_csv(good_data, 'hap_and_refPanel_merged.txt', sep='\t',
        #header=True, index=False)
        del hap_panel

    else:
        print('# Haplotype reference panel is not provided....\n'
              '  - Only using the samples in the input ("%s") data.' %
              (input_file))
    ''' Step 02 - A (add on - ii) ** merge bed-regions if provided to limit phase extension
                                        and group the data by "contig". '''
    if not bed_file:
        # group data only at "contig" level, keep the sort as it is
        print('# No bed file is given ... ')
        print(
            '  - So, grouping the haplotype file only by chromosome (contig)')

        good_data_by_group = good_data.groupby('CHROM')

    elif bed_file:
        print(
            '# Merging the bed boundries from "%s" with the input haplotype file ... "%s" '
            % (bed_file, input_file))

        # merge/intersect the "bed regions" and "haplotype file"
        # then groupy "contig" and "bed regions" by passing it to function "merge_hap_with_bed()"
        good_data_by_group = merge_hap_with_bed(my_bed, good_data)
        # ** for future: we can also run multiprocessing while merging "hap file" with "bed regions"
        del my_bed

    ch_vals = [x for x in good_data['CHROM'].unique()]
    ''' Step 02 - A (**add on - iii):
        - Write the initial haplotype data.
        - Compute the statistics of the initial phased file for SOI if required '''

    print('# Writing initial haplotype for sample "%s" in the file "%s" ' %
          (soi, 'initial_haplotype_' + soi + '.txt'))

    # select the colums of interest
    # initial_haplotype = good_data[['CHROM', 'POS', 'REF', 'all-alleles', soi + ':PI', soi + ':PG_al']]. \
    #     sort_values(by=['CHROM', 'POS'])
    initial_haplotype_dd = good_data[[
        'CHROM', 'POS', 'REF', 'all-alleles', soi + ':PI', soi + ':PG_al'
    ]]
    initial_haplotype = initial_haplotype_dd.compute().sort_values(
        by=['CHROM', 'POS'])

    # write this initial haplotype to a file
    initial_haplotype.to_csv(outputdir + '/' + 'initial_haplotype_' + soi +
                             '.txt',
                             sep='\t',
                             header=True,
                             index=False)

    if hapstats == 'yes':
        print(
            '  - Computing the descriptive statistics of the haplotype data before phase extension'
        )

        # pipe the data to a function to compute haplotype statistics
        compute_haplotype_stats(initial_haplotype, soi, 'initial', outputdir)
    else:
        print(
            '  - Proceeding to phase-extension without preparing descriptive statistics of initial haplotype state.'
        )
    ''' Step 02 - B: - Split the data (grouped by chromosome (contig) values.
                        - Store data in disk or memory.
                        - Multiprocess each chunks separately '''
    print()
    print('# Starting multiprocessing using "%i" processes ' % (nt))

    # ** new method: create a folder to store the data to disk (rather than memory)
    # ** (see old method for comparison)
    # if os.path.exists('chunked_Data_' + soi):
    #     shutil.rmtree('chunked_Data_' + soi, ignore_errors=False, onerror=None)
    # os.makedirs('chunked_Data_' + soi + '/', exist_ok=True)
    ''' Step 02 - B (i)'''

    ################### old method - ** if possible reuse this method in future.
    # take the large dataframe that is grouped by contig and ..
    # .. keep chunks of dataframes as as OrderedDict(list of (keys, Dataframe object))
    #df_list = collections.OrderedDict()
    ########################################

    # # new method - storing data to disk
    # for chr_, data_by_chr in good_data_by_group:
    #     chunked_path = 'chunked_Data_' + soi + '/' + soi + ':' + str(chr_)
    #     data_by_chr.to_csv(chunked_path,sep='\t', index=False, header=True)

    # clear memory - does it do it's job ** ??
    # initial_haplotype = None; good_data = None; input_file = None
    # # good_data_by_group = None; samples = None
    # data_by_chr = None
    # del initial_haplotype, good_data, input_file, good_data_by_group, samples, data_by_chr
    ''' Now, pipe the procedure to next function for multiprocessing (i.e Step 02 - C) '''
    multiproc(sample_list, pool, hapstats, soi, outputdir, addmissingsites,
              bed_file, snp_threshold, num_of_hets, lods_cut_off, maxed_as,
              writelod, good_data_by_group, ch_vals)

    # remove the chunked data folder ** (this can be retained if need be)
    # shutil.rmtree('chunked_Data_' + soi, ignore_errors=False, onerror=None)

    print('End :)')
Пример #32
0
import gc
from sklearn import model_selection
from dask import dataframe as dd
from avito.common import csv_loader, column_selector, pocket_xgb, pocket_timer, pocket_logger, holdout_validator
from avito.fe import additional_fe

logger = pocket_logger.get_my_logger()
timer = pocket_timer.GoldenTimer(logger)
dtypes = csv_loader.get_featured_dtypes()
predict_col = column_selector.get_predict_col()
lgb_col = column_selector.get_stem_col()
lgb_col = [c.replace(" ", "_") for c in lgb_col]
# tail = lgb_col[-5:]
# print(tail)

train = dd.read_csv(PRED_TRAIN).compute()
gazou = dd.read_csv(GAZOU_TRAIN).compute()
#gazou["image"] = gazou["image"].apply(lambda w: w.replace(".jpg", ""))
train = pd.merge(train, gazou, on="image", how="left")
desc_train = scipy.sparse.load_npz(DENSE_TF_TRAIN)
title_train = scipy.sparse.load_npz(TITLE_CNT_TRAIN)
timer.time("load csv in ")

cat_col = [
    "region", "city", "parent_category_name", "category_name", "param_1",
    "param_2", "param_3", "param_all", "image_top_1", "user_type"
]
train = pd.get_dummies(
    data=train,
    prefix=cat_col,
    dummy_na=True,
Пример #33
0
def test_header_None():
    with filetexts({".tmp.1.csv": "1,2", ".tmp.2.csv": "", ".tmp.3.csv": "3,4"}):
        df = dd.read_csv(".tmp.*.csv", header=None)
        expected = pd.DataFrame({0: [1, 3], 1: [2, 4]})
        assert_eq(df.compute().reset_index(drop=True), expected)
Пример #34
0
def test_multiple_read_csv_has_deterministic_name():
    with filetexts({"_foo.1.csv": csv_text, "_foo.2.csv": csv_text}):
        a = dd.read_csv("_foo.*.csv")
        b = dd.read_csv("_foo.*.csv")

        assert sorted(a.dask.keys(), key=str) == sorted(b.dask.keys(), key=str)
Пример #35
0
def test_read_csv_has_different_names_based_on_blocksize():
    with filetext(csv_text) as fn:
        a = dd.read_csv(fn, blocksize="10kB")
        b = dd.read_csv(fn, blocksize="20kB")
        assert a._name != b._name
Пример #36
0
def test_read_csv_sensitive_to_enforce():
    with filetexts(csv_files, mode="b"):
        a = dd.read_csv("2014-01-*.csv", enforce=True)
        b = dd.read_csv("2014-01-*.csv", enforce=False)
        assert a._name != b._name
Пример #37
0
def get_branchDetail():
    data = dd.read_csv('F:\\Data\\BranchFact\\20*.csv',encoding='gbk',dtype={'ORGCODE':object,'PLUID':object})
    return data
Пример #38
0
def test_windows_line_terminator():
    text = "a,b\r\n1,2\r\n2,3\r\n3,4\r\n4,5\r\n5,6\r\n6,7"
    with filetext(text) as fn:
        df = dd.read_csv(fn, blocksize=5, lineterminator="\r\n")
        assert df.b.sum().compute() == 2 + 3 + 4 + 5 + 6 + 7
        assert df.a.sum().compute() == 1 + 2 + 3 + 4 + 5 + 6
Пример #39
0
@author: divya
"""

#import dask, sys
from dask.diagnostics import ProgressBar
import dask.dataframe as dd
import pandas as pd, numpy as np
import dask
from pandas.tseries.tools import to_datetime

#Progress Bar
pbar = ProgressBar()
pbar.register()

#read GNDITEM file
df_gnditem = dd.read_csv('GNDITEM_utf_8.csv', encoding='utf-8')
print df_gnditem.head()

#Select only price > 0 and quantity = 1, and drop NA values
df2 = df_gnditem[df_gnditem.PRICE > 0.0 and df_gnditem.QUANTITY == 1.0]
df2 = df2.dropna()
print df2.head()
print dd.compute(df2.count())

#read Loyalty file
df_loyalty = pd.read_csv('loyalty_utf_8.csv', header=0, encoding='utf-8')

len(df_loyalty)
#Drop NA values
df_loyalty = df_loyalty.dropna()
len(df_loyalty)