def read_csv_dask(filepath, usecols=None): # Pandas writes CSV files out as a single file if os.path.isfile(filepath): return dd.read_csv(filepath, usecols=usecols) # Dask may have written out CSV files in partitions filepath_expr = filepath.replace('.csv', '*.csv') return dd.read_csv(filepath_expr, usecols=usecols)
def test_late_dtypes(): text = 'numbers,names,more_numbers,integers\n' for i in range(1000): text += '1,foo,2,3\n' text += '1.5,bar,2.5,3\n' with filetext(text) as fn: sol = pd.read_csv(fn) with pytest.raises(ValueError) as e: dd.read_csv(fn, sample=50).compute(get=get_sync) msg = ("Mismatched dtypes found.\n" "Expected integers, but found floats for columns:\n" "- 'more_numbers'\n" "- 'numbers'\n" "\n" "To fix, specify dtypes manually by adding:\n" "\n" "dtype={'more_numbers': float,\n" " 'numbers': float}\n" "\n" "to the call to `read_csv`/`read_table`.\n" "\n" "Alternatively, provide `assume_missing=True` to interpret " "all unspecified integer columns as floats.") assert str(e.value) == msg # Specifying dtypes works res = dd.read_csv(fn, sample=50, dtype={'more_numbers': float, 'numbers': float}) assert_eq(res, sol)
def test_index_col(): with filetext(text) as fn: try: dd.read_csv(fn, chunkbytes=30, index_col='name') assert False except ValueError as e: assert 'set_index' in str(e)
def test_read_csv_sync(loop): import dask.dataframe as dd import pandas as pd with cluster(nworkers=3) as (s, [a, b, c]): with make_hdfs() as (hdfs, basedir): with hdfs.open('%s/1.csv' % basedir, 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('%s/2.csv' % basedir, 'wb') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') with Client(('127.0.0.1', s['port']), loop=loop) as e: values = dd.read_csv('hdfs://%s/*.csv' % basedir, lineterminator='\n', collection=False, header=0) futures = e.compute(values) assert all(isinstance(f, Future) for f in futures) L = e.gather(futures) assert isinstance(L[0], pd.DataFrame) assert list(L[0].columns) == ['name', 'amount', 'id'] df = dd.read_csv('hdfs://%s/*.csv' % basedir, lineterminator='\n', collection=True, header=0) assert isinstance(df, dd.DataFrame) assert list(df.head().iloc[0]) == ['Alice', 100, 1]
def test_read_csv_raises_on_no_files(): fn = '.not.a.real.file.csv' try: dd.read_csv(fn) assert False except (OSError, IOError) as e: assert fn in str(e)
def test_index_col(): with filetext(csv_text) as fn: try: dd.read_csv(fn, blocksize=30, index_col='name') assert False except ValueError as e: assert 'set_index' in str(e)
def test_assume_missing(): text = 'numbers,names,more_numbers,integers\n' for i in range(1000): text += '1,foo,2,3\n' text += '1.5,bar,2.5,3\n' with filetext(text) as fn: sol = pd.read_csv(fn) # assume_missing affects all columns res = dd.read_csv(fn, sample=50, assume_missing=True) assert_eq(res, sol.astype({'integers': float})) # assume_missing doesn't override specified dtypes res = dd.read_csv(fn, sample=50, assume_missing=True, dtype={'integers': 'int64'}) assert_eq(res, sol) # assume_missing works with dtype=None res = dd.read_csv(fn, sample=50, assume_missing=True, dtype=None) assert_eq(res, sol.astype({'integers': float})) text = 'numbers,integers\n' for i in range(1000): text += '1,2\n' text += '1.5,2\n' with filetext(text) as fn: sol = pd.read_csv(fn) # assume_missing ignored when all dtypes specifed df = dd.read_csv(fn, sample=30, dtype='int64', assume_missing=True) assert df.numbers.dtype == 'int64'
def test_read_csv_header_issue_823(): text = '''a b c-d\n1 2 3\n4 5 6'''.replace(' ', '\t') with filetext(text) as fn: df = dd.read_csv(fn, sep='\t') assert_eq(df, pd.read_csv(fn, sep='\t')) df = dd.read_csv(fn, delimiter='\t') assert_eq(df, pd.read_csv(fn, delimiter='\t'))
def test_string_blocksize(): with filetext(timeseries) as fn: a = dd.read_csv(fn, blocksize='30B') b = dd.read_csv(fn, blocksize='30') assert a.npartitions == b.npartitions c = dd.read_csv(fn, blocksize='64MiB') assert c.npartitions == 1
def animate(i, symbol): print(symbol) df = dd.read_csv( './{0}_*.csv'.format(symbol), header=None, names=['DateTime', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Volume']) df['Volume'] = df['Volume'].apply(value_to_float) df = df.compute() #Read the FB_Max and FB_Min from file dfFB = pd.read_csv('./FB_Max_Min.csv', header=0) price_max = dfFB['FB_Max'].tail(1).values[0] price_min = dfFB['FB_Min'].tail(1).values[0] if (price_max == 0.0 and price_min == 0.0): #price_max = df['Close'].max() #price_min = df['Close'].min() price_max = 2 * (df['Close'].max() + df['Close'].min() + df['Close']. tail(1).values[0]) / 3 - df['Close'].min() price_min = 2 * (df['Close'].max() + df['Close'].min() + df['Close']. tail(1).values[0]) / 3 - df['Close'].max() # Fibonacci Levels considering original trend as upward move diff = price_max - price_min level1 = price_max - 0.236 * diff level2 = price_max - 0.382 * diff level3 = price_max - 0.618 * diff df['Volume_New'] = df['Volume'] - df['Volume'].shift(1) df['Volume_New'] = df['Volume_New'].fillna(0) df['Volume_New'] = np.where(df['Volume_New'] < 0, 0, df['Volume_New']) #df['Volume_New'] = df['Volume_New']/1e6 # dollar volume in millions #df['SMA(10)'] = df['Close'].rolling(window=10).mean() #df['SMA(20)'] = df['Close'].rolling(window=20).mean() #df['SMA(50)'] = df['Close'].rolling(window=50).mean() df['SMA(150)'] = df['Close'].rolling(window=150).mean() df['SMA(200)'] = df['Close'].rolling(window=200).mean() #y = df['Close'].tail(60).values #x = df.tail(60).index.values #fit = np.polyfit(x, y, deg=1) titleColor = 'red' pctChange = 100 * ( df['Close'].tail(1).values[0] - df['Open'].tail(1).values[0]) / df['Open'].tail(1).values[0] if pctChange > 0: titleColor = 'green' elif pctChange == 0.0: titleColor = 'black' df2 = df[['DateTime', 'Open', 'High', 'Low', 'Close', 'Volume']] print(df2.head(5)) #clear ax1, ax2 ax1.clear() ax2.clear() ax1.set_title('{0} ({1:.2f}%)'.format(symbol, pctChange), color=titleColor, fontsize=15) #ax1.plot(x, fit[0] * x + fit[1], color='red', linewidth=5.0) ax1.axhspan(level1, price_min, alpha=0.4, color='lightsalmon') #ax1.axhspan(level2, level1, alpha=0.5, color='palegoldenrod') ax1.axhspan(level2, level1, alpha=0.5, color='gold') ax1.axhspan(level3, level2, alpha=0.5, color='palegreen') ax1.axhspan(price_max, level3, alpha=0.5, color='powderblue') #Plot Close, High as line df.plot(y=['Close', 'High'], color=['Blue', 'Green'], ax=ax1) #df.plot(y= ['SMA(10)', 'SMA(20)', 'SMA(50)', 'SMA(200)'], color=['Red', 'Yellow', 'Purple', 'Orange'], ax=ax1) #df.plot(y= ['SMA(10)', 'SMA(50)', 'SMA(200)'], color=['Red', 'Purple', 'Orange'], ax=ax1) df.plot(y=['SMA(150)', 'SMA(200)'], color=['Yellow', 'Purple'], ax=ax1) yLast = df.tail(1)['Close'].values[0] #print(yLast) ax1.annotate('%0.3f' % yLast, xy=(0.95, yLast), xytext=(8, 0), xycoords=('axes fraction', 'data'), textcoords='offset points') ax1.axhline(y=yLast, color='y', linestyle='-.') ax1.legend(loc='upper left') #Plot Volume as positive and negative bar #df['Volume']=df['Volume'].loc[::10] quotes = list( zip(df.index.tolist(), df['Open'].tolist(), df['High'].tolist(), df['Low'].tolist(), df['Close'].tolist(), df['Volume_New'].tolist())) bc = volume_overlay3(ax2, quotes, colorup='g', colordown='r', width=2.5, alpha=1.0) ax2.set_ylim(df['Volume_New'].min(), 5 * df['Volume_New'].max()) ax2.add_collection(bc) # Formatter Class to eliminate weekend data gaps on chart class MyFormatter(Formatter): def __init__(self, datetimes, fmt='%Y-%m-%d %H:%M:%S'): self.datetimes = datetimes self.fmt = fmt def __call__(self, x, pos=0): 'Return the label for time x at position pos' ind = int(round(x)) #print(ind) if ind >= len(self.datetimes) or ind < 0: return '' #print(self.datetimes[ind]) return self.datetimes[ind].strftime(self.fmt) #return self.dates[ind] dff = pd.to_datetime(df['DateTime']) #print(type(dff)) formatter = MyFormatter(dff) ax1.set_xticklabels(df["DateTime"].tolist(), rotation=15, ha='right') ax1.xaxis.set_major_formatter(formatter) #ax1.set_xlim(0, len(df)-1) ax1.minorticks_on() # Customize the major grid ax1.grid(which='major', linestyle='-', linewidth='0.5', color='red') # Customize the minor grid ax1.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
# In[ ]: # https://www.kaggle.com/cttsai/blend-app-channel-and-app-mean def mean_feat(train, test, attrs=[]): return pd.merge(test, train.groupby(attrs)['is_attributed'].mean().reset_index(), on=attrs, how='left').fillna(0).set_index('click_id') # In[ ]: import dask.dataframe as dd dask_df = dd.read_csv('../input/train.csv', dtype=dtypes) dask_df.npartitions # In[ ]: df_pos = dask_df[(dask_df['is_attributed'] == 1)].compute() print("Total positives : ", df_pos.shape[0]) df_neg = dask_df[(dask_df['is_attributed'] == 0)].compute() print("Total Negatives : ", df_neg.shape[0]) print("Base percentage of positives : ", 100 * df_pos.shape[0] / df_neg.shape[0]) # In[ ]: df_neg = df_neg.sample(n=3000000) # 2.25 million = 20% , 4.5 = ~10%
def test_empty_csv_file(): with filetext('a,b') as fn: df = dd.read_csv(fn, header=0) assert len(df.compute()) == 0 assert list(df.columns) == ['a', 'b']
path = r'C:\Users\trist\Documents\db_loc\home_loan\files' params = { 'num_leaves': 1023, 'objective': 'regression', 'min_data_in_leaf': 100, 'learning_rate': 0.01, 'feature_fraction': 1.0, 'bagging_fraction': 0.8, 'bagging_freq': 2, 'metric': 'auc', 'num_threads': 12 } MAX_ROUNDS = 10000 df_app_train = dd.read_csv(path + '/application_train.csv') df_app_test = dd.read_csv(path + '/application_test.csv') df_concat = dd.concat([df_app_test, df_app_train]) def fill_na_encodings(df): df = df.mask(df == 'XNA/XAP', '') df = df.mask(df == 'XNA', '') # df = df.replace('XAP', np.nan) df = df.mask(df == 365243.00, np.nan) for i, j in zip(df.dtypes, df.columns): if i == 'object': pass else:
def calculate_avg_arrival_rates_per_hour(self): hour_stats = { event_name: "Not Recorded" for event_name in self.event_names } for event_name in tqdm_notebook(self.event_names): mapped_name = self.source_map[event_name] if mapped_name is not None: all_columns = [self.unique_id, mapped_name] df = dd.read_csv(self.data_path, usecols=all_columns) df = df.dropna() df = df.compute() df[mapped_name] = pd.to_datetime(df[mapped_name], errors="coerce", infer_datetime_format=True) df.index = df[mapped_name] total_counts = df.groupby([ df.index.year, df.index.month, df.index.day, df.index.hour ])[self.unique_id].count().values fig, ax = plt.subplots() x = [e for e in range(0, 24 * 10)] x = np.array(x) days = np.random.choice(360, 10, replace=False) total = [] [ total.extend(total_counts[(12 * x):(12 * x) + 24]) for x in days ] c = np.mean(total) fs, pw = ss.periodogram(x) max_y = max(pw) # Find the maximum y value dom_freq = fs[pw.argmax()] amp = sqrt(sum(n * n for n in total) / len(total)) * sqrt(2) #print(dom_freq) #print(amp) #print(c) #print(fs) #ax.plot(fs,pw) params, params_covariance = optimize.curve_fit(sin_func, x, total, p0=[amp, c]) ax.plot(x, total, 'bo') ax.plot(x, sin_func(x, params[0], params[1]), label='Fitted function') hour_avgs = {} for x in self.hour_ranges: hour_slice = df.between_time(*x) total_counts = hour_slice.groupby([ hour_slice.index.year, hour_slice.index.month, hour_slice.index.day, hour_slice.index.hour ])[self.unique_id].count().values mean = np.mean(total_counts) hour_avgs[x] = mean hour_stats[event_name] = hour_avgs for k, v in hour_stats.items(): x = list(v.keys()) x_vals = np.array([e for e in range(0, 24)]) y = list(v.values()) fig, ax = plt.subplots() ax.plot(x_vals, y, 'bo') ax.plot(x_vals, sin_func(x_vals, params[0], params[1]), label='Fitted function') return hour_stats
def time_read_csv(self, get): return dd.read_csv('{}/*.csv'.format(self.data_dir)).compute(get=get)
import sys import dask.dataframe as dd import numpy as np from dask.diagnostics import progress from IPHeatmap.settings_local import DATABASES if __name__ == '__main__': columns = ['network', 'geoname_id', 'registered_country_geoname_id', 'represented_country_geoname_id', 'is_anonymous_proxy', 'is_satellite_provider', 'postal_code', 'latitude', 'longitude', 'accuracy_radius'] types = {'geoname_id': np.int32, 'registered_country_geoname_id': np.int32, 'latitude': np.float16, 'longitude': np.float16} used_cols = ['geoname_id', 'latitude', 'longitude', 'registered_country_geoname_id'] df = dd.read_csv('data/GeoLite2-City-CSV_20190618/GeoLite2-City-Blocks-IPv4.csv', assume_missing=True, usecols=used_cols) df = df.dropna() df = df.astype(dtype=types) # "reduce resolution" print(df.head(100), df.dtypes, df.index, sep='\n') uri = 'postgresql://*****:*****@localhost/geodata'.format(DATABASES.get('default').get('password')) with progress.ProgressBar(): dd.to_sql(df, 'heatmapAPI_geonode', uri, if_exists='append', index=False, parallel=True) sys.exit(0)
def test_categorical_known(): text1 = normalize_text( """ A,B a,a b,b a,a """ ) text2 = normalize_text( """ A,B a,a b,b c,c """ ) dtype = pd.api.types.CategoricalDtype(["a", "b", "c"], ordered=False) with filetexts({"foo.1.csv": text1, "foo.2.csv": text2}): result = dd.read_csv("foo.*.csv", dtype={"A": "category", "B": "category"}) assert result.A.cat.known is False assert result.B.cat.known is False expected = pd.DataFrame( { "A": pd.Categorical( ["a", "b", "a", "a", "b", "c"], categories=dtype.categories ), "B": pd.Categorical( ["a", "b", "a", "a", "b", "c"], categories=dtype.categories ), }, index=[0, 1, 2, 0, 1, 2], ) assert_eq(result, expected) # Specify a dtype result = dd.read_csv("foo.*.csv", dtype={"A": dtype, "B": "category"}) assert result.A.cat.known is True assert result.B.cat.known is False tm.assert_index_equal(result.A.cat.categories, dtype.categories) assert result.A.cat.ordered is False assert_eq(result, expected) # ordered dtype = pd.api.types.CategoricalDtype(["a", "b", "c"], ordered=True) result = dd.read_csv("foo.*.csv", dtype={"A": dtype, "B": "category"}) expected["A"] = expected["A"].cat.as_ordered() assert result.A.cat.known is True assert result.B.cat.known is False assert result.A.cat.ordered is True assert_eq(result, expected) # Specify "unknown" categories result = dd.read_csv( "foo.*.csv", dtype=pd.api.types.CategoricalDtype(ordered=False) ) assert result.A.cat.known is False result = dd.read_csv("foo.*.csv", dtype="category") assert result.A.cat.known is False
def test_usecols(): with filetext(timeseries) as fn: df = dd.read_csv(fn, blocksize=30, usecols=["High", "Low"]) expected = pd.read_csv(fn, usecols=["High", "Low"]) assert (df.compute().values == expected.values).all()
def test_read_csv_skiprows_range(): with filetext(csv_text) as fn: f = dd.read_csv(fn, skiprows=range(5)) result = f expected = pd.read_csv(fn, skiprows=range(5)) assert_eq(result, expected)
def test_read_csv_singleton_dtype(): data = b"a,b\n1,2\n3,4\n5,6" with filetext(data, mode="wb") as fn: assert_eq(pd.read_csv(fn, dtype=float), dd.read_csv(fn, dtype=float))
def test_none_usecols(): with filetext(csv_text) as fn: df = dd.read_csv(fn, usecols=None) assert_eq(df, pd.read_csv(fn, usecols=None))
print(q5.qsize()) result_count = q5.qsize() data = client.gather(q5) while data.qsize() < result_count: print('sleeping') sleep(.1) print(data.qsize()) iterdata = [*iterq(data)] print(len(iterdata)) df = pd.DataFrame(iterdata) print(len(df)) ddf = dd.from_pandas(df, npartitions=4) remote_ddf = client.scatter(ddf) remote_result = remote_ddf.result() remote_result.to_csv('./export4-*.csv') new_ddf = dd.read_csv('./export4-*.csv') new_ddf.compute() #%% print(len(remote_result)) #%% #%% df.info() #%% df.to_hdf('./export7.hdf', 'key') #%% new_df = pd.read_hdf('./export7.hdf')
def test_csv_with_integer_names(): with filetext("alice,1\nbob,2") as fn: df = dd.read_csv(fn, header=None) assert list(df.columns) == [0, 1]
def test_late_dtypes(): text = "numbers,names,more_numbers,integers,dates\n" for i in range(1000): text += "1,,2,3,2017-10-31 00:00:00\n" text += "1.5,bar,2.5,3,4998-01-01 00:00:00\n" date_msg = ( "\n" "\n" "-------------------------------------------------------------\n" "\n" "The following columns also failed to properly parse as dates:\n" "\n" "- dates\n" "\n" "This is usually due to an invalid value in that column. To\n" "diagnose and fix it's recommended to drop these columns from the\n" "`parse_dates` keyword, and manually convert them to dates later\n" "using `dd.to_datetime`." ) with filetext(text) as fn: sol = pd.read_csv(fn) msg = ( "Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\n" "\n" "+--------------+---------+----------+\n" "| Column | Found | Expected |\n" "+--------------+---------+----------+\n" "| more_numbers | float64 | int64 |\n" "| names | object | float64 |\n" "| numbers | float64 | int64 |\n" "+--------------+---------+----------+\n" "\n" "- names\n" " ValueError(.*)\n" "\n" "Usually this is due to dask's dtype inference failing, and\n" "*may* be fixed by specifying dtypes manually by adding:\n" "\n" "dtype={'more_numbers': 'float64',\n" " 'names': 'object',\n" " 'numbers': 'float64'}\n" "\n" "to the call to `read_csv`/`read_table`." ) with pytest.raises(ValueError) as e: dd.read_csv(fn, sample=50, parse_dates=["dates"]).compute(scheduler="sync") assert e.match(msg + date_msg) with pytest.raises(ValueError) as e: dd.read_csv(fn, sample=50).compute(scheduler="sync") assert e.match(msg) msg = ( "Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\n" "\n" "+--------------+---------+----------+\n" "| Column | Found | Expected |\n" "+--------------+---------+----------+\n" "| more_numbers | float64 | int64 |\n" "| numbers | float64 | int64 |\n" "+--------------+---------+----------+\n" "\n" "Usually this is due to dask's dtype inference failing, and\n" "*may* be fixed by specifying dtypes manually by adding:\n" "\n" "dtype={'more_numbers': 'float64',\n" " 'numbers': 'float64'}\n" "\n" "to the call to `read_csv`/`read_table`.\n" "\n" "Alternatively, provide `assume_missing=True` to interpret\n" "all unspecified integer columns as floats." ) with pytest.raises(ValueError) as e: dd.read_csv(fn, sample=50, dtype={"names": "O"}).compute(scheduler="sync") assert str(e.value) == msg with pytest.raises(ValueError) as e: dd.read_csv( fn, sample=50, parse_dates=["dates"], dtype={"names": "O"} ).compute(scheduler="sync") assert str(e.value) == msg + date_msg msg = ( "Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\n" "\n" "The following columns failed to properly parse as dates:\n" "\n" "- dates\n" "\n" "This is usually due to an invalid value in that column. To\n" "diagnose and fix it's recommended to drop these columns from the\n" "`parse_dates` keyword, and manually convert them to dates later\n" "using `dd.to_datetime`." ) with pytest.raises(ValueError) as e: dd.read_csv( fn, sample=50, parse_dates=["dates"], dtype={"more_numbers": float, "names": object, "numbers": float}, ).compute(scheduler="sync") assert str(e.value) == msg # Specifying dtypes works res = dd.read_csv( fn, sample=50, dtype={"more_numbers": float, "names": object, "numbers": float}, ) assert_eq(res, sol)
def test_empty_csv_file(): with filetext("a,b") as fn: df = dd.read_csv(fn, header=0) assert len(df.compute()) == 0 assert list(df.columns) == ["a", "b"]
def time_read_csv_meta(self, get): return dd.read_csv('{}/*.csv'.format(self.data_dir))
def test_read_csv_no_sample(): with filetexts(csv_files, mode="b") as fn: df = dd.read_csv(fn, sample=False) assert list(df.columns) == ["name", "amount", "id"]
import pandas as pd import dask.dataframe as dd import numpy as np import boto3 import s3fs import psycopg2 import sqlalchemy from sqlalchemy import create_engine import os # NBA Data Sources nba_team = dd.read_csv( 's3://sportsdatawarehouse/NBA/teams/NBA_teams_team stats_*.csv').compute() nba_player = dd.read_csv( 's3://sportsdatawarehouse/NBA/players/NBA_Players_per game stats_*.csv' ).compute() nba_salary = dd.read_csv( 's3://sportsdatawarehouse/NBA/salary/*nbasalary.csv').compute() nba_teamid = dd.read_csv( 's3://sportsdatawarehouse/key_identifiers/nba_team.csv').compute() nba_playerid = dd.read_csv( 's3://sportsdatawarehouse/key_identifiers/nba_players.csv').compute() nba_game_details = dd.read_csv( 's3://sportsdatawarehouse/NBA/games_details.csv').compute() # WNBA Data Sources wnba_team = dd.read_csv( 's3://sportsdatawarehouse/WNBA/team/wbna_team*.csv').compute() wnba_player = dd.read_csv( 's3://sportsdatawarehouse/WNBA/players/wbna_player*.csv').compute() wnba_salary = dd.read_csv( 's3://sportsdatawarehouse/WNBA/salaries/*_team_salaries.csv').compute()
def get_HQDetail(): data = dd.read_csv('F:\\Data\\HQFact\\HQ20*.csv',encoding='gbk',dtype={'ORGCODE':object,'PLUID':object})
def get_DCDetail(): data = dd.read_csv('F:\\Data\\FactWithCost\\20*.csv',encoding='gbk',dtype={'CUSTCODE':object,'PLUCODE':object}) return data
def phase_converter(soi, outputdir, nt, input_file, lods_cut_off, snp_threshold, num_of_hets, maxed_as, bed_file, refhap, use_sample, hapstats, writelod, addmissingsites): '''Assign the number of process - this is the optimal position to start multiprocessing ! **note: number of process should be declared after all the global variables are declared, because each pool will need to copy the variable/value of global variables. ''' pool = Pool(processes=nt) # number of pool to run at once; default at 1 ''' Step 01: Read the input file and and prepare two files as output ''' # a) One output file contains extended phase-states for the sample of interest (soi) # b) another output file contains the lines that have missing data for sample of interest data = dd.read_csv(input_file, sep='\t') data_header = list(data.columns) pg_al_set = {al for al in data_header if al.endswith(':PG_al')} pi_set = {pi for pi in data_header if pi.endswith(':PI')} soi_PI_index = soi + ':PI' soi_PG_index = soi + ':PG_al' # check if soi is in header if not soi_PI_index in pi_set: assert False, "soi pi index is not found" if not soi_PG_index in pg_al_set: assert False, "soi pg index is not found" os.makedirs(outputdir, exist_ok=True) missing_fpath = outputdir + '/' + "missingdata_" + soi + ".txt" missing = data[(data[soi_PI_index] == '.') | (data[soi_PG_index] == '.')] missing.to_csv( missing_fpath, sep='\t', index=False, ) good_data = data[(data[soi_PI_index] != '.') | (data[soi_PG_index] != '.')] ''' Step 01 - B: check if "bed file" and "haplotype reference" file are given. - then read the "bed file" and "haplotype file" into the memory. - these data will be used downstream after reading the haplotype file as "good_data" ''' # check and load bed file if bed_file: ''' we want to extend phase state only within bed boundries. - so, we merge the "input haplotype-file" with "bed-file". ''' my_bed = dd.read_csv(bed_file, sep='\t', names=['CHROM', 'start', 'end']) my_bed['CHROM'] = my_bed['CHROM'].astype( str) # setting CHROM column as string type .. # this is necessary because there has been problem with groupby operations downstream else: print('# Genomic bed file is not provided ... ') # check and load "haplotype reference panel" if refhap: hap_panel = dd.read_csv(refhap, sep='\t').drop(['REF', 'ALT'], axis=1) hap_panel['CHROM'] = hap_panel['CHROM'].astype( str) # setting CHROM as string type data # also find the sample in refHap panel hap_panel_samples = find_samples(list(hap_panel.keys())) else: hap_panel_samples = [] print('# Haplotype reference panel is not provided ... ') print( ' So, phase extension will run using the samples available in the input haplotype file. ' ) ''' Step 01 - D: Prepare the samples to use the data from. ''' ''' Prepare a list of tuples of samples (PI, PG_al) from the input data and update it as needed. - **Note: the sample list should always include the soi (sample of interest) - this is done to include observation from soi rather than introducing a pseudo count when transition is missing from some observation (n to m). ''' sample_list = find_samples( data_header) # returns data from "input haplotype file" # update the names in "sample_list" if other samples are requested by the user: if use_sample == "" or use_sample == 'input': sample_list = sample_list # use all the samples from hapRefPanel and input samples elif use_sample == 'all': sample_list = sample_list + hap_panel_samples elif use_sample == 'refHap': sample_list = hap_panel_samples + [ (soi + ":PI", soi + ":PG_al") ] # add the self sample name to account .. # .. for missing observations instead of using pseudo count # if specific select samples are of interest, split the sample names and then prepare .. # .. the list of tuples of sample "PI" and "PG_al" else: sample_list = use_sample.split(',') sample_list = [((x + ':PI'), (x + ':PG_al')) for x in sample_list] + \ [(soi + ":PI", soi +":PG_al")] ''' Step 02: pipe the data into "pandas", then: A) group the data by "contig" which helps in multiprocessing/threading. A - optional: if "bed regions" are given add the bed_regions boundries as "start_end" B) within each group, group again by "PI keys" of soi and then sort by minimum "POS" value for each "PI key" C) then pipe the data within each "PI key" for phase-extension computation.''' ''' Step 02 - A : read good part of the data into "pandas" as dataframe.''' # good_data = pd.read_table(StringIO(good_data), delimiter='\t') good_data['CHROM'] = good_data['CHROM'].astype( str) # setting CHROM as string type data # this is necessary # to maintain proper groupby downstream # ** only if "good_data" is desired as text output #pd.DataFrame.to_csv(good_data, 'good_data_test.txt', sep='\t', header=True, index=False) ''' Step 02 - A (add on - i) ** merge reference haplotype if provided ''' if refhap: # update the "good_data" (i.e, haplotype data) print( 'Merging input haplotype data with data from the hap-reference panel' ) good_data = dd.merge(good_data, hap_panel, on=['CHROM', 'POS'], how='left').fillna('.') good_data.sort_values(by=['CHROM', 'POS'], inplace=True) # if haplotype and reference panel merged lines are desired #pd.DataFrame.to_csv(good_data, 'hap_and_refPanel_merged.txt', sep='\t', #header=True, index=False) del hap_panel else: print('# Haplotype reference panel is not provided....\n' ' - Only using the samples in the input ("%s") data.' % (input_file)) ''' Step 02 - A (add on - ii) ** merge bed-regions if provided to limit phase extension and group the data by "contig". ''' if not bed_file: # group data only at "contig" level, keep the sort as it is print('# No bed file is given ... ') print( ' - So, grouping the haplotype file only by chromosome (contig)') good_data_by_group = good_data.groupby('CHROM') elif bed_file: print( '# Merging the bed boundries from "%s" with the input haplotype file ... "%s" ' % (bed_file, input_file)) # merge/intersect the "bed regions" and "haplotype file" # then groupy "contig" and "bed regions" by passing it to function "merge_hap_with_bed()" good_data_by_group = merge_hap_with_bed(my_bed, good_data) # ** for future: we can also run multiprocessing while merging "hap file" with "bed regions" del my_bed ch_vals = [x for x in good_data['CHROM'].unique()] ''' Step 02 - A (**add on - iii): - Write the initial haplotype data. - Compute the statistics of the initial phased file for SOI if required ''' print('# Writing initial haplotype for sample "%s" in the file "%s" ' % (soi, 'initial_haplotype_' + soi + '.txt')) # select the colums of interest # initial_haplotype = good_data[['CHROM', 'POS', 'REF', 'all-alleles', soi + ':PI', soi + ':PG_al']]. \ # sort_values(by=['CHROM', 'POS']) initial_haplotype_dd = good_data[[ 'CHROM', 'POS', 'REF', 'all-alleles', soi + ':PI', soi + ':PG_al' ]] initial_haplotype = initial_haplotype_dd.compute().sort_values( by=['CHROM', 'POS']) # write this initial haplotype to a file initial_haplotype.to_csv(outputdir + '/' + 'initial_haplotype_' + soi + '.txt', sep='\t', header=True, index=False) if hapstats == 'yes': print( ' - Computing the descriptive statistics of the haplotype data before phase extension' ) # pipe the data to a function to compute haplotype statistics compute_haplotype_stats(initial_haplotype, soi, 'initial', outputdir) else: print( ' - Proceeding to phase-extension without preparing descriptive statistics of initial haplotype state.' ) ''' Step 02 - B: - Split the data (grouped by chromosome (contig) values. - Store data in disk or memory. - Multiprocess each chunks separately ''' print() print('# Starting multiprocessing using "%i" processes ' % (nt)) # ** new method: create a folder to store the data to disk (rather than memory) # ** (see old method for comparison) # if os.path.exists('chunked_Data_' + soi): # shutil.rmtree('chunked_Data_' + soi, ignore_errors=False, onerror=None) # os.makedirs('chunked_Data_' + soi + '/', exist_ok=True) ''' Step 02 - B (i)''' ################### old method - ** if possible reuse this method in future. # take the large dataframe that is grouped by contig and .. # .. keep chunks of dataframes as as OrderedDict(list of (keys, Dataframe object)) #df_list = collections.OrderedDict() ######################################## # # new method - storing data to disk # for chr_, data_by_chr in good_data_by_group: # chunked_path = 'chunked_Data_' + soi + '/' + soi + ':' + str(chr_) # data_by_chr.to_csv(chunked_path,sep='\t', index=False, header=True) # clear memory - does it do it's job ** ?? # initial_haplotype = None; good_data = None; input_file = None # # good_data_by_group = None; samples = None # data_by_chr = None # del initial_haplotype, good_data, input_file, good_data_by_group, samples, data_by_chr ''' Now, pipe the procedure to next function for multiprocessing (i.e Step 02 - C) ''' multiproc(sample_list, pool, hapstats, soi, outputdir, addmissingsites, bed_file, snp_threshold, num_of_hets, lods_cut_off, maxed_as, writelod, good_data_by_group, ch_vals) # remove the chunked data folder ** (this can be retained if need be) # shutil.rmtree('chunked_Data_' + soi, ignore_errors=False, onerror=None) print('End :)')
import gc from sklearn import model_selection from dask import dataframe as dd from avito.common import csv_loader, column_selector, pocket_xgb, pocket_timer, pocket_logger, holdout_validator from avito.fe import additional_fe logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) dtypes = csv_loader.get_featured_dtypes() predict_col = column_selector.get_predict_col() lgb_col = column_selector.get_stem_col() lgb_col = [c.replace(" ", "_") for c in lgb_col] # tail = lgb_col[-5:] # print(tail) train = dd.read_csv(PRED_TRAIN).compute() gazou = dd.read_csv(GAZOU_TRAIN).compute() #gazou["image"] = gazou["image"].apply(lambda w: w.replace(".jpg", "")) train = pd.merge(train, gazou, on="image", how="left") desc_train = scipy.sparse.load_npz(DENSE_TF_TRAIN) title_train = scipy.sparse.load_npz(TITLE_CNT_TRAIN) timer.time("load csv in ") cat_col = [ "region", "city", "parent_category_name", "category_name", "param_1", "param_2", "param_3", "param_all", "image_top_1", "user_type" ] train = pd.get_dummies( data=train, prefix=cat_col, dummy_na=True,
def test_header_None(): with filetexts({".tmp.1.csv": "1,2", ".tmp.2.csv": "", ".tmp.3.csv": "3,4"}): df = dd.read_csv(".tmp.*.csv", header=None) expected = pd.DataFrame({0: [1, 3], 1: [2, 4]}) assert_eq(df.compute().reset_index(drop=True), expected)
def test_multiple_read_csv_has_deterministic_name(): with filetexts({"_foo.1.csv": csv_text, "_foo.2.csv": csv_text}): a = dd.read_csv("_foo.*.csv") b = dd.read_csv("_foo.*.csv") assert sorted(a.dask.keys(), key=str) == sorted(b.dask.keys(), key=str)
def test_read_csv_has_different_names_based_on_blocksize(): with filetext(csv_text) as fn: a = dd.read_csv(fn, blocksize="10kB") b = dd.read_csv(fn, blocksize="20kB") assert a._name != b._name
def test_read_csv_sensitive_to_enforce(): with filetexts(csv_files, mode="b"): a = dd.read_csv("2014-01-*.csv", enforce=True) b = dd.read_csv("2014-01-*.csv", enforce=False) assert a._name != b._name
def get_branchDetail(): data = dd.read_csv('F:\\Data\\BranchFact\\20*.csv',encoding='gbk',dtype={'ORGCODE':object,'PLUID':object}) return data
def test_windows_line_terminator(): text = "a,b\r\n1,2\r\n2,3\r\n3,4\r\n4,5\r\n5,6\r\n6,7" with filetext(text) as fn: df = dd.read_csv(fn, blocksize=5, lineterminator="\r\n") assert df.b.sum().compute() == 2 + 3 + 4 + 5 + 6 + 7 assert df.a.sum().compute() == 1 + 2 + 3 + 4 + 5 + 6
@author: divya """ #import dask, sys from dask.diagnostics import ProgressBar import dask.dataframe as dd import pandas as pd, numpy as np import dask from pandas.tseries.tools import to_datetime #Progress Bar pbar = ProgressBar() pbar.register() #read GNDITEM file df_gnditem = dd.read_csv('GNDITEM_utf_8.csv', encoding='utf-8') print df_gnditem.head() #Select only price > 0 and quantity = 1, and drop NA values df2 = df_gnditem[df_gnditem.PRICE > 0.0 and df_gnditem.QUANTITY == 1.0] df2 = df2.dropna() print df2.head() print dd.compute(df2.count()) #read Loyalty file df_loyalty = pd.read_csv('loyalty_utf_8.csv', header=0, encoding='utf-8') len(df_loyalty) #Drop NA values df_loyalty = df_loyalty.dropna() len(df_loyalty)