def format_data(data): data['Date1'] = dd.to_datetime(data['Date-Time']) data['hour'] = dd.to_datetime(data['Date-Time'], format='%H:%M').dt.hour data['minute'] = dd.to_datetime(data['Date-Time'], format='%H:%M').dt.minute data['second'] = dd.to_datetime(data['Date-Time'], format='%H:%M').dt.second data['date'] = dd.to_datetime(data['Date-Time']) data['date1'] = data['date'].dt.date return data
def load_lineitem(data_folder): data_path = data_folder + "/lineitem.pq" df = pd.read_parquet( data_path, ) df.L_SHIPDATE = pd.to_datetime(df.L_SHIPDATE, format='%Y-%m-%d') df.L_RECEIPTDATE = pd.to_datetime(df.L_RECEIPTDATE, format='%Y-%m-%d') df.L_COMMITDATE = pd.to_datetime(df.L_COMMITDATE, format='%Y-%m-%d') return df
def format_date_columns(data: dd = None, date_columns: List[str] = None) -> dd: """ Format all columns specified in data_columns to ISO8601 date fromat :param data: dask dataframe :param date_columns: name of date columns :return: dask dataframe """ for date_column in date_columns: data[date_column] = dd.to_datetime(dd.to_datetime(data[date_column]), format='%Y-%m-%d') return data
def members_to_timedate(ddf): ddf["max_transaction_date"] = dd.to_datetime( ddf["max_transaction_date"], format='%Y%m%d', errors='coerce') ddf["registration_init_time"] = dd.to_datetime( ddf["registration_init_time"], format='%Y%m%d', errors='coerce') ddf["registration_init_year"] = ddf["registration_init_time"].dt.year ddf["registration_init_month"] = ddf["registration_init_time"].dt.month ddf["registration_init_day"] = ddf["registration_init_time"].dt.day ddf["max_transaction_year"] = ddf["max_transaction_date"].dt.year ddf["max_transaction_month"] = ddf["max_transaction_date"].dt.month ddf["max_transaction_day"] = ddf["max_transaction_date"].dt.day ddf["account_age"] = ddf["max_transaction_date"].dt.year - ddf["registration_init_time"].dt.year return ddf
def get_cutoffs(data, column, current_format, desired_format, period='end'): if period == 'end': return dd.to_datetime( (dd.to_datetime(data[column], format=current_format) + MonthEnd(0)).dt.strftime(desired_format), format=desired_format) elif period == 'start': return dd.to_datetime(data[column], format=current_format) else: print('Prediodicity not supported')
def format_data(data): import pandas as pd import numpy as np import time from datetime import datetime import dask import dask.dataframe as dd data['Date1'] = dd.to_datetime(data['Date-Time']) data['hour'] = dd.to_datetime(data['Date-Time'], format='%H:%M').dt.hour data['minute'] = dd.to_datetime(data['Date-Time'], format='%H:%M').dt.minute data['second'] = dd.to_datetime(data['Date-Time'], format='%H:%M').dt.second data['date'] = dd.to_datetime(data['Date-Time']) data['date1'] = data['date'].dt.date return data
def spec_filter(df: dd.DataFrame): # properly assigning date format df["date"] = dd.to_datetime(df["date"], format="%Y-%m-%d") print("Passed dates!") # general filtering df["clean"] = df["content"].str.findall(pattern).str.join(" ") print("Passed general!") # pipelining filtering out stopwords # df["clean"] = [str(tok.lemma_).lower() for doc in nlp.pipe(df["clean"], batch_size=30) for tok in doc # if tok.is_alpha and tok.text.lower() not in stopwords] df["clean"] = df["clean"].map(nltk.word_tokenize) print(df["clean"].compute()) df["clean"] = list( filter(lambda word: word.isalnum(), df["clean"])).map(lambda text: text.lower()).filter( lambda word: word not in stopwords).map( lemmatizer.lemmatize).map(stemmer.stem) print(df["clean"].compute()) print("Passed nlp!") df["summary"] = summarizer(df["clean"], min_length=10, max_length=100, do_sample=False) print("Passed summary!") # df["title_tok"] # df["title_tag"] # df["content_tok"] # df["content_tag"] return df
def preprocess(jsonl): """Ingest data and preprocess comment text""" bag = ( db.read_text(jsonl, blocksize="10MiB") .map(json.loads) .map( lambda r: { "created_utc": r["created_utc"], "subreddit": r["subreddit"], "text": regex_replace(r["body"]), } ) ) df = bag.to_dataframe() df = df[df["text"].str.len() > 30] df["created_utc"] = dd.to_datetime(df["created_utc"], unit="s") ## dask and spacy multiprocessing don't play nicely ## nlp.pipe might not be the fastest way to preprocessing df = df.compute() df["tokens"] = tokenize(df["text"].astype("unicode")) df = df[df["tokens"] != ""] df = df.drop("text", axis=1) return df
def analyze(self, granularity): ddf = dd.read_parquet( os.getenv('local_location') + 'rates/' + self.instrument[0] + '_' + granularity + '/' + 'part.*.parquet') ddf = ddf.astype({ 'complete': 'bool', 'volume': 'int64', 'o': 'float64', 'h': 'float64', 'l': 'float64', 'c': 'float64' }) ddf = ddf[ddf['complete'] == True] ddf['time'] = dd.to_datetime(ddf['time']) ddf = ddf.set_index('time') ddf['vol'] = ddf['h'] - ddf['l'] ddf['mov'] = ddf['c'] - ddf['o'] pdf = ddf.compute() ax = sns.jointplot(pdf['mov'], pdf['vol'], alpha=0.2) ax.set_axis_labels('Low to high', 'Open to close', fontsize=14) if granularity == 'D': plt.title('Daily volatility {}'.format(self.instrument[0]), fontsize=14) name = 'daily_volatility_{}.png'.format(self.instrument[0]) else: plt.title('Weekly volatility {}'.format(self.instrument[0]), fontsize=14) name = 'weekly_volatility_{}.png'.format(self.instrument[0]) plt.tight_layout() try: ax.savefig(os.getenv('local_location') + 'images/' + name) except: os.mkdir(os.getenv('local_location') + 'images/') ax.savefig(os.getenv('local_location') + 'images/' + name)
def get_am_pm_speed_los(df_cmp, conflation, cmp_segs, conf_len, ss_threshold, cur_year): #Create date and time fields for subsequent filtering df_cmp['Date_Time'] = df_cmp['Date Time'].str[:16] df_cmp['Date_Time'] = df_cmp['Date_Time'].str.replace('T', " ") df_cmp['Date'] = df_cmp['Date_Time'].str[:10] df_cmp['Day']=dd.to_datetime(df_cmp['Date_Time']) df_cmp['DOW']=df_cmp.Day.dt.dayofweek #Tue-1, Wed-2, Thu-3 df_cmp['Hour']=df_cmp.Day.dt.hour df_cmp['Minute']=df_cmp.Day.dt.minute #Get AM (7-9am) speeds on Tue, Wed, and Thu df_am=df_cmp[((df_cmp['DOW']>=1) & (df_cmp['DOW']<=3)) & ((df_cmp['Hour']==7) | (df_cmp['Hour']==8))] #Get PM (4:30-6:30pm) speeds on Tue, Wed, and Thu df_pm=df_cmp[((df_cmp['DOW']>=1) & (df_cmp['DOW']<=3)) & (((df_cmp['Hour']==16) & (df_cmp['Minute']>=30)) | (df_cmp['Hour']==17) | ((df_cmp['Hour']==18) & (df_cmp['Minute']<30)))] # Use merge to attach INRIX speeds to CMP segments df_cmp_am = df_am.merge(conflation, left_on='Segment ID', right_on='INRIX_SegID', how='outer') df_cmp_am['Speed(miles/hour)'] = df_cmp_am['Speed(miles/hour)'].astype(float) df_cmp_pm = df_pm.merge(conflation, left_on='Segment ID', right_on='INRIX_SegID', how='outer') df_cmp_pm['Speed(miles/hour)'] = df_cmp_pm['Speed(miles/hour)'].astype(float) cmp_am_agg = cmp_seg_level_speed_and_los(df_cmp_am, cmp_segs, conf_len, ss_threshold, cur_year, cur_period = 'AM') cmp_pm_agg = cmp_seg_level_speed_and_los(df_cmp_pm, cmp_segs, conf_len, ss_threshold, cur_year, cur_period = 'PM') cmp_segs_los = cmp_am_agg.append(cmp_pm_agg, ignore_index=True) return cmp_segs_los
def load_data(): header = [ 'timestamp', 'line_id', 'direction', 'jrny_patt_id', 'time_frame', 'journey_id', 'operator', 'congestion', 'lon', 'lat', 'delay', 'block_id', 'vehicle_id', 'stop_id', 'at_stop' ] types = { 'timestamp': np.int64, 'journey_id': np.int32, 'congestion': np.int8, 'lon': np.float64, 'lat': np.float64, 'delay': np.int8, 'vehicle_id': np.int32, 'at_stop': np.int8 } file_name = 'data/month.csv' df = dd.read_csv(file_name, header=None, names=header, dtype=types, parse_dates=['time_frame'], infer_datetime_format=True) null_replacements = {'line_id': 0, 'stop_id': 0} df = df.fillna(value=null_replacements) df['line_id'] = df['line_id'].astype(np.int32) df['stop_id'] = df['stop_id'].astype(np.int32) df['timestamp'] = dd.to_datetime(df['timestamp'], unit='us') return df
def filter(self, df): # filter date df['Timestamp'] = dd.to_datetime(df.Timestamp, unit='ns') max_timestamp = df.Timestamp.max().compute() init_timestamp = max_timestamp - timedelta(days=self.sample_days) df = df[df.Timestamp >= init_timestamp] print(init_timestamp, max_timestamp) # Filter minin interactions df_item = df.groupby("ItemID").count() df_item = df_item[ df_item.SessionID >= self.minimum_interactions].reset_index() #print("Filter minin interactions", df_item.count()) # Filter session size df_session = df.groupby("SessionID").count() df_session = df_session[ df_session.ItemID >= self.min_session_size].reset_index() #print("Filter session size", df_session.count()) df = df \ .merge(df_item[["ItemID"]], on="ItemID") \ .merge(df_session[["SessionID"]], on="SessionID") return df
def order_flow(self, dsk): df = dsk.compute() for i in df["instrument"].unique(): temp_df = df[df["instrument"] == i] temp_df = temp_df.copy() temp_df["transaction"] = np.where(temp_df.units > 0, 1, 0) task = build([GetHistoricRates(instrument=i, granularity="H1")], local_scheduler=True) ddf_rate = dd.read_parquet( os.getenv("local_location") + "rates/" + i + "_" + "H1" + "/" + "part.*.parquet") ddf_rate["time"] = dd.to_datetime(ddf_rate["time"]) ddf_rate = ddf_rate[["c", "time"]] ddf_rate["close"] = ddf_rate["c"].astype("float64") ddf_rate = ddf_rate.compute() fig, ax = plt.subplots() scatter = ax.scatter(temp_df["time"], temp_df["price"], c=temp_df["transaction"]) ax.plot(ddf_rate["time"], ddf_rate["close"], alpha=0.4) plt.xlim( temp_df["time"].max() - datetime.timedelta(days=30), temp_df["time"].max(), ) plt.xticks(rotation=45) plt.ylabel("USD") plt.title("Transactions for {}".format(i)) plt.legend(handles=scatter.legend_elements()[0], labels=['Sell', 'Buy']) fig.savefig( os.getenv("local_location") + "images/" + "order_flow_{}.png".format(i))
def load_orders(data_folder): data_path = data_folder + "/orders.pq" df = pd.read_parquet( data_path, ) df.O_ORDERDATE = pd.to_datetime(df.O_ORDERDATE, format='%Y-%m-%d') return df
def get_day_parts(time): """ Separate time 24h into day parts. ref: https://stackoverflow.com/questions/55571311/get-part-of-day-morning-afternoon-evening-night-in-python-dataframe Parameters ---------- time: str time. Returns ------- pandas series day parts. """ datetime = dd.to_datetime(time) datetime2 = (datetime.dt.hour % 24 + 4) // 4 datetime2 = datetime2.replace({1: 'late_night', 2: 'early_morning', 3: 'morning', 4: 'afternoon', 5: 'evening', 6: 'night'}) return datetime2
def get_seasons(time): """ Separate 12 mon into seasons. ref: https://stackoverflow.com/questions/55571311/get-part-of-day-morning-afternoon-evening-night-in-python-dataframe Parameters ---------- time: str time. Returns ------- pandas series seasons. """ datetime = dd.to_datetime(time) datetime2 = (datetime.dt.month % 24 + 2) // 3 datetime2 = datetime2.replace({1: 'spring', 2: 'summer', 3: 'fall', 4: 'winter', }) return datetime2
def _convert_dayid_to_datetime(ddf: dd.DataFrame) -> dd.DataFrame: ddf["datetime"] = dd.to_datetime( ddf["day"], origin="01/01/2009", unit="D", ) + dd.to_timedelta(ddf["halfhourly_id"] / 2, unit="h") return ddf.drop(columns=["day", "halfhourly_id"])
def calculate(self, ddf, instrument): # The calculations for the report ddf = ddf[ddf["complete"] == True] ddf = ddf[["c", "time"]] ddf = ddf.astype({"c": "float64"}) ddf = ddf.rename(columns={"c": instrument}) ddf["time"] = dd.to_datetime(ddf["time"]) ddf = ddf.set_index("time") return ddf
def basic(df: dd.DataFrame): #df['day'] = df.click_time.str[8:10].astype(int) #df['hour'] = df.click_time.str[11:13].astype(int) df["click_time"] = dd.to_datetime(df["click_time"]) df["hour"] = df["click_time"].dt.hour #df["telling_ip"] = np.where(df["ip"] <= 126420, 1, 0) #df["idoa_is_last_try"] = df.groupby(["ip", "app", "device", "os"])["channel"].diff(periods=-1) #df["idoa_is_last_try"] = np.where(df["idoa_is_last_try"].isnull(), 1, 0) timer.time("done basic")
def apply_index_and_date_dd(index_col: int, parse_dates: bool, df: dd.DataFrame) -> dd.DataFrame: if index_col is not None: df = df.set_index(df.columns[index_col]) df.index = df.index.rename(None) if parse_dates is not None: # noinspection PyTypeChecker df.index = dd.to_datetime(df.index) return df
def change_to_datetime_index_test(testdata, ts_column, str_format=''): testdata = copy.deepcopy(testdata) if str_format: print('Date_time string format given as %s' % str_format) else: print( ' Alert: No strf_time_format given for %s. Provide strf_time format during "setup" for better results.' % ts_column) ##### This is where we change the time index of test data ############# try: if isinstance(testdata, pd.Series) or isinstance( testdata, pd.DataFrame): if ts_column in testdata.columns: ###### If the str_format is there, set the column as time series index ## ts_index = testdata.pop(ts_column) if str_format: ts_index = pd.to_datetime(ts_index, format=str_format) else: ts_index = pd.to_datetime(ts_index) testdata.index = ts_index elif ts_column in testdata.index.name: ts_index = testdata.index ### now set the index to datetime format if str_format: ts_index = pd.to_datetime(ts_index, format=str_format) else: ts_index = pd.to_datetime(ts_index) testdata.index = ts_index elif type(testdata) == dask.dataframe.core.DataFrame: #### the below tests work for a dask dataframe as well ## if ts_column in testdata.columns: ####### Now set the index to datetime index and drop the ts_colum ######### testdata.index = dd.to_datetime(testdata[ts_column].compute()) testdata = testdata.drop(ts_column, axis=1) elif ts_column in testdata.index.name: #### the above test works for a dask dataframe as well ## ts_index = testdata.index if type(testdata.index.compute().values[0]) in [np.datetime64]: ## it is already in date-time index format - do nothing pass else: testdata.index = pd.to_datetime(str_values, format=str_format) else: print( "Error: Cannot detect %s either in columns or index. Please check input and try again." % ts_column) else: print( 'Unknown type of testdata. Please check input and try again.') except: print( ' converting testdata to datetime index erroring. Please check input and try again.' ) #### this is where we return the testdata and format return testdata, str_format
def read_tasks_raw(log_path): """ Read tasks from one or multiple task log files. """ df_tasks = (db.read_text(log_path + "/task*.jsonl").map( str.split, "\n").flatten().filter(lambda item: item != "").map(json.loads).map( _flatten_dict, "tasks", ["datetime", "client_id"]).flatten().map( _flatten_dict, "startstops", [ "worker", "status", "nbytes", "thread", "type", "typename", "key", "datetime", "client_id", ], ).flatten().to_dataframe({ "action": str, "start": "float64", "stop": "float64", "worker": str, "status": str, "nbytes": "int64", "thread": "int64", "type": str, "typename": str, "key": object, "datetime": str, "client_id": "int64", })) for column_name in ["start", "stop"]: df_tasks[column_name] = dd.to_datetime(df_tasks[column_name], unit="s") df_tasks["datetime"] = dd.to_datetime(df_tasks["datetime"], utc=True) for column_name in ["action", "status"]: df_tasks[column_name] = df_tasks[column_name].astype("category") return df_tasks
def load_parse_file(file_path, date_column="date"): """Loads a file into Pandas dataframe, and parse the datetime columns Arguments: file_path: string path to the input file. Returns: Dataframe: dask.dataframe from the file """ data = dd.read_csv(file_path) data[date_column] = dd.to_datetime(data[date_column], format='%Y-%m-%d') return data
def basic(df: pd.DataFrame): #df['day'] = df.click_time.str[8:10].astype(int) df['hour'] = df.click_time.str[11:13].astype(int) df["click_time"] = dd.to_datetime(df["click_time"]) #df["ip_count"] = df.groupby("ip")["channel"].transform('count') #df["app_count"] = df.groupby("app")["channel"].transform('count') #df["os_count"] = df.groupby("os")["channel"].transform('count') df["idoa_is_last_try"] = df.groupby(["ip", "app", "device", "os"])["channel"].shift(-1) df["idoa_is_last_try"] = np.where(df["idoa_is_last_try"].isnull(), 1, 0) df["telling_ip"] = np.where(df["ip"] <= 126420, 1, 0)
def job(): t = datetime.datetime.now() coll = pymongo.MongoClient(mongo_ip).qa.ASKBIDCAPPED data = from_sequence([ item for item in coll.find( # {'code':'000001'}, {}, { '_id': 0, 'close': 1, 'amount': 1, 'vol': 1, 'code': 1, 'market': 1, 'datetime': 1 }, batch_size=10000000) ]).to_dataframe( meta={ 'close': 'f64', 'amount': 'f64', 'vol': 'f64', 'code': 'str', 'market': 'str', 'datetime': 'str' }) res = data.assign(datetime=dd.to_datetime(data.datetime)).set_index( 'datetime').groupby('code').apply(lambda x: x.resample('1min').apply( { 'close': 'ohlc', 'vol': 'last', # 'code': 'last', 'amount': 'last' }, meta={ 'open': 'f64', 'high': 'f64', 'low': 'f64', 'close': 'f64', 'vol': 'f64', 'amount': 'f64' })) res.columns = res.columns.droplevel(0) last_minute = '{} 15:00:00'.format(datetime.date.today()) #data_min =res.loc[(slice(None), last_minute), :] re = compute(res.reindex(1)) #re = (data_min) print(re) pymongo.MongoClient(mongo_ip).qa.stock_1min.insert_many( QA.QA_util_to_json_from_pandas(re.reset_index())) print(datetime.datetime.now())
def extract(self, instrument, granularity): ddf = dd.read_parquet( os.getenv('local_location') + 'rates/' + instrument + '_' + granularity + '/' + 'part.*.parquet') ddf = ddf[ddf['complete'] == True] ddf = ddf[['c', 'time']] ddf = ddf.astype({'c': 'float64'}) ddf = ddf.rename(columns={'c': instrument}) ddf['time'] = dd.to_datetime(ddf['time']) ddf = ddf.set_index('time') return ddf
def test_as_array(): df = DaskDataFrame([], "a:str,b:int") assert [] == df.as_array() assert [] == df.as_array(type_safe=True) assert [] == list(df.as_array_iterable()) assert [] == list(df.as_array_iterable(type_safe=True)) df = DaskDataFrame([["a", 1]], "a:str,b:int") assert [["a", 1]] == df.as_array() assert [["a", 1]] == df.as_array(["a", "b"]) assert [[1, "a"]] == df.as_array(["b", "a"]) # prevent pandas auto type casting df = DaskDataFrame([[1.0, 1.1]], "a:double,b:int") assert [[1.0, 1]] == df.as_array() assert isinstance(df.as_array()[0][0], float) assert isinstance(df.as_array()[0][1], int) assert [[1.0, 1]] == df.as_array(["a", "b"]) assert [[1, 1.0]] == df.as_array(["b", "a"]) df = DaskDataFrame([[np.float64(1.0), 1.1]], "a:double,b:int") assert [[1.0, 1]] == df.as_array() assert isinstance(df.as_array()[0][0], float) assert isinstance(df.as_array()[0][1], int) df = DaskDataFrame([[pandas.Timestamp("2020-01-01"), 1.1]], "a:datetime,b:int") df.native["a"] = pd.to_datetime(df.native["a"]) assert [[datetime(2020, 1, 1), 1]] == df.as_array() assert isinstance(df.as_array()[0][0], datetime) assert isinstance(df.as_array()[0][1], int) df = DaskDataFrame([[pandas.NaT, 1.1]], "a:datetime,b:int") df.native["a"] = pd.to_datetime(df.native["a"]) assert isinstance(df.as_array()[0][0], datetime) assert isinstance(df.as_array()[0][1], int) df = DaskDataFrame([[1.0, 1.1]], "a:double,b:int") assert [[1.0, 1]] == df.as_array(type_safe=True) assert isinstance(df.as_array()[0][0], float) assert isinstance(df.as_array()[0][1], int)
def process_data(data): # process users users = data["users"] users["zipcode1"] = users["zipcode"].str.get(0) users["zipcode2"] = users["zipcode"].str.slice(0, 2) users["zipcode3"] = users["zipcode"].str.slice(0, 3) data["users"] = users.persist() logger.debug("users data processed.") # process items items = data["items"] items = items[items["title"] != "unknown"] # remove "unknown" movie items["release_date"] = dd.to_datetime(items["release"]) items["release_year"] = items["release_date"].dt.year data["items"] = items.persist() logger.debug("items data processed.") # process context for el in ["all", "train", "test"]: context = data[el] context["datetime"] = dd.to_datetime(context["timestamp"], unit="s") context["year"] = context["datetime"].dt.year context["month"] = context["datetime"].dt.month context["day"] = context["datetime"].dt.day context["week"] = context["datetime"].dt.week context["dayofweek"] = context["datetime"].dt.dayofweek + 1 data[el] = context logger.debug("context data processed.") # merge data dfs = { item: (data[item].merge(data["users"], "inner", "user_id").merge(data["items"], "inner", "item_id").persist()) for item in ["all", "train", "test"] } dfs.update({"users": users, "items": items}) logger.info("data merged.") return dfs
def get_average_distance(local_file_path): logging.info("> started executing get_average_duration() ") global slack_notification_message try: df = dd.read_csv(local_file_path, header=0, low_memory=False, usecols=use_column_list) # , nrows = '10' , ,usecols = '' df['year'] = dd.to_datetime(df['tpep_dropoff_datetime']).dt.year df['month'] = dd.to_datetime(df['tpep_dropoff_datetime']).dt.month average_trip_distance = str(df.loc[ df['trip_distance']!=0 ].groupby(['year', 'month'])['trip_distance'].mean().compute().round(2)) logging.info("Average Trip distance : {}".format(average_trip_distance)) rolling_average_trip_distance = str(get_rooling_average_distance(df)) logging.info("45 days Average Trip distance : {}".format(rolling_average_trip_distance)) slack_notification_message = """ Note > Average Trip distance for a month : \n {} \n {} \n Note > from past 45 days Average Trip distance : \n{}""".format(average_trip_distance, '*'*50 ,rolling_average_trip_distance) return slack_notification_message except Exception as e: logging.error('ERROR !!: some issue with File in the location {} and error is {}'.format(local_file_path , str(e))) raise
def extract_restrictions(period): df = dd.from_pandas(pd.read_csv( const.BASE_RESTRICTIONS_URL, usecols=list(const.RESTRICTIONS_COLUMNS_TYPES), dtype=const.RESTRICTIONS_COLUMNS_TYPES, ), npartitions=10) df['timestamp'] = dd.to_datetime(df['Date'], format='%Y%m%d') df = df.drop(['Date'], axis=1) df = df[df['timestamp'].between(period[0], period[-1])] df['timestamp'] = df['timestamp'].map(lambda x: x.isoformat()) return df