Пример #1
0
def format_data(data):

    data['Date1'] = dd.to_datetime(data['Date-Time'])
    data['hour'] = dd.to_datetime(data['Date-Time'], format='%H:%M').dt.hour
    data['minute'] = dd.to_datetime(data['Date-Time'], format='%H:%M').dt.minute
    data['second'] = dd.to_datetime(data['Date-Time'], format='%H:%M').dt.second
    data['date'] = dd.to_datetime(data['Date-Time']) 
    data['date1'] = data['date'].dt.date
    return data
Пример #2
0
def load_lineitem(data_folder):
    data_path = data_folder + "/lineitem.pq"
    df = pd.read_parquet(
        data_path,
    )
    df.L_SHIPDATE = pd.to_datetime(df.L_SHIPDATE, format='%Y-%m-%d')
    df.L_RECEIPTDATE = pd.to_datetime(df.L_RECEIPTDATE, format='%Y-%m-%d')
    df.L_COMMITDATE = pd.to_datetime(df.L_COMMITDATE, format='%Y-%m-%d')
    return df
def format_date_columns(data: dd = None, date_columns: List[str] = None) -> dd:
    """
    Format all columns specified in data_columns to ISO8601 date fromat
    :param data: dask dataframe
    :param date_columns: name of date columns
    :return: dask dataframe
    """
    for date_column in date_columns:
        data[date_column] = dd.to_datetime(dd.to_datetime(data[date_column]),
                                           format='%Y-%m-%d')
    return data
Пример #4
0
def members_to_timedate(ddf):
    ddf["max_transaction_date"] = dd.to_datetime( ddf["max_transaction_date"], format='%Y%m%d', errors='coerce')
    ddf["registration_init_time"] = dd.to_datetime( ddf["registration_init_time"], format='%Y%m%d', errors='coerce')
    ddf["registration_init_year"] =  ddf["registration_init_time"].dt.year
    ddf["registration_init_month"] =  ddf["registration_init_time"].dt.month
    ddf["registration_init_day"] =  ddf["registration_init_time"].dt.day
    ddf["max_transaction_year"] =  ddf["max_transaction_date"].dt.year
    ddf["max_transaction_month"] =  ddf["max_transaction_date"].dt.month
    ddf["max_transaction_day"] =  ddf["max_transaction_date"].dt.day
    ddf["account_age"] =  ddf["max_transaction_date"].dt.year -  ddf["registration_init_time"].dt.year
    return ddf
Пример #5
0
 def get_cutoffs(data,
                 column,
                 current_format,
                 desired_format,
                 period='end'):
     if period == 'end':
         return dd.to_datetime(
             (dd.to_datetime(data[column], format=current_format) +
              MonthEnd(0)).dt.strftime(desired_format),
             format=desired_format)
     elif period == 'start':
         return dd.to_datetime(data[column], format=current_format)
     else:
         print('Prediodicity not supported')
Пример #6
0
def format_data(data):
    import pandas as pd
    import numpy as np
    import time
    from datetime import datetime
    import dask
    import dask.dataframe as dd
    data['Date1'] = dd.to_datetime(data['Date-Time'])
    data['hour'] = dd.to_datetime(data['Date-Time'], format='%H:%M').dt.hour
    data['minute'] = dd.to_datetime(data['Date-Time'], format='%H:%M').dt.minute
    data['second'] = dd.to_datetime(data['Date-Time'], format='%H:%M').dt.second
    data['date'] = dd.to_datetime(data['Date-Time']) 
    data['date1'] = data['date'].dt.date
    return data
Пример #7
0
def spec_filter(df: dd.DataFrame):
    # properly assigning date format
    df["date"] = dd.to_datetime(df["date"], format="%Y-%m-%d")
    print("Passed dates!")

    # general filtering
    df["clean"] = df["content"].str.findall(pattern).str.join(" ")
    print("Passed general!")

    # pipelining filtering out stopwords
    # df["clean"] = [str(tok.lemma_).lower() for doc in nlp.pipe(df["clean"], batch_size=30) for tok in doc
    #                if tok.is_alpha and tok.text.lower() not in stopwords]

    df["clean"] = df["clean"].map(nltk.word_tokenize)
    print(df["clean"].compute())

    df["clean"] = list(
        filter(lambda word: word.isalnum(),
               df["clean"])).map(lambda text: text.lower()).filter(
                   lambda word: word not in stopwords).map(
                       lemmatizer.lemmatize).map(stemmer.stem)
    print(df["clean"].compute())
    print("Passed nlp!")

    df["summary"] = summarizer(df["clean"],
                               min_length=10,
                               max_length=100,
                               do_sample=False)
    print("Passed summary!")
    # df["title_tok"]
    # df["title_tag"]
    # df["content_tok"]
    # df["content_tag"]

    return df
Пример #8
0
def preprocess(jsonl):
    """Ingest data and preprocess comment text"""
    bag = (
        db.read_text(jsonl, blocksize="10MiB")
        .map(json.loads)
        .map(
            lambda r: {
                "created_utc": r["created_utc"],
                "subreddit": r["subreddit"],
                "text": regex_replace(r["body"]),
            }
        )
    )
    df = bag.to_dataframe()

    df = df[df["text"].str.len() > 30]
    df["created_utc"] = dd.to_datetime(df["created_utc"], unit="s")

    ## dask and spacy multiprocessing don't play nicely
    ## nlp.pipe might not be the fastest way to preprocessing
    df = df.compute()
    df["tokens"] = tokenize(df["text"].astype("unicode"))

    df = df[df["tokens"] != ""]
    df = df.drop("text", axis=1)

    return df
Пример #9
0
 def analyze(self, granularity):
     ddf = dd.read_parquet(
         os.getenv('local_location') + 'rates/' + self.instrument[0] + '_' +
         granularity + '/' + 'part.*.parquet')
     ddf = ddf.astype({
         'complete': 'bool',
         'volume': 'int64',
         'o': 'float64',
         'h': 'float64',
         'l': 'float64',
         'c': 'float64'
     })
     ddf = ddf[ddf['complete'] == True]
     ddf['time'] = dd.to_datetime(ddf['time'])
     ddf = ddf.set_index('time')
     ddf['vol'] = ddf['h'] - ddf['l']
     ddf['mov'] = ddf['c'] - ddf['o']
     pdf = ddf.compute()
     ax = sns.jointplot(pdf['mov'], pdf['vol'], alpha=0.2)
     ax.set_axis_labels('Low to high', 'Open to close', fontsize=14)
     if granularity == 'D':
         plt.title('Daily volatility {}'.format(self.instrument[0]),
                   fontsize=14)
         name = 'daily_volatility_{}.png'.format(self.instrument[0])
     else:
         plt.title('Weekly volatility {}'.format(self.instrument[0]),
                   fontsize=14)
         name = 'weekly_volatility_{}.png'.format(self.instrument[0])
     plt.tight_layout()
     try:
         ax.savefig(os.getenv('local_location') + 'images/' + name)
     except:
         os.mkdir(os.getenv('local_location') + 'images/')
         ax.savefig(os.getenv('local_location') + 'images/' + name)
def get_am_pm_speed_los(df_cmp, conflation, cmp_segs, conf_len, ss_threshold, cur_year):
    #Create date and time fields for subsequent filtering
    df_cmp['Date_Time'] = df_cmp['Date Time'].str[:16]
    df_cmp['Date_Time'] = df_cmp['Date_Time'].str.replace('T', " ")
    df_cmp['Date'] = df_cmp['Date_Time'].str[:10]
    df_cmp['Day']=dd.to_datetime(df_cmp['Date_Time'])
    df_cmp['DOW']=df_cmp.Day.dt.dayofweek #Tue-1, Wed-2, Thu-3
    df_cmp['Hour']=df_cmp.Day.dt.hour
    df_cmp['Minute']=df_cmp.Day.dt.minute

    #Get AM (7-9am) speeds on Tue, Wed, and Thu
    df_am=df_cmp[((df_cmp['DOW']>=1) & (df_cmp['DOW']<=3)) & ((df_cmp['Hour']==7) | (df_cmp['Hour']==8))]

    #Get PM (4:30-6:30pm) speeds on Tue, Wed, and Thu
    df_pm=df_cmp[((df_cmp['DOW']>=1) & (df_cmp['DOW']<=3)) & (((df_cmp['Hour']==16) & (df_cmp['Minute']>=30)) | (df_cmp['Hour']==17) | ((df_cmp['Hour']==18) & (df_cmp['Minute']<30)))]

    # Use merge to attach INRIX speeds to CMP segments
    df_cmp_am = df_am.merge(conflation, left_on='Segment ID', right_on='INRIX_SegID', how='outer')
    df_cmp_am['Speed(miles/hour)'] = df_cmp_am['Speed(miles/hour)'].astype(float)
    df_cmp_pm = df_pm.merge(conflation, left_on='Segment ID', right_on='INRIX_SegID', how='outer')
    df_cmp_pm['Speed(miles/hour)'] = df_cmp_pm['Speed(miles/hour)'].astype(float)

    cmp_am_agg = cmp_seg_level_speed_and_los(df_cmp_am, cmp_segs, conf_len, ss_threshold, cur_year, cur_period = 'AM')

    cmp_pm_agg = cmp_seg_level_speed_and_los(df_cmp_pm, cmp_segs, conf_len, ss_threshold, cur_year, cur_period = 'PM')

    cmp_segs_los = cmp_am_agg.append(cmp_pm_agg, ignore_index=True)
    
    return cmp_segs_los
Пример #11
0
def load_data():
    header = [
        'timestamp', 'line_id', 'direction', 'jrny_patt_id', 'time_frame',
        'journey_id', 'operator', 'congestion', 'lon', 'lat', 'delay',
        'block_id', 'vehicle_id', 'stop_id', 'at_stop'
    ]
    types = {
        'timestamp': np.int64,
        'journey_id': np.int32,
        'congestion': np.int8,
        'lon': np.float64,
        'lat': np.float64,
        'delay': np.int8,
        'vehicle_id': np.int32,
        'at_stop': np.int8
    }
    file_name = 'data/month.csv'
    df = dd.read_csv(file_name,
                     header=None,
                     names=header,
                     dtype=types,
                     parse_dates=['time_frame'],
                     infer_datetime_format=True)
    null_replacements = {'line_id': 0, 'stop_id': 0}
    df = df.fillna(value=null_replacements)
    df['line_id'] = df['line_id'].astype(np.int32)
    df['stop_id'] = df['stop_id'].astype(np.int32)
    df['timestamp'] = dd.to_datetime(df['timestamp'], unit='us')
    return df
Пример #12
0
    def filter(self, df):
        # filter date

        df['Timestamp'] = dd.to_datetime(df.Timestamp, unit='ns')

        max_timestamp = df.Timestamp.max().compute()
        init_timestamp = max_timestamp - timedelta(days=self.sample_days)
        df = df[df.Timestamp >= init_timestamp]
        print(init_timestamp, max_timestamp)

        # Filter minin interactions
        df_item = df.groupby("ItemID").count()
        df_item = df_item[
            df_item.SessionID >= self.minimum_interactions].reset_index()
        #print("Filter minin interactions", df_item.count())

        # Filter session size
        df_session = df.groupby("SessionID").count()
        df_session = df_session[
            df_session.ItemID >= self.min_session_size].reset_index()
        #print("Filter session size", df_session.count())
        df = df \
            .merge(df_item[["ItemID"]], on="ItemID") \
            .merge(df_session[["SessionID"]], on="SessionID")

        return df
Пример #13
0
 def order_flow(self, dsk):
     df = dsk.compute()
     for i in df["instrument"].unique():
         temp_df = df[df["instrument"] == i]
         temp_df = temp_df.copy()
         temp_df["transaction"] = np.where(temp_df.units > 0, 1, 0)
         task = build([GetHistoricRates(instrument=i, granularity="H1")],
                      local_scheduler=True)
         ddf_rate = dd.read_parquet(
             os.getenv("local_location") + "rates/" + i + "_" + "H1" + "/" +
             "part.*.parquet")
         ddf_rate["time"] = dd.to_datetime(ddf_rate["time"])
         ddf_rate = ddf_rate[["c", "time"]]
         ddf_rate["close"] = ddf_rate["c"].astype("float64")
         ddf_rate = ddf_rate.compute()
         fig, ax = plt.subplots()
         scatter = ax.scatter(temp_df["time"],
                              temp_df["price"],
                              c=temp_df["transaction"])
         ax.plot(ddf_rate["time"], ddf_rate["close"], alpha=0.4)
         plt.xlim(
             temp_df["time"].max() - datetime.timedelta(days=30),
             temp_df["time"].max(),
         )
         plt.xticks(rotation=45)
         plt.ylabel("USD")
         plt.title("Transactions for {}".format(i))
         plt.legend(handles=scatter.legend_elements()[0],
                    labels=['Sell', 'Buy'])
         fig.savefig(
             os.getenv("local_location") + "images/" +
             "order_flow_{}.png".format(i))
Пример #14
0
def load_orders(data_folder):
    data_path = data_folder + "/orders.pq"
    df = pd.read_parquet(
        data_path,
    )
    df.O_ORDERDATE = pd.to_datetime(df.O_ORDERDATE, format='%Y-%m-%d')
    return df
Пример #15
0
def get_day_parts(time):
    """
    Separate time 24h into day parts.
    ref: https://stackoverflow.com/questions/55571311/get-part-of-day-morning-afternoon-evening-night-in-python-dataframe

    Parameters
    ----------
    time: str
          time.

    Returns
    -------
    pandas series
        day parts.

    """

    datetime = dd.to_datetime(time)
    datetime2 = (datetime.dt.hour % 24 + 4) // 4
    datetime2 = datetime2.replace({1: 'late_night',
                                   2: 'early_morning',
                                   3: 'morning',
                                   4: 'afternoon',
                                   5: 'evening',
                                   6: 'night'})

    return datetime2
Пример #16
0
def get_seasons(time):
    """
    Separate 12 mon into seasons.
    ref: https://stackoverflow.com/questions/55571311/get-part-of-day-morning-afternoon-evening-night-in-python-dataframe

    Parameters
    ----------
    time: str
          time.

    Returns
    -------
    pandas series
        seasons.

    """

    datetime = dd.to_datetime(time)
    datetime2 = (datetime.dt.month % 24 + 2) // 3
    datetime2 = datetime2.replace({1: 'spring',
                                   2: 'summer',
                                   3: 'fall',
                                   4: 'winter',
                                   })

    return datetime2
Пример #17
0
def _convert_dayid_to_datetime(ddf: dd.DataFrame) -> dd.DataFrame:

    ddf["datetime"] = dd.to_datetime(
        ddf["day"],
        origin="01/01/2009",
        unit="D",
    ) + dd.to_timedelta(ddf["halfhourly_id"] / 2, unit="h")

    return ddf.drop(columns=["day", "halfhourly_id"])
Пример #18
0
 def calculate(self, ddf, instrument):
     # The calculations for the report
     ddf = ddf[ddf["complete"] == True]
     ddf = ddf[["c", "time"]]
     ddf = ddf.astype({"c": "float64"})
     ddf = ddf.rename(columns={"c": instrument})
     ddf["time"] = dd.to_datetime(ddf["time"])
     ddf = ddf.set_index("time")
     return ddf
Пример #19
0
def basic(df: dd.DataFrame):
    #df['day'] = df.click_time.str[8:10].astype(int)
    #df['hour'] = df.click_time.str[11:13].astype(int)
    df["click_time"] = dd.to_datetime(df["click_time"])
    df["hour"] = df["click_time"].dt.hour
    #df["telling_ip"] = np.where(df["ip"] <= 126420, 1, 0)
    #df["idoa_is_last_try"] = df.groupby(["ip", "app", "device", "os"])["channel"].diff(periods=-1)
    #df["idoa_is_last_try"] = np.where(df["idoa_is_last_try"].isnull(), 1, 0)
    timer.time("done basic")
Пример #20
0
 def apply_index_and_date_dd(index_col: int, parse_dates: bool,
                             df: dd.DataFrame) -> dd.DataFrame:
     if index_col is not None:
         df = df.set_index(df.columns[index_col])
         df.index = df.index.rename(None)
     if parse_dates is not None:
         # noinspection PyTypeChecker
         df.index = dd.to_datetime(df.index)
     return df
Пример #21
0
def change_to_datetime_index_test(testdata, ts_column, str_format=''):
    testdata = copy.deepcopy(testdata)
    if str_format:
        print('Date_time string format given as %s' % str_format)
    else:
        print(
            '    Alert: No strf_time_format given for %s. Provide strf_time format during "setup" for better results.'
            % ts_column)
    ##### This is where we change the time index of test data #############

    try:
        if isinstance(testdata, pd.Series) or isinstance(
                testdata, pd.DataFrame):
            if ts_column in testdata.columns:
                ###### If the str_format is there, set the column as time series index ##
                ts_index = testdata.pop(ts_column)
                if str_format:
                    ts_index = pd.to_datetime(ts_index, format=str_format)
                else:
                    ts_index = pd.to_datetime(ts_index)
                testdata.index = ts_index
            elif ts_column in testdata.index.name:
                ts_index = testdata.index
                ### now set the index to datetime format
                if str_format:
                    ts_index = pd.to_datetime(ts_index, format=str_format)
                else:
                    ts_index = pd.to_datetime(ts_index)
                testdata.index = ts_index
        elif type(testdata) == dask.dataframe.core.DataFrame:
            #### the below tests work for a dask dataframe as well ##
            if ts_column in testdata.columns:
                ####### Now set the index to datetime index and drop the ts_colum #########
                testdata.index = dd.to_datetime(testdata[ts_column].compute())
                testdata = testdata.drop(ts_column, axis=1)
            elif ts_column in testdata.index.name:
                #### the above test works for a dask dataframe as well ##
                ts_index = testdata.index
                if type(testdata.index.compute().values[0]) in [np.datetime64]:
                    ## it is already in date-time index format - do nothing
                    pass
                else:
                    testdata.index = pd.to_datetime(str_values,
                                                    format=str_format)
            else:
                print(
                    "Error: Cannot detect %s either in columns or index. Please check input and try again."
                    % ts_column)
        else:
            print(
                'Unknown type of testdata. Please check input and try again.')
    except:
        print(
            '    converting testdata to datetime index erroring. Please check input and try again.'
        )
    #### this is where we return the testdata and format
    return testdata, str_format
Пример #22
0
def read_tasks_raw(log_path):
    """
    Read tasks from one or multiple task log files.

    """
    df_tasks = (db.read_text(log_path + "/task*.jsonl").map(
        str.split,
        "\n").flatten().filter(lambda item: item != "").map(json.loads).map(
            _flatten_dict, "tasks", ["datetime", "client_id"]).flatten().map(
                _flatten_dict,
                "startstops",
                [
                    "worker",
                    "status",
                    "nbytes",
                    "thread",
                    "type",
                    "typename",
                    "key",
                    "datetime",
                    "client_id",
                ],
            ).flatten().to_dataframe({
                "action": str,
                "start": "float64",
                "stop": "float64",
                "worker": str,
                "status": str,
                "nbytes": "int64",
                "thread": "int64",
                "type": str,
                "typename": str,
                "key": object,
                "datetime": str,
                "client_id": "int64",
            }))

    for column_name in ["start", "stop"]:
        df_tasks[column_name] = dd.to_datetime(df_tasks[column_name], unit="s")
    df_tasks["datetime"] = dd.to_datetime(df_tasks["datetime"], utc=True)
    for column_name in ["action", "status"]:
        df_tasks[column_name] = df_tasks[column_name].astype("category")

    return df_tasks
Пример #23
0
def load_parse_file(file_path, date_column="date"):
    """Loads a file into Pandas dataframe, and parse the datetime columns
        Arguments:
            file_path: string path to the input file.
        Returns:
            Dataframe: dask.dataframe from the file
    """
    data = dd.read_csv(file_path)
    data[date_column] = dd.to_datetime(data[date_column], format='%Y-%m-%d')
    return data
Пример #24
0
def basic(df: pd.DataFrame):
    #df['day'] = df.click_time.str[8:10].astype(int)
    df['hour'] = df.click_time.str[11:13].astype(int)
    df["click_time"] = dd.to_datetime(df["click_time"])
    #df["ip_count"] = df.groupby("ip")["channel"].transform('count')
    #df["app_count"] = df.groupby("app")["channel"].transform('count')
    #df["os_count"] = df.groupby("os")["channel"].transform('count')
    df["idoa_is_last_try"] = df.groupby(["ip", "app", "device",
                                         "os"])["channel"].shift(-1)
    df["idoa_is_last_try"] = np.where(df["idoa_is_last_try"].isnull(), 1, 0)
    df["telling_ip"] = np.where(df["ip"] <= 126420, 1, 0)
Пример #25
0
def job():
    t = datetime.datetime.now()
    coll = pymongo.MongoClient(mongo_ip).qa.ASKBIDCAPPED
    data = from_sequence([
        item for item in coll.find(
            # {'code':'000001'},
            {},
            {
                '_id': 0,
                'close': 1,
                'amount': 1,
                'vol': 1,
                'code': 1,
                'market': 1,
                'datetime': 1
            },
            batch_size=10000000)
    ]).to_dataframe(
        meta={
            'close': 'f64',
            'amount': 'f64',
            'vol': 'f64',
            'code': 'str',
            'market': 'str',
            'datetime': 'str'
        })
    res = data.assign(datetime=dd.to_datetime(data.datetime)).set_index(
        'datetime').groupby('code').apply(lambda x: x.resample('1min').apply(
            {
                'close': 'ohlc',
                'vol': 'last',
                # 'code': 'last',
                'amount': 'last'
            },
            meta={
                'open': 'f64',
                'high': 'f64',
                'low': 'f64',
                'close': 'f64',
                'vol': 'f64',
                'amount': 'f64'
            }))
    res.columns = res.columns.droplevel(0)

    last_minute = '{} 15:00:00'.format(datetime.date.today())
    #data_min =res.loc[(slice(None), last_minute), :]
    re = compute(res.reindex(1))
    #re = (data_min)
    print(re)

    pymongo.MongoClient(mongo_ip).qa.stock_1min.insert_many(
        QA.QA_util_to_json_from_pandas(re.reset_index()))

    print(datetime.datetime.now())
Пример #26
0
 def extract(self, instrument, granularity):
     ddf = dd.read_parquet(
         os.getenv('local_location') + 'rates/' + instrument + '_' +
         granularity + '/' + 'part.*.parquet')
     ddf = ddf[ddf['complete'] == True]
     ddf = ddf[['c', 'time']]
     ddf = ddf.astype({'c': 'float64'})
     ddf = ddf.rename(columns={'c': instrument})
     ddf['time'] = dd.to_datetime(ddf['time'])
     ddf = ddf.set_index('time')
     return ddf
Пример #27
0
def test_as_array():
    df = DaskDataFrame([], "a:str,b:int")
    assert [] == df.as_array()
    assert [] == df.as_array(type_safe=True)
    assert [] == list(df.as_array_iterable())
    assert [] == list(df.as_array_iterable(type_safe=True))

    df = DaskDataFrame([["a", 1]], "a:str,b:int")
    assert [["a", 1]] == df.as_array()
    assert [["a", 1]] == df.as_array(["a", "b"])
    assert [[1, "a"]] == df.as_array(["b", "a"])

    # prevent pandas auto type casting
    df = DaskDataFrame([[1.0, 1.1]], "a:double,b:int")
    assert [[1.0, 1]] == df.as_array()
    assert isinstance(df.as_array()[0][0], float)
    assert isinstance(df.as_array()[0][1], int)
    assert [[1.0, 1]] == df.as_array(["a", "b"])
    assert [[1, 1.0]] == df.as_array(["b", "a"])

    df = DaskDataFrame([[np.float64(1.0), 1.1]], "a:double,b:int")
    assert [[1.0, 1]] == df.as_array()
    assert isinstance(df.as_array()[0][0], float)
    assert isinstance(df.as_array()[0][1], int)

    df = DaskDataFrame([[pandas.Timestamp("2020-01-01"), 1.1]],
                       "a:datetime,b:int")
    df.native["a"] = pd.to_datetime(df.native["a"])
    assert [[datetime(2020, 1, 1), 1]] == df.as_array()
    assert isinstance(df.as_array()[0][0], datetime)
    assert isinstance(df.as_array()[0][1], int)

    df = DaskDataFrame([[pandas.NaT, 1.1]], "a:datetime,b:int")
    df.native["a"] = pd.to_datetime(df.native["a"])
    assert isinstance(df.as_array()[0][0], datetime)
    assert isinstance(df.as_array()[0][1], int)

    df = DaskDataFrame([[1.0, 1.1]], "a:double,b:int")
    assert [[1.0, 1]] == df.as_array(type_safe=True)
    assert isinstance(df.as_array()[0][0], float)
    assert isinstance(df.as_array()[0][1], int)
Пример #28
0
def process_data(data):
    # process users
    users = data["users"]
    users["zipcode1"] = users["zipcode"].str.get(0)
    users["zipcode2"] = users["zipcode"].str.slice(0, 2)
    users["zipcode3"] = users["zipcode"].str.slice(0, 3)
    data["users"] = users.persist()
    logger.debug("users data processed.")

    # process items
    items = data["items"]
    items = items[items["title"] != "unknown"]  # remove "unknown" movie
    items["release_date"] = dd.to_datetime(items["release"])
    items["release_year"] = items["release_date"].dt.year
    data["items"] = items.persist()
    logger.debug("items data processed.")

    # process context
    for el in ["all", "train", "test"]:
        context = data[el]
        context["datetime"] = dd.to_datetime(context["timestamp"], unit="s")
        context["year"] = context["datetime"].dt.year
        context["month"] = context["datetime"].dt.month
        context["day"] = context["datetime"].dt.day
        context["week"] = context["datetime"].dt.week
        context["dayofweek"] = context["datetime"].dt.dayofweek + 1
        data[el] = context
    logger.debug("context data processed.")

    # merge data
    dfs = {
        item: (data[item].merge(data["users"], "inner",
                                "user_id").merge(data["items"], "inner",
                                                 "item_id").persist())
        for item in ["all", "train", "test"]
    }
    dfs.update({"users": users, "items": items})
    logger.info("data merged.")

    return dfs
Пример #29
0
def get_average_distance(local_file_path):
    logging.info("> started executing get_average_duration() ")
    global slack_notification_message
    try:
        df = dd.read_csv(local_file_path, header=0, low_memory=False,
                         usecols=use_column_list)  # , nrows = '10' , ,usecols = ''
        df['year'] = dd.to_datetime(df['tpep_dropoff_datetime']).dt.year
        df['month'] = dd.to_datetime(df['tpep_dropoff_datetime']).dt.month
        average_trip_distance  = str(df.loc[ df['trip_distance']!=0 ].groupby(['year', 'month'])['trip_distance'].mean().compute().round(2))
        logging.info("Average Trip distance : {}".format(average_trip_distance))

        rolling_average_trip_distance = str(get_rooling_average_distance(df))
        logging.info("45 days Average Trip distance : {}".format(rolling_average_trip_distance))

        slack_notification_message  = """ Note >  Average Trip distance for a month : \n {} \n {} \n Note > from past 45 days Average Trip distance : \n{}""".format(average_trip_distance, '*'*50 ,rolling_average_trip_distance)


        return slack_notification_message

    except Exception as e:
        logging.error('ERROR !!: some issue with File in the location {} and error is {}'.format(local_file_path , str(e)))
        raise
Пример #30
0
def extract_restrictions(period):
    df = dd.from_pandas(pd.read_csv(
        const.BASE_RESTRICTIONS_URL,
        usecols=list(const.RESTRICTIONS_COLUMNS_TYPES),
        dtype=const.RESTRICTIONS_COLUMNS_TYPES,
    ),
                        npartitions=10)
    df['timestamp'] = dd.to_datetime(df['Date'], format='%Y%m%d')
    df = df.drop(['Date'], axis=1)
    df = df[df['timestamp'].between(period[0], period[-1])]
    df['timestamp'] = df['timestamp'].map(lambda x: x.isoformat())

    return df