def test_to_datetime(): # DataFrame input for to_datetime modin_df = pd.DataFrame({ "year": [2015, 2016], "month": [2, 3], "day": [4, 5] }) pandas_df = pandas.DataFrame({ "year": [2015, 2016], "month": [2, 3], "day": [4, 5] }) df_equals(pd.to_datetime(modin_df), pandas.to_datetime(pandas_df)) # Series input for to_datetime modin_s = pd.Series(["3/11/2000", "3/12/2000", "3/13/2000"] * 1000) pandas_s = pandas.Series(["3/11/2000", "3/12/2000", "3/13/2000"] * 1000) df_equals(pd.to_datetime(modin_s), pandas.to_datetime(pandas_s)) # Other inputs for to_datetime value = 1490195805 assert pd.to_datetime(value, unit="s") == pandas.to_datetime(value, unit="s") value = 1490195805433502912 assert pd.to_datetime(value, unit="ns") == pandas.to_datetime(value, unit="ns") value = [1, 2, 3] assert pd.to_datetime( value, unit="D", origin=pd.Timestamp("2000-01-01")).equals( pandas.to_datetime(value, unit="D", origin=pandas.Timestamp("2000-01-01")))
def test_to_datetime(): value = 1490195805 assert pd.to_datetime(value, unit="s") == pandas.to_datetime(value, unit="s") value = 1490195805433502912 assert pd.to_datetime(value, unit="ns") == pandas.to_datetime(value, unit="ns") value = [1, 2, 3] assert pd.to_datetime( value, unit="D", origin=pd.Timestamp("2000-01-01")).equals( pandas.to_datetime(value, unit="D", origin=pandas.Timestamp("2000-01-01")))
def test_to_datetime_inplace_side_effect(): # See GH#3063 times = list(range(1617993360, 1618193360)) values = list(range(215441, 415441)) modin_df = pd.DataFrame({"time": times, "value": values}) pandas_df = pandas.DataFrame({"time": times, "value": values}) df_equals( pd.to_datetime(modin_df["time"], unit="s"), pandas.to_datetime(pandas_df["time"], unit="s"), )
def error345(df): index_3 = [] index_4 = [] print('In 3', df.head()) groups = [x for _, x in df.groupby('DEVICE_ID')] for x in groups: x = x[x['Usable'] == 0] if (len(x) <= 1): try: df.Usable = df['Usable'].mask(df.index.isin(x.index.values), 5) except: continue x['prev_LAT'] = x['LAT'].shift(1) x['prev_LONGITUDE'] = x['LONGITUDE'].shift(1) x = x.fillna(0) try: dist = x[['LAT', 'LONGITUDE', 'prev_LAT', 'prev_LONGITUDE']].apply(calc_dist, axis=1) dist.columns = ['dist'] except: print(x.head()) continue dist_zeroes = (dist['dist'] == 0) timevals = pd.to_datetime(x['IST_DATE'], format='%Y-%m-%j %H:%M:%S') timevals = pd.Series(timevals).diff() timevals = timevals.fillna(pd.Timedelta(seconds=0)) timevals = timevals / np.timedelta64(1, 'h') timevals.index = x.index time_zeroes = (timevals == 0) speed = dist['dist'] / timevals speed = speed.replace(np.inf, -1) speed_zeroes = [speed > 90] a = (~np.array(dist_zeroes) & np.array(time_zeroes)) index_4.extend(x.loc[a].index.values) index_3.extend(x.loc[speed_zeroes[0]].index.values) df.Usable = df['Usable'].mask(df.index.isin(index_3), 3) df.Usable = df['Usable'].mask(df.index.isin(index_4), 4) return df
if not isVaildDate(str(df_row[field])): df_row[field] = np.nan return df_row purge_pat_files('../data', r'^[^_]+_log.csv$') logs1 = pd.read_csv('../data/log_reduced.csv', encoding='utf-8', chunksize=c_sz) for df, _ in zip(logs1, trange(1000)): df = pd.merge(df, ad_static, left_on='曝光广告id', right_on='广告id', how='inner') # 3. 去掉非法时间行 df['广告请求时间'] = pd.to_datetime(df['广告请求时间'], unit='s') # 转为日期 df['广告请求时间_date'] = df['广告请求时间'].apply(lambda x: x.date()) df = df[col_names1] # 1. 去空值 df.dropna(axis=0, how='any', inplace=True) # 数据分割 _ = df.apply(save_csv, axis=1) print('done')
df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) end_mem = df.memory_usage().sum() / 1024**2 print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format( start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem)) gc.collect() return df print( '=============================================== read train ===============================================' ) t = time.time() train_df = pd.read_csv('./dataset/train.csv') train_df['date'] = pd.to_datetime(train_df['ts'].apply( lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x / 1000)))) train_df['day'] = train_df['date'].dt.day train_df.loc[train_df['day'] == 7, 'day'] = 8 train_df['hour'] = train_df['date'].dt.hour train_df['minute'] = train_df['date'].dt.minute train_num = train_df.shape[0] labels = train_df['target'].values print('runtime:', time.time() - t) print( '=============================================== click data ===============================================' ) click_df = train_df[train_df['target'] == 1].sort_values( 'timestamp').reset_index(drop=True) click_df['exposure_click_gap'] = click_df['timestamp'] - click_df['ts'] click_df = click_df[click_df['exposure_click_gap'] >= 0].reset_index(drop=True)
ax=axes[0]) data1.plot( kind="hist", y="Defense", bins=50, range=(0, 250), normed=True, ax=axes[1], cumulative=True, ) plt.savefig("graph.png") plt data.describe() time_list = ["1992-03-08", "1992-04-12"] print(type(time_list[1])) # As you can see date is string datetime_object = pd.to_datetime(time_list) print(type(datetime_object)) import warnings warnings.filterwarnings("ignore") data2 = data.head() date_list = [ "1992-01-10", "1992-02-10", "1992-03-10", "1993-03-15", "1993-03-16" ] datetime_object = pd.to_datetime(date_list) data2["date"] = datetime_object data2 = data2.set_index("date") data2 print(data2.loc["1993-03-16"]) print(data2.loc["1992-03-10":"1993-03-16"]) data2.resample("A").mean()