예제 #1
0
def test_to_datetime():
    # DataFrame input for to_datetime
    modin_df = pd.DataFrame({
        "year": [2015, 2016],
        "month": [2, 3],
        "day": [4, 5]
    })
    pandas_df = pandas.DataFrame({
        "year": [2015, 2016],
        "month": [2, 3],
        "day": [4, 5]
    })
    df_equals(pd.to_datetime(modin_df), pandas.to_datetime(pandas_df))

    # Series input for to_datetime
    modin_s = pd.Series(["3/11/2000", "3/12/2000", "3/13/2000"] * 1000)
    pandas_s = pandas.Series(["3/11/2000", "3/12/2000", "3/13/2000"] * 1000)
    df_equals(pd.to_datetime(modin_s), pandas.to_datetime(pandas_s))

    # Other inputs for to_datetime
    value = 1490195805
    assert pd.to_datetime(value, unit="s") == pandas.to_datetime(value,
                                                                 unit="s")
    value = 1490195805433502912
    assert pd.to_datetime(value, unit="ns") == pandas.to_datetime(value,
                                                                  unit="ns")
    value = [1, 2, 3]
    assert pd.to_datetime(
        value, unit="D", origin=pd.Timestamp("2000-01-01")).equals(
            pandas.to_datetime(value,
                               unit="D",
                               origin=pandas.Timestamp("2000-01-01")))
예제 #2
0
def test_to_datetime():
    value = 1490195805
    assert pd.to_datetime(value, unit="s") == pandas.to_datetime(value,
                                                                 unit="s")
    value = 1490195805433502912
    assert pd.to_datetime(value, unit="ns") == pandas.to_datetime(value,
                                                                  unit="ns")
    value = [1, 2, 3]
    assert pd.to_datetime(
        value, unit="D", origin=pd.Timestamp("2000-01-01")).equals(
            pandas.to_datetime(value,
                               unit="D",
                               origin=pandas.Timestamp("2000-01-01")))
예제 #3
0
def test_to_datetime_inplace_side_effect():
    # See GH#3063
    times = list(range(1617993360, 1618193360))
    values = list(range(215441, 415441))
    modin_df = pd.DataFrame({"time": times, "value": values})
    pandas_df = pandas.DataFrame({"time": times, "value": values})
    df_equals(
        pd.to_datetime(modin_df["time"], unit="s"),
        pandas.to_datetime(pandas_df["time"], unit="s"),
    )
예제 #4
0
def error345(df):
    index_3 = []
    index_4 = []
    print('In 3', df.head())
    groups = [x for _, x in df.groupby('DEVICE_ID')]
    for x in groups:
        x = x[x['Usable'] == 0]
        if (len(x) <= 1):
            try:
                df.Usable = df['Usable'].mask(df.index.isin(x.index.values), 5)
            except:
                continue
        x['prev_LAT'] = x['LAT'].shift(1)
        x['prev_LONGITUDE'] = x['LONGITUDE'].shift(1)
        x = x.fillna(0)
        try:
            dist = x[['LAT', 'LONGITUDE', 'prev_LAT',
                      'prev_LONGITUDE']].apply(calc_dist, axis=1)
            dist.columns = ['dist']
        except:
            print(x.head())
            continue
        dist_zeroes = (dist['dist'] == 0)
        timevals = pd.to_datetime(x['IST_DATE'], format='%Y-%m-%j %H:%M:%S')
        timevals = pd.Series(timevals).diff()
        timevals = timevals.fillna(pd.Timedelta(seconds=0))
        timevals = timevals / np.timedelta64(1, 'h')
        timevals.index = x.index
        time_zeroes = (timevals == 0)
        speed = dist['dist'] / timevals
        speed = speed.replace(np.inf, -1)
        speed_zeroes = [speed > 90]
        a = (~np.array(dist_zeroes) & np.array(time_zeroes))

        index_4.extend(x.loc[a].index.values)
        index_3.extend(x.loc[speed_zeroes[0]].index.values)
    df.Usable = df['Usable'].mask(df.index.isin(index_3), 3)
    df.Usable = df['Usable'].mask(df.index.isin(index_4), 4)

    return df
    if not isVaildDate(str(df_row[field])):
        df_row[field] = np.nan

    return df_row


purge_pat_files('../data', r'^[^_]+_log.csv$')

logs1 = pd.read_csv('../data/log_reduced.csv',
                    encoding='utf-8',
                    chunksize=c_sz)

for df, _ in zip(logs1, trange(1000)):
    df = pd.merge(df,
                  ad_static,
                  left_on='曝光广告id',
                  right_on='广告id',
                  how='inner')
    # 3. 去掉非法时间行
    df['广告请求时间'] = pd.to_datetime(df['广告请求时间'], unit='s')  # 转为日期
    df['广告请求时间_date'] = df['广告请求时间'].apply(lambda x: x.date())
    df = df[col_names1]

    # 1. 去空值
    df.dropna(axis=0, how='any', inplace=True)

    # 数据分割
    _ = df.apply(save_csv, axis=1)

print('done')
예제 #6
0
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    gc.collect()
    return df


print(
    '=============================================== read train ==============================================='
)
t = time.time()
train_df = pd.read_csv('./dataset/train.csv')
train_df['date'] = pd.to_datetime(train_df['ts'].apply(
    lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x / 1000))))
train_df['day'] = train_df['date'].dt.day
train_df.loc[train_df['day'] == 7, 'day'] = 8
train_df['hour'] = train_df['date'].dt.hour
train_df['minute'] = train_df['date'].dt.minute
train_num = train_df.shape[0]
labels = train_df['target'].values
print('runtime:', time.time() - t)

print(
    '=============================================== click data ==============================================='
)
click_df = train_df[train_df['target'] == 1].sort_values(
    'timestamp').reset_index(drop=True)
click_df['exposure_click_gap'] = click_df['timestamp'] - click_df['ts']
click_df = click_df[click_df['exposure_click_gap'] >= 0].reset_index(drop=True)
예제 #7
0
           ax=axes[0])
data1.plot(
    kind="hist",
    y="Defense",
    bins=50,
    range=(0, 250),
    normed=True,
    ax=axes[1],
    cumulative=True,
)
plt.savefig("graph.png")
plt
data.describe()
time_list = ["1992-03-08", "1992-04-12"]
print(type(time_list[1]))  # As you can see date is string
datetime_object = pd.to_datetime(time_list)
print(type(datetime_object))
import warnings

warnings.filterwarnings("ignore")
data2 = data.head()
date_list = [
    "1992-01-10", "1992-02-10", "1992-03-10", "1993-03-15", "1993-03-16"
]
datetime_object = pd.to_datetime(date_list)
data2["date"] = datetime_object
data2 = data2.set_index("date")
data2
print(data2.loc["1993-03-16"])
print(data2.loc["1992-03-10":"1993-03-16"])
data2.resample("A").mean()