Python make_df 예제들, masterthesis.eval_1.util.make_df Python 예제들

예제 #1

0

파일 보기

def run(path, verbose=1):
    print(path)
    trajs = read_geolife(path)
    preprocessed = list()
    for traj in trajs:
        traj_new = time_duplication_filter(traj)
        traj_new = speed_filter_abs(traj_new, 300, in_kmh=True)
        preprocessed.append(traj_new)

    if verbose > 1:
        print('Trajectories preprocessed')

    df = make_df(preprocessed)

    if verbose > 1:
        print('Trajectories split at staypoints')

    length = len(df)
    df = cluster_into_spots(df, init_eps=300, levels=1, threshold=0.2)

    if verbose > 1:
        print('Trajectories clustered in sps')

    ts = df['end_date'].iloc[-1] - df['start_date'].iloc[0]
    train_weeks = int((ts.days / 7) * 4/5)
    duration = timedelta(days=7 * train_weeks)
    train_start = df['start_date'].iloc[0]
    train_end = train_start + duration
    train = df[train_start:train_end]
    test = df[train_end:]

    dates = list()
    for i in test.index:
        if len(dates) > 1:
            if dates[-1] == (i.month, i.day):
                continue
        dates.append((i.month, i.day))

    ts = TimeSensitiveMostFrequentRoute()
    ts.fit(train)
    preds = ts.predict_proba(test)
    ranks = list()
    for i in range(len(preds)):
        correct = False

        for j, pred in enumerate(preds[i].index):
            if pred == test['end_cluster'].iloc[i]:
                ranks.append((j + 1, len(preds[i])))
                correct = True
                break

        if not correct:
            ranks.append((None, len(preds[i])))


    if verbose > 0:
        print('Average rank/pred', np.mean(ranks, axis=0))
        print('Total predictions:', len(test))

    return ranks, len(test)

예제 #2

0

파일 보기

파일: eval_process1.py 프로젝트: M20190649/fuzzy-umbrella

def prepare(path, verbose=1):
    print(path)
    trajs = read_geolife(path)
    preprocessed = list()
    for traj in trajs:
        traj_new = time_duplication_filter(traj)
        traj_new = speed_filter_abs(traj_new, 300, in_kmh=True)
        preprocessed.append(traj_new)

    if verbose > 1:
        print('Trajectories preprocessed')

    df = make_df(preprocessed)

    if verbose > 1:
        print('Trajectories split at staypoints')

    df = kmeans_cluster_into_spots(df, 20, 100)

    if verbose > 1:
        print('Trajectories clustered in sps')

예제 #3

0

파일 보기

파일: eval_process2.py 프로젝트: M20190649/fuzzy-umbrella

def run(path, test_traj_len, predict_after_frac=100, verbose=1):
    print(path)
    trajs = read_geolife(path)
    preprocessed = list()
    for traj in trajs:
        traj_new = time_duplication_filter(traj)
        traj_new = speed_filter_abs(traj_new, 300, in_kmh=True)
        preprocessed.append(traj_new)

    if verbose > 1:
        print('Trajectories preprocessed')

    df = make_df(preprocessed)

    if verbose > 1:
        print('Trajectories split at staypoints')

    length = len(df)
    df = cluster_into_spots(df, init_eps=300, levels=0, threshold=0.2)

    if verbose > 1:
        print('Trajectories clustered in sps')

    ts = df['end_date'].iloc[-1] - df['start_date'].iloc[0]
    train_weeks = int((ts.days / 7) * 4/5)
    duration = timedelta(days=7 * train_weeks)
    train_start = df['start_date'].iloc[0]
    train_end = train_start + duration
    train = df[train_start:train_end]
    test = df[train_end:]

    test_trajs = get_test_trajs(test, preprocessed)

    kse = KnownStartEstimator()
    kse = kse.fit(train)

    filtered_test_trajs = list()
    for traj in test_trajs:
        if len(traj) >= test_traj_len:
            filtered_test_trajs.append(traj)

    if verbose > 1:
        print(len(filtered_test_trajs), "of", len(test_trajs), "have the minimal length of", test_traj_len)


    res = dict()
    for traj in filtered_test_trajs:

        res[traj[-1]] = kse.predict_proba(traj[:predict_after_frac])



    ranks = list()
    for p in res:
        predictions = res[p]
        truth = resolve_endcluster(train, p)
        if predictions is None or len(predictions) < 1:
            ranks.append((None, None))
        else:
            ranked = False
            for i in range(len(predictions)):
                if predictions.index[i] == truth:
                    ranks.append((i+1, len(predictions)))
                    ranked = True
                    break

            if not ranked:
                ranks.append((None, len(predictions)))


    return ranks

예제 #4

0

파일 보기

def run(path, verbose=1):
    print(path)
    trajs = read_geolife(path)
    if len(trajs) < 30:
        return None
    preprocessed = list()
    for traj in trajs:
        traj_new = time_duplication_filter(traj)
        traj_new = speed_filter_abs(traj_new, 300, in_kmh=True)
        preprocessed.append(traj_new)

    if verbose > 1:
        print('Trajectories preprocessed')

    df = make_df(preprocessed)

    if verbose > 1:
        print('Trajectories split at staypoints')

    df = agglomerative_cluster_into_spots(df, 20, 100)

    if verbose > 1:
        print('Trajectories clustered in sps')

    ts = df['end_date'].iloc[-1] - df['start_date'].iloc[0]
    train_weeks = int((ts.days / 7) * 4 / 5)
    duration = timedelta(days=7 * train_weeks)
    train_start = df['start_date'].iloc[0]
    train_end = train_start + duration
    train = df[train_start:train_end]
    test = df[train_end:]

    dates = list()
    for i in test.index:
        if len(dates) > 1:
            if dates[-1] == (i.month, i.day):
                continue
        dates.append((i.month, i.day))

    bwe = FrequentistEstimator()

    bwe = bwe.fit(train)

    counter_no_prob = 0

    ranks = list()

    for date in dates:
        month, day = date
        x = pd.DataFrame(
            data=[[49.475752, 8.482531]],
            index=pd.DatetimeIndex(
                [pd.Timestamp("2008-{}-{} 19:45:21".format(month, day))]),
            columns=['lat', 'lon'])
        if verbose > 2:
            print(x)
        pred = bwe.predict_proba(x)

        sorted_pred = sorted(pred.items(),
                             key=operator.itemgetter(1),
                             reverse=True)

        for i, row in df.loc[(df.index.month == month)
                             & (df.index.day == day)].iterrows():
            if verbose > 2:
                print((row['start_cluster'], row['end_cluster']))
            if (row['start_cluster'], row['end_cluster']) in pred.keys():
                if verbose > 2:
                    print(pred[(row['start_cluster'], row['end_cluster'])])
                for j, s_pred in enumerate(sorted_pred):
                    if s_pred[0] == (row['start_cluster'], row['end_cluster']):
                        if verbose > 2:
                            print('Ranked:', j + 1, 'of total', len(pred),
                                  'predictions')
                        ranks.append((j + 1, len(pred)))
            else:
                if verbose > 2:
                    print("no prob")
                counter_no_prob += 1

    if verbose > 0:
        print('Average rank/pred', np.mean(ranks, axis=0))
        print('Number of no prob:', counter_no_prob)
        print('Total predictions:', len(test))

    return ranks, len(test), counter_no_prob

예제 #5

0

파일 보기

파일: eval_process5.py 프로젝트: M20190649/fuzzy-umbrella

def run(path, verbose=1):
    print(path)
    trajs = read_geolife(path)
    preprocessed = list()
    for traj in trajs:
        traj_new = time_duplication_filter(traj)
        traj_new = speed_filter_abs(traj_new, 300, in_kmh=True)
        preprocessed.append(traj_new)

    if verbose > 1:
        print('Trajectories preprocessed')

    df = make_df(preprocessed)

    if verbose > 1:
        print('Trajectories split at staypoints')

    length = len(df)
    df = cluster_into_spots(df, init_eps=300, levels=1, threshold=0.2)

    if verbose > 1:
        print('Trajectories clustered in sps')

    ts = df['end_date'].iloc[-1] - df['start_date'].iloc[0]
    train_weeks = int((ts.days / 7) * 4 / 5)
    duration = timedelta(days=7 * train_weeks)
    train_start = df['start_date'].iloc[0]
    train_end = train_start + duration
    train = df[train_start:train_end]
    test = df[train_end:]

    ddf = DenseDepartureTimes(0.03)
    train_ddt = ddf.fit_transform(train.copy())
    bdte = BayesDepartureTimeEstimator()
    bdte = bdte.fit(train_ddt)

    counter_prob = 0

    ranks = list()

    for i, row in test.iterrows():
        x = pd.DataFrame(data=[[row['start_lat'], row['start_lon']]],
                         index=[i],
                         columns=['lat', 'lon'])
        pred = bdte.predict_proba(x)
        sorted_pred = sorted(pred.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
        for key in pred.keys():
            time_wa, time_wb = bdte.resolve_start_time_cluster(key)
            dummydate = date(1970, 1, 1)
            # time_wa = (datetime.combine(dummydate,time_wa)-timedelta(hours=1)).time()
            # time_wb = (datetime.combine(dummydate,time_wb)+timedelta(hours=1)).time()
            if len(x.between_time(time_wa, time_wb)) == 1:
                for j, s_pred in enumerate(sorted_pred):
                    if s_pred[0] == key:
                        if verbose > 1:
                            print('Ranked:', j + 1, 'of total', len(pred),
                                  'predictions')
                        counter_prob += 1

    if verbose > 0:
        print('Average rank/pred', np.mean(ranks, axis=0))
        print('Number of no prob:', counter_prob)
        print('Total predictions:', len(test))

    return ranks, len(test), counter_prob