Exemplos de process_df em Python, exemplos de utils.process_df em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: run_exp_temp_proxies.py Projeto: clinicalml/proxy-anchor-regression

    DATA_PATH = "data"

    files = [
        'BeijingPM20100101_20151231.csv',
        'GuangzhouPM20100101_20151231.csv',
        'ShenyangPM20100101_20151231.csv',
        'ChengduPM20100101_20151231.csv',
        'ShanghaiPM20100101_20151231.csv'
    ]

    dfs = [pd.read_csv(f"{DATA_PATH}/{f}") for f in files]

    raw_df = dfs[CITY].drop('No', axis=1)
    filt_df = raw_df.dropna()

    df, X, y = utils.process_df(filt_df)

    # Get Proxy Info for this city / season
    lr = pipeline.Pipeline([('scaler', preprocessing.StandardScaler()),
                            ('lr', lm.LinearRegression(fit_intercept=True))])

    for test_season in seasons:
        print(f"\t Season: {test_season}")

        dev_year = 2013
        if create_proxies:
            data = utils.get_dev_train_test_data_anchors(
                    df, X, y, test_season, dev_year,
                    anchor, proxies, rho)
        else:
            data = utils.get_dev_train_test_data(

Exemplo n.º 2

0

Exibir arquivo

import os
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold
import hyperopt.hp as hp
from hyperopt import tpe, fmin, STATUS_OK
import utils

# Load dataframe
df_pp = pd.read_pickle(os.getcwd() + '/data/full_df_preproc.pkl')
print('> Loaded full preprocessed dataframe')

# Feat engineering + processing
df = utils.process_df(df_pp)

# Separate train and validation dfs
train_test_df = df.loc['train'].copy()
val_df = df.loc['test'].copy()

print('> Separated train+test / validation sets')

# To numpy
x_train_test = train_test_df.iloc[:, 1:-1].values
y_train_test = train_test_df.iloc[:, -1].values
user_ids = train_test_df.iloc[:, 0].astype(float).values
x_val = val_df.iloc[:, 1:-1].values  # Drop dummy labels

# Set log1p of labels
y_train_test = np.log1p(y_train_test)

Exemplo n.º 3

0

Exibir arquivo

def drawArrow(A, B, label, color='red'):
    # print('label = ',label)
    ax.arrow(A[0],
             A[1],
             B[0] - A[0],
             B[1] - A[1],
             head_width=0.02,
             length_includes_head=True,
             label=label,
             color=color)
    ax.annotate(label, ((A[0] + B[0]) / 2, (A[1] + B[1]) / 2))


df = pd.read_csv('short_states.csv')
print('df = ', df)
short_states = utils.process_df(df['state'])
seq = df['digit'].values
print('short states = ', short_states)

# for i in range(len(short_states) - 1):
# 	drawArrow(short_states[i], short_states[i + 1], label=seq[i],color='blue')

plt.show()
print('-----generate dictionaries----')
id_to_p = {}
p_to_id = {}
for i, state in enumerate(states_df):
    id_to_p[i] = (tuple(state), state_list_sm[i])
    p_to_id[tuple(state)] = i

state_points = {}

Exemplo n.º 4

0

Exibir arquivo

Arquivo: covid_testing_map.py Projeto: nthakor/nthakor.github.io

import pandas as pd
import numpy as np
import folium
from folium.plugins import MarkerCluster, BeautifyIcon
from folium import FeatureGroup, LayerControl
import json
from utils import process_df, get_data_from_sheets

# dictionaty mapping state names with abbreviation
st_abbrev = json.load(open("st_abbrev.json", 'r'))
# url of the data
curl = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv"
rurl = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv"
durl = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv"
# read data
curr_df = process_df(curl, st_abbrev)
recovery_df = process_df(rurl, st_abbrev)
death_df = process_df(durl, st_abbrev)

# get states coordinates for heatmap
url = 'https://raw.githubusercontent.com/python-visualization/folium/master/examples/data'
us_states = f'{url}/us-states.json'
US_Unemployment_Oct2012 = f'{url}/US_Unemployment_Oct2012.csv'
geo_json_data = json.loads(requests.get(us_states).text)

# coloring function for heatmap
linear = cm.LinearColormap(['orange', 'red'], vmin=0, vmax=max(curr_df.cases))


def my_color_function(feature):
    return linear(list(curr_df[curr_df["states"] == feature['id']].cases)[0])