DATA_PATH = "data" files = [ 'BeijingPM20100101_20151231.csv', 'GuangzhouPM20100101_20151231.csv', 'ShenyangPM20100101_20151231.csv', 'ChengduPM20100101_20151231.csv', 'ShanghaiPM20100101_20151231.csv' ] dfs = [pd.read_csv(f"{DATA_PATH}/{f}") for f in files] raw_df = dfs[CITY].drop('No', axis=1) filt_df = raw_df.dropna() df, X, y = utils.process_df(filt_df) # Get Proxy Info for this city / season lr = pipeline.Pipeline([('scaler', preprocessing.StandardScaler()), ('lr', lm.LinearRegression(fit_intercept=True))]) for test_season in seasons: print(f"\t Season: {test_season}") dev_year = 2013 if create_proxies: data = utils.get_dev_train_test_data_anchors( df, X, y, test_season, dev_year, anchor, proxies, rho) else: data = utils.get_dev_train_test_data(
import os import numpy as np import pandas as pd import lightgbm as lgb from sklearn.model_selection import KFold import hyperopt.hp as hp from hyperopt import tpe, fmin, STATUS_OK import utils # Load dataframe df_pp = pd.read_pickle(os.getcwd() + '/data/full_df_preproc.pkl') print('> Loaded full preprocessed dataframe') # Feat engineering + processing df = utils.process_df(df_pp) # Separate train and validation dfs train_test_df = df.loc['train'].copy() val_df = df.loc['test'].copy() print('> Separated train+test / validation sets') # To numpy x_train_test = train_test_df.iloc[:, 1:-1].values y_train_test = train_test_df.iloc[:, -1].values user_ids = train_test_df.iloc[:, 0].astype(float).values x_val = val_df.iloc[:, 1:-1].values # Drop dummy labels # Set log1p of labels y_train_test = np.log1p(y_train_test)
def drawArrow(A, B, label, color='red'): # print('label = ',label) ax.arrow(A[0], A[1], B[0] - A[0], B[1] - A[1], head_width=0.02, length_includes_head=True, label=label, color=color) ax.annotate(label, ((A[0] + B[0]) / 2, (A[1] + B[1]) / 2)) df = pd.read_csv('short_states.csv') print('df = ', df) short_states = utils.process_df(df['state']) seq = df['digit'].values print('short states = ', short_states) # for i in range(len(short_states) - 1): # drawArrow(short_states[i], short_states[i + 1], label=seq[i],color='blue') plt.show() print('-----generate dictionaries----') id_to_p = {} p_to_id = {} for i, state in enumerate(states_df): id_to_p[i] = (tuple(state), state_list_sm[i]) p_to_id[tuple(state)] = i state_points = {}
import pandas as pd import numpy as np import folium from folium.plugins import MarkerCluster, BeautifyIcon from folium import FeatureGroup, LayerControl import json from utils import process_df, get_data_from_sheets # dictionaty mapping state names with abbreviation st_abbrev = json.load(open("st_abbrev.json", 'r')) # url of the data curl = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv" rurl = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv" durl = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv" # read data curr_df = process_df(curl, st_abbrev) recovery_df = process_df(rurl, st_abbrev) death_df = process_df(durl, st_abbrev) # get states coordinates for heatmap url = 'https://raw.githubusercontent.com/python-visualization/folium/master/examples/data' us_states = f'{url}/us-states.json' US_Unemployment_Oct2012 = f'{url}/US_Unemployment_Oct2012.csv' geo_json_data = json.loads(requests.get(us_states).text) # coloring function for heatmap linear = cm.LinearColormap(['orange', 'red'], vmin=0, vmax=max(curr_df.cases)) def my_color_function(feature): return linear(list(curr_df[curr_df["states"] == feature['id']].cases)[0])