def read_data(self, file_path): data = pd.read_csv(file_path, header=None, sep=",", names=["labels", "text"], index_col=False) data.dropna(inplace=True) data["labels"] = data["labels"] - 1 return data
def val(model, data, label): # Construct new features data['sma_10'] = pd.DataFrame(SMA(data, timeperiod=10)) data['mom_10'] = pd.DataFrame(MOM(data, 10)) data['wma_10'] = pd.DataFrame(WMA(data, 10)) data = pd.concat( [data, STOCHF(data, fastk_period=14, fastd_period=3)], axis=1) data['macd'] = pd.DataFrame( MACD(data, fastperiod=12, slowperiod=26)['macd']) data['rsi'] = pd.DataFrame(RSI(data, timeperiod=14)) data['willr'] = pd.DataFrame(WILLR(data, timeperiod=14)) data['cci'] = pd.DataFrame(CCI(data, timeperiod=14)) data['pct_change_20'] = ROC(data, timeperiod=20) data['pct_change_30'] = ROC(data, timeperiod=30) data['pct_change_60'] = ROC(data, timeperiod=60) data.dropna(inplace=True) data = data[-253:] pred_price = pd.DataFrame([], index=data.index, columns=['pred_label']) for ind, x in enumerate(data.values): model.eval() with torch.no_grad(): x = sc.transform(x.reshape(1, -1)) x = torch.tensor(x).float().to(device) pred_y = torch.sigmoid(model(x)).numpy() pred_price['pred_label'].iloc[ind] = pred_y # x at T includes the true price at T-1, so we have a new pair (x,y) sample to update model. # Note that it is not a leak. # if(ind>0): # model.train() # new_x = data.values[ind-1] # new_x = sc.transform(new_x.reshape(1,-1)) # new_x = torch.tensor(new_x).float().to(device) # new_y = sc.transform((data.values[ind]).reshape(1,-1))[0,3] # new_y = torch.tensor(new_y).float().to(device) # optimizer.zero_grad() # y1 = model(new_x) # loss = loss_function(y1,new_y) # loss.backward() # optimizer.step() print("max(pred_price['pred_label']):", max(pred_price['pred_label'])) for thrsh in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]: pred_price['pred_label_{}'.format(thrsh)] = pred_price[ 'pred_label'].apply(lambda x: 1 if x > thrsh else 0) accuracy = np.mean(pred_price['pred_label_{}'.format(thrsh)].values == label['label'].values) print("thresh:{} ,{}".format(thrsh, accuracy))
def val(model, data, day, label): # Construct new features data['sma_10'] = pd.DataFrame(SMA(data, timeperiod=10)) data['mom_10'] = pd.DataFrame(MOM(data, 10)) data['wma_10'] = pd.DataFrame(WMA(data, 10)) data = pd.concat( [data, STOCHF(data, fastk_period=14, fastd_period=3)], axis=1) data['macd'] = pd.DataFrame( MACD(data, fastperiod=12, slowperiod=26)['macd']) data['rsi'] = pd.DataFrame(RSI(data, timeperiod=14)) data['willr'] = pd.DataFrame(WILLR(data, timeperiod=14)) data['cci'] = pd.DataFrame(CCI(data, timeperiod=14)) data['pct_change_20'] = ROC(data, timeperiod=20) data['pct_change_30'] = ROC(data, timeperiod=30) data['pct_change_60'] = ROC(data, timeperiod=60) data.dropna(inplace=True) data = data[-253 - day:] pred_price = pd.DataFrame([], index=data.index, columns=['pred_T+1_price']) temp = np.array([]) for ind, x in enumerate(data.values): model.eval() with torch.no_grad(): x = torch.tensor(x).float().to(device) pred_y = model(x).numpy() pred_price['pred_T+1_price'].iloc[ind] = pred_y # x at T includes the true price at T-1, so we have a new pair (x,y) sample to update model. # Note that it is not a leak. if (ind > 0): model.train() new_x = data.values[ind - 1] new_x = sc.transform(new_x.reshape(1, -1)) new_x = torch.tensor(new_x).float().to(device) new_y = sc.transform((data.values[ind]).reshape(1, -1))[0, 3] new_y = torch.tensor(new_y).float().to(device) optimizer.zero_grad() y1 = model(new_x) loss = loss_function(y1, new_y) loss.backward() optimizer.step() pred_price['label'] = pred_price['pred_T+1_price'].diff(1).apply( lambda x: 1 if x > 0 else 0)[day:] pred_price = pred_price.dropna() accuracy = np.mean(pred_price['label'].values == label['label'].values) print(accuracy)
def read_data(self, file_path): data = pd.read_csv(file_path, header=None, sep=",", names=["labels", "title", "description"], index_col=False) data["text"] = data["title"] + ". " + data["description"] data["labels"] = data["labels"] - 1 data.drop(columns=["title", "description"], inplace=True) data.dropna(inplace=True) return data
def __init__(self, path, preprocess=False, preprocess_fn=process_str, delim=',', binarize_sent=False, drop_unlabeled=False, text_key='sentence', label_key='label'): self.path = path self.delim = delim self.text_key = text_key self.label_key = label_key self.drop_unlabeled = drop_unlabeled if drop_unlabeled: data = pd.read_csv(path, sep=delim, usecols=['Sentiment', text_key, label_key], encoding='unicode_escape') self.Sentiment = data['Sentiment'].values data = data.dropna(axis=0, subset=['Sentiment']) else: data = pd.read_csv(path, sep=delim, usecols=[text_key, label_key]) data = data.fillna(value=0) self.X = data[text_key].values.tolist() if preprocess: self.X = [preprocess_fn(s, maxlen=None, encode=None) for s in self.X] if label_key in data: self.Y = data[label_key].values if binarize_sent: self.Y = ((self.Y/np.max(self.Y)) > .5).astype(int) else: self.Y = np.ones(len(self.X))*-1
def load_motionsense_data(path, attributes): times = ['time'] data = pd.read_csv(path)[times + attributes] data = data.dropna() for attr in attributes: data[attr] = data[attr].apply(lambda x: round(x, 5)) return data
def read_data(self, file_path): data = pd.read_csv(file_path, header=None, sep=",", names=[ "labels", "question_title", "question_content", "best_answer" ], index_col=False) data.dropna(inplace=True) data["text"] = data["question_title"] + data[ "question_content"] + data["best_answer"] data["labels"] = data["labels"] - 1 data.drop( columns=["question_title", "question_content", "best_answer"], inplace=True) return data
def __init__(self, filepath, labelpath, flag): # filepath: string, path of data # flag: string, 'train' or 'Validation' , to split the whole dataset data = pd.read_csv( filepath, delimiter=',', index_col=0, usecols=(1, 2, 3, 4, 5, 6), names=['Index', 'open', 'high', 'low', 'close', 'volume'], skiprows=1) label = pd.read_csv(labelpath, index_col=0, usecols=(1, 2), names=['Index', 'label'], skiprows=1) # Construct new features data['sma_10'] = pd.DataFrame(SMA(data, timeperiod=10)) data['mom_10'] = pd.DataFrame(MOM(data, 10)) data['wma_10'] = pd.DataFrame(WMA(data, 10)) data = pd.concat( [data, STOCHF(data, fastk_period=14, fastd_period=3)], axis=1) data['macd'] = pd.DataFrame( MACD(data, fastperiod=12, slowperiod=26)['macd']) data['rsi'] = pd.DataFrame(RSI(data, timeperiod=14)) data['willr'] = pd.DataFrame(WILLR(data, timeperiod=14)) data['cci'] = pd.DataFrame(CCI(data, timeperiod=14)) data['pct_change_20'] = ROC(data, timeperiod=20) data['pct_change_30'] = ROC(data, timeperiod=30) data['pct_change_60'] = ROC(data, timeperiod=60) data.dropna(inplace=True) if flag == 'train': # Don't fit the MinMaxScaler with Validation set, which causes data leak. traindata_len = 1000 train_data = data[-traindata_len:] x = sc.fit_transform(train_data.values) y = label['label'][-traindata_len:].values self.x = x self.y = y
def __init__(self, path, tokenizer=None, preprocess_fn=None, delim=',', binarize_sent=False, drop_unlabeled=False, text_key='sentence', label_key='label', **kwargs): self.is_lazy = False self.preprocess_fn = preprocess_fn self.SetTokenizer(tokenizer) self.path = path self.delim = delim self.text_key = text_key self.label_key = label_key self.drop_unlabeled = drop_unlabeled if '.tsv' in self.path: self.delim = '\t' self.X = [] self.Y = [] try: cols = [text_key] if isinstance(label_key, list): cols += label_key else: cols += [label_key] data = pd.read_csv(self.path, sep=self.delim, usecols=cols, encoding='latin-1') except: data = pd.read_csv(self.path, sep=self.delim, usecols=[text_key], encoding='latin-1') data = data.dropna(axis=0) self.X = data[text_key].values.tolist() try: self.Y = data[label_key].values except Exception as e: self.Y = np.ones(len(self.X)) * -1 if binarize_sent: self.Y = binarize_labels(self.Y, hard=binarize_sent)
def __init__(self, path, preprocess=False, preprocess_fn=process_str, delim=',', binarize_sent=False, drop_unlabeled=False, text_key='sentence', label_key='label', **kwargs): self.processed_path = self.path = path self.path = path self.delim = delim self.text_key = text_key self.label_key = label_key self.drop_unlabeled = drop_unlabeled if '.tsv' in self.path: self.delim = '\t' load_path, should_process = get_load_path_and_should_process(self.path, text_key, label_key) should_process = should_process or preprocess self.X = [] self.Y = [] # if drop_unlabeled: # data = pd.read_csv(load_path, sep=delim, usecols=['Sentiment', text_key, label_key], # encoding='unicode_escape') # self.Sentiment = data['Sentiment'].values # data = data.dropna(axis=0, subset=['Sentiment']) # else: # data = pd.read_csv(load_path, sep=delim, usecols=[text_key, label_key]) # data = data.fillna(value=-1) try: data = pd.read_csv(load_path, sep=self.delim, usecols=[text_key, label_key], encoding='latin-1') except: data = pd.read_csv(load_path, sep=self.delim, usecols=[text_key], encoding='latin-1') data = data.dropna(axis=0) self.X = data[text_key].values.tolist() if should_process: self.X = [preprocess_fn(s, maxlen=None, encode=None) for s in self.X] if label_key in data: self.Y = data[label_key].values else: self.Y = np.ones(len(self.X))*-1 if should_process: self.processed_path = save_preprocessed(self, text_key=text_key, label_key=label_key) else: self.processed_path = load_path if binarize_sent: self.Y = binarize_labels(self.Y, hard=binarize_sent)
def __init__(self, csv_path): """Parse and store data.""" super().__init__() data = pd.read_csv(csv_path) data = data.dropna().reset_index(drop=True) self.months = np.array(list(map(date2month, data["날짜"]))) self.avg_temperatures = data["평균기온"].astype(float).values self.rainfalls = data["강수량"].astype(float).values self.fine_dust = data["미세먼지"].astype(float).values self.superfine_dust = data["초미세먼지"].astype(float).values data["holiday"] = ((data["주말공휴일"] == "1") | (data["주말공휴일"] == "토") | (data["주말공휴일"] == "일")).astype(int).values self.holiday = data["holiday"].astype(int).values self.spring = data["봄"].astype(int).values self.summer = data["여름"].astype(int).values self.autumn = data["가을"].astype(int).values self.winter = ( 1 - (data["봄"] + data["여름"] + data["가을"])).astype(int).values self.social_distance = data["사회적거리두기"].astype(int).values self.fog = data["안개"].astype(int).values self.visitors = data["방문객수"].astype(int).values self.len = len(data)
correct = (y == torch.argmax(y_prime, dim=1)).sum().double() return 1.0 - correct / float(y.size(0)) # Device configuration device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("CUDA is available: {}".format(torch.cuda.is_available())) if torch.cuda.is_available(): print("Using device: {}".format( torch.cuda.get_device_name(torch.cuda.current_device()))) # Load data data = pd.read_csv("data/processed.csv", index_col=0) # Drop null columns data.dropna(axis='columns', how='any', inplace=True) # Isolate season 1980 data # season_1980_data = data[data.index.str.contains('198009|198010|198011|198012|198101|198102')] # data = data[~data.index.str.contains('198009|198010|198011|198012|198101|198102')] # Training data y_data = data[['win', 'tie', 'loss']] x_data = data.drop(['win', 'tie', 'loss', 'team'], axis='columns').dropna(axis='columns', how='any') # Data parameters input_size = len(x_data.columns) output_size = len(y_data.columns) # Create dataset
from torch.nn import functional as F import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec import os from torch.autograd import Variable data=pd.read_csv('Tab.delimited.Cleaned.dataset.WITH.variable.labels.csv', sep='\t', engine='python') file=open('mixture.txt') labels=[] for line in file: word=line.rstrip('\n') labels.append(word) data=data.loc[:,labels] #The selected columns prediction data=data.replace(' ', np.nan) #print(data.info(verbose=True)) data=data.dropna() #print(data.info(verbose=True)) #17 rows and 49 cols file=open('mixture.txt') categories=["diseaseframinga","reciprocityothera","reciprocityusa","allowedforbiddena","flagtimeestimate1", "flagtimeestimate2","flagtimeestimate3","flagtimeestimate4","sex","citizenship"] df=pd.get_dummies(data,columns=categories) #print(df.info(verbose=True)) df=df.replace("Very much","11") #rint(train.loc[:,"flagsupplement1"]) df=df.replace("Not at all","1") df=df.replace("Republican","7") df=df.replace("Democrat","1") df=df.replace("Conservative","7") df=df.replace("Liberal","1")
def preprocess(self): print("Preprocessing data") (self.root / self.train_folder).mkdir() (self.root / self.test_folder).mkdir() (self.root / self.eval_folder).mkdir() data = pd.read_pickle(self.root / self.raw_file) edata = pd.read_pickle(self.root / self.raw_eval) # Filter null data data = data.dropna(how='any', axis='rows') # Filter negative fare amount data = data[data['fare_amount'] > 0] data = data[data['fare_amount'] < 250] # Filter passenger count data = data[(data['passenger_count'] <= 6) & (data['passenger_count'] >= 1)] data['passenger_count'] -= 1 edata['passenger_count'] -= 1 # Convert datetime to usable data data['year'] = (data['pickup_datetime'].map(lambda x: x.year) - self.min_year) data['month'] = data['pickup_datetime'].map(lambda x: x.month) - 1 data['weekday'] = data['pickup_datetime'].map(lambda x: x.weekday()) data['quaterhour'] = data['pickup_datetime'].map( lambda x: x.hour*4 + x.minute//15 ) data.drop('pickup_datetime', 1, inplace=True) edata['year'] = (edata['pickup_datetime'].map(lambda x: x.year) - self.min_year) edata['month'] = edata['pickup_datetime'].map(lambda x: x.month) - 1 edata['weekday'] = edata['pickup_datetime'].map(lambda x: x.weekday()) edata['quaterhour'] = edata['pickup_datetime'].map( lambda x: x.hour*4 + x.minute//15 ) edata.drop('pickup_datetime', 1, inplace=True) # Filter location data lx = data['pickup_longitude'] ly = data['dropoff_longitude'] px = data['pickup_latitude'] py = data['dropoff_latitude'] elx = edata['pickup_longitude'] ely = edata['dropoff_longitude'] epx = edata['pickup_latitude'] epy = edata['dropoff_latitude'] data = data[ (lx <= np.ceil(elx.max())) & (lx >= np.floor(elx.min())) & (ly <= np.ceil(ely.max())) & (ly >= np.floor(ely.min())) & (px <= np.ceil(epx.max())) & (px >= np.floor(epx.min())) & (py <= np.ceil(epy.max())) & (py >= np.floor(epy.min())) ] print(data.iloc[:, 1:5].columns) # Normalize data data.iloc[:, 1:5] = (data.iloc[:, 1:5] - self.mean) / self.std edata.iloc[:, 0:4] = (edata.iloc[:, 0:4] - self.mean) / self.std # train-test split t = int(self.testsplit * len(data)) print("Writing train files") torch.save( torch.tensor(data.iloc[t:, 0].values, dtype=torch.float), self.root / self.train_folder / self.target_file ) torch.save( torch.tensor(data.iloc[t:, 1:5].values, dtype=torch.float), self.root / self.train_folder / self.gps_file ) torch.save( torch.tensor(data.iloc[t:, 5:].values, dtype=torch.long), self.root / self.train_folder / self.cat_file ) print("Writing test files") torch.save( torch.tensor(data.iloc[:t, 0].values, dtype=torch.float), self.root / self.test_folder / self.target_file ) torch.save( torch.tensor(data.iloc[:t, 1:5].values, dtype=torch.float), self.root / self.test_folder / self.gps_file ) torch.save( torch.tensor(data.iloc[:t, 5:].values, dtype=torch.long), self.root / self.test_folder / self.cat_file ) print("Writing eval file") torch.save( torch.tensor(edata.iloc[:, :4].values, dtype=torch.float), self.root / self.eval_folder / self.gps_file ) torch.save( torch.tensor(edata.iloc[:, 4:].values, dtype=torch.long), self.root / self.eval_folder / self.cat_file )
def __get_data: data_path = '../Data/' filenames = [ # 'CM2014_edit.csv', 'CM2015_edit.csv', 'CM2016_edit.csv', 'CM2017_edit.csv', 'CM2018_edit.csv', 'mdcp.csv', 'major_ion.csv', 'Weather_Data.csv' ] # cla_2014 = pd.read_csv(data_path + filenames[0], low_memory=False) cla_2015_raw = pd.read_csv(data_path + filenames[0], low_memory=False) cla_2016_raw = pd.read_csv(data_path + filenames[1], low_memory=False) cla_2017_raw = pd.read_csv(data_path + filenames[2], low_memory=False) cla_2018_raw = pd.read_csv(data_path + filenames[3], low_memory=False) mdcp_raw = pd.read_csv(data_path + filenames[4], low_memory=False) # Mendota buoy weather_raw = pd.read_csv(data_path + filenames[6], error_bad_lines=False, low_memory=False) ########### Clean Data ########### ########### CLA Data ########### keep15 = [ # features to keep for years 2015-2017 'correct_timestamp', 'collectionSiteId', 'lake', 'algalBloom', 'algalBloomSheen', 'turbidity', # 'waterTemp', # 'waveIntensity', 'lat', 'long' ] keep18 = [ # features to keep for 2018 'sample_collection_time', 'collectionSiteId', 'lake', 'algalBloom', 'algalBloomSheen', 'turbidity', # 'waterTemp', # 'waveIntensity', 'latitiude', 'longitude' ] rename15 = { # rename features for 2015-2017 'collectionSiteId': 'site', 'lat': 'latitude', 'long': 'longitude', 'correct_timestamp': 'date' } rename18 = { # renamce features for 2018 'collectionSiteId': 'site', 'sample_collection_time': 'date', 'latitiude': 'latitude' } cla_2015 = cla_2015_raw[keep15] cla_2016 = cla_2016_raw[keep15] cla_2017 = cla_2017_raw[keep15] cla_2018 = cla_2018_raw[keep18] cla_2015.rename(rename15, axis='columns', inplace=True) cla_2016.rename(rename15, axis='columns', inplace=True) cla_2017.rename(rename15, axis='columns', inplace=True) cla_2018.rename(rename18, axis='columns', inplace=True) # change data types numeric = [ # list of numeric features 'algalBloom', 'algalBloomSheen', 'turbidity', # 'waterTemp', # 'waveIntensity', 'latitude', 'longitude' ] # convert data types features = cla_2015.columns.values for feat in features: if feat in numeric: cla_2015[feat] = pd.to_numeric(cla_2015[feat], errors='coerce') cla_2016[feat] = pd.to_numeric(cla_2016[feat], errors='coerce') cla_2017[feat] = pd.to_numeric(cla_2017[feat], errors='coerce') cla_2018[feat] = pd.to_numeric(cla_2018[feat], errors='coerce') if feat in ['site', 'lake']: cla_2015[feat] = cla_2015[feat].astype(str) cla_2016[feat] = cla_2016[feat].astype(str) cla_2017[feat] = cla_2017[feat].astype(str) cla_2018[feat] = cla_2018[feat].astype(str) if feat == 'date': cla_2015[feat] = pd.to_datetime(cla_2015[feat], errors='coerce') cla_2016[feat] = pd.to_datetime(cla_2016[feat], errors='coerce') cla_2017[feat] = pd.to_datetime(cla_2017[feat], errors='coerce') cla_2018[feat] = pd.to_datetime(cla_2018[feat], errors='coerce') # remove nans cla_2015.dropna(axis='rows', how='any', inplace=True) cla_2016.dropna(axis='rows', how='any', inplace=True) cla_2017.dropna(axis='rows', how='any', inplace=True) cla_2018.dropna(axis='rows', how='any', inplace=True) # remove any data point not on lake mendota cla_2015 = cla_2015[cla_2015['lake'].str.contains('Mendota')] cla_2016 = cla_2016[cla_2016['lake'].str.contains('Mendota')] cla_2017 = cla_2017[cla_2017['lake'].str.contains('Mendota')] cla_2018 = cla_2018[cla_2018['lake'].str.contains('Mendota')] # set date as index cla_2015.set_index('date', inplace=True) cla_2016.set_index('date', inplace=True) cla_2017.set_index('date', inplace=True) cla_2018.set_index('date', inplace=True) # sort data by dates cla_2015.sort_values(by='date', inplace=True) cla_2016.sort_values(by='date', inplace=True) cla_2017.sort_values(by='date', inplace=True) cla_2018.sort_values(by='date', inplace=True) # resample, ffill and bfill cla_2015 = cla_2015.resample('D').mean() cla_2015.ffill(inplace=True) cla_2015.bfill(inplace=True) for date in cla_2015.index: if cla_2015.loc[date, 'algalBloomSheen'] > 0: cla_2015.loc[date, 'algalBloomSheen'] = 1 cla_2016 = cla_2016.resample('D').mean() cla_2016.ffill(inplace=True) cla_2016.bfill(inplace=True) for date in cla_2016.index: if cla_2016.loc[date, 'algalBloomSheen'] > 0: cla_2016.loc[date, 'algalBloomSheen'] = 1 cla_2017 = cla_2017.resample('D').mean() cla_2017.ffill(inplace=True) cla_2017.bfill(inplace=True) for date in cla_2017.index: if cla_2017.loc[date, 'algalBloomSheen'] > 0: cla_2017.loc[date, 'algalBloomSheen'] = 1 cla_2018 = cla_2018.resample('D').mean() cla_2018.ffill(inplace=True) cla_2018.bfill(inplace=True) for date in cla_2018.index: if cla_2018.loc[date, 'algalBloomSheen'] > 0: cla_2018.loc[date, 'algalBloomSheen'] = 1 # only keep the dates of the official sampling season of each year # cla_2015 = cla_2015[(cla_2015.index >= '2015-5-18') & (cla_2015.index <= '2015-9-4')] # cla_2016 = cla_2016[(cla_2016.index >= '2016-5-25') & (cla_2016.index <= '2016-9-4')] # cla_2017 = cla_2017[(cla_2017.index >= '2017-5-25') & (cla_2017.index <= '2017-9-4')] # cla_2018 = cla_2018[(cla_2018.index >= '2018-5-25') & (cla_2018.index <= '2018-9-4')] ########### MDCP Data ########### keep_mdcp = [ 'sampledate', 'sampletime', 'air_temp', 'rel_hum', 'wind_speed', 'wind_dir', 'chlor', 'phycocyanin', 'do_raw', 'do_sat', 'do_wtemp', 'pco2_ppm', 'par', 'par_below' ] mdcp = mdcp_raw[keep_mdcp] mdcp.ffill(inplace=True) mdcp.bfill(inplace=True) mdcp['date'] = mdcp['sampledate'] + ' ' + mdcp['sampletime'] mdcp['date'] = pd.to_datetime(mdcp['date'], errors='coerce') mdcp.dropna(axis='rows', how='any', inplace=True) mdcp = mdcp[[ 'date', 'air_temp', 'rel_hum', 'wind_speed', 'wind_dir', 'chlor', 'phycocyanin', 'do_raw', 'do_sat', 'do_wtemp', 'pco2_ppm', 'par', 'par_below' ]] mdcp.set_index('date', inplace=True) mdcp = mdcp.resample('D').mean() mdcp.ffill(inplace=True) mdcp.bfill(inplace=True) ########### Weather Data ########### keep_weather = [ 'DATE', 'REPORTTPYE', 'DAILYMaximumDryBulbTemp', 'DAILYMinimumDryBulbTemp', 'DAILYAverageDryBulbTemp', 'DAILYDeptFromNormalAverageTemp', 'DAILYAverageDewPointTemp', 'DAILYAverageWetBulbTemp', 'DAILYPrecip', 'DAILYAverageStationPressure', 'DAILYAverageSeaLevelPressure' ] weather = weather_raw[keep_weather] # weather['REPORTTPYE'].dropna(axis='rows', how='any', inplace=True) weather = weather.iloc[:-1] # remove last entry since it has NaN in REPORTTPYE weather = weather[weather['REPORTTPYE'].str.contains('SOD')] # only keep summary of day (SOD) info weather = weather.drop(['REPORTTPYE'], axis='columns') weather['DATE'] = pd.to_datetime(weather['DATE'], errors='coerce') weather.set_index('DATE', inplace=True) weather = weather.resample('D').ffill() weather.ffill(inplace=True) weather.bfill(inplace=True) # Join CLA, MDCP, and Weather Data # Append CLA data cla = cla_2015.append(cla_2016) cla = cla.append(cla_2017) cla = cla.append(cla_2018) # Insert MDCP data data = cla.join(mdcp, how='inner') # Insert weather data data = data.join(weather, how='inner') # sine/cosine transformation of month of year and wind direction data['cos_month'] = np.cos(2 * np.pi * (data.index.month.values / 12)) data['sin_month'] = np.sin(2 * np.pi * (data.index.month.values / 12)) data['cos_wind_dir'] = np.cos(2 * np.pi * (data['wind_dir'] / 12)) data['sin_wind_dir'] = np.sin(2 * np.pi * (data['wind_dir'] / 12)) data = data.drop(['wind_dir'], axis='columns') # Replace 'T' and 's' in 'DAILYPrecip' column for date in data.index: if 'T' in data.loc[date, 'DAILYPrecip']: data.loc[date, 'DAILYPrecip'] = 0.01 elif 's' in data.loc[date, 'DAILYPrecip']: data.loc[date, 'DAILYPrecip'] = 0 # Make every feature numeric for col in data.columns.values: if type(data[col].values[0]) != np.float64: data[col] = pd.to_numeric(data[col], errors='coerce') # create indicator features for whether there was rain or a bloom one day ago, or within three days or a week ago precip = (data['DAILYPrecip'] > 0).astype(int) # convert precipitation to boolean values # data['DAILYPrecip_one_day'] = precip.shift(1) # data['DAILYPrecip_three_day'] = precip.rolling(window=3, min_periods=1).sum() # NOTE THAT THIS IS DEPENDENT ON CURRENT VALUE # data['DAILYPrecip_one_week'] = precip.rolling(window=7, min_periods=1).sum() # data['algalBloomSheen_one_day'] = data['algalBloomSheen'].shift(1) # data['algalBloomSheen_three_day'] = data[['algalBloomSheen']].shift(1).rolling(3).sum() # data['algalBloomSheen_one_week'] = data[['algalBloomSheen']].shift(1).rolling(7).sum() # shift algalbloomsheen by -1 to predict next day algal bloom data['DAILYPrecip_one_day'] = precip data['DAILYPrecip_three_day'] = precip.rolling(window=3, min_periods=1).sum() # NOTE THAT THIS IS DEPENDENT ON CURRENT VALUE data['DAILYPrecip_one_week'] = precip.rolling(window=7, min_periods=1).sum() data['algalBloomSheen_one_day'] = data['algalBloomSheen'] data['algalBloomSheen_three_day'] = data[['algalBloomSheen']].rolling(3, min_periods=1).sum() data['algalBloomSheen_one_week'] = data[['algalBloomSheen']].rolling(7, min_periods=1).sum() data['algalBloomSheen'] = data['algalBloomSheen'].shift(-1) data.dropna(axis='rows', how='any', inplace=True) # display(data[['DAILYPrecip', # 'DAILYPrecip_one_day', # 'DAILYPrecip_three_day', # 'DAILYPrecip_one_week', # 'algalBloomSheen', # 'algalBloomSheen_one_day', # 'algalBloomSheen_three_day', # 'algalBloomSheen_one_week' # ]].head(15)) labels = data[['algalBloomSheen']] data = data.drop(['latitude', 'longitude', 'algalBloom', 'algalBloomSheen'], axis='columns') return data, labels