Python dropna 예제들, torch.utils.data.dropna Python 예제들

예제 #1

0

파일 보기

 def read_data(self, file_path):
     data = pd.read_csv(file_path,
                        header=None,
                        sep=",",
                        names=["labels", "text"],
                        index_col=False)
     data.dropna(inplace=True)
     data["labels"] = data["labels"] - 1
     return data

예제 #2

0

파일 보기

파일: mlp_classfication.py 프로젝트: joelongLin/SIGIR-2020-FINIR-Competition

def val(model, data, label):

    # Construct new features
    data['sma_10'] = pd.DataFrame(SMA(data, timeperiod=10))
    data['mom_10'] = pd.DataFrame(MOM(data, 10))
    data['wma_10'] = pd.DataFrame(WMA(data, 10))
    data = pd.concat(
        [data, STOCHF(data, fastk_period=14, fastd_period=3)], axis=1)

    data['macd'] = pd.DataFrame(
        MACD(data, fastperiod=12, slowperiod=26)['macd'])
    data['rsi'] = pd.DataFrame(RSI(data, timeperiod=14))
    data['willr'] = pd.DataFrame(WILLR(data, timeperiod=14))
    data['cci'] = pd.DataFrame(CCI(data, timeperiod=14))

    data['pct_change_20'] = ROC(data, timeperiod=20)
    data['pct_change_30'] = ROC(data, timeperiod=30)
    data['pct_change_60'] = ROC(data, timeperiod=60)
    data.dropna(inplace=True)

    data = data[-253:]
    pred_price = pd.DataFrame([], index=data.index, columns=['pred_label'])

    for ind, x in enumerate(data.values):
        model.eval()
        with torch.no_grad():
            x = sc.transform(x.reshape(1, -1))
            x = torch.tensor(x).float().to(device)
            pred_y = torch.sigmoid(model(x)).numpy()

            pred_price['pred_label'].iloc[ind] = pred_y

        # x at T includes the true price at T-1, so we have a new pair (x,y) sample to update model.
        # Note that it is not a leak.
        # if(ind>0):
        #     model.train()
        #     new_x = data.values[ind-1]
        #     new_x = sc.transform(new_x.reshape(1,-1))
        #     new_x = torch.tensor(new_x).float().to(device)

        #     new_y = sc.transform((data.values[ind]).reshape(1,-1))[0,3]
        #     new_y = torch.tensor(new_y).float().to(device)

        #     optimizer.zero_grad()
        #     y1 = model(new_x)
        #     loss = loss_function(y1,new_y)
        #     loss.backward()
        #     optimizer.step()

    print("max(pred_price['pred_label']):", max(pred_price['pred_label']))
    for thrsh in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
        pred_price['pred_label_{}'.format(thrsh)] = pred_price[
            'pred_label'].apply(lambda x: 1 if x > thrsh else 0)

        accuracy = np.mean(pred_price['pred_label_{}'.format(thrsh)].values ==
                           label['label'].values)
        print("thresh:{} ,{}".format(thrsh, accuracy))

예제 #3

0

파일 보기

def val(model, data, day, label):

    # Construct new features
    data['sma_10'] = pd.DataFrame(SMA(data, timeperiod=10))
    data['mom_10'] = pd.DataFrame(MOM(data, 10))
    data['wma_10'] = pd.DataFrame(WMA(data, 10))
    data = pd.concat(
        [data, STOCHF(data, fastk_period=14, fastd_period=3)], axis=1)

    data['macd'] = pd.DataFrame(
        MACD(data, fastperiod=12, slowperiod=26)['macd'])
    data['rsi'] = pd.DataFrame(RSI(data, timeperiod=14))
    data['willr'] = pd.DataFrame(WILLR(data, timeperiod=14))
    data['cci'] = pd.DataFrame(CCI(data, timeperiod=14))

    data['pct_change_20'] = ROC(data, timeperiod=20)
    data['pct_change_30'] = ROC(data, timeperiod=30)
    data['pct_change_60'] = ROC(data, timeperiod=60)
    data.dropna(inplace=True)

    data = data[-253 - day:]
    pred_price = pd.DataFrame([], index=data.index, columns=['pred_T+1_price'])
    temp = np.array([])

    for ind, x in enumerate(data.values):
        model.eval()
        with torch.no_grad():

            x = torch.tensor(x).float().to(device)
            pred_y = model(x).numpy()

            pred_price['pred_T+1_price'].iloc[ind] = pred_y

        # x at T includes the true price at T-1, so we have a new pair (x,y) sample to update model.
        # Note that it is not a leak.
        if (ind > 0):
            model.train()
            new_x = data.values[ind - 1]
            new_x = sc.transform(new_x.reshape(1, -1))
            new_x = torch.tensor(new_x).float().to(device)

            new_y = sc.transform((data.values[ind]).reshape(1, -1))[0, 3]
            new_y = torch.tensor(new_y).float().to(device)

            optimizer.zero_grad()
            y1 = model(new_x)
            loss = loss_function(y1, new_y)
            loss.backward()
            optimizer.step()

    pred_price['label'] = pred_price['pred_T+1_price'].diff(1).apply(
        lambda x: 1 if x > 0 else 0)[day:]
    pred_price = pred_price.dropna()

    accuracy = np.mean(pred_price['label'].values == label['label'].values)
    print(accuracy)

예제 #4

0

파일 보기

 def read_data(self, file_path):
     data = pd.read_csv(file_path,
                        header=None,
                        sep=",",
                        names=["labels", "title", "description"],
                        index_col=False)
     data["text"] = data["title"] + ". " + data["description"]
     data["labels"] = data["labels"] - 1
     data.drop(columns=["title", "description"], inplace=True)
     data.dropna(inplace=True)
     return data

예제 #5

0

파일 보기

파일: datasets.py 프로젝트: unazed/sentiment-discovery

	def __init__(self, path, preprocess=False, preprocess_fn=process_str, delim=',',
				binarize_sent=False, drop_unlabeled=False, text_key='sentence', label_key='label'):
		self.path = path
		self.delim = delim
		self.text_key = text_key
		self.label_key = label_key
		self.drop_unlabeled = drop_unlabeled

		if drop_unlabeled:
			data = pd.read_csv(path, sep=delim, usecols=['Sentiment', text_key, label_key],
				encoding='unicode_escape')
			self.Sentiment = data['Sentiment'].values
			data = data.dropna(axis=0, subset=['Sentiment'])
		else:
			data = pd.read_csv(path, sep=delim, usecols=[text_key, label_key])

		data = data.fillna(value=0)

		self.X = data[text_key].values.tolist()
		if preprocess:
			self.X = [preprocess_fn(s, maxlen=None, encode=None) for s in self.X]
		if label_key in data:
			self.Y = data[label_key].values
			if binarize_sent:
				self.Y = ((self.Y/np.max(self.Y)) > .5).astype(int)
		else:
			self.Y = np.ones(len(self.X))*-1

예제 #6

0

파일 보기

def load_motionsense_data(path, attributes):
    times = ['time']
    data = pd.read_csv(path)[times + attributes]
    data = data.dropna()
    for attr in attributes:
        data[attr] = data[attr].apply(lambda x: round(x, 5))
    return data

예제 #7

0

파일 보기

 def read_data(self, file_path):
     data = pd.read_csv(file_path,
                        header=None,
                        sep=",",
                        names=[
                            "labels", "question_title", "question_content",
                            "best_answer"
                        ],
                        index_col=False)
     data.dropna(inplace=True)
     data["text"] = data["question_title"] + data[
         "question_content"] + data["best_answer"]
     data["labels"] = data["labels"] - 1
     data.drop(
         columns=["question_title", "question_content", "best_answer"],
         inplace=True)
     return data

예제 #8

0

파일 보기

파일: mlp_classfication.py 프로젝트: joelongLin/SIGIR-2020-FINIR-Competition

    def __init__(self, filepath, labelpath, flag):
        # filepath: string, path of data
        # flag: string, 'train' or 'Validation' , to split the whole dataset

        data = pd.read_csv(
            filepath,
            delimiter=',',
            index_col=0,
            usecols=(1, 2, 3, 4, 5, 6),
            names=['Index', 'open', 'high', 'low', 'close', 'volume'],
            skiprows=1)
        label = pd.read_csv(labelpath,
                            index_col=0,
                            usecols=(1, 2),
                            names=['Index', 'label'],
                            skiprows=1)
        # Construct new features
        data['sma_10'] = pd.DataFrame(SMA(data, timeperiod=10))
        data['mom_10'] = pd.DataFrame(MOM(data, 10))
        data['wma_10'] = pd.DataFrame(WMA(data, 10))
        data = pd.concat(
            [data, STOCHF(data, fastk_period=14, fastd_period=3)], axis=1)

        data['macd'] = pd.DataFrame(
            MACD(data, fastperiod=12, slowperiod=26)['macd'])
        data['rsi'] = pd.DataFrame(RSI(data, timeperiod=14))
        data['willr'] = pd.DataFrame(WILLR(data, timeperiod=14))
        data['cci'] = pd.DataFrame(CCI(data, timeperiod=14))

        data['pct_change_20'] = ROC(data, timeperiod=20)
        data['pct_change_30'] = ROC(data, timeperiod=30)
        data['pct_change_60'] = ROC(data, timeperiod=60)
        data.dropna(inplace=True)

        if flag == 'train':
            # Don't fit the MinMaxScaler with Validation set, which causes data leak.
            traindata_len = 1000
            train_data = data[-traindata_len:]
            x = sc.fit_transform(train_data.values)

            y = label['label'][-traindata_len:].values

        self.x = x
        self.y = y

예제 #9

0

파일 보기

파일: datasets.py 프로젝트: yrchen92/CoDIR

    def __init__(self,
                 path,
                 tokenizer=None,
                 preprocess_fn=None,
                 delim=',',
                 binarize_sent=False,
                 drop_unlabeled=False,
                 text_key='sentence',
                 label_key='label',
                 **kwargs):
        self.is_lazy = False
        self.preprocess_fn = preprocess_fn
        self.SetTokenizer(tokenizer)
        self.path = path
        self.delim = delim
        self.text_key = text_key
        self.label_key = label_key
        self.drop_unlabeled = drop_unlabeled

        if '.tsv' in self.path:
            self.delim = '\t'

        self.X = []
        self.Y = []
        try:
            cols = [text_key]
            if isinstance(label_key, list):
                cols += label_key
            else:
                cols += [label_key]
            data = pd.read_csv(self.path,
                               sep=self.delim,
                               usecols=cols,
                               encoding='latin-1')
        except:
            data = pd.read_csv(self.path,
                               sep=self.delim,
                               usecols=[text_key],
                               encoding='latin-1')

        data = data.dropna(axis=0)

        self.X = data[text_key].values.tolist()
        try:
            self.Y = data[label_key].values
        except Exception as e:
            self.Y = np.ones(len(self.X)) * -1

        if binarize_sent:
            self.Y = binarize_labels(self.Y, hard=binarize_sent)

예제 #10

0

파일 보기

파일: datasets.py 프로젝트: ViugiNick/sentiment-discovery

    def __init__(self, path, preprocess=False, preprocess_fn=process_str, delim=',',
                binarize_sent=False, drop_unlabeled=False, text_key='sentence', label_key='label',
                **kwargs):
        self.processed_path = self.path = path
        self.path = path
        self.delim = delim
        self.text_key = text_key
        self.label_key = label_key
        self.drop_unlabeled = drop_unlabeled

        if '.tsv' in self.path:
            self.delim = '\t'

        load_path, should_process = get_load_path_and_should_process(self.path, text_key, label_key)
        should_process = should_process or preprocess

        self.X = []
        self.Y = []
#        if drop_unlabeled:
#            data = pd.read_csv(load_path, sep=delim, usecols=['Sentiment', text_key, label_key],
#                encoding='unicode_escape')
#            self.Sentiment = data['Sentiment'].values
#            data = data.dropna(axis=0, subset=['Sentiment'])
#        else:
#            data = pd.read_csv(load_path, sep=delim, usecols=[text_key, label_key])

 #       data = data.fillna(value=-1)
        try:
            data = pd.read_csv(load_path, sep=self.delim, usecols=[text_key, label_key], encoding='latin-1')
        except:
            data = pd.read_csv(load_path, sep=self.delim, usecols=[text_key], encoding='latin-1')

        data = data.dropna(axis=0)

        self.X = data[text_key].values.tolist()
        if should_process:
            self.X = [preprocess_fn(s, maxlen=None, encode=None) for s in self.X]
        if label_key in data:
            self.Y = data[label_key].values
        else:
            self.Y = np.ones(len(self.X))*-1

        if should_process:
            self.processed_path = save_preprocessed(self, text_key=text_key, label_key=label_key)
        else:
            self.processed_path = load_path

        if binarize_sent:
            self.Y = binarize_labels(self.Y, hard=binarize_sent)

예제 #11

0

파일 보기

파일: data.py 프로젝트: jessekim-ck/seoul-grand-park

 def __init__(self, csv_path):
     """Parse and store data."""
     super().__init__()
     data = pd.read_csv(csv_path)
     data = data.dropna().reset_index(drop=True)
     self.months = np.array(list(map(date2month, data["날짜"])))
     self.avg_temperatures = data["평균기온"].astype(float).values
     self.rainfalls = data["강수량"].astype(float).values
     self.fine_dust = data["미세먼지"].astype(float).values
     self.superfine_dust = data["초미세먼지"].astype(float).values
     data["holiday"] = ((data["주말공휴일"] == "1") | (data["주말공휴일"] == "토") |
                        (data["주말공휴일"] == "일")).astype(int).values
     self.holiday = data["holiday"].astype(int).values
     self.spring = data["봄"].astype(int).values
     self.summer = data["여름"].astype(int).values
     self.autumn = data["가을"].astype(int).values
     self.winter = (
         1 - (data["봄"] + data["여름"] + data["가을"])).astype(int).values
     self.social_distance = data["사회적거리두기"].astype(int).values
     self.fog = data["안개"].astype(int).values
     self.visitors = data["방문객수"].astype(int).values
     self.len = len(data)

예제 #12

0

파일 보기

파일: main.py 프로젝트: sudhirpotturi/nflprediction

        correct = (y == torch.argmax(y_prime, dim=1)).sum().double()
        return 1.0 - correct / float(y.size(0))


# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("CUDA is available: {}".format(torch.cuda.is_available()))
if torch.cuda.is_available():
    print("Using device: {}".format(
        torch.cuda.get_device_name(torch.cuda.current_device())))

# Load data
data = pd.read_csv("data/processed.csv", index_col=0)

# Drop null columns
data.dropna(axis='columns', how='any', inplace=True)

# Isolate season 1980 data
# season_1980_data = data[data.index.str.contains('198009|198010|198011|198012|198101|198102')]
# data = data[~data.index.str.contains('198009|198010|198011|198012|198101|198102')]

# Training data
y_data = data[['win', 'tie', 'loss']]
x_data = data.drop(['win', 'tie', 'loss', 'team'],
                   axis='columns').dropna(axis='columns', how='any')

# Data parameters
input_size = len(x_data.columns)
output_size = len(y_data.columns)

# Create dataset

예제 #13

0

파일 보기

파일: mixture vae BNE.py 프로젝트: new-gitters/VAE-Encoder

from torch.nn import functional as F
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import os
from torch.autograd import Variable
data=pd.read_csv('Tab.delimited.Cleaned.dataset.WITH.variable.labels.csv', sep='\t', engine='python')
file=open('mixture.txt')
labels=[]
for line in file:
    word=line.rstrip('\n')
    labels.append(word)

data=data.loc[:,labels]  #The selected columns prediction
data=data.replace(' ', np.nan)
#print(data.info(verbose=True))
data=data.dropna()
#print(data.info(verbose=True))
#17 rows and 49 cols
file=open('mixture.txt')
categories=["diseaseframinga","reciprocityothera","reciprocityusa","allowedforbiddena","flagtimeestimate1",
            "flagtimeestimate2","flagtimeestimate3","flagtimeestimate4","sex","citizenship"]
df=pd.get_dummies(data,columns=categories)
#print(df.info(verbose=True))

df=df.replace("Very much","11")
#rint(train.loc[:,"flagsupplement1"])
df=df.replace("Not at all","1")
df=df.replace("Republican","7")
df=df.replace("Democrat","1")
df=df.replace("Conservative","7")
df=df.replace("Liberal","1")

예제 #14

0

파일 보기

    def preprocess(self):
        print("Preprocessing data")
        (self.root / self.train_folder).mkdir()
        (self.root / self.test_folder).mkdir()
        (self.root / self.eval_folder).mkdir()

        data = pd.read_pickle(self.root / self.raw_file)
        edata = pd.read_pickle(self.root / self.raw_eval)

        # Filter null data
        data = data.dropna(how='any', axis='rows')

        # Filter negative fare amount
        data = data[data['fare_amount'] > 0]
        data = data[data['fare_amount'] < 250]

        # Filter passenger count
        data = data[(data['passenger_count'] <= 6) &
                    (data['passenger_count'] >= 1)]
        data['passenger_count'] -= 1
        edata['passenger_count'] -= 1

        # Convert datetime to usable data
        data['year'] = (data['pickup_datetime'].map(lambda x: x.year)
                        - self.min_year)
        data['month'] = data['pickup_datetime'].map(lambda x: x.month) - 1
        data['weekday'] = data['pickup_datetime'].map(lambda x: x.weekday())
        data['quaterhour'] = data['pickup_datetime'].map(
            lambda x: x.hour*4 + x.minute//15
        )
        data.drop('pickup_datetime', 1, inplace=True)

        edata['year'] = (edata['pickup_datetime'].map(lambda x: x.year)
                         - self.min_year)
        edata['month'] = edata['pickup_datetime'].map(lambda x: x.month) - 1
        edata['weekday'] = edata['pickup_datetime'].map(lambda x: x.weekday())
        edata['quaterhour'] = edata['pickup_datetime'].map(
            lambda x: x.hour*4 + x.minute//15
        )
        edata.drop('pickup_datetime', 1, inplace=True)

        # Filter location data
        lx = data['pickup_longitude']
        ly = data['dropoff_longitude']
        px = data['pickup_latitude']
        py = data['dropoff_latitude']

        elx = edata['pickup_longitude']
        ely = edata['dropoff_longitude']
        epx = edata['pickup_latitude']
        epy = edata['dropoff_latitude']

        data = data[
            (lx <= np.ceil(elx.max())) &
            (lx >= np.floor(elx.min())) &
            (ly <= np.ceil(ely.max())) &
            (ly >= np.floor(ely.min())) &
            (px <= np.ceil(epx.max())) &
            (px >= np.floor(epx.min())) &
            (py <= np.ceil(epy.max())) &
            (py >= np.floor(epy.min()))
        ]

        print(data.iloc[:, 1:5].columns)

        # Normalize data
        data.iloc[:, 1:5] = (data.iloc[:, 1:5] - self.mean) / self.std
        edata.iloc[:, 0:4] = (edata.iloc[:, 0:4] - self.mean) / self.std

        # train-test split
        t = int(self.testsplit * len(data))

        print("Writing train files")
        torch.save(
            torch.tensor(data.iloc[t:, 0].values, dtype=torch.float),
            self.root / self.train_folder / self.target_file
        )
        torch.save(
            torch.tensor(data.iloc[t:, 1:5].values, dtype=torch.float),
            self.root / self.train_folder / self.gps_file
        )
        torch.save(
            torch.tensor(data.iloc[t:, 5:].values, dtype=torch.long),
            self.root / self.train_folder / self.cat_file
        )

        print("Writing test files")
        torch.save(
            torch.tensor(data.iloc[:t, 0].values, dtype=torch.float),
            self.root / self.test_folder / self.target_file
        )
        torch.save(
            torch.tensor(data.iloc[:t, 1:5].values, dtype=torch.float),
            self.root / self.test_folder / self.gps_file
        )
        torch.save(
            torch.tensor(data.iloc[:t, 5:].values, dtype=torch.long),
            self.root / self.test_folder / self.cat_file
        )

        print("Writing eval file")
        torch.save(
            torch.tensor(edata.iloc[:, :4].values, dtype=torch.float),
            self.root / self.eval_folder / self.gps_file
        )
        torch.save(
            torch.tensor(edata.iloc[:, 4:].values, dtype=torch.long),
            self.root / self.eval_folder / self.cat_file
        )

예제 #15

0

파일 보기

파일: mendota-data.py 프로젝트: acnagle/CLA-Project

    def __get_data:
        data_path = '../Data/'

        filenames = [
        #     'CM2014_edit.csv',
            'CM2015_edit.csv',
            'CM2016_edit.csv',
            'CM2017_edit.csv',
            'CM2018_edit.csv',
            'mdcp.csv',
            'major_ion.csv',
            'Weather_Data.csv'
        ]

        # cla_2014 = pd.read_csv(data_path + filenames[0], low_memory=False)
        cla_2015_raw = pd.read_csv(data_path + filenames[0], low_memory=False)
        cla_2016_raw = pd.read_csv(data_path + filenames[1], low_memory=False)
        cla_2017_raw = pd.read_csv(data_path + filenames[2], low_memory=False)
        cla_2018_raw = pd.read_csv(data_path + filenames[3], low_memory=False)
        mdcp_raw = pd.read_csv(data_path + filenames[4], low_memory=False)    # Mendota buoy
        weather_raw = pd.read_csv(data_path + filenames[6], error_bad_lines=False, low_memory=False)

        ########### Clean Data ###########
        ########### CLA Data ###########

        keep15 = [     # features to keep for years 2015-2017
            'correct_timestamp',
            'collectionSiteId',
            'lake',
            'algalBloom',
            'algalBloomSheen',
            'turbidity',
        #     'waterTemp',
        #     'waveIntensity',
            'lat',
            'long'
        ]

        keep18 = [    # features to keep for 2018
            'sample_collection_time',
            'collectionSiteId',
            'lake',
            'algalBloom',
            'algalBloomSheen',
            'turbidity',
        #     'waterTemp',
        #     'waveIntensity',
            'latitiude',
            'longitude'
        ]

        rename15 = {   # rename features for 2015-2017
            'collectionSiteId': 'site',
            'lat': 'latitude',
            'long': 'longitude',
            'correct_timestamp': 'date'
        }

        rename18 = {   # renamce features for 2018
            'collectionSiteId': 'site',
            'sample_collection_time': 'date',
            'latitiude': 'latitude'
        }

        cla_2015 = cla_2015_raw[keep15]
        cla_2016 = cla_2016_raw[keep15]
        cla_2017 = cla_2017_raw[keep15]
        cla_2018 = cla_2018_raw[keep18]

        cla_2015.rename(rename15, axis='columns', inplace=True)
        cla_2016.rename(rename15, axis='columns', inplace=True)
        cla_2017.rename(rename15, axis='columns', inplace=True)
        cla_2018.rename(rename18, axis='columns', inplace=True)

        # change data types
        numeric = [    # list of numeric features
            'algalBloom',
            'algalBloomSheen',
            'turbidity',
        #     'waterTemp',
        #     'waveIntensity',
            'latitude',
            'longitude'
        ]

        # convert data types
        features = cla_2015.columns.values

        for feat in features:
            if feat in numeric:
                cla_2015[feat] = pd.to_numeric(cla_2015[feat], errors='coerce')
                cla_2016[feat] = pd.to_numeric(cla_2016[feat], errors='coerce')
                cla_2017[feat] = pd.to_numeric(cla_2017[feat], errors='coerce')
                cla_2018[feat] = pd.to_numeric(cla_2018[feat], errors='coerce')

            if feat in ['site', 'lake']:
                cla_2015[feat] = cla_2015[feat].astype(str)
                cla_2016[feat] = cla_2016[feat].astype(str)
                cla_2017[feat] = cla_2017[feat].astype(str)
                cla_2018[feat] = cla_2018[feat].astype(str)

            if feat == 'date':
                cla_2015[feat] = pd.to_datetime(cla_2015[feat], errors='coerce')
                cla_2016[feat] = pd.to_datetime(cla_2016[feat], errors='coerce')
                cla_2017[feat] = pd.to_datetime(cla_2017[feat], errors='coerce')
                cla_2018[feat] = pd.to_datetime(cla_2018[feat], errors='coerce')

        # remove nans
        cla_2015.dropna(axis='rows', how='any', inplace=True)
        cla_2016.dropna(axis='rows', how='any', inplace=True)
        cla_2017.dropna(axis='rows', how='any', inplace=True)
        cla_2018.dropna(axis='rows', how='any', inplace=True)

        # remove any data point not on lake mendota
        cla_2015 = cla_2015[cla_2015['lake'].str.contains('Mendota')]
        cla_2016 = cla_2016[cla_2016['lake'].str.contains('Mendota')]
        cla_2017 = cla_2017[cla_2017['lake'].str.contains('Mendota')]
        cla_2018 = cla_2018[cla_2018['lake'].str.contains('Mendota')]

        # set date as index
        cla_2015.set_index('date', inplace=True)
        cla_2016.set_index('date', inplace=True)
        cla_2017.set_index('date', inplace=True)
        cla_2018.set_index('date', inplace=True)

        # sort data by dates
        cla_2015.sort_values(by='date', inplace=True)
        cla_2016.sort_values(by='date', inplace=True)
        cla_2017.sort_values(by='date', inplace=True)
        cla_2018.sort_values(by='date', inplace=True)

        # resample, ffill and bfill
        cla_2015 = cla_2015.resample('D').mean()
        cla_2015.ffill(inplace=True)
        cla_2015.bfill(inplace=True)

        for date in cla_2015.index:
            if cla_2015.loc[date, 'algalBloomSheen'] > 0:
                cla_2015.loc[date, 'algalBloomSheen'] = 1

        cla_2016 = cla_2016.resample('D').mean()
        cla_2016.ffill(inplace=True)
        cla_2016.bfill(inplace=True)

        for date in cla_2016.index:
            if cla_2016.loc[date, 'algalBloomSheen'] > 0:
                cla_2016.loc[date, 'algalBloomSheen'] = 1

        cla_2017 = cla_2017.resample('D').mean()
        cla_2017.ffill(inplace=True)
        cla_2017.bfill(inplace=True)

        for date in cla_2017.index:
            if cla_2017.loc[date, 'algalBloomSheen'] > 0:
                cla_2017.loc[date, 'algalBloomSheen'] = 1

        cla_2018 = cla_2018.resample('D').mean()
        cla_2018.ffill(inplace=True)
        cla_2018.bfill(inplace=True)

        for date in cla_2018.index:
            if cla_2018.loc[date, 'algalBloomSheen'] > 0:
                cla_2018.loc[date, 'algalBloomSheen'] = 1

        # only keep the dates of the official sampling season of each year
        # cla_2015 = cla_2015[(cla_2015.index >= '2015-5-18') & (cla_2015.index <= '2015-9-4')]
        # cla_2016 = cla_2016[(cla_2016.index >= '2016-5-25') & (cla_2016.index <= '2016-9-4')]
        # cla_2017 = cla_2017[(cla_2017.index >= '2017-5-25') & (cla_2017.index <= '2017-9-4')]
        # cla_2018 = cla_2018[(cla_2018.index >= '2018-5-25') & (cla_2018.index <= '2018-9-4')]

        ########### MDCP Data ###########

        keep_mdcp = [
            'sampledate',
            'sampletime',
            'air_temp',
            'rel_hum',
            'wind_speed',
            'wind_dir',
            'chlor',
            'phycocyanin',
            'do_raw',
            'do_sat',
            'do_wtemp',
            'pco2_ppm',
            'par',
            'par_below'
        ]

        mdcp = mdcp_raw[keep_mdcp]
        mdcp.ffill(inplace=True)
        mdcp.bfill(inplace=True)

        mdcp['date'] = mdcp['sampledate'] + ' ' + mdcp['sampletime']
        mdcp['date'] = pd.to_datetime(mdcp['date'], errors='coerce')
        mdcp.dropna(axis='rows', how='any', inplace=True)

        mdcp = mdcp[[
            'date',
            'air_temp',
            'rel_hum',
            'wind_speed',
            'wind_dir',
            'chlor',
            'phycocyanin',
            'do_raw',
            'do_sat',
            'do_wtemp',
            'pco2_ppm',
            'par',
            'par_below'
        ]]
        mdcp.set_index('date', inplace=True)

        mdcp = mdcp.resample('D').mean()
        mdcp.ffill(inplace=True)
        mdcp.bfill(inplace=True)

        ########### Weather Data ###########

        keep_weather = [
            'DATE',
            'REPORTTPYE',
            'DAILYMaximumDryBulbTemp',
            'DAILYMinimumDryBulbTemp',
            'DAILYAverageDryBulbTemp',
            'DAILYDeptFromNormalAverageTemp',
            'DAILYAverageDewPointTemp',
            'DAILYAverageWetBulbTemp',
            'DAILYPrecip',
            'DAILYAverageStationPressure',
            'DAILYAverageSeaLevelPressure'
        ]

        weather = weather_raw[keep_weather]
        # weather['REPORTTPYE'].dropna(axis='rows', how='any', inplace=True)
        weather = weather.iloc[:-1]  # remove last entry since it has NaN in REPORTTPYE

        weather = weather[weather['REPORTTPYE'].str.contains('SOD')]    # only keep summary of day (SOD) info
        weather = weather.drop(['REPORTTPYE'], axis='columns')
        weather['DATE'] = pd.to_datetime(weather['DATE'], errors='coerce')

        weather.set_index('DATE', inplace=True)
        weather = weather.resample('D').ffill()
        weather.ffill(inplace=True)
        weather.bfill(inplace=True)

        # Join CLA, MDCP, and Weather Data

        # Append CLA data
        cla = cla_2015.append(cla_2016)
        cla = cla.append(cla_2017)
        cla = cla.append(cla_2018)

        # Insert MDCP data
        data = cla.join(mdcp, how='inner')

        # Insert weather data
        data = data.join(weather, how='inner')

        # sine/cosine transformation of month of year and wind direction
        data['cos_month'] = np.cos(2 * np.pi * (data.index.month.values / 12))
        data['sin_month'] = np.sin(2 * np.pi * (data.index.month.values / 12))

        data['cos_wind_dir'] = np.cos(2 * np.pi * (data['wind_dir'] / 12))
        data['sin_wind_dir'] = np.sin(2 * np.pi * (data['wind_dir'] / 12))
        data = data.drop(['wind_dir'], axis='columns')

        # Replace 'T' and 's' in 'DAILYPrecip' column
        for date in data.index:
            if 'T' in data.loc[date, 'DAILYPrecip']:
                data.loc[date, 'DAILYPrecip'] = 0.01
            elif 's' in data.loc[date, 'DAILYPrecip']:
                data.loc[date, 'DAILYPrecip'] = 0

        # Make every feature numeric
        for col in data.columns.values:
            if type(data[col].values[0]) != np.float64:
                data[col] = pd.to_numeric(data[col], errors='coerce')

        # create indicator features for whether there was rain or a bloom one day ago, or within three days or a week ago
        precip = (data['DAILYPrecip'] > 0).astype(int)   # convert precipitation to boolean values
        # data['DAILYPrecip_one_day'] = precip.shift(1)
        # data['DAILYPrecip_three_day'] = precip.rolling(window=3, min_periods=1).sum()    # NOTE THAT THIS IS DEPENDENT ON CURRENT VALUE
        # data['DAILYPrecip_one_week'] = precip.rolling(window=7, min_periods=1).sum()

        # data['algalBloomSheen_one_day'] = data['algalBloomSheen'].shift(1)
        # data['algalBloomSheen_three_day'] = data[['algalBloomSheen']].shift(1).rolling(3).sum()
        # data['algalBloomSheen_one_week'] = data[['algalBloomSheen']].shift(1).rolling(7).sum()

        # shift algalbloomsheen by -1 to predict next day algal bloom
        data['DAILYPrecip_one_day'] = precip
        data['DAILYPrecip_three_day'] = precip.rolling(window=3, min_periods=1).sum()    # NOTE THAT THIS IS DEPENDENT ON CURRENT VALUE
        data['DAILYPrecip_one_week'] = precip.rolling(window=7, min_periods=1).sum()
        data['algalBloomSheen_one_day'] = data['algalBloomSheen']
        data['algalBloomSheen_three_day'] = data[['algalBloomSheen']].rolling(3, min_periods=1).sum()
        data['algalBloomSheen_one_week'] = data[['algalBloomSheen']].rolling(7, min_periods=1).sum()
        data['algalBloomSheen'] = data['algalBloomSheen'].shift(-1)

        data.dropna(axis='rows', how='any', inplace=True)

        # display(data[['DAILYPrecip',
        #       'DAILYPrecip_one_day',
        #       'DAILYPrecip_three_day',
        #       'DAILYPrecip_one_week',
        #       'algalBloomSheen',
        #       'algalBloomSheen_one_day',
        #       'algalBloomSheen_three_day',
        #       'algalBloomSheen_one_week'
        #      ]].head(15))

        labels = data[['algalBloomSheen']]
        data = data.drop(['latitude', 'longitude', 'algalBloom', 'algalBloomSheen'], axis='columns')

        return data, labels