def main(): parser = argparse.ArgumentParser() parser.add_argument("path", type=str, help="Dataset path (*.csv)") parser.add_argument("destino_path", type=str, help="Path to save the file (*.csv)") args = parser.parse_args() path = args.path destino_path = args.destino_path #read csv csv = pd.read_csv(path) #csv = clean_csv(csv).clean(600,'not', 'not', confi = None) #csv = clean_csv(csv).clean(600,'yes', 'not', confi = None) csv = clean_csv(csv).clean(700,'yes', 'yes',3, confi = 0.65) csv = csv.reset_index(drop=True) #preprocessing each list p1 = Preprocessing() lista_query = p1.preprocess_text(csv['query'].tolist()) lista_response = p1.preprocess_text([str(i) for i in csv['response'].tolist()]) #delete query and response columns csv.drop(['query', 'response'], axis=1, inplace=True) #add query and response columns csv['query'] = lista_query csv['response'] = lista_response csv = csv.reset_index(drop=True) #save csv file csv.to_csv(destino_path + 'dataset_confidence_065__3.csv', index=False)
def process_csv(filename, outfile): csv = pd.read_csv(filename, names=['timeStamp', 'rawData']) csv.drop(csv.index[0], inplace=True) csv.reset_index(inplace=True) csv.drop(['index'], axis=1, inplace=True) csv['timeStamp'] -= csv['timeStamp'][0] csv['rawData'] = csv['rawData'] - float((csv['rawData'].mode())) ## Add endTimeStamp ## # tmp = csv['timeStamp'].shift(-1).fillna(0) # csv['endTimeStamp'] = tmp # csv = csv[:-1] # csv['offset'] = csv['endTimeStamp'] - csv['timeStamp'] csv.to_csv(outfile, sep=',', index=False)
def get_price_list(self, years): index = years/3 for file in self.files: csv = pd.read_csv(file) csv.drop(['Date', 'Open', 'High', 'Low', 'Adj Close', 'Volume'], axis=1, inplace=True) list = [] for item in csv.as_matrix(): list.append(item[0]) self.price_list.append(list[-int(index * len(list)):len(list)]) for i in self.price_list: returns = [] for j in range(len(i) - 1): returns.append((i[j + 1] - i[j]) / i[j]) self.returns.append(np.log(1 + np.asarray(returns)))
def load_csv(self): csv_file = self.folder + '/behavior/output.csv' csv = pd.read_csv(csv_file, delimiter=',', encoding='utf-16', header=0, skiprows=[1]) csv.columns = ['Event Time', 'Event', 'Probability', 'Side'] # get gamble side gamble_string = csv.loc[csv['Side'].notnull(), 'Side'].values[0] if 'RIGHT' in gamble_string: self.gamble_side = 'right' if 'LEFT' in gamble_string: self.gamble_side = 'left' # drop side column csv.drop('Side', axis=1, inplace=True) # Cleanup DateTime csv['Event Time'] = csv['Event Time'].apply(self.convert_to_datetime) start_dateteime = csv.loc[0, 'Event Time'] # convert ms to sampling rate time delta delta = csv['Event Time'] - csv.loc[0, 'Event Time'] csv.insert(1, 'Start', (delta.dt.total_seconds() * 20000).astype('uint64')) # clean up proabability column ===== # calculate where prob changes prob = csv.loc[csv['Probability'].notnull(), 'Probability'] prob_change = np.where(prob.values[:-1] != prob.values[1:])[0] prob_change_idx = prob.iloc[prob_change].index.values prob_change_idx = np.append(prob_change_idx, prob.index[-1]) # change 3 bins probability to number # change first bin start = 0 stop = prob_change_idx[0] self.match_probability(csv, start, stop) # change second bin start = prob_change_idx[0] + 1 stop = prob_change_idx[1] self.match_probability(csv, start, stop) # change third bin start = prob_change_idx[1] + 1 stop = stop = prob_change_idx[2] self.match_probability(csv, start, stop) # add probability to last rows nan = np.where(csv['Probability'].isnull())[0] csv.loc[nan[0]:, 'Probability'] = csv.loc[nan[0] - 1, 'Probability'] # cleanup event names # new names dict replace = dict() replace['TIstarts'] = 'start' replace['IND-CUE_pres_start'] = 'cue' replace['SOUND_start'] = 'sound' replace['resp-time-window_start'] = 'openloop' replace['right_rewarded'] = 'right_rw' replace['right_NOreward'] = 'right_norw' replace['left_rewarded'] = 'left_rw' replace['left_NOreward'] = 'left_norw' replace['no response in time'] = 'no response in time' replace['ITIstarts'] = 'iti' replace['ITIends'] = 'end' replace['start'] = 'session start' replace['end'] = 'session end' csv['Event'] = csv['Event'].apply( lambda event: replace[event] if event in replace.keys() else event) return csv