def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("path", type=str, help="Dataset path (*.csv)")
    parser.add_argument("destino_path", type=str, help="Path to save the file (*.csv)")
    args = parser.parse_args()
    
    path = args.path
    destino_path = args.destino_path
    #read csv  
    csv = pd.read_csv(path)
    #csv = clean_csv(csv).clean(600,'not', 'not', confi = None)
    #csv = clean_csv(csv).clean(600,'yes', 'not', confi = None)
    csv = clean_csv(csv).clean(700,'yes', 'yes',3, confi = 0.65)
    csv = csv.reset_index(drop=True)
    
    #preprocessing each list
    p1 = Preprocessing()
    lista_query = p1.preprocess_text(csv['query'].tolist())
    lista_response = p1.preprocess_text([str(i) for i in csv['response'].tolist()])
    
    #delete query and response columns
    csv.drop(['query', 'response'], axis=1, inplace=True)
    #add query and response columns
    csv['query'] = lista_query
    csv['response'] = lista_response
    csv = csv.reset_index(drop=True)
    #save csv file
    csv.to_csv(destino_path + 'dataset_confidence_065__3.csv', index=False)
示例#2
0
def process_csv(filename, outfile):
    csv = pd.read_csv(filename, names=['timeStamp', 'rawData'])
    csv.drop(csv.index[0], inplace=True)
    csv.reset_index(inplace=True)

    csv.drop(['index'], axis=1, inplace=True)
    csv['timeStamp'] -= csv['timeStamp'][0]
    csv['rawData'] = csv['rawData'] - float((csv['rawData'].mode()))

    ## Add endTimeStamp ##
    # tmp = csv['timeStamp'].shift(-1).fillna(0)
    # csv['endTimeStamp'] = tmp
    # csv = csv[:-1]
    # csv['offset'] = csv['endTimeStamp'] - csv['timeStamp']
    csv.to_csv(outfile, sep=',', index=False)
示例#3
0
    def get_price_list(self, years):
        index = years/3
        for file in self.files:
            csv = pd.read_csv(file)
            csv.drop(['Date', 'Open', 'High', 'Low', 'Adj Close', 'Volume'], axis=1, inplace=True)
            list = []
            for item in csv.as_matrix():
                list.append(item[0])
            self.price_list.append(list[-int(index * len(list)):len(list)])

        for i in self.price_list:
            returns = []
            for j in range(len(i) - 1):
                returns.append((i[j + 1] - i[j]) / i[j])
            self.returns.append(np.log(1 + np.asarray(returns)))
示例#4
0
    def load_csv(self):
        csv_file = self.folder + '/behavior/output.csv'
        csv = pd.read_csv(csv_file,
                          delimiter=',',
                          encoding='utf-16',
                          header=0,
                          skiprows=[1])
        csv.columns = ['Event Time', 'Event', 'Probability', 'Side']

        # get gamble side
        gamble_string = csv.loc[csv['Side'].notnull(), 'Side'].values[0]
        if 'RIGHT' in gamble_string:
            self.gamble_side = 'right'
        if 'LEFT' in gamble_string:
            self.gamble_side = 'left'

        # drop side column
        csv.drop('Side', axis=1, inplace=True)

        # Cleanup DateTime
        csv['Event Time'] = csv['Event Time'].apply(self.convert_to_datetime)
        start_dateteime = csv.loc[0, 'Event Time']

        # convert ms to sampling rate time delta
        delta = csv['Event Time'] - csv.loc[0, 'Event Time']
        csv.insert(1, 'Start',
                   (delta.dt.total_seconds() * 20000).astype('uint64'))

        # clean up proabability column =====
        # calculate where prob changes
        prob = csv.loc[csv['Probability'].notnull(), 'Probability']
        prob_change = np.where(prob.values[:-1] != prob.values[1:])[0]
        prob_change_idx = prob.iloc[prob_change].index.values
        prob_change_idx = np.append(prob_change_idx, prob.index[-1])
        # change 3 bins probability to number
        # change first bin
        start = 0
        stop = prob_change_idx[0]
        self.match_probability(csv, start, stop)
        # change second bin
        start = prob_change_idx[0] + 1
        stop = prob_change_idx[1]
        self.match_probability(csv, start, stop)
        # change third bin
        start = prob_change_idx[1] + 1
        stop = stop = prob_change_idx[2]
        self.match_probability(csv, start, stop)
        # add probability to last rows
        nan = np.where(csv['Probability'].isnull())[0]
        csv.loc[nan[0]:, 'Probability'] = csv.loc[nan[0] - 1, 'Probability']

        # cleanup event names
        # new names dict
        replace = dict()
        replace['TIstarts'] = 'start'
        replace['IND-CUE_pres_start'] = 'cue'
        replace['SOUND_start'] = 'sound'
        replace['resp-time-window_start'] = 'openloop'
        replace['right_rewarded'] = 'right_rw'
        replace['right_NOreward'] = 'right_norw'
        replace['left_rewarded'] = 'left_rw'
        replace['left_NOreward'] = 'left_norw'
        replace['no response in time'] = 'no response in time'
        replace['ITIstarts'] = 'iti'
        replace['ITIends'] = 'end'
        replace['start'] = 'session start'
        replace['end'] = 'session end'
        csv['Event'] = csv['Event'].apply(
            lambda event: replace[event] if event in replace.keys() else event)

        return csv