def getArrivalRates(infile1, infile2, infile3, Origin, slotInMinutes=5, windowInMinutes=20, numWindows=9, Destination=None): ''' ------------------------------------------------------------------------------------------------------ returns the arrival rates using data from past days infile1, infile2, and infile3 are csv files containing trip info. on similar days For NYC data: Feb 07, Feb 14, and Feb21 same hour intervals ------------------------------------------------------------------------------------------------------ ''' dDict1, head1 = readCSV(infile1) # read data on previous days dDict2, head2 = readCSV(infile2) dDict3, head3 = readCSV(infile3) lamSlots1, lamMint1 = getLamPerRegionMLE(dDict1, Origin, slotInMinutes, windowInMinutes, numWindows, Destination) lamSlots2, lamMint2 = getLamPerRegionMLE(dDict2, Origin, slotInMinutes, windowInMinutes, numWindows, Destination) lamSlots3, lamMint3 = getLamPerRegionMLE(dDict3, Origin, slotInMinutes, windowInMinutes, numWindows, Destination) lamSlots = dict() lamMint = dict() for elem in lamSlots1: lamSlots[elem] = np.mean( [lamSlots1[elem], lamSlots2[elem], lamSlots3[elem]]) lamMint[elem] = np.mean( [lamMint1[elem], lamMint2[elem], lamMint3[elem]]) return lamSlots, lamMint, lamSlots1, lamSlots2, lamSlots3
def prepare_data(): datas = np.asarray(readCSV(args.train_feature_file)) # print(np.array(datas).shape) # feat2idx = {} # idx2feat = {} # fid2idx = {} # idx2fid = {} # for idx, line in enumerate(datas): # if idx == 0: # feats = line.split(',')[1:] # for feat_idx in range(len(feats)): # feat2idx[feats[feat_idx]] = feat_idx # idx2feat[feat_idx] = feats[feat_idx] # else: # fid = line.split(',', 1)[0] # fid2idx[fid] = idx - 1 # idx2fid[idx - 1] = fid trainX = datas[1:, 1:] logger.info('Selected train features shape = ( %d , %d )', trainX.shape[0], trainX.shape[1]) datas = np.asarray(readCSV(args.train_ans_file)) trainY = datas[1:, 1:] logger.info('Selected train ans shape = ( %d )', trainY.shape[0]) datas = np.asarray(readCSV(args.test_feature_file)) testX = datas[1:, 1:] testFid = datas[1:, 0] logger.info('Selected test features shape = ( %d , %d )', testX.shape[0], testX.shape[1]) return trainX, trainY, testX, testFid #, feat2idx, idx2feat, fid2idx, idx2fid
def __init__(self, config_dir, train_close_pricesDF=None, trainDF=None, val_close_pricesDF=None, valDF=None, load_val=True, nrows=None): """Data class. You have the option of loading all four dataframes to avoid reloading them.""" self.train_close_pricesDF = None self.trainDF = None self.val_close_pricesDF = None self.valDF = None self.train_minutes = None self.val_minutes = None self.config_dir = config_dir self.load_val = load_val if (train_close_pricesDF is not None and trainDF is not None and val_close_pricesDF is not None and valDF is not None): self.train_close_pricesDF = train_close_pricesDF.copy() self.trainDF = trainDF.copy() self.val_close_pricesDF = val_close_pricesDF.copy() self.valDF = valDF.copy() if nrows is not None: print('presupplied data with nrows =', nrows, '... subsetting the DFs.') self.train_close_pricesDF = self.train_close_pricesDF[:nrows] self.trainDF = self.trainDF[:nrows] self.val_close_pricesDF = self.val_close_pricesDF[:nrows] self.valDF = self.valDF[:nrows] else: print('loading train_close_pricesDF') self.train_close_pricesDF = readCSV( config_dir + 'Postprocessed_Data/train_close_prices.csv', nrows=nrows) print('loading trainDF') self.trainDF = readCSV(config_dir + 'Postprocessed_Data/train_minutesDF.csv', nrows=nrows) print('load train complete') if load_val: print('loading val_close_pricesDF') self.val_close_pricesDF = readCSV( config_dir + 'Postprocessed_Data/val_close_prices.csv', nrows=nrows) print('loading valDF') self.valDF = readCSV(config_dir + 'Postprocessed_Data/val_minutesDF.csv', nrows=nrows) print('load val complete') self.splitMinutes()
def main(): attendFileName = sys.argv[1] statsFileName = sys.argv[2] attendData = utils.readCSV(attendFileName) statsData = utils.readCSV(statsFileName) people = {} for row in statsData: p = Person(row) people[p.name] = p utils.getGitHubStats('asdf')
def globalBouncedEmails(self, csv): csv = utils.readCSV(csv) if csv[0][4] == "Global Bounce" and csv[0][1] == "Portal Bounce": emails = [row[0] for row in csv if row[1] == "TRUE" or row[4] == "TRUE"] return emails else: print "4th column of the CSV file is not Global Bounce"
def main(): # List of sentence objects sentences = readCSV(STSS_131_DATA) STSS_values = [] scores = [] final_data = [] init = 66 for i, j in enumerate(sentences): # for i in range(0,3): result = measureSimilarity(sentences[i].first_sentence, sentences[i].second_sentence) # scores.append((init, result)) print(f" CASE {init}") # if init == 127: # break if isinstance(result, str): pass else: scores.append(result) final_data.append((init, result)) STSS_values.append(j.human_SS) init += 1 # print(final_data) p = stats.pearsonr(scores, STSS_values) print( f"Pearsons Correla-tion Coefficient against Human Judgement: {p}") with open('new_method_no_punc_stopword.csv', 'w') as f: writer = csv.writer(f) for row in final_data: writer.writerow(row)
def main(): args = processCommandline() occupancyData = utils.readCSV(args['input']) chargesData = json.loads(open(args['charges']).read()) print(chargesData) month = args['month'] year = args['year'] chargeDay = args['chargeday'] dueDay = args['dueday'] dueMonth = args['duemonth'] if (dueMonth == '-'): dueMonth = month transformedRecords = [] for ocd in occupancyData: if (ocd['Residing'] == ''): continue # refugee flat, ignore trec = transformToApnaComplexFormat(ocd, chargesData, month, year, chargeDay, dueDay, dueMonth) print(ocd, trec) for rec in trec: transformedRecords.append(rec) writeOutput(transformedRecords, args['output'])
def getQuantiMonth(month): all_df = [] for f in os.listdir('../../data/quanti/' + month): df = readCSV(os.path.join('../../data/quanti/' + month, f), dtype=str) all_df.append(df) all_df = pd.concat(all_df) all_df.set_index("gigyaid", inplace=True) cols = all_df.columns new_df = pd.DataFrame(index=all_df.index.unique(), columns=cols) for col in cols: if col == "contentswatched": # print(getContentsUnique(all_df, col)) contents = getContentsUnique(all_df, col) new_df = pd.merge(new_df, contents, left_index=True, right_on='gigyaid') new_df.drop("contentswatched_x", axis=1, inplace=True) new_df.rename({"contentswatched_y": "contentswatched"}, axis=1, inplace=True) else: all_df[col] = all_df[col].astype(float) new_df[col] = getSum(all_df, col)[col].values toCSV(new_df, "../../data/aggregated/quanti" + month + ".csv")
def main(): random.seed(23333) # imgs, other_infos = utils.deserialize("../data", fontList) # imgs_shuffled = [] # for i in imgs: # indices = [j for j in range(i.shape[0])] # random.shuffle(indices) # imgs_shuffled.append(i[indices]) # train = np.array([utils.hog(img) for i in imgs_shuffled for img in i[:len(i) * 4 // 5]]) # train = np.reshape(train, [train.shape[0], -1]) # train_label = np.array([i for i, _ in enumerate(imgs) for j in _[:len(_) * 4 // 5]]) # test = np.array([utils.hog(img) for i in imgs_shuffled for img in i[len(i) * 4 // 5:]]) # test = np.reshape(test, [test.shape[0], -1]) # test_label = np.array([i for i, _ in enumerate(imgs) for j in _[len(_) * 4 // 5:]]) imgs, other_infos = utils.readCSV( "../fonts", True, lambda x: int(x['m_label']) < 128 and int(x[ 'm_label']) not in [83, 84, 85, 115, 116, 117], fontList=fontList) train = np.array([utils.daisy(img) for i in imgs for img in i]) train = np.reshape(train, [train.shape[0], -1]) train_label = np.array([i for i, _ in enumerate(imgs) for j in _]) imgs, other_infos = utils.readCSV( "../fonts", True, lambda x: int(x['m_label']) in [83, 84, 85, 115, 116, 117], fontList=fontList) test = np.array([utils.daisy(img) for i in imgs for img in i]) test = np.reshape(test, [test.shape[0], -1]) test_label = np.array([i for i, _ in enumerate(imgs) for j in _]) for k in [1, 4, 8, 16, 24, 32]: print('k =', k, end=' ') tp = 0 for idx, t in enumerate(test): lst, dist = utils.nearest_neighbour(t, train) cnt = [0 for j in range(len(fontList))] for i in lst[:k]: cnt[train_label[i]] += 1 if max(cnt) == cnt[test_label[idx]]: tp += 1 print('acc =', tp / test.shape[0])
def populateDB(filename, csv_delimiter, header, language='EN', dbname='TwitterDB', mode=0, serialized=False): start = time.time() h, lines = utils.readCSV(filename, csv_delimiter, header) populateDatabase(lines, language, dbname, mode, serialized) end = time.time() print "time_populate.append(", (end - start), ")"
def main(): args = processCommandline() contactsData = utils.readCSV(args['input']) db, cursor = dbutils.dbConnect() dbutils.createTable( db, cursor, "Owner", ("building text", "flat_number text", "primary_first_name text", "primary_last_name text", "primary_email text", "primary_mobile text", "secondary_email text", "secondary_mobile text", "flat_type text"), True) importData(db, cursor, contactsData, "Owner")
def getOrderedServiceSingle(infile1, region, slotInMinutes=5, windowInMinutes=20, numWindows=9): ''' ------------------------------------------------------------------------------------------------------ gets the service distribution from past data ------------------------------------------------------------------------------------------------------ ''' dDictSer, headSer = readCSV(infile1) dictofLists = getOrderedService(dDictSer, region, slotInMinutes, windowInMinutes, numWindows) return dictofLists
def main(): csvName = 'data/original.csv' csv_lines = readCSV(csvName) spec_num = len(csv_lines) - 1 shuffle_seed_name = 'data/shuffle_seed.npy' shuffle_seed = np.random.permutation(spec_num) with open(shuffle_seed_name, 'wb') as frs: np.save(frs, shuffle_seed) shuffle_csv(csv_lines, 'data/shuffle.csv', shuffle_seed) spec_length = 1473 spec_name = 'data/original.bin' spec_shuffle_name = 'data/shuffle.bin' shuffle_bin(spec_name, spec_shuffle_name, shuffle_seed, spec_length=spec_length)
def getQualiMonth(month): all_df = [] for f in os.listdir('../../data/quali/' + month): df = readCSV(os.path.join('../../data/quali/' + month, f), converters=converters) all_df.append(df) all_df = pd.concat(all_df) all_df.set_index("gigyaid", inplace=True) cols = all_df.columns new_df = pd.DataFrame(index=all_df.index.unique(), columns=cols) for col in cols: all_df[col] = all_df[col].apply(lambda x: [i.upper() for i in x]) new_df[col] = getUnique(all_df, col)[col].values new_df.index.name = "gigyaid" toCSV(new_df, "../../data/aggregated/quali" + month + ".csv")
def getArrivalRatesSingle(infile1, Origin, slotInMinutes=5, windowInMinutes=20, numWindows=9, Destination=None): ''' ------------------------------------------------------------------------------------------------------ gets the arrival rates using data from a single past day observations NYC data: use Feb07, 2018 data to predict Feb14, 2018 rates ------------------------------------------------------------------------------------------------------ ''' dDictArate, headArate = readCSV(infile1) lamSlots, lamMint = getLamPerRegionMLE(dDictArate, Origin, slotInMinutes, windowInMinutes, numWindows, Destination) return lamSlots, lamMint
def split_train_val(): bin_name = 'data/train_tmp.bin' csv_name = 'data/train_tmp.csv' csv_train = 'data/train.csv' csv_val = 'data/val.csv' bin_train = 'data/train.bin' bin_val = 'data/val.bin' length = (10 + 1473) * 4 csv_lines = readCSV(csv_name) csv_header = csv_lines[0] csv_lines = csv_lines[1:] train_length = int(len(csv_lines) * 0.9) with open(csv_train, 'wb') as fcsvtrain, open(bin_train, 'wb') as fbintrain,\ open(csv_val, 'wb') as fcsvval, open(bin_val, 'wb') as fbinval,\ open(bin_name, 'rb') as fbin: trainwriter = csv.writer(fcsvtrain) valwriter = csv.writer(fcsvval) trainwriter.writerow(csv_header) valwriter.writerow(csv_header) buf_train = buf_val = '' for i, line in enumerate(tqdm(csv_lines)): buf = fbin.read(length) if i < train_length: trainwriter.writerow(line) buf_train += buf if i % 100 == 0 and i != 0: fbintrain.write(buf_train) buf_train = '' else: valwriter.writerow(line) buf_val += buf if i % 100 == 0 and i != 0: fbinval.write(buf_val) buf_val = '' else: fbintrain.write(buf_train) fbinval.write(buf_val) bin_names = [bin_name, csv_name] bin_names.insert(0, 'rm') os.system(' '.join(bin_names))
def split_train_test(): bin_name = 'data/shuffle.bin' csv_name = 'data/shuffle_new.csv' test_no = ['3', '9', '19', '24'] csv_train = 'data/train_tmp.csv' csv_test = 'data/test.csv' bin_train = 'data/train_tmp.bin' bin_test = 'data/test.bin' length = (10 + 1473) * 4 csv_lines = readCSV(csv_name) csv_header = csv_lines[0] csv_lines = csv_lines[1:] with open(csv_train, 'wb') as fcsvtrain, open(bin_train, 'wb') as fbintrain,\ open(csv_test, 'wb') as fcsvtest, open(bin_test, 'wb') as fbintest,\ open(bin_name, 'rb') as fbin: trainwriter = csv.writer(fcsvtrain) testwriter = csv.writer(fcsvtest) trainwriter.writerow(csv_header) testwriter.writerow(csv_header) buf_train = buf_test = '' for i, line in enumerate(tqdm(csv_lines)): buf = fbin.read(length) if line[0] in test_no: testwriter.writerow(line) buf_test += buf if i % 100 == 0 and i != 0: fbintest.write(buf_test) buf_test = '' else: trainwriter.writerow(line) buf_train += buf if i % 100 == 0 and i != 0: fbintrain.write(buf_train) buf_train = '' else: fbintrain.write(buf_train) fbintest.write(buf_test)
return similarity # S1 = "The president was assinated in his car" # S2 = "The president was assinated and driver could do nothing" # soc_sts(S1, S2) # S1 = "Would you like to go out to drink with me tonight?" # S2 = "I really don't know what to eat tonight so I might go out somewhere." # soc_sts(S1, S2) from scipy import stats STSS_131_DATA = "data/STSS-131.csv" sentences = readCSV(STSS_131_DATA) sim_values, STSS_values = [], [] for s in sentences: # soc_sts(s.first_sentence, s.second_sentence) sim_values.append(soc_sts(s.first_sentence, s.second_sentence) * 4) STSS_values.append(s.human_SS) print("******************************************************") p = stats.pearsonr(sim_values, STSS_values) print("soc_sts correlation with STSS: ", p) """ S1 = "Would you like to go out to drink with me tonight?" S2 = "I really don't know what to eat tonight so I might go out somewhere." soc_sts(S1, S2) """
def createPnlVisPlot(config_dir, desired_vis_plot_fn, ts_window_size=5000): """ Creates a pnl visualization plot and saves it in the Plots/ directory. ts_window_size determines how many timesteps to consider to determine the <worst> and <best> time windows. """ # [x] find and load the trade_log # filenames are pnl_vis_<model_name>_<sess_type>_<episode>_<total, best, worst>.png print('creating pnl vis plot:', desired_vis_plot_fn) #we need to first remove the model_str, then we can split by '_' before_model_str = desired_vis_plot_fn.split('[')[0] after_model_str = desired_vis_plot_fn.split(']')[-1] model_str = desired_vis_plot_fn.split(before_model_str)[1].split(after_model_str)[0] sess_type = after_model_str.split('_')[1] episode = after_model_str.split('_')[2] plot_type = after_model_str.split('_')[3].split('.')[0] sess_type = '' if sess_type == 'train' else sess_type #trade_log files are saved with the sesstype only if it != 'train' trade_log_fn = config_dir + 'Deep_Models/'+ model_str +'/Trade_Logs/'+sess_type+episode+'.csv' trade_logDF = readCSV(trade_log_fn) # [x] create plot full_logDF = trade_logDF.copy() full_logDF['Midpt'] = (full_logDF.C_B + full_logDF.C_A)/2 full_logDF['DayPNL'] = (full_logDF.ActualAction!=0)*(full_logDF.C_B - full_logDF.Midpt) #it's always negative half the bid-ask spread full_logDF['OpenPNL'] = full_logDF.NewPosition.shift() * (full_logDF.Midpt - full_logDF.Midpt.shift()) full_logDF.OpenPNL.loc[0] = 0 full_logDF['TotalPNL'] = full_logDF.DayPNL + full_logDF.OpenPNL #subset the log if plotting best or worst window if plot_type == 'best': rolling_sum = full_logDF.TotalPNL.rolling(window=ts_window_size).sum() idx_max = rolling_sum[::-1].idxmax() # the [::-1] selects the last occurence of idxmax full_logDF = full_logDF[(idx_max-ts_window_size):idx_max] if plot_type == 'worst': rolling_sum = full_logDF.TotalPNL.rolling(window=ts_window_size).sum() idx_min = rolling_sum[::-1].idxmin() # the [::-1] selects the last occurence of idxmax full_logDF = full_logDF[(idx_min-ts_window_size):idx_min] full_logDF['CumPNL'] = full_logDF.TotalPNL.cumsum() # [x] graph (from http://kitchingroup.cheme.cmu.edu/blog/2013/09/13/Plotting-two-datasets-with-very-different-scales/) buys = full_logDF.loc[full_logDF.ActualAction == 1][['Minute', 'C_A']] sells = full_logDF.loc[full_logDF.ActualAction == 2][['Minute', 'C_B']] fig = plt.figure() ax1 = fig.add_subplot(111) ax1.plot(full_logDF.Minute, full_logDF.Midpt, 'k-', linewidth=0.5) ax1.plot(buys.Minute, buys.C_A, 'g+') ax1.plot(sells.Minute, sells.C_B, 'r+') ax1.set_ylabel('Price') # [x] add plot title ax1.set_title(plot_type) ax2 = ax1.twinx() ax2.plot(full_logDF.Minute, full_logDF.CumPNL, 'b-') ax2.set_ylabel('PNL') # [x] Add pnl=0 line # ax2.hlines(y=0, xmin=full_logDF.Minute.iloc[0] , xmax=full_logDF.Minute.iloc[-1], colors='0.75', linestyles='dashed') ax2.axhline(y=0, color='0.75', linestyle='dashed') # using this instead of hlines frees us from supplying xmin and xmax # save plot plt.savefig(config_dir+'Plots/'+desired_vis_plot_fn, dpi=256)
def processData(comb_row, config_dir, day_chg_incs, minute_incs): """ 1. Locates the Postprocessed_Data/*. If no Postprocessed_Data/* exists, then... 2. Locates the Data/* and generates Postprocessed_Data/*. If no Data/* exists, then... 3. Generate Data/* then generate Postprocessed_Data/*. """ # [x] Locates the PostprocessedData/* # [x] check that it is in the correct format. sec_guide_fn = config_dir + 'Data/sec_guide.csv' if os.path.exists(sec_guide_fn): sec_guideDF = pd.read_csv(sec_guide_fn) num_secs = len(sec_guideDF) postprocessed_data_dir = config_dir + 'Postprocessed_Data/' if (os.path.exists(postprocessed_data_dir + 'train_close_prices.csv') and os.path.exists(postprocessed_data_dir + 'train_minutesDF.csv') and os.path.exists(postprocessed_data_dir + 'val_close_prices.csv') and os.path.exists(postprocessed_data_dir + 'val_minutesDF.csv')): print('All Postprocessed_Data files exist. Checking columns') cols_set_actual = set( pd.read_csv(postprocessed_data_dir + 'train_minutesDF.csv', nrows=0).columns.tolist()) cols_set_expected = set( getExpectedPostprocessedCols(num_secs, day_chg_incs, minute_incs)) assert cols_set_actual == cols_set_expected, 'Problem with the columns. \ \nCols in Postprocessed actual but not in expected: ' +str(cols_set_actual.difference(cols_set_expected))+\ '\nCols in expected but not in Postprocessed actual: '+str(cols_set_expected.difference(cols_set_actual)) print('Columns passed the check. Loading postprocessed data.') print('loading train_close_pricesDF') train_close_pricesDF = readCSV(postprocessed_data_dir + 'train_close_prices.csv') print('loading train_minutesDF') train_minutesDF = readCSV(postprocessed_data_dir + 'train_minutesDF.csv') print('loading val_close_pricesDF') val_close_pricesDF = readCSV(postprocessed_data_dir + 'val_close_prices.csv') print('loading val_minutesDF') val_minutesDF = readCSV(postprocessed_data_dir + 'val_minutesDF.csv') print('loading complete.') return train_close_pricesDF, train_minutesDF, val_close_pricesDF, val_minutesDF else: print( 'There are Postprocessed_Data/* missing. Here are the files there currently:' ) print(os.listdir(postprocessed_data_dir)) else: print(sec_guide_fn, 'not found. Continuing...') print('\n') # [x] Locates or creates the Data/* data_dir = config_dir + 'Data/' daily_fn = data_dir + 'daily_summary.csv' all_minutes_fn = data_dir + 'all_minutes.csv' sec_guide_fn = data_dir + 'sec_guide.csv' all_minutesDF, dailyDF, sec_guideDF = 'not set yet', 'not set yet', 'not set yet' if not (os.path.exists(daily_fn) and os.path.exists(all_minutes_fn) and os.path.exists(sec_guide_fn)): print('Data/ has not been loaded and pre-processed yet. Doing so now.') all_minutesDF, dailyDF, sec_guideDF = loadAndProcessData( comb_row, config_dir, day_chg_incs, minute_incs) else: print('Data found! Loading data...') # [x] check the order of other_secs sec_guideDF = readCSV(sec_guide_fn) assert (comb_row.Sec1 == sec_guideDF.iloc[0].Sec) other_secs_comb_row = comb_row[[ col for col in comb_row.index if (col[:3] == 'Sec' and col != 'Sec1') ]].values other_secs_comb_row = [ i for i in other_secs_comb_row if type(i) == str ] other_secs_sec_guide = sec_guideDF.iloc[1:].Sec.values assert (np.all(other_secs_comb_row == other_secs_sec_guide)) # [x] check the dates dailyDF = readCSV(daily_fn) first_date = dailyDF.Date.iloc[0] last_date = dailyDF.Date.iloc[-1] assert abs((pd.to_datetime(first_date) - pd.to_datetime(comb_row.TrainStartDate) ).days) < 20, str(first_date) + ' ' + str( comb_row.TrainStartDate) # may want to relax these assert abs( (pd.to_datetime(last_date) - pd.to_datetime(comb_row.ValEndDate) ).days) < 20, str(last_date) + ' ' + str(comb_row.ValEndDate) all_minutesDF = readCSV(all_minutes_fn) print('Data/ load complete.') return postprocessData(all_minutesDF, dailyDF, sec_guideDF, config_dir, day_chg_incs, minute_incs)
def loadAndProcessData(comb_row, config_dir, day_chg_incs, minute_incs, minute_dir=tdm_dir + 'Minute_Files/'): """ 1. Loads the minute files from the external hard drive 2. Creates, saves, and returns all_minutesDF, sec_guideDF, and dailyDF """ other_secs = comb_row[[ col for col in comb_row.index if (col[:3] == 'Sec' and col != 'Sec1') ]].values other_secs = [i for i in other_secs if type(i) == str] print('No pre-loaded data found. Loading data for ' + comb_row.Sec1 + ' and ' + ','.join(other_secs)) sec1_minuteDF = readCSV(minute_dir + comb_row.Sec1 + '.csv') sec1_minuteDF = sec1_minuteDF.loc[ (sec1_minuteDF.Date >= comb_row.TrainStartDate) & (sec1_minuteDF.Date <= comb_row.ValEndDate)].reset_index(drop=True) print('Loading minuteDF for ' + comb_row.Sec2) other_secs_minuteDF = readCSV(minute_dir + comb_row.Sec2 + '.csv')[[ 'Product', 'Date', 'Minute', 'O_B', 'O_A', 'H_B', 'H_A', 'L_B', 'L_A', 'C_B', 'C_A', 'Count', 'B_TickImb', 'A_TickImb', 'M_TickImb' ]] for sec in other_secs[1:]: print('Loading minuteDF for ' + sec) other_secs_minuteDF = other_secs_minuteDF.append( readCSV(minute_dir + sec + '.csv')[[ 'Product', 'Date', 'Minute', 'O_B', 'O_A', 'H_B', 'H_A', 'L_B', 'L_A', 'C_B', 'C_A', 'Count', 'B_TickImb', 'A_TickImb', 'M_TickImb' ]], ignore_index=True) other_secs_minuteDF = other_secs_minuteDF.loc[ (other_secs_minuteDF.Date >= comb_row.TrainStartDate) & (other_secs_minuteDF.Date <= comb_row.ValEndDate)].reset_index( drop=True) print('other_secs_minuteDF has ' + str(len(other_secs_minuteDF)) + ' rows.') print('sec1_minuteDF has ' + str(len(sec1_minuteDF)) + ' rows.') print("readCSVs complete. Subsetting dates...") # [x] subset for dates dates_in_common = set(sec1_minuteDF.Date.unique()) for sec in other_secs: print(sec) other_sec_dates = set(other_secs_minuteDF.loc[ other_secs_minuteDF.Product == sec].Date.unique()) print('removing', [ str(d) for d in sorted(list(dates_in_common.difference(other_sec_dates))) ]) dates_in_common = dates_in_common.intersection(other_sec_dates) print(len(dates_in_common), 'dates_in_common') sec1_dates_to_remove = set( sec1_minuteDF.Date.unique()).difference(dates_in_common) print(str(len(dates_in_common)) + ' dates_in_common') if len(sec1_dates_to_remove) > 0: print('- removing ' + str(len(sec1_dates_to_remove)) + ' dates from ' + comb_row.Sec1) sec1_minuteDF = sec1_minuteDF.loc[sec1_minuteDF.Date.isin( dates_in_common)].reset_index(drop=True) for sec in other_secs: sec_dates_to_remove = set( other_secs_minuteDF.loc[other_secs_minuteDF.Product == sec].Date. unique()).difference(dates_in_common) if len(sec_dates_to_remove) > 0: print('- removing ' + str(len(sec_dates_to_remove)) + ' dates from ' + sec) other_secs_minuteDF = other_secs_minuteDF.loc[ other_secs_minuteDF.Date.isin(dates_in_common)].reset_index(drop=True) print('other_secs_minuteDF has ' + str(len(other_secs_minuteDF)) + ' rows.') print('sec1_minuteDF has ' + str(len(sec1_minuteDF)) + ' rows.') print("Date subset complete. Determining each day's Open/Closes...") # [x] determine each day's open and close dailyDF = pd.DataFrame(columns=['Date', 'Open', 'Close']) dailyDF['Date'] = sec1_minuteDF.Date.unique() for i in tqdm(range(len(dailyDF))): date = dailyDF.loc[i].Date lastOpen = sec1_minuteDF.loc[sec1_minuteDF.Date == date].Minute.min() firstClose = sec1_minuteDF.loc[sec1_minuteDF.Date == date].Minute.max() other_sec_date_subDF = other_secs_minuteDF.loc[other_secs_minuteDF.Date == date] for sec in other_secs: lastOpen = max( lastOpen, other_sec_date_subDF.loc[other_sec_date_subDF.Product == sec].Minute.min()) firstClose = min( firstClose, other_sec_date_subDF.loc[ other_sec_date_subDF.Product == sec].Minute.max()) dailyDF.loc[i, 'Open'] = lastOpen dailyDF.loc[i, 'Close'] = firstClose dailyDF.Open = dailyDF.Open.dt.strftime(date_format='%H:%M') dailyDF.Close = dailyDF.Close.dt.strftime(date_format='%H:%M') dailyDF.to_csv(config_dir + 'Data/daily_summary.csv', index=False) print( "Each day's Open/Closes determination complete. Creating all_minutesDF..." ) # [x] create all_minutesDF all_minutesDF = pd.DataFrame(columns=['Date', 'Minute']) # enumerate minutes for i in range(len(dailyDF)): open_dt = pd.to_datetime( dailyDF.loc[i].Date.strftime(format='%Y-%m-%d') + ' ' + dailyDF.loc[i].Open, format='%Y-%m-%d %H:%M') close_dt = pd.to_datetime( dailyDF.loc[i].Date.strftime(format='%Y-%m-%d') + ' ' + dailyDF.loc[i].Close, format='%Y-%m-%d %H:%M') minute_range = pd.date_range(start=open_dt, end=close_dt, freq='T') day_minutesDF = pd.DataFrame({ 'Date': minute_range.date, 'Minute': minute_range.values }) all_minutesDF = all_minutesDF.append(day_minutesDF, ignore_index=True) #populate minute data col_stems = [ 'O_B', 'O_A', 'H_B', 'H_A', 'L_B', 'L_A', 'C_B', 'C_A', 'Count', 'B_TickImb', 'A_TickImb', 'M_TickImb' ] first_minute_populate_stems = [ 'O_B', 'O_A', 'H_B', 'H_A', 'L_B', 'L_A', 'C_B', 'C_A' ] for sec_num in range(1, len(other_secs) + 2): sec_cols = [col_stem + str(sec_num) for col_stem in col_stems] for sec_col in sec_cols: all_minutesDF[sec_col] = np.nan all_minutesDF[[c + '1' for c in col_stems ]] = pd.merge(all_minutesDF[['Minute']], sec1_minuteDF[['Minute'] + col_stems], on='Minute', how='left')[col_stems] print('Merging into all_minutesDF...') for sec_num in range(2, len(other_secs) + 2): other_sec = other_secs[sec_num - 2] all_minutesDF[[c + str(sec_num) for c in col_stems]] = pd.merge( all_minutesDF[['Minute']], other_secs_minuteDF[['Minute'] + col_stems].loc[ other_secs_minuteDF.Product == other_sec], on='Minute', how='left')[col_stems] print('Getting the first datapoint of each day...') #get first datapoint of each day for date in tqdm(dates_in_common): date = dailyDF.loc[i].Date if date in dates_in_common: open_dt = pd.to_datetime( dailyDF.loc[i].Date.strftime(format='%Y-%m-%d') + ' ' + dailyDF.loc[i].Open, format='%Y-%m-%d %H:%M') sec1_last_row = sec1_minuteDF.loc[(sec1_minuteDF.Date == date) & ( sec1_minuteDF.Minute <= open_dt)].iloc[-1] if sec1_last_row.Minute < open_dt: if (open_dt - sec1_last_row.Minute).seconds / 60 > 20: raise ValueError( 'Too much time has elapsed. ' + comb_row.Sec1 + ' open quote is stale at ' + open_dt.strftime(format='%Y-%m-%d %H:%M') + ' by ' + str((open_dt - sec1_last_row.Minute).seconds / 60) + ' minutes.') else: all_minutesDF.loc[all_minutesDF.Minute == open_dt, [c + '1' for c in col_stems]] = 0 all_minutesDF.loc[ all_minutesDF.Minute == open_dt, [c + '1' for c in first_minute_populate_stems ]] = sec1_last_row[first_minute_populate_stems] other_secs_subDF = other_secs_minuteDF.loc[ (other_secs_minuteDF.Date == date) & (other_secs_minuteDF.Minute <= open_dt)] for sec_num in range(2, len(other_secs) + 2): other_sec = other_secs[sec_num - 2] other_sec_last_row = other_secs_subDF.loc[ other_secs_subDF.Product == other_sec].iloc[-1] if other_sec_last_row.Minute < open_dt: if (open_dt - other_sec_last_row.Minute).seconds / 60 > 20: raise ValueError( "Too much time has elapsed. " + other_sec + " open quote is stale at " + date.strftime(open_dt='%Y-%m-%d %H:%M') + ' by ' + str((open_dt - other_sec_last_row.Minute).seconds / 60) + ' minutes.') else: all_minutesDF.loc[ all_minutesDF.Minute == open_dt, [c + str(sec_num) for c in col_stems]] = 0 all_minutesDF.loc[all_minutesDF.Minute == open_dt, [ c + str(sec_num) for c in first_minute_populate_stems ]] = other_sec_last_row[first_minute_populate_stems] print('Saving all_minutesDF...') all_minutesDF.to_csv(config_dir + 'Data/all_minutes.csv', index=False) print('Save complete.') sec_guideDF = pd.DataFrame({'Sec': [comb_row.Sec1] + list(other_secs)}) sec_guideDF.to_csv(config_dir + 'Data/sec_guide.csv', index=False) return all_minutesDF, dailyDF, sec_guideDF
numIdle['avg'].append( np.mean([ numIdle[1][key], numIdle[2][key], numIdle[3][key], numIdle[4][key] ])) numBlocked['avg'].append( np.mean([ numBlocked[1][key], numBlocked[2][key], numBlocked[3][key], numBlocked[4][key] ])) return efficiency, percentBlocked, numIdle, numBlocked, totaltime if __name__ == '__main__': dDict, head = readCSV('data/ridesLyftMHTN14.csv') dDict = addTimeStamp(dDict, slotInMinutes=5) dictNum = computeNumberOfUsers(dDict) adjacency = {1: [2], 2: [1, 3, 4], 3: [2, 4], 4: [2, 3]} initSupply = { 1: { 'idle': 0, 'active': 0, 'arrived': 0, 'departed': 0 }, 2: { 'idle': 0, 'active': 0, 'arrived': 0, 'departed': 0
def populateDB(filename, csv_delimiter, header, language='EN', dbname='TwitterDB', host='localhost', port=27017, mode=0, serialized=False): start = time.time() h, lines = utils.readCSV(filename, csv_delimiter, header) populateDatabase(lines, language, dbname, host, port, mode, serialized) end = time.time() print "time_populate.append(", (end - start), ")"
def populateDB(filename, csv_delimiter, header, language='EN', dbname='TwitterDB', mode=0): start = time.time() h, lines = utils.readCSV(filename, csv_delimiter, header) populateDatabase(lines, dbname) end = time.time() print "time_populate.append(", (end - start), ")"
def main(): args = processCommandline() contactsData = utils.readCSV(args['input']) transformedContactsData = transformData(contactsData) writeOutput(transformedContactsData, args['output'])
words = [] for w in lemmas.wordList: word = dict() word['word'] = w.word word['tf'] = w.tf word['count'] = w.count word['pos'] = w.wtype words.append(word) document['cleanText'] = cleanText#.encode('latin-1').encode('string_escape').replace('\r', '').replace('\n', '') document['lemmaText'] = lemmaText document['words'] = words except Exception as e: print e, 'sunt in lemmaText' except Exception as e: print e, 'aici try 1', elem else: print 'aici in else', elem return document if __name__ == "__main__": dbname = 'EGCDB' filename = 'RNTI_articles_export.csv' header = False csv_delimiter = '\t' corpus = utils.readCSV(filename) # createDB = CreateDB_EGC(dbname=dbname) # createDB.insert_data(corpus=corpus) print len(corpus) insert_data(dbname, corpus)
for i in range(len(folder_length)): dct[str(i + 1)] = [str(j) for j in range(1, folder_length[i] + 1)] label_dict = {} for i in range(1, len(folder_length) + 1): label_dict[i] = label_dict.get(i, {}) for j in dct[str(i)]: label_dict[i][int(j)] = i return label_dict if __name__ == '__main__': folder_length = [6, 10, 5, 6] # gen_path_dict(path) path_dict = get_path_dict('path_dict.pkl') label_dict = gen_label_dict() elements = [l[1:] for l in readCSV('data/elements.csv')[1:]] with open('data/original.bin', 'wb') as f, open('data/original.csv', 'wb') as fcsv: cw = csv.writer(fcsv) cw.writerow(['No', 'Class', 'SubNO', 'Category'] + 'Mn Si Ni Cr V Mo Ti Cu Fe'.split()) for cls, subfolders in path_dict.items(): for subno, filenames in subfolders.items(): no = sum(folder_length[:cls - 1]) + subno label = label_dict[cls][subno] csv_list = [no, cls, subno, label] + elements[no - 1] for fn in filenames: cw.writerow(csv_list) spa_array = read_spa(fn) f.write(struct.pack('<f', label)) np.array(elements[no - 1], np.float32).tofile(f)
def get_size(): return len(readCSV('data/train.csv')[1:]),\ len(readCSV('data/test.csv')[1:]),\ len(readCSV('data/val.csv')[1:])
'angel1k_o', 'angel3k_o', 'angel5k_o', 'angel10k_o', 'angel25k_o', 'angel50k' ] models['Armadillo'] = [ 'Armadillo1k_o', 'Armadillo3k_o', 'Armadillo5k_o', 'Armadillo10k_o', 'Armadillo25k_o', 'Armadillo35k' ] models['bunny'] = [ 'bunny1k_o', 'bunny3k_o', 'bunny5k_o', 'bunny10k_o', 'bunny25k_o', 'bunny70k' ] numResolution = 6 rotatePeriod = np.array([20, 40, 60, 80, 100, 200, 300, 400, 500, 600], dtype=np.float) numRotates = rotatePeriod.shape[0] qualityOffset = [0, 1] qualityOpt = ['PSNR', 'SSIM'] for modelId in range(4): # modelId = 0 coarseResId = 2 renderMode = '25' rootDir = "/Users/sfhan/Dropbox/stereoRepoj/Results" datafile = join( rootDir, modelName[modelId], models[modelName[modelId]][coarseResId] + "_rot1_{}_2.csv".format(renderMode)) data = readCSV(datafile) plot_rotatePeriod_quality()
def STSS_tests(): """Some tests for wordNetSimilarity using STSS dataset""" import matplotlib.pyplot as plt sentences = readCSV(STSS_131_DATA) sim_values, STSS_values = [], [] n = 0 print("Using path_similarity") for s in sentences: val = wordNetSimilarity(s.first_sentence, s.second_sentence) sim_values.append(val) STSS_values.append(s.human_SS) print("******************************************************") print(stats.pearsonr(sim_values, STSS_values)) sim_values, STSS_values = [], [] n = 0 for s in sentences: sim_values.append( wordNetSimilarity(s.first_sentence, s.second_sentence, use_wup=True)) STSS_values.append(s.human_SS) print("******************************************************") print("Using wup") p = stats.pearsonr(sim_values, STSS_values) print(p) sim_values, STSS_values = [], [] n = 0 for s in sentences: sim_values.append( wordNetSimilarity(s.first_sentence, s.second_sentence, perform_stemming=True)) STSS_values.append(s.human_SS) print("******************************************************") print("Preprocessing: Stemming") p = stats.pearsonr(sim_values, STSS_values) print(p) sim_values, STSS_values = [], [] for s in sentences: sim_values.append( wordNetSimilarity(s.first_sentence, s.second_sentence, perform_lemmatization=True)) STSS_values.append(s.human_SS) print("******************************************************") print("Preprocessing: lemmatization") p = stats.pearsonr(sim_values, STSS_values) print(p) sim_values, STSS_values = [], [] n = 0 for s in sentences: sim_values.append( wordNetSimilarity(s.first_sentence, s.second_sentence, use_idf=True)) STSS_values.append(s.human_SS) print("******************************************************") print("Preprocessing: use tf_idf") p = stats.pearsonr(sim_values, STSS_values) print(p)
import utils from operator import itemgetter ACHIEVEMENTS = utils.readCSV("./static/achievements.csv") TITLES = utils.readJSON("./static/titles.json") def get_max_level(achievement_list): max_points = 0 max_level = 0 for achv in achievement_list: if int(achv['points']) > 0: max_points += int(achv['points']) tmp_max_points = max_points while tmp_max_points / 2 > 0: max_level += 1 tmp_max_points = int(tmp_max_points / 2) return max_level, max_points def get_levelup_requirements(): max_level, max_points = get_max_level(ACHIEVEMENTS) level_info = list() while max_points / 2 > 0: level_info.append(max_points) max_points = int(max_points / 2) return list(reversed(level_info))