def splitData(): for i in range(1,FILES+1): print '-'*50 print 'Split dataset 1 %d: ' % i rfile = file(DATA_SET,'r') reader = csv.reader(rfile) j = i + 10 if j != 32: train_file = file('splited_data/%d.csv'%i,'w') result_file = file('splited_data/%s_%d.csv'%('result',i),'w') train_writer = csv.writer(train_file) result_writer = csv.writer(result_file) for line in reader: progressBar(reader.line_ num,DATASET_SIZE) if int(line[5]) >= i and int(line[5]) < j: train_writer.writerow(line) if int(line[5]) == j and int(line[2]) == 4: result_writer.writerow([line[0],line[1]]) train_file.close() result_file.close() else: train_file = file('splited_data/for_prediction.csv','w') train_writer = csv.writer(train_file) for line in reader: progressBar(reader.line_num,DATASET_SIZE) if int(line[5]) >= i and int(line[5]) < j: train_writer.writerow(line) train_file.close() rfile.close()
def prepare_validation(filename): REPLICATE, DATES, INDIVIDUALS, WEIGHTED = 1, 2, 4, 8 # Load the data, note the unique dates, replicates data = pd.read_csv(filename, header=None) dates = data[DATES].unique().tolist() replicates = data[REPLICATE].unique().tolist() # Calculate the 561H frequency for each date, replicate count, frequencies = 0, [] for replicate in replicates: byReplicate = data[data[REPLICATE] == replicate] frequency = [] for date in dates: byDate = byReplicate[byReplicate[DATES] == date] frequency.append(sum(byDate[WEIGHTED]) / sum(byDate[INDIVIDUALS])) if len(frequencies) != 0: frequencies = np.vstack((frequencies, frequency)) else: frequencies = frequency # Update the progress bar count += 1 progressBar(count, len(replicates)) # Return the results return dates, frequencies
def report(title, dates, districtData): for district in rwanda.DISTRICTS: rwanda.plot_summary(title, dates, districtData[district], district=district) progressBar(district, len(rwanda.DISTRICTS))
def sampling(): cutoffLine('*') print 'Sampling using EasyEnsemble method' start_time = time.time() TRAIN_SET = 'training_set' if not os.path.exists(TRAIN_SET): os.mkdir(TRAIN_SET) propotion = 10 negative_size = POSITIVE * propotion r_file = file(PRE_DIR + '/negative_set.csv', 'r') reader = csv.reader(r_file) positive_set = readCSV(PRE_DIR + '/positive_set.csv', int) negative_set = [] set_count = 0 for line in reader: progressBar(reader.line_num, NEGATIVE) line = map(int, line) if line[-1] == 1: positive_set.append(line) if line[-1] == 0: negative_set.append(line) if len(negative_set) == negative_size or reader.line_num == NEGATIVE: set_count += 1 training_set = positive_set + negative_set random.shuffle(training_set) file_name = TRAIN_SET + '/' + '%d.csv'%set_count writeCSV(training_set, file_name) negative_set = [] r_file.close() end_time = time.time() duration = timekeeper(start_time, end_time) cutoffLine('*') print 'It takes %s to sampling' % duration
def splitData(): cutoffLine('*') print 'Start split data with window %d' % WINDOW start_time = time.time() stat_file = file(PRE_DIR + '/stat.csv', 'w') stat_writer = csv.writer(stat_file) for i in range(1, FILES + 1): cutoffLine('-') print 'Split dataset %d/%d: ' % (i, FILES) rfile = file(DATA_SET, 'r') reader = csv.reader(rfile) j = i + WINDOW if j != TOTAL_DAY + 1: if j == TOTAL_DAY: train_file_name = 'test.csv' result_file_name = 'result_test.csv' else: train_file_name = '%d.csv' % i result_file_name = '%s_%d.csv' % ('result', i) train_file = file(PRE_DIR + '/' + train_file_name, 'w') result_file = file(PRE_DIR + '/' + result_file_name, 'w') train_writer = csv.writer(train_file) result_writer = csv.writer(result_file) train_count = 0 result_count = 0 for line in reader: progressBar(reader.line_num, DATASET_SIZE) if int(line[5]) >= i and int(line[5]) < j: train_writer.writerow(line) train_count += 1 if int(line[5]) == j and int(line[2]) == 4: result_writer.writerow([line[0], line[1]]) result_count += 1 stat_writer.writerow([train_file_name, train_count]) stat_writer.writerow([result_file_name, result_count]) train_file.close() result_file.close() else: forpredict_file_name = 'for_prediction.csv' train_file = file(PRE_DIR + '/' + forpredict_file_name, 'w') train_writer = csv.writer(train_file) train_count = 0 for line in reader: progressBar(reader.line_num, DATASET_SIZE) if int(line[5]) >= i and int(line[5]) < j: train_writer.writerow(line) train_count += 1 stat_writer.writerow([forpredict_file_name, train_count]) train_file.close() rfile.close() stat_file.close() end_time = time.time() duration = timekeeper(start_time, end_time) cutoffLine('-') print 'It takes ' + duration + ' to split dataset.' cutoffLine('*')
def splitData(): cutoffLine('*') print 'Start split data with window %d' % WINDOW start_time = time.time() stat_file = file(PRE_DIR + '/stat.csv','w') stat_writer = csv.writer(stat_file) for i in range(1,FILES+1): cutoffLine('-') print 'Split dataset %d/%d: ' % (i, FILES) rfile = file(DATA_SET,'r') reader = csv.reader(rfile) j = i + WINDOW if j != TOTAL_DAY + 1: if j == TOTAL_DAY: train_file_name = 'test.csv' result_file_name = 'result_test.csv' else: train_file_name = '%d.csv'%i result_file_name = '%s_%d.csv'%('result',i) train_file = file(PRE_DIR + '/' + train_file_name,'w') result_file = file(PRE_DIR + '/' + result_file_name,'w') train_writer = csv.writer(train_file) result_writer = csv.writer(result_file) train_count = 0 result_count = 0 for line in reader: progressBar(reader.line_num, DATASET_SIZE) if int(line[5]) >= i and int(line[5]) < j: train_writer.writerow(line) train_count += 1 if int(line[5]) == j and int(line[2]) == 4: result_writer.writerow([line[0],line[1]]) result_count += 1 stat_writer.writerow([train_file_name, train_count]) stat_writer.writerow([result_file_name, result_count]) train_file.close() result_file.close() else: forpredict_file_name = 'for_prediction.csv' train_file = file(PRE_DIR + '/' + forpredict_file_name,'w') train_writer = csv.writer(train_file) train_count = 0 for line in reader: progressBar(reader.line_num,DATASET_SIZE) if int(line[5]) >= i and int(line[5]) < j: train_writer.writerow(line) train_count += 1 stat_writer.writerow([forpredict_file_name, train_count]) train_file.close() rfile.close() stat_file.close() end_time = time.time() duration = timekeeper(start_time,end_time) cutoffLine('-') print 'It takes ' + duration + ' to split dataset.' cutoffLine('*')
def process_datasets(): print("Preparing data sets...") replicates = pd.read_csv(REPLICATES_LIST, header=None) configurations = replicates[2].unique() count = 0 progressBar(count, len(configurations)) for configuration in configurations: data = replicates[replicates[2] == configuration] filename = os.path.join(DATASET_DIRECTORY, configuration.replace('yml', 'csv')) merge_data(data[-50:][3].to_numpy(), filename) count = count + 1 progressBar(count, len(configurations))
def prepare(filename): REPLICATE, DATES, DISTRICT, INDIVIDUALS, WEIGHTED = 1, 2, 3, 4, 8 # Load the data, note the unique dates, replicates data = pd.read_csv(filename, header=None) dates = data[DATES].unique().tolist() replicates = data[REPLICATE].unique().tolist() # Build the dictionary that will be used to store data districtData = {} for district in rwanda.DISTRICTS: districtData[district] = {} for key in rwanda.REPORT_LAYOUT: districtData[district][key] = [] # Start by filtering by the replicate count = 0 for replicate in replicates: byReplicate = data[data[REPLICATE] == replicate] # Load the relevent data for each district for district in rwanda.DISTRICTS: byDistrict = byReplicate[byReplicate[DISTRICT] == district] # Load the simple data for key in rwanda.REPORT_LAYOUT: if key != 'frequency': # Append the basic information index = rwanda.REPORT_LAYOUT[key][rwanda.REPORT_INDEX] if len(districtData[district][key]) != 0: districtData[district][key] = np.vstack( (districtData[district][key], byDistrict[index])) else: districtData[district][key] = byDistrict[index] else: # Append the 561H frequency data frequency = byDistrict[WEIGHTED] / byDistrict[INDIVIDUALS] if len(districtData[district][key]) != 0: districtData[district][key] = np.vstack( (districtData[district][key], frequency)) else: districtData[district][key] = frequency # Update the progress bar count += 1 progressBar(count, len(replicates)) # Return the results return dates, districtData
def main(method, filename, progress=True): # Load the relevent ASC data [ascHeader, pfpr] = load_asc("../../GIS/rwa_pfpr2to10.asc") [_, population] = load_asc("../../GIS/rwa_population.asc") [_, treatments] = load_asc("../../GIS/rwa_treatment.asc") beta = [] for row in range(0, ascHeader['nrows']): beta.append([]) for col in range(0, ascHeader['ncols']): # Append no data and continue if pfpr[row][col] == ascHeader['nodata']: beta[row].append(ascHeader['nodata']) continue # If PfPR is zero, then beta is zero if pfpr[row][col] == 0: beta[row].append(0) continue # Get the population bin, find the beta for the cell popBin = get_bin(population[row][col], POPULATION_BINS) target = round(pfpr[row][col] * 100.0, 8) result = method("data/calibration.csv", popBin, treatments[row][col], target) # Check for errors before updating the array if result is None: sys.stderr.write("Null value returned for beta, exiting\n") sys.exit(1) if result < 0: sys.stderr.write( "Projected beta {} is less than zero, exiting\n".format( result)) # Only exit if the debug flag isn't set if progress: sys.exit(1) beta[row].append(result) # Note the progress if progress: progressBar(row + 1, ascHeader['nrows']) # Save the calculated beta values print("Saving {}".format(filename)) write_asc(ascHeader, beta, filename)
def prepare_national(filename): REPLICATE, DATES, INDIVIDUALS, WEIGHTED = 1, 2, 4, 8 # Load the data, note the unique dates, replicates data = pd.read_csv(filename, header=None) dates = data[DATES].unique().tolist() replicates = data[REPLICATE].unique().tolist() # Build the dictionary for the results results = {} for key in rwanda.REPORT_LAYOUT: results[key] = [] # Start by filtering by the replicate count = 0 for replicate in replicates: byReplicate = data[data[REPLICATE] == replicate] # Prepare the data structure for this replicate values = {} for key in rwanda.REPORT_LAYOUT: values[key] = [] # Next, filter by date so we can properly aggregate for date in dates: byDate = byReplicate[byReplicate[DATES] == date] for key in rwanda.REPORT_LAYOUT: if key != 'frequency': index = rwanda.REPORT_LAYOUT[key][rwanda.REPORT_INDEX] values[key].append(sum(byDate[index])) else: values[key].append( sum(byDate[WEIGHTED]) / sum(byDate[INDIVIDUALS])) # Append this replicate to our results for key in rwanda.REPORT_LAYOUT: if len(results[key]) != 0: results[key] = np.vstack((results[key], values[key])) else: results[key] = values[key] # Update the progress bar count += 1 progressBar(count, len(replicates)) # Return the results return dates, results
def splitData(): stat_file = file('splited_data/stat.csv','w') stat_writer = csv.writer(stat_file) for i in range(1,FILES+1): cutoffLine('-') print 'Split dataset %d: ' % i rfile = file(DATA_SET,'r') reader = csv.reader(rfile) j = i + 10 if j != TOTAL_DAY + 1: if j == TOTAL_DAY: train_file_name = 'test.csv' result_file_name = 'result_test.csv' else: train_file_name = '%d.csv'%i result_file_name = '%s_%d.csv'%('result',i) train_file = file(PRE_DIR + '/' + train_file_name,'w') result_file = file(PRE_DIR + '/' + result_file_name,'w') train_writer = csv.writer(train_file) result_writer = csv.writer(result_file) train_count = 0 result_count = 0 for line in reader: progressBar(reader.line_num, DATASET_SIZE) if int(line[5]) >= i and int(line[5]) < j: train_writer.writerow(line) train_count += 1 if int(line[5]) == j and int(line[2]) == 4: result_writer.writerow([line[0],line[1]]) result_count += 1 stat_writer.writerow([train_file_name, train_count]) stat_writer.writerow([result_file_name, result_count]) train_file.close() result_file.close() else: forpredict_file_name = 'for_prediction.csv' train_file = file(PRE_DIR + '/' + forpredict_file_name,'w') train_writer = csv.writer(train_file) train_count = 0 for line in reader: progressBar(reader.line_num,DATASET_SIZE) if int(line[5]) >= i and int(line[5]) < j: train_writer.writerow(line) train_count += 1 stat_writer.writerow([forpredict_file_name, train_count]) train_file.close() rfile.close()
def process_annual_data(studyId): # Percentage of public market treatments, drawn from configuration # 3: Status Quo # 4: Rapid Private Market Elimination # 5: Ten Year Private Market Elimination # 6: Status Quo # 7: Rapid Private Market Elimination publicmarket = { 3: {2025: 0.832, 2030: 0.832, 2035: 0.832}, 4: {2025: 1.0, 2030: 1.0, 2035: 1.0}, 5: {2025: 0.916, 2030: 1.0, 2035: 1.0}, 6: {2025: 0.832, 2030: 0.832, 2035: 0.832}, 7: {2025: 1.0, 2030: 1.0, 2035: 1.0} } # Range of dates for the year ranges = { 2025: (6575, 6606, 6634, 6665, 6695, 6726, 6756, 6787, 6818, 6848, 6879, 6909), 2030: (8401, 8432, 8460, 8491, 8521, 8552, 8582, 8613, 8644, 8674, 8705, 8735), 2035: (10227, 10258, 10286, 10317, 10347, 10378, 10408, 10439, 10470, 10500, 10531, 10561) } # Exit if a study was not provided if studyId not in publicmarket.keys(): print("Invalid study provided, {}".format(studyId)) exit(1) # Let the user know what we are doing print("Processing annual data...") count = 0 progressBar(count, len(ranges.keys())) for key in ranges.keys(): # Query and save the data data = get_annual_data(studyId, ranges[key], publicmarket[studyId][key]) with open("out/{}-{}-annual-data.csv".format(key, studyId), "wb") as csvfile: writer = csv.writer(csvfile) writer.writerow(["filename", "replicateid", "population", "cases", "reportedcases", "clinicalper1000", "pfpr2to10"]) for row in data: writer.writerow(list(row)) # Note the process count = count + 1 progressBar(count, len(ranges.keys()))
def extract_feature(file_name, line_count, start_date, result_name = ''): r_name = PRE_DIR + '/' + file_name w_name = PRE_DIR + '/set_' + file_name r_file = file(r_name, 'r') w_file = file(w_name, 'w') reader = csv.reader(r_file) writer = csv.writer(w_file) UI_feature = {} for line in reader: progressBar(reader.line_num, line_count) UI = (int(line[0]),int(line[1])) if UI not in UI_feature: UI_feature[UI] = [0]*40 ## 0 ~ 9 前十天浏览量统计 ## 10 ~ 19 前十天收藏量统计 ## 20 ~ 29 前十天加入购物车统计 ## 30 ~ 39 前十天购买量统计 index = int(line[5]) - start_date if int(line[2]) == 1 : UI_feature[UI][index] += 1 if int(line[2]) == 2 : UI_feature[UI][10+index] += 1 if int(line[2]) == 3 : UI_feature[UI][20+index] += 1 if int(line[2]) == 4 : UI_feature[UI][30+index] += 1 r_file.close() ## 打标签 result_set = set() if result_name: r_name = PRE_DIR + '/' + result_name r_file = file(r_name, 'r') reader = csv.reader(r_file) for line in reader: result_set.add((int(line[0]), int(line[1]))) r_file.close() if result_set: for UI in UI_feature: if UI in result_set: writer.writerow(list(UI) + UI_feature[UI] + [1]) else: writer.writerow(list(UI) + UI_feature[UI] + [0]) else: for UI in UI_feature: writer.writerow(list(UI) + UI_feature[UI]) w_file.close()
def process_summaries(replicates, burnIn, modelStartDate): # Update the user print("Processing {} replicate summaries...".format(len(replicates))) # Note the progress total = 0 progressBar(total, len(replicates) + 1) # Iterate through all of the rows for replicate in replicates: # Only download complete summaries if replicate[COMPLETE] == False: continue # Check to see if the work has already been done filename = GENOTYPE_TEMPLATE.format(replicate[LABEL], replicate[REPLICATEID]) if not os.path.exists(filename): save_genotype_summary(replicate[LABEL], replicate[REPLICATEID], burnIn) filename = TREATMENT_TEMPLATE.format(replicate[LABEL], replicate[REPLICATEID]) if not os.path.exists(filename): save_treatment_summary(replicate[LABEL], replicate[REPLICATEID], burnIn) filename = FREQUENCIES_TEMPLATE.format(replicate[LABEL], replicate[REPLICATEID]) if not os.path.exists(filename): save_genotype_frequencies(replicate[LABEL], replicate[REPLICATEID], burnIn, modelStartDate) # Note the progress total = total + 1 progressBar(total, len(replicates) + 1) # Note that we are done progressBar(len(replicates) + 1, len(replicates) + 1)
def sampling(proportion): cutoffLine('*') start_time = time.time() print 'sampling with propotion %d...'%proportion negative_needed = POSITIVE * proportion sample_times = 10 mod = NEGATIVE / sample_times negative_eachtime = negative_needed / sample_times training_set = readCSV(PRE_DIR + '/positive_set.csv', int) ## sampling negative example rfile = file(PRE_DIR + '/' + 'negative_set.csv', 'r') reader = csv.reader(rfile) negative_tmp = [] for line in reader: progressBar(reader.line_num, NEGATIVE) negative_tmp.append(map(int, line)) if reader.line_num % mod == 0: random.shuffle(negative_tmp) training_set = training_set + negative_tmp[0:negative_eachtime] negative_tmp = [] rfile.close() wfile = file('data/training_set_%d.csv'%proportion, 'w') writer = csv.writer(wfile) random.shuffle(training_set) writer.writerows(training_set) wfile.close() cutoffLine('-') print "Real proportion: %f" %((len(training_set)-POSITIVE) / float(POSITIVE)) cutoffLine('*') end_time = time.time() duration = timekeeper(start_time, end_time) print 'It takes %s to sampling with proportion %d'%(duration, proportion)
def process_frequencies(replicates, subset): # If there are not replicates, then return if len(replicates) == 0: return # Update the user and note common data print("Processing {} replicate frequencies...".format(len(replicates))) nrows = replicates[0][1] ncols = replicates[0][2] # Note the progress total = 0 progressBar(total, len(replicates) + 1) # Iterate through all of the rows currentRate = None for replicate in replicates: # Reset the replicate count on a new row if currentRate != replicate[0]: if currentRate is not None: save_frequencies(data, currentRate) del data currentRate = replicate[0] data = {} # Run a short query to see if we have anything to work with for row in get_580y_frequency_subset(replicate[REPLICATEID], subset): days = row[0] if days not in data: data[days] = [[[0, 0, 0] for _ in range(nrows)] for _ in range(ncols)] c = row[1] r = row[2] # Array formatted as: 0 - infectedindividuals (query index 3) # 1 - weightedoccurrences (query index 5) # 3 - count data[days][r][c][0] += row[3] data[days][r][c][1] += row[5] data[days][r][c][2] += 1 # Note the progress total = total + 1 progressBar(total, len(replicates) + 1) # Save the last data set save_frequencies(data, currentRate) progressBar(total + 1, len(replicates) + 1)
def process_replicates(): # Process the replicates to make sure we have all of the data we need locally print("Querying for replicates list...") replicates = get_replicates() save_csv(REPLICATES_LIST, replicates) print("Processing replicates...") count = 0 progressBar(count, len(replicates)) for row in replicates: # Check to see if we already have the data filename = os.path.join(REPLICATE_DIRECTORY, "{}.csv".format(row[3])) if os.path.exists(filename): continue # Query and store the data replicate = get_replicate(row[3]) save_csv(filename, replicate) # Update the progress bar count = count + 1 progressBar(count, len(replicates)) if count != len(replicates): progressBar(len(replicates), len(replicates))
def extract_feature(window, actday, file_name, line_count, start_date, result_name = ''): r_name = PRE_DIR + '/' + file_name w_name = PRE_DIR + '/set_' + file_name r_file = file(r_name, 'r') w_file = file(w_name, 'w') reader = csv.reader(r_file) writer = csv.writer(w_file) ## 统计同类商品排名,为了避免使用未来信息 ci_rank = {} for c in ci_sale: ## 统计(actday-window)之前商品销量 ci_rank[c] = {} for item in ci_sale[c]: ci_rank[c][item] = sum(ci_sale[c][item][0:actday-window]) ## 销量排名;销量好的排名在后,方便处理没有销量的商品(设为0) rank_list = sorted(ci_rank[c].iteritems(), key = lambda x: x[1]) for index, item in enumerate(rank_list): item = list(item) item[1] = index + 1 rank_list[index] = item ci_rank[c] = dict(rank_list) UI_feature = {} for line in reader: progressBar(reader.line_num, line_count) UI = (int(line[0]),int(line[1]),int(line[4])) if UI not in UI_feature: UI_feature[UI] = [0]*(window*4) ## 4种行为统计 index = int(line[5]) - start_date if int(line[2]) == 1 : UI_feature[UI][index] += 1 if int(line[2]) == 2 : UI_feature[UI][window+index] += 1 if int(line[2]) == 3 : UI_feature[UI][2*window+index] += 1 if int(line[2]) == 4 : UI_feature[UI][3*window+index] += 1 r_file.close() ## 商品同类排名 for UI in UI_feature: if ci_rank[UI[2]].has_key(UI[1]): UI_feature[UI].append(ci_rank[UI[2]][UI[1]]) else: UI_feature[UI].append(0) ## 打标签 result_set = set() if result_name: r_name = PRE_DIR + '/' + result_name r_file = file(r_name, 'r') reader = csv.reader(r_file) for line in reader: result_set.add((int(line[0]), int(line[1]))) r_file.close() if result_set: for UI in UI_feature: if (UI[0],UI[1]) in result_set: writer.writerow(list(UI) + UI_feature[UI] + [1]) else: writer.writerow(list(UI) + UI_feature[UI] + [0]) else: for UI in UI_feature: writer.writerow(list(UI) + UI_feature[UI]) w_file.close()