def get_predicting_data(trainfrom, testat): h, content = toolkit.load_csv('db4school', trainfrom, True) contentT = map(list, zip(*content)) y_train = contentT[-1] content = list() for attr, col in zip(h, contentT): if attr in settings.record_attrs: content.append(col) x_train = map(list, zip(*content)) h, content = toolkit.load_csv('db4school', testat, True) contentT = map(list, zip(*content)) y_test = contentT[-1] content=list() for attr, col in zip(h, contentT): if attr in settings.record_attrs: content.append(col) x_test = map(list, zip(*content)) x_test = map(lambda r:map(toolkit.str2num, r), x_test) x_train = map(lambda r:map(toolkit.str2num, r), x_train) y_test = map(toolkit.str2num, y_test) y_train = map(toolkit.str2num, y_train) return x_train, y_train, x_test, y_test
def CLIFF(model, db_folder, write_out_folder=None): """ Core function for CLIFF algorithm prune the data set according to the power attributes are discretized :param model: should be a csv file containing the original database :param db_folder: the folder name of db :param write_out_folder: where to write out the generated data base into "write_out_folder/model.csv" :return: the CLIFFED database """ ori_attrs, alldata = toolkit.load_csv(db_folder, model) # load the database record_attrs = settings.record_attrs alldataT = map(list, zip(*alldata)) valued_dataT = list() for attr, col in zip(ori_attrs, alldataT): if attr in record_attrs: valued_dataT.append(col) valued_dataT.append(alldataT[-1]) # cant miss the classification alldata = map(list, zip(*valued_dataT)) alldata = map(lambda row:map(toolkit.str2num, row), alldata) # numbering the 2d table after_cliff = cliff_core(alldata) after_cliff.insert(0, record_attrs+[ori_attrs[-1]]) # add the header if write_out_folder: toolkit.write_csv(write_out_folder, model, after_cliff) return after_cliff
def module2(): print('this is the second module in this file.') head, content = toolkit.load_csv(settings.project_path+'/Reports', 'PREDICTION_report') content = map(lambda r: map(toolkit.str2num, r), content) lg_rmse = lambda x: x[4]=='linear regression' and x[5]=='RMSE' dt_rmse = lambda x: x[4]=='decision tree' and x[5]=='RMSE' lace1 = lambda x: x[3]=='Lace1Out' lace2 = lambda x: x[3]=='Lace2Out' org = lambda x: x[3]=='NoHandle' v = list() for clf in [lg_rmse, dt_rmse]: for alg in [org, lace1, lace2]: selected = filter(clf and alg, content) v.append(zip(*selected)[-1]) plt.clf() fig = plt.figure(1) fig.set_size_inches(7, 5) ax = fig.add_subplot(111) box = ax.boxplot(v) plt.xticks(range(1, 7), ['org', 'lace1', 'lace2']*2) ax.axvspan(0, 3.5, alpha=0.3, color='gray') ax.text(1, 6, 'Linear regression', fontsize=10) ax.text(4, 6, 'Decision tree', fontsize=10) ax.set_ylim([0, 10]) ax.set_title('RMSE for predicting at the whole shoolcard set') fig.savefig('school.png', bbox_inhes='tight')
def load_csv_within_region(folder, model, region): header, content = toolkit.load_csv(folder, model, has_header=True) assert 'STABBR' in header, 'please make sure region info in database' abbr_at = header.index('STABBR') content = [i for i in content if i[abbr_at] in REGIONS[region]] return header, content
def get_moprhed_train(source_folder, model): # type: (str, str) -> list, list _, all_trains = toolkit.load_csv(source_folder, model) all_trains = map(lambda r:map(toolkit.str2num, r), all_trains) # change the numbers if possible x = [row[:-1] for row in all_trains] y = [row[-1] for row in all_trains] return x, y
def get_test(model): ori_attrs, all_trains = toolkit.load_csv('TestSet', model) all_trains = map(lambda r: map(toolkit.str2num, r), all_trains) # change the numbers if possible trainsT = map(list, zip(*all_trains)) tmp_trainsT = list() for oa, col in zip(ori_attrs, trainsT): if oa in settings.record_attrs: tmp_trainsT.append(col) tmp_trainsT.append(trainsT[-1]) all_trains = map(list, zip(*tmp_trainsT)) x = [row[:-1] for row in all_trains] y = [row[-1] for row in all_trains] return x, y
def apriori_cmpr(model, org_folder, ptz_folder): """ Note: ignore the class attribute. just focus on the independent attributes :param model: :param org_folder: :param ptz_folder: :return: """ # load the data sets org_attrs, org_data = toolkit.load_csv(org_folder, model) org_data = map(lambda r: map(toolkit.str2num, r), org_data) ptz_attrs, ptz_data = toolkit.load_csv(ptz_folder, model) ptz_data = map(lambda r: map(toolkit.str2num, r), ptz_data) ptz_data = toolkit.del_col_in_table(ptz_data, -1) # delete the useless columns attributes = settings.record_attrs org_dataT = map(list, zip(*org_data)) org_dataT = [col for col, a1 in zip(org_dataT, org_attrs) if a1 in attributes] org_data = map(list, zip(*org_dataT)) # discretize the data # translate the continuous attribute into 'attr+level' dis_org_data = [] dis_ptz_data = [] # ranges_dict = dict() # for backup for attr_name, col1, col2 in zip(attributes, zip(*org_data), zip(*ptz_data)): col1 = list(col1) col2 = list(col2) col = col1 + col2 # NOTE: put two dataset together ranges = toolkit.binrange(col) # ranges_dict[attr_name] = ranges tags = [] for element in col1: for cursor, upper_bound in enumerate(ranges): if upper_bound >= element: break # lower_bound = ranges[max(cursor-1, 0)] # mid = (upper_bound + lower_bound) / 2 # if type(mid) is float: # mid = round(mid, 2) # # tags.append(attr_name+':' + str(mid)) tags.append(attr_name + ':' + str(cursor)) dis_org_data.append(tags) tags = [] for element in col2: for cursor, upper_bound in enumerate(ranges): if upper_bound >= element: break tags.append(attr_name + ':' + str(cursor)) dis_ptz_data.append(tags) dis_org_data = map(list, zip(*dis_org_data)) dis_ptz_data = map(list, zip(*dis_ptz_data)) logging.info("Database discretization done.") org_iter = dataset_iter(dis_org_data) ptz_iter = dataset_iter(dis_ptz_data) items_org, rules_org = runApriori(org_iter, settings.apriori_min_support, settings.apriori_min_confidence) items_ptz, rules_ptz = runApriori(ptz_iter, settings.apriori_min_support, settings.apriori_min_confidence) return items_org, items_ptz, rules_org, rules_ptz, dis_org_data, dis_ptz_data
def module1(): header, content = toolkit.load_csv(settings.project_path+"/db4school", "precision_report") content = map(lambda r: map(toolkit.str2num, r), content) source_db = lambda x: x[2] test_for = lambda x: x[4] cases = ['school0', 'school1', 'school2', 'school3'] lg_rmse = lambda x: x[5]=='linear regression' and x[6]=='RMSE' # lg_mae = lambda x: x[5]=='linear regression' and x[6]=='MAE' dt_rmse = lambda x: x[5]=='decision tree' and x[6]=='RMSE' # dt_mae = lambda x: x[5]=='decision tree' and x[6]=='MAE' for case in cases: plt.clf() fig = plt.figure(1) fig.set_size_inches(7,5) match_cases = filter(lambda x: source_db(x) == case, content) lace1_cases = filter(lambda x: 'LACE1' in x, match_cases) lace2_cases = filter(lambda x: 'LACE2' in x, match_cases) v = list() for testat in cases: # lace1 selected = filter(lambda x: test_for(x) == testat, lace1_cases) lg = filter(lg_rmse, selected) dt = filter(dt_rmse, selected) v.append(zip(*lg)[-1]) v.append(zip(*dt)[-1]) # lace2 selected = filter(lambda x: test_for(x) == testat, lace2_cases) lg = filter(lg_rmse, selected) dt = filter(dt_rmse, selected) v.append(zip(*lg)[-1]) v.append(zip(*dt)[-1]) ax = fig.add_subplot(111) box = ax.boxplot(v) ax.axvspan(0, 4.5, alpha=0.3, color='gray') ax.axvspan(8.5, 12.5, alpha=0.3, color='gray') ax.text(1, 6, 'LACE1\nregression', fontsize=10) ax.text(5, 6, 'LACE1\ndecision tree', fontsize=10) ax.text(9, 6, 'LACE2\nregression', fontsize=10) ax.text(13, 6, 'LACE2\ndecision tree', fontsize=10) plt.setp(box['boxes'][cases.index(case)], color='red') plt.setp(box['boxes'][cases.index(case)+4], color='red') plt.setp(box['boxes'][cases.index(case)+8], color='red') plt.setp(box['boxes'][cases.index(case)+12], color='red') # pdb.set_trace() plt.xticks(range(1, 17), ['NE', 'NW', 'S', 'W']*4) ax.set_ylim([0, 10]) ax.set_title('RMSE for prediction from region data.') fig.savefig(case+'.png', bbox_inches='tight')