def rescale_data_file(path): for f in prep.gen_file_list(path): if not f.endswith('.prescale'): continue print 'rescaling file: %s' % f fpath = f.rsplit('/', 1)[0] cols = prep.get_feature_columns(fpath + '/.columns') domains = prep.read_domains(cols, fpath + '/.prescale.domains') header = prep.get_header(fpath + '/.header') scaled_file = f.replace('.prescale', '.train') fin = open(f, 'r') fout = open(scaled_file, 'w') for line in fin: row = line.strip().split('\t') for c in cols: if prep.get_col_type(c, header) == 'num': min_val = float(domains[c]['min']) max_val = float(domains[c]['max']) new_val = rescale(float(row[c]), min_val, max_val, 1e6) # log_val = math.log(new_val + 1) row[c] = str(new_val) fout.write('\t'.join(row) + '\n') fin.close() fout.close()
def cluster_all_tables(data_path): for d in os.listdir(data_path): if not os.path.isdir(data_path + '/' + d): continue if d != 'lineitem': continue print 'processing %s' % d full_path = data_path.rstrip('/') + '/' + d.rstrip('/') + '/' sample_ratio = int(open(full_path + '.ratio').read()) data_file = '%s%s.train.%d.sample' % (full_path, d, sample_ratio) k = int(open(full_path + '.k').read()) if k > 1: feat_cols = prep.get_feature_columns(full_path + '.columns') table = prep.load_file(data_file, feat_cols) seeds = load_means(full_path + '/.means') # output_weka(table, 'weka.arff') # return feat_doms = prep.read_domains(feat_cols, full_path + '.domains') header = prep.get_header(full_path + '.header') print 'start clustering %s' % data_file # model = clustering(k, feat_cols, feat_doms, header, table, seeds, data_file + '.res') labels = kmeans(k, table) centers = get_centers(table, labels) classify_data_kmeans(k, feat_cols, full_path, centers)
def convert_data_file(path): # get date mark # mindate = dt.date(1992,1,1).toordinal() # maxdate = dt.date(1998,12,31).toordinal() # weight = [10.0, 4.0, 3.0] # weight = [(99.0 * x + 1.0) for x in np.random.sample(5)] # date_mark = [] # acc = 0 # for x in weight: # acc += x # date_mark.append(acc/sum(weight) * (maxdate - mindate) + mindate) for f in prep.gen_file_list(path): if not f.endswith('.txt'): continue print 'converting file: %s' % f header_file = f.rsplit('/', 1)[0] + '/.header' header = prep.get_header(header_file) fin = open(f, 'r') fout = open(f.replace('.txt', '.prescale'), 'w') for line in fin: row = line.strip().split('\t') new_row = convert_row(row, header) fout.write('\t'.join(new_row) + '\n') fin.close() fout.close()
def create_unclustered_index(data_path, db_name, table_name, mode='P'): conn = psycopg2.connect("dbname=%s port=11111" % db_name) conn.set_isolation_level(0) cur = conn.cursor() for f in prep.gen_file_list(data_path): file_name = f[f.rfind('/') + 1:] cur_path = f[:f.rfind('/')] full_path = os.path.abspath(cur_path) if mode == 'P': t_name = table_name + '_' + file_name elif mode == 'I': t_name = table_name else: print 'unrecognized mode: %s' % mode sys.exit(1) print t_name # if not os.path.exists(cur_path + '/.k'): # k = -1 # else: # k = int(open(cur_path + '/.k').read()) # if k == 1: # continue # cols = prep.get_feature_columns(cur_path + '/.columns') header = prep.get_header(cur_path + '/.header') cols = range(len(header))[1:] for col in cols: field_name = header[col][0] if 'comment' in field_name: continue idx_name = '%s_%s_%s' % (t_name, str(col), field_name) cur.execute('DROP INDEX IF EXISTS %s' % idx_name) print 'creating (unclustered) index %s' % idx_name cur.execute('CREATE INDEX %s ON %s (%s)' % (idx_name, t_name, field_name)) cur.close()
def create_unclustered_index(data_path, db_name, table_name, mode = 'P'): conn = psycopg2.connect("dbname=%s port=11111" % db_name) conn.set_isolation_level(0) cur = conn.cursor() for f in prep.gen_file_list(data_path): file_name = f[f.rfind('/')+1:] cur_path = f[:f.rfind('/')] full_path = os.path.abspath(cur_path) if mode == 'P': t_name = table_name + '_' + file_name elif mode == 'I': t_name = table_name else: print 'unrecognized mode: %s' % mode sys.exit(1) print t_name # if not os.path.exists(cur_path + '/.k'): # k = -1 # else: # k = int(open(cur_path + '/.k').read()) # if k == 1: # continue # cols = prep.get_feature_columns(cur_path + '/.columns') header = prep.get_header(cur_path + '/.header') cols = range(len(header))[1:] for col in cols: field_name = header[col][0] if 'comment' in field_name: continue idx_name = '%s_%s_%s' % (t_name, str(col), field_name) cur.execute('DROP INDEX IF EXISTS %s' % idx_name) print 'creating (unclustered) index %s' % idx_name cur.execute('CREATE INDEX %s ON %s (%s)' % (idx_name, t_name, field_name)) cur.close()