示例#1
0
def rescale_data_file(path):
  for f in prep.gen_file_list(path):
    if not f.endswith('.prescale'):
      continue
   
    print 'rescaling file: %s' % f
    fpath = f.rsplit('/', 1)[0]
    cols = prep.get_feature_columns(fpath + '/.columns')
    domains = prep.read_domains(cols, fpath + '/.prescale.domains')
    header = prep.get_header(fpath + '/.header')

    scaled_file = f.replace('.prescale', '.train')

    fin = open(f, 'r')
    fout = open(scaled_file, 'w')

    for line in fin:
      row = line.strip().split('\t')
      for c in cols:
        if prep.get_col_type(c, header) == 'num':
          min_val = float(domains[c]['min'])
          max_val = float(domains[c]['max'])
          new_val = rescale(float(row[c]), min_val, max_val, 1e6)
    #      log_val = math.log(new_val + 1)
          row[c] = str(new_val)
      fout.write('\t'.join(row) + '\n')
    fin.close()
    fout.close()
示例#2
0
def cluster_all_tables(data_path):
  for d in os.listdir(data_path):
    if not os.path.isdir(data_path + '/' + d):
      continue
   
    if d != 'lineitem':
      continue

    print 'processing %s' % d
    full_path = data_path.rstrip('/') + '/' + d.rstrip('/') + '/' 
    sample_ratio = int(open(full_path + '.ratio').read())
    data_file = '%s%s.train.%d.sample' % (full_path, d, sample_ratio)

    k = int(open(full_path + '.k').read())
    if k > 1:
      feat_cols = prep.get_feature_columns(full_path + '.columns')
      table = prep.load_file(data_file, feat_cols)
      seeds = load_means(full_path + '/.means')
#    output_weka(table, 'weka.arff')
#      return
      feat_doms = prep.read_domains(feat_cols, full_path + '.domains')
      header = prep.get_header(full_path + '.header')

      print 'start clustering %s' % data_file
  #    model = clustering(k, feat_cols, feat_doms, header, table, seeds, data_file + '.res')
   
      labels = kmeans(k, table)
      centers = get_centers(table, labels)
      classify_data_kmeans(k, feat_cols, full_path, centers)
示例#3
0
def convert_data_file(path):
# get date mark
#  mindate = dt.date(1992,1,1).toordinal()
#  maxdate = dt.date(1998,12,31).toordinal()
#  weight = [10.0, 4.0, 3.0]
#  weight = [(99.0 * x + 1.0) for x in np.random.sample(5)]
#  date_mark = []
#  acc = 0
#  for x in weight:
#    acc += x
#    date_mark.append(acc/sum(weight) * (maxdate - mindate) + mindate)

  for f in prep.gen_file_list(path):
    if not f.endswith('.txt'):
      continue

    print 'converting file: %s' % f
  
    header_file = f.rsplit('/', 1)[0] + '/.header'
    header = prep.get_header(header_file)
  
    fin = open(f, 'r')
    fout = open(f.replace('.txt', '.prescale'), 'w')

    for line in fin:
      row = line.strip().split('\t')
      new_row = convert_row(row, header)
  
      fout.write('\t'.join(new_row) + '\n')

    fin.close()
    fout.close()
示例#4
0
def rescale_data_file(path):
    for f in prep.gen_file_list(path):
        if not f.endswith('.prescale'):
            continue

        print 'rescaling file: %s' % f
        fpath = f.rsplit('/', 1)[0]
        cols = prep.get_feature_columns(fpath + '/.columns')
        domains = prep.read_domains(cols, fpath + '/.prescale.domains')
        header = prep.get_header(fpath + '/.header')

        scaled_file = f.replace('.prescale', '.train')

        fin = open(f, 'r')
        fout = open(scaled_file, 'w')

        for line in fin:
            row = line.strip().split('\t')
            for c in cols:
                if prep.get_col_type(c, header) == 'num':
                    min_val = float(domains[c]['min'])
                    max_val = float(domains[c]['max'])
                    new_val = rescale(float(row[c]), min_val, max_val, 1e6)
                    #      log_val = math.log(new_val + 1)
                    row[c] = str(new_val)
            fout.write('\t'.join(row) + '\n')
        fin.close()
        fout.close()
示例#5
0
def convert_data_file(path):
    # get date mark
    #  mindate = dt.date(1992,1,1).toordinal()
    #  maxdate = dt.date(1998,12,31).toordinal()
    #  weight = [10.0, 4.0, 3.0]
    #  weight = [(99.0 * x + 1.0) for x in np.random.sample(5)]
    #  date_mark = []
    #  acc = 0
    #  for x in weight:
    #    acc += x
    #    date_mark.append(acc/sum(weight) * (maxdate - mindate) + mindate)

    for f in prep.gen_file_list(path):
        if not f.endswith('.txt'):
            continue

        print 'converting file: %s' % f

        header_file = f.rsplit('/', 1)[0] + '/.header'
        header = prep.get_header(header_file)

        fin = open(f, 'r')
        fout = open(f.replace('.txt', '.prescale'), 'w')

        for line in fin:
            row = line.strip().split('\t')
            new_row = convert_row(row, header)

            fout.write('\t'.join(new_row) + '\n')

        fin.close()
        fout.close()
示例#6
0
def cluster_all_tables(data_path):
    for d in os.listdir(data_path):
        if not os.path.isdir(data_path + '/' + d):
            continue

        if d != 'lineitem':
            continue

        print 'processing %s' % d
        full_path = data_path.rstrip('/') + '/' + d.rstrip('/') + '/'
        sample_ratio = int(open(full_path + '.ratio').read())
        data_file = '%s%s.train.%d.sample' % (full_path, d, sample_ratio)

        k = int(open(full_path + '.k').read())
        if k > 1:
            feat_cols = prep.get_feature_columns(full_path + '.columns')
            table = prep.load_file(data_file, feat_cols)
            seeds = load_means(full_path + '/.means')
            #    output_weka(table, 'weka.arff')
            #      return
            feat_doms = prep.read_domains(feat_cols, full_path + '.domains')
            header = prep.get_header(full_path + '.header')

            print 'start clustering %s' % data_file
            #    model = clustering(k, feat_cols, feat_doms, header, table, seeds, data_file + '.res')

            labels = kmeans(k, table)
            centers = get_centers(table, labels)
            classify_data_kmeans(k, feat_cols, full_path, centers)
示例#7
0
def create_unclustered_index(data_path, db_name, table_name, mode='P'):
    conn = psycopg2.connect("dbname=%s port=11111" % db_name)
    conn.set_isolation_level(0)
    cur = conn.cursor()

    for f in prep.gen_file_list(data_path):
        file_name = f[f.rfind('/') + 1:]
        cur_path = f[:f.rfind('/')]
        full_path = os.path.abspath(cur_path)

        if mode == 'P':
            t_name = table_name + '_' + file_name
        elif mode == 'I':
            t_name = table_name
        else:
            print 'unrecognized mode: %s' % mode
            sys.exit(1)

        print t_name

        #    if not os.path.exists(cur_path + '/.k'):
        #      k = -1
        #    else:
        #      k = int(open(cur_path + '/.k').read())
        #    if k == 1:
        #      continue
        #   cols = prep.get_feature_columns(cur_path + '/.columns')
        header = prep.get_header(cur_path + '/.header')
        cols = range(len(header))[1:]

        for col in cols:
            field_name = header[col][0]
            if 'comment' in field_name:
                continue
            idx_name = '%s_%s_%s' % (t_name, str(col), field_name)
            cur.execute('DROP INDEX IF EXISTS %s' % idx_name)
            print 'creating (unclustered) index %s' % idx_name
            cur.execute('CREATE INDEX %s ON %s (%s)' %
                        (idx_name, t_name, field_name))

    cur.close()
示例#8
0
def create_unclustered_index(data_path, db_name, table_name, mode = 'P'):
  conn = psycopg2.connect("dbname=%s port=11111" % db_name)
  conn.set_isolation_level(0)
  cur = conn.cursor()

  for f in prep.gen_file_list(data_path):
    file_name = f[f.rfind('/')+1:]
    cur_path = f[:f.rfind('/')]
    full_path = os.path.abspath(cur_path)

    if mode == 'P':
      t_name = table_name + '_' + file_name
    elif mode == 'I':
      t_name = table_name
    else:
      print 'unrecognized mode: %s' % mode
      sys.exit(1)

    print t_name

#    if not os.path.exists(cur_path + '/.k'):
#      k = -1
#    else:
#      k = int(open(cur_path + '/.k').read())
#    if k == 1:
#      continue
#   cols = prep.get_feature_columns(cur_path + '/.columns')
    header = prep.get_header(cur_path + '/.header')
    cols = range(len(header))[1:]

    for col in cols:
      field_name = header[col][0]
      if 'comment' in field_name:
        continue
      idx_name = '%s_%s_%s' % (t_name, str(col), field_name)
      cur.execute('DROP INDEX IF EXISTS %s' % idx_name)
      print 'creating (unclustered) index %s' % idx_name
      cur.execute('CREATE INDEX %s ON %s (%s)' % (idx_name, t_name, field_name))
  
  cur.close()