예제 #1
0
def rescale_data_file(path):
  for f in prep.gen_file_list(path):
    if not f.endswith('.prescale'):
      continue
   
    print 'rescaling file: %s' % f
    fpath = f.rsplit('/', 1)[0]
    cols = prep.get_feature_columns(fpath + '/.columns')
    domains = prep.read_domains(cols, fpath + '/.prescale.domains')
    header = prep.get_header(fpath + '/.header')

    scaled_file = f.replace('.prescale', '.train')

    fin = open(f, 'r')
    fout = open(scaled_file, 'w')

    for line in fin:
      row = line.strip().split('\t')
      for c in cols:
        if prep.get_col_type(c, header) == 'num':
          min_val = float(domains[c]['min'])
          max_val = float(domains[c]['max'])
          new_val = rescale(float(row[c]), min_val, max_val, 1e6)
    #      log_val = math.log(new_val + 1)
          row[c] = str(new_val)
      fout.write('\t'.join(row) + '\n')
    fin.close()
    fout.close()
예제 #2
0
def rescale_data_file(path):
    for f in prep.gen_file_list(path):
        if not f.endswith('.prescale'):
            continue

        print 'rescaling file: %s' % f
        fpath = f.rsplit('/', 1)[0]
        cols = prep.get_feature_columns(fpath + '/.columns')
        domains = prep.read_domains(cols, fpath + '/.prescale.domains')
        header = prep.get_header(fpath + '/.header')

        scaled_file = f.replace('.prescale', '.train')

        fin = open(f, 'r')
        fout = open(scaled_file, 'w')

        for line in fin:
            row = line.strip().split('\t')
            for c in cols:
                if prep.get_col_type(c, header) == 'num':
                    min_val = float(domains[c]['min'])
                    max_val = float(domains[c]['max'])
                    new_val = rescale(float(row[c]), min_val, max_val, 1e6)
                    #      log_val = math.log(new_val + 1)
                    row[c] = str(new_val)
            fout.write('\t'.join(row) + '\n')
        fin.close()
        fout.close()
예제 #3
0
def clustering(k, feature_cols, feature_domains, header, table, seeds, result_file):
  best_loglike = None
  best_model = None
  # Giant random seeding loop, 

  data = mx.DataSet()
  data.fromArray(table)
  for r in range(1):
  #  weights = np.random.random_sample(k)
  #  weights_norm = weights / sum(weights)
    weights_norm = [1.0/k] * k
    components = []
    for i in range(k):
      products = []
      for j in range(table.shape[1]):
        col_type = prep.get_col_type(feature_cols[j], header)
        col_id = feature_cols[j]

        if col_type == 'cat':
          vals = feature_domains[col_id].keys()
          cnt_vals = len(vals)
          rand_dist = np.random.random_sample(cnt_vals)

          dist = mx.DiscreteDistribution(cnt_vals, rand_dist / sum(rand_dist), mx.Alphabet(vals))
        
        elif col_type == 'num':
          min_val = feature_domains[col_id]['min']
          max_val = feature_domains[col_id]['max']
      #  mean = random.uniform(min_val, max_val)
          mean = seeds[header[col_id][0]][i]
          stdev = (max_val - min_val) / 2.0 / k
          
          dist = mx.NormalDistribution(mean, stdev)

        else:
          sys.exit(1)
        products.append(dist)

      comp = mx.ProductDistribution(products)
      components.append(comp)
    
    mix_table = mx.MixtureModel(k, weights_norm, components)
    print mix_table

    #loglike = mix_table.randMaxEM(data,1,50,50)
    #print loglike
    #print mix_table
    if not best_loglike or loglike > best_loglike:
    #  best_loglike = loglike
      best_model = copy.copy(mix_table)

#data.internalInit(mix)
# mix_table.modelInitialization(data)
#  print best_loglike
#  print best_model

  labels = best_model.classify(data, None, None, 1) 

  ## output clustering results
  
  # count cluster sizes on sampled data
  f = open(result_file + '.stats', 'w')
  cnt = {}
  for l in labels:
    cnt[l] = 1 if l not in cnt else cnt[l] + 1

  for l in cnt:
    f.write('%s %d %f%%\n' % ( l, cnt[l], cnt[l] * 100.0 / sum(cnt.values())))
  f.close()

  mx.writeMixture(best_model, result_file + '.model')
  return best_model
예제 #4
0
def clustering(k, feature_cols, feature_domains, header, table, seeds,
               result_file):
    best_loglike = None
    best_model = None
    # Giant random seeding loop,

    data = mx.DataSet()
    data.fromArray(table)
    for r in range(1):
        #  weights = np.random.random_sample(k)
        #  weights_norm = weights / sum(weights)
        weights_norm = [1.0 / k] * k
        components = []
        for i in range(k):
            products = []
            for j in range(table.shape[1]):
                col_type = prep.get_col_type(feature_cols[j], header)
                col_id = feature_cols[j]

                if col_type == 'cat':
                    vals = feature_domains[col_id].keys()
                    cnt_vals = len(vals)
                    rand_dist = np.random.random_sample(cnt_vals)

                    dist = mx.DiscreteDistribution(cnt_vals,
                                                   rand_dist / sum(rand_dist),
                                                   mx.Alphabet(vals))

                elif col_type == 'num':
                    min_val = feature_domains[col_id]['min']
                    max_val = feature_domains[col_id]['max']
                    #  mean = random.uniform(min_val, max_val)
                    mean = seeds[header[col_id][0]][i]
                    stdev = (max_val - min_val) / 2.0 / k

                    dist = mx.NormalDistribution(mean, stdev)

                else:
                    sys.exit(1)
                products.append(dist)

            comp = mx.ProductDistribution(products)
            components.append(comp)

        mix_table = mx.MixtureModel(k, weights_norm, components)
        print mix_table

        #loglike = mix_table.randMaxEM(data,1,50,50)
        #print loglike
        #print mix_table
        if not best_loglike or loglike > best_loglike:
            #  best_loglike = loglike
            best_model = copy.copy(mix_table)


#data.internalInit(mix)
# mix_table.modelInitialization(data)
#  print best_loglike
#  print best_model

    labels = best_model.classify(data, None, None, 1)

    ## output clustering results

    # count cluster sizes on sampled data
    f = open(result_file + '.stats', 'w')
    cnt = {}
    for l in labels:
        cnt[l] = 1 if l not in cnt else cnt[l] + 1

    for l in cnt:
        f.write('%s %d %f%%\n' %
                (l, cnt[l], cnt[l] * 100.0 / sum(cnt.values())))
    f.close()

    mx.writeMixture(best_model, result_file + '.model')
    return best_model