示例#1
0
def gen_query():
  [header, data] = discretize.load_file(sys.argv[1])
  gen_queries_uniform_row(data, sys.argv[2]+"row", int(sys.argv[3]))
  gen_queries_uniform_key(data, sys.argv[2]+"key", int(sys.argv[3]))
示例#2
0
def gen_query():
    [header, data] = discretize.load_file(sys.argv[1])
    gen_queries_uniform_row(data, sys.argv[2] + "row", int(sys.argv[3]))
    gen_queries_uniform_key(data, sys.argv[2] + "key", int(sys.argv[3]))
示例#3
0
def create_partitioned_db(file_name, force_split):
  # split this big into separate files
  print 'populating data to db'
  fname = ROOT_PATH + RAWDATA_PATH + file_name
  [header, data] = discretize.load_file(fname)

  labels = list(set(data[:,-1]))
  force_name = ("_sp_" + str(force_split)) if force_split > 0 else ""
  data_dir = file_name + force_name + ".splits"
  print data_dir
  if os.path.exists(data_dir):
    print "seems the directory: %s exists already..." % data_dir
    print "skipping this step"

  else:
    os.mkdir(data_dir)

    if force_split > 0:
      interval = int(math.ceil(data.shape[0] * 1.0 / force_split))
      for i in range(force_split):
        subset = np.array(data[i*interval:min((i+1)*interval, data.shape[0]), :])
        print subset.shape
        fw = open(data_dir + "/" + file_name + "_" + str(i), 'w')
        for row in subset:
          if len(row) != len(DB_SCHEMA.split(",")):
            print "unmatched length!"
          fw.write(','.join(row) + '\n')
        fw.close()

    else:
      print 'partition based on labels'
      for x in labels:
        print 'creating partition %s' % x
       # subset = np.array(list(list(y) for y in data if y[-1] == x))
        print 'created subset'
        fw = open(data_dir + "/" + file_name + "_" + str(x), 'w')
        print 'opened file'
        for row in data:
          if row[-1] != x:
            continue
          if len(row) != len(DB_SCHEMA.split(",")):
            print "unmatched length!"
          fw.write(','.join(row) + '\n')
        fw.close()

  # load each file into db as a table
  dbname = file_name + force_name 

  if not execute_status('pg_ctl start -D %s -l pg.log -o "-p 11111"' % PGDATA_PATH):         
    print "databaes failed to start..."
    sys.exit(1)
 
  sleep(2)

  conn = psycopg2.connect("dbname=postgres port=11111")
  conn.set_isolation_level(0)

# psql_proc = subprocess.Popen([POSTGRES_BIN_PATH+"psql", "-p 11111", "postgres"], 
#      stdin=subprocess.PIPE, 
#      stdout=subprocess.PIPE,
#      universal_newlines=True)

  cur = conn.cursor()
  cur.execute("DROP DATABASE IF EXISTS %s;" % dbname)
  cur.execute("CREATE DATABASE %s;" % dbname)
  cur.close()
  conn.close()

  conn = psycopg2.connect("dbname=%s port=11111" % dbname)
  conn.set_isolation_level(0)
  cur = conn.cursor()
  tables = os.listdir(data_dir)

  # create master table
  master_table = dbname
  cur.execute("DROP TABLE IF EXISTS %s;" % master_table)
  cur.execute("CREATE TABLE %s %s;" % (master_table, DB_SCHEMA))

  for data_file in tables:
    cur.execute("DROP TABLE IF EXISTS %s;" % data_file)
    cur.execute("CREATE TABLE %s () INHERITS (%s);" % (data_file,  master_table))
    cur.execute("COPY %s FROM '%s' WITH (FORMAT CSV);" %
        (data_file, ROOT_PATH + RAWDATA_PATH + data_dir + "/" + data_file))

  cur.close()
  conn.close()

  if not execute_status('pg_ctl stop -D %s' % PGDATA_PATH):         
    print "databaes failed to stop..."
    sys.exit(1)

  return [header, data, dbname, tables]
示例#4
0
def create_partitioned_db(file_name, force_split):
    # split this big into separate files
    print 'populating data to db'
    fname = ROOT_PATH + RAWDATA_PATH + file_name
    [header, data] = discretize.load_file(fname)

    labels = list(set(data[:, -1]))
    force_name = ("_sp_" + str(force_split)) if force_split > 0 else ""
    data_dir = file_name + force_name + ".splits"
    print data_dir
    if os.path.exists(data_dir):
        print "seems the directory: %s exists already..." % data_dir
        print "skipping this step"

    else:
        os.mkdir(data_dir)

        if force_split > 0:
            interval = int(math.ceil(data.shape[0] * 1.0 / force_split))
            for i in range(force_split):
                subset = np.array(
                    data[i * interval:min((i + 1) *
                                          interval, data.shape[0]), :])
                print subset.shape
                fw = open(data_dir + "/" + file_name + "_" + str(i), 'w')
                for row in subset:
                    if len(row) != len(DB_SCHEMA.split(",")):
                        print "unmatched length!"
                    fw.write(','.join(row) + '\n')
                fw.close()

        else:
            print 'partition based on labels'
            for x in labels:
                print 'creating partition %s' % x
                # subset = np.array(list(list(y) for y in data if y[-1] == x))
                print 'created subset'
                fw = open(data_dir + "/" + file_name + "_" + str(x), 'w')
                print 'opened file'
                for row in data:
                    if row[-1] != x:
                        continue
                    if len(row) != len(DB_SCHEMA.split(",")):
                        print "unmatched length!"
                    fw.write(','.join(row) + '\n')
                fw.close()

    # load each file into db as a table
    dbname = file_name + force_name

    if not execute_status(
            'pg_ctl start -D %s -l pg.log -o "-p 11111"' % PGDATA_PATH):
        print "databaes failed to start..."
        sys.exit(1)

    sleep(2)

    conn = psycopg2.connect("dbname=postgres port=11111")
    conn.set_isolation_level(0)

    # psql_proc = subprocess.Popen([POSTGRES_BIN_PATH+"psql", "-p 11111", "postgres"],
    #      stdin=subprocess.PIPE,
    #      stdout=subprocess.PIPE,
    #      universal_newlines=True)

    cur = conn.cursor()
    cur.execute("DROP DATABASE IF EXISTS %s;" % dbname)
    cur.execute("CREATE DATABASE %s;" % dbname)
    cur.close()
    conn.close()

    conn = psycopg2.connect("dbname=%s port=11111" % dbname)
    conn.set_isolation_level(0)
    cur = conn.cursor()
    tables = os.listdir(data_dir)

    # create master table
    master_table = dbname
    cur.execute("DROP TABLE IF EXISTS %s;" % master_table)
    cur.execute("CREATE TABLE %s %s;" % (master_table, DB_SCHEMA))

    for data_file in tables:
        cur.execute("DROP TABLE IF EXISTS %s;" % data_file)
        cur.execute("CREATE TABLE %s () INHERITS (%s);" %
                    (data_file, master_table))
        cur.execute(
            "COPY %s FROM '%s' WITH (FORMAT CSV);" %
            (data_file, ROOT_PATH + RAWDATA_PATH + data_dir + "/" + data_file))

    cur.close()
    conn.close()

    if not execute_status('pg_ctl stop -D %s' % PGDATA_PATH):
        print "databaes failed to stop..."
        sys.exit(1)

    return [header, data, dbname, tables]