예제 #1
0
def process(table):
    if table not in tables:
        raise RuntimeError("no such table: '%s'" % table)
    if count:
        # only count
        timing.start(table, 'count')
        print "%s:" % table,
        sys.stdout.flush()
        s = "echo 'select count(*) FROM %s' | psql %s" % (table, db)
        c = os.popen(s).read()
        i = c.rfind('-') + 1
        j = c.rfind("(")
        print c[i:j].strip()
        timing.done(table, 'count')
        return

    T = tables[table]
    print T
    if T.get('skip', False):
        return
    if update and not T.get('update', False):
        return
    print "get from rethinkdb as json"
    path_to_json = export_from_rethinkdb.process(table, export, update)
    print "convert json to csv"
    path_to_csv = json_to_csv.process(path_to_json, export)
    if T.get('fix_timestamps', False):
        print "fix timestamps in the csv file"
        path_to_csv = fix_timestamps.process(path_to_csv)  # path changes
    print "load csv into database"
    read_from_csv.process(path_to_csv)
    print "parse JSONB data in the database to relational data"
    populate_relational_table.process(table,
                                      T.get('replace', False) or not update)
예제 #2
0
def process(table, export=True, update=False):
    out = '/migrate/data/%s'%table
    if update:
        path_to_json = out + '/smc/update-%s.json'%table
        if not os.path.exists(path_to_json):
            raise RuntimeError("run the update query")
            return path_to_json
    else:
        path_to_json = out + '/smc/%s.json'%table
    if not os.path.exists(out):
        export = True
    if not export:
        return path_to_json
    timing.start(table, 'export_from_rethinkdb')
    if os.path.exists(out):
        os.system("rm -rf %s"%out)
    if table == 'accounts':
        s = "cd /migrate/smc/src&& . smc-env&& cd /migrate/smc/src/scripts/postgresql/migrate/&&time coffee repeated_emails.coffee"
        print s
        if os.system(s):
            raise RuntimeError("error deduplicating emails")
    s = "time rethinkdb export --password-file /migrate/secrets/rethinkdb --format json  -d %s -c db3 -e smc.%s"%(
        out, table)
    print s
    if os.system(s):
        raise RuntimeError("error exporting from rethinkdb - %s"%table)
    timing.done(table, 'export_from_rethinkdb')
    return path_to_json
예제 #3
0
def process(table, export=True, update=False):
    out = '/migrate/data/%s' % table
    if update:
        path_to_json = out + '/smc/update-%s.json' % table
        if not os.path.exists(path_to_json):
            raise RuntimeError("run the update query")
            return path_to_json
    else:
        path_to_json = out + '/smc/%s.json' % table
    if not os.path.exists(out):
        export = True
    if not export:
        return path_to_json
    timing.start(table, 'export_from_rethinkdb')
    if os.path.exists(out):
        os.system("rm -rf %s" % out)
    if table == 'accounts':
        s = "cd /migrate/smc/src&& . smc-env&& cd /migrate/smc/src/scripts/postgresql/migrate/&&time coffee repeated_emails.coffee"
        print s
        if os.system(s):
            raise RuntimeError("error deduplicating emails")
    s = "time rethinkdb export --password-file /migrate/secrets/rethinkdb --format json  -d %s -c db3 -e smc.%s" % (
        out, table)
    print s
    if os.system(s):
        raise RuntimeError("error exporting from rethinkdb - %s" % table)
    timing.done(table, 'export_from_rethinkdb')
    return path_to_json
예제 #4
0
def process(x):
    base, ext = os.path.splitext(x)
    name = os.path.split(base)[1]
    if name.endswith('-time'):
        name = name[:-5]
    if name.startswith('update-'):
        name = name[len('update-'):]
    timing.start(name, 'read_from_csv')
    s = """time echo "drop table %s_json; create table %s_json (a JSONB); copy %s_json from '%s' with (format csv, DELIMITER e'\\1', QUOTE e'\\2');" | psql %s """%(name, name, name, os.path.abspath(x), db)
    print(s)
    if os.system(s):
        raise RuntimeError("error exporting from rethinkdb - %s"%x)
    timing.done(name, 'read_from_csv')
예제 #5
0
def process(file):
    print "fix timestamps in %s"%file
    base = os.path.splitext(file)[0]
    out_filename_csv = base + '-time.csv'
    if os.path.exists(out_filename_csv):
        print("output file %s already exists; not overwriting it"%out_filename_csv)
        return out_filename_csv
    timing.start(os.path.split(base)[-1], 'fix_timestamps')
    out = open(out_filename_csv, 'w')
    for x in open(file).xreadlines():
        out.write(json.dumps(fix_timestamps(json.loads(x[:-1]))) + '\n')
    out.close()
    timing.done(os.path.split(base)[-1], 'fix_timestamps')
    return out_filename_csv
예제 #6
0
def process(file):
    print "fix timestamps in %s" % file
    base = os.path.splitext(file)[0]
    out_filename_csv = base + '-time.csv'
    if os.path.exists(out_filename_csv):
        print("output file %s already exists; not overwriting it" %
              out_filename_csv)
        return out_filename_csv
    timing.start(os.path.split(base)[-1], 'fix_timestamps')
    out = open(out_filename_csv, 'w')
    for x in open(file).xreadlines():
        out.write(json.dumps(fix_timestamps(json.loads(x[:-1]))) + '\n')
    out.close()
    timing.done(os.path.split(base)[-1], 'fix_timestamps')
    return out_filename_csv
예제 #7
0
def process(path_to_json, do_it=True):
    base, ext = os.path.splitext(path_to_json)
    # The grep -v '\\\\u0000' skips any json record with null bytes.  These are not valid/meaningful
    # for postgres, and happen in a very small handful of non-important records.
    path_to_csv = "%s.csv" % base
    if not os.path.exists(path_to_csv):
        do_it = True
    if not do_it:
        return path_to_csv
    timing.start(os.path.split(base)[-1], 'json_to_csv')
    s = "time sed 's/,$//' %s | head -n -1 | tail -n +2 | grep -v '\\\\u0000' > %s" % (
        path_to_json, path_to_csv)
    print(s)
    if os.system(s):
        raise RuntimeError("error converting json to csv - %s" % path_to_json)
    timing.done(os.path.split(base)[-1], 'json_to_csv')
    return path_to_csv
예제 #8
0
def process(table, replace=False):
    timing.start(table, 'populate_relational_table')
    try:
        disable_triggers(table)
        if replace:
            s = 'echo "DELETE FROM %s;" | psql %s'%(table, db)
            print(s)
            if os.system(s):
                raise RuntimeError("error populating relational data - %s - dropping table failed"%table)
        s = "psql --set ON_ERROR_STOP=1 -d %s -a -f %s/sql/import-%s_json.sql"%(db, path, table)
        print(s)
        if os.system(s):
            raise RuntimeError("error populating relational data - %s"%table)
        s = "time echo 'drop table %s_json' | psql %s"%(table, db)
        print(s)
        if os.system(s):
            raise RuntimeError("error dropping intermediate table %s_json"%table)
        timing.done(table, 'populate_relational_table')
    finally:
        enable_triggers(table)