def process(table): if table not in tables: raise RuntimeError("no such table: '%s'" % table) if count: # only count timing.start(table, 'count') print "%s:" % table, sys.stdout.flush() s = "echo 'select count(*) FROM %s' | psql %s" % (table, db) c = os.popen(s).read() i = c.rfind('-') + 1 j = c.rfind("(") print c[i:j].strip() timing.done(table, 'count') return T = tables[table] print T if T.get('skip', False): return if update and not T.get('update', False): return print "get from rethinkdb as json" path_to_json = export_from_rethinkdb.process(table, export, update) print "convert json to csv" path_to_csv = json_to_csv.process(path_to_json, export) if T.get('fix_timestamps', False): print "fix timestamps in the csv file" path_to_csv = fix_timestamps.process(path_to_csv) # path changes print "load csv into database" read_from_csv.process(path_to_csv) print "parse JSONB data in the database to relational data" populate_relational_table.process(table, T.get('replace', False) or not update)
def process(table, export=True, update=False): out = '/migrate/data/%s'%table if update: path_to_json = out + '/smc/update-%s.json'%table if not os.path.exists(path_to_json): raise RuntimeError("run the update query") return path_to_json else: path_to_json = out + '/smc/%s.json'%table if not os.path.exists(out): export = True if not export: return path_to_json timing.start(table, 'export_from_rethinkdb') if os.path.exists(out): os.system("rm -rf %s"%out) if table == 'accounts': s = "cd /migrate/smc/src&& . smc-env&& cd /migrate/smc/src/scripts/postgresql/migrate/&&time coffee repeated_emails.coffee" print s if os.system(s): raise RuntimeError("error deduplicating emails") s = "time rethinkdb export --password-file /migrate/secrets/rethinkdb --format json -d %s -c db3 -e smc.%s"%( out, table) print s if os.system(s): raise RuntimeError("error exporting from rethinkdb - %s"%table) timing.done(table, 'export_from_rethinkdb') return path_to_json
def process(table, export=True, update=False): out = '/migrate/data/%s' % table if update: path_to_json = out + '/smc/update-%s.json' % table if not os.path.exists(path_to_json): raise RuntimeError("run the update query") return path_to_json else: path_to_json = out + '/smc/%s.json' % table if not os.path.exists(out): export = True if not export: return path_to_json timing.start(table, 'export_from_rethinkdb') if os.path.exists(out): os.system("rm -rf %s" % out) if table == 'accounts': s = "cd /migrate/smc/src&& . smc-env&& cd /migrate/smc/src/scripts/postgresql/migrate/&&time coffee repeated_emails.coffee" print s if os.system(s): raise RuntimeError("error deduplicating emails") s = "time rethinkdb export --password-file /migrate/secrets/rethinkdb --format json -d %s -c db3 -e smc.%s" % ( out, table) print s if os.system(s): raise RuntimeError("error exporting from rethinkdb - %s" % table) timing.done(table, 'export_from_rethinkdb') return path_to_json
def process(x): base, ext = os.path.splitext(x) name = os.path.split(base)[1] if name.endswith('-time'): name = name[:-5] if name.startswith('update-'): name = name[len('update-'):] timing.start(name, 'read_from_csv') s = """time echo "drop table %s_json; create table %s_json (a JSONB); copy %s_json from '%s' with (format csv, DELIMITER e'\\1', QUOTE e'\\2');" | psql %s """%(name, name, name, os.path.abspath(x), db) print(s) if os.system(s): raise RuntimeError("error exporting from rethinkdb - %s"%x) timing.done(name, 'read_from_csv')
def process(file): print "fix timestamps in %s"%file base = os.path.splitext(file)[0] out_filename_csv = base + '-time.csv' if os.path.exists(out_filename_csv): print("output file %s already exists; not overwriting it"%out_filename_csv) return out_filename_csv timing.start(os.path.split(base)[-1], 'fix_timestamps') out = open(out_filename_csv, 'w') for x in open(file).xreadlines(): out.write(json.dumps(fix_timestamps(json.loads(x[:-1]))) + '\n') out.close() timing.done(os.path.split(base)[-1], 'fix_timestamps') return out_filename_csv
def process(file): print "fix timestamps in %s" % file base = os.path.splitext(file)[0] out_filename_csv = base + '-time.csv' if os.path.exists(out_filename_csv): print("output file %s already exists; not overwriting it" % out_filename_csv) return out_filename_csv timing.start(os.path.split(base)[-1], 'fix_timestamps') out = open(out_filename_csv, 'w') for x in open(file).xreadlines(): out.write(json.dumps(fix_timestamps(json.loads(x[:-1]))) + '\n') out.close() timing.done(os.path.split(base)[-1], 'fix_timestamps') return out_filename_csv
def process(path_to_json, do_it=True): base, ext = os.path.splitext(path_to_json) # The grep -v '\\\\u0000' skips any json record with null bytes. These are not valid/meaningful # for postgres, and happen in a very small handful of non-important records. path_to_csv = "%s.csv" % base if not os.path.exists(path_to_csv): do_it = True if not do_it: return path_to_csv timing.start(os.path.split(base)[-1], 'json_to_csv') s = "time sed 's/,$//' %s | head -n -1 | tail -n +2 | grep -v '\\\\u0000' > %s" % ( path_to_json, path_to_csv) print(s) if os.system(s): raise RuntimeError("error converting json to csv - %s" % path_to_json) timing.done(os.path.split(base)[-1], 'json_to_csv') return path_to_csv
def process(table, replace=False): timing.start(table, 'populate_relational_table') try: disable_triggers(table) if replace: s = 'echo "DELETE FROM %s;" | psql %s'%(table, db) print(s) if os.system(s): raise RuntimeError("error populating relational data - %s - dropping table failed"%table) s = "psql --set ON_ERROR_STOP=1 -d %s -a -f %s/sql/import-%s_json.sql"%(db, path, table) print(s) if os.system(s): raise RuntimeError("error populating relational data - %s"%table) s = "time echo 'drop table %s_json' | psql %s"%(table, db) print(s) if os.system(s): raise RuntimeError("error dropping intermediate table %s_json"%table) timing.done(table, 'populate_relational_table') finally: enable_triggers(table)