def test_all(): connstr = os.environ['TEST_PSQLCONN'] dbreq = PsqlRequests(psycopg2.connect(connstr)) tables = collection_tables("a_inserts").tables for table in tables: create_table = generate_create_table_statement(tables[table], "", "") print create_table dbreq.cursor.execute(create_table) indexes = dbreq.get_table_max_indexes(tables[table], "") inserts = generate_insert_queries(tables[table], "", "", initial_indexes = indexes) for query in inserts[1]: dbreq.cursor.execute(inserts[0], query) dbreq.cursor.execute('COMMIT')
def test_tables2(): """ Test collection with id of INT type """ collection_name = 'a_somethings' tables = collection_tables(collection_name) root_t = tables.tables[collection_name] assert ('id' in root_t.sql_columns) assert (root_t.sql_columns['id'].values[0] == 777) comments_t = tables.tables['a_something_comments'] assert ('a_somethings_id' in comments_t.sql_columns) assert (comments_t.sql_columns['a_somethings_id'].values[0] == 777) items_t = tables.tables['a_something_comment_items'] assert ('a_somethings_id' in items_t.sql_columns) assert ('customer_id_bsontype' in root_t.sql_columns) print root_t.sql_columns['customer_id_bsontype'].values assert (root_t.sql_columns['customer_id_bsontype'].values[0] == 7) assert(root_t.sql_columns['customer_id_oid'].values[0]\ == '56b8f05cf9fcee1b00000000') #test load / unload data connstr = os.environ['TEST_PSQLCONN'] dbreq = PsqlRequests(psycopg2.connect(connstr)) insert_tables_data_into_dst_psql(dbreq, tables, '', '') create_psql_index(root_t, dbreq, '', '') loaded_tables = load_single_rec_into_tables_obj(dbreq, tables.schema_engine, '', 777) assert (tables.compare(loaded_tables) == True)
def main(): """ main """ parser = argparse.ArgumentParser() parser.add_argument("--config-file", action="store", help="Settings file", type=file, required=True) parser.add_argument("--psql-section", help="Psql section name from config", type=str, required=True) parser.add_argument("-cn", "--collection-name", help="Mongo collection name", type=str, required=True) parser.add_argument("--psql-table-name", type=str, required=True) parser.add_argument("-psql-table-prefix", type=str, required=False) parser.add_argument("--input-csv-dir", type=str, required=True) args = parser.parse_args() config = configparser.ConfigParser() config.read_file(args.config_file) schema_name = config['psql']['psql-schema-name'] schemas_dir = config['misc']['schemas-dir'] schema_path = join(schemas_dir, args.collection_name + '.json') schema_file = open(schema_path, 'r') psql_settings = psql_settings_from_config(config, args.psql_section) table_prefix = "" if args.psql_table_prefix: table_prefix = args.psql_table_prefix schema = SchemaEngine(args.collection_name, [load(schema_file)]) table = create_tables_load_bson_data(schema, None)\ .tables[args.psql_table_name] dbreq = PsqlRequests(psql_conn_from_settings(psql_settings)) create_psql_table(table, dbreq, schema_name, table_prefix, drop=True) create_psql_index(table, dbreq, schema_name, table_prefix) dbreq.conn.commit() csv_files = [f \ for f in listdir(args.input_csv_dir) \ if isfile(join(args.input_csv_dir, f))] csv_files.sort() for name in csv_files: csvpath = join(args.input_csv_dir, name) with open(csvpath, 'rb') as csv_f: schema_name_subst = schema_name if len(schema_name): schema_name_subst += '.' tname = '%s"%s%s"' % (schema_name_subst, table_prefix, args.psql_table_name) copy_from_csv(dbreq, csv_f, tname)
def test_complete_partial_record2(): logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(message)s') PSQL_SCHEMA_NAME = '' # etalon of data sample_data_before = { 'posts2': { 'id': [133], "updated_at": [loads('{ "$date" : "2016-02-08T20:02:12.985Z"}')] }, 'posts2_comments': { 'idx': [1, 2], 'id_oid': [ str(loads('{ "$oid": "56b8f35ef9fcee1b0000001a" }')), str(loads('{ "$oid": "56b8f344f9fcee1b00000018" }')) ], 'updated_at': [ loads('{ "$date" : "2016-02-08T20:02:12.985Z"}'), loads('{ "$date" : "2016-02-08T20:02:12.985Z"}') ] }, 'posts2_comment_struct_tests': { 'v': [1, 2, 3], 'idx': [1, 2, 1] }, 'posts2_comment_struct_test_nested': { 'nested': [20, 23, 24, 25, 26], 'idx': [1, 1, 2, 1, 2] } } wrong_raw_bson_data = '[{\ "_id": 133,\ "updated_at": { "$date" : "2016-02-08T20:02:12.985Z"},\ "comments": "error in data"\ }]' existing_raw_bson_data = '[{\ "_id": 133,\ "updated_at": { "$date" : "2016-02-08T20:02:12.985Z"},\ "comments": [ {\ "_id": { "$oid": "56b8f35ef9fcee1b0000001a" },\ "updated_at": { "$date" : "2016-02-08T20:02:12.985Z"},\ "struct" : {\ "tests": [{\ "v": 1,\ "nested": [20]\ }, {\ "v": 2,\ "nested": [23, 24]\ }]}\ }, {\ "_id": { "$oid": "56b8f344f9fcee1b00000018" },\ "updated_at": { "$date" : "2016-02-08T20:02:12.985Z"},\ "struct" : {\ "tests": [{\ "v": 3,\ "nested": [25, 26]\ }]}\ } ]\ }]' oplog_object_id_bson_raw_data = '{"_id": 133}' # insert request should be created, to add 'tests' item oplog_path_array_bson_raw_data = '{\ "comments.0.struct.tests.0.nested.1": 21,\ "comments.2": { \ "_id": { "$oid": "56b8f35ef9fcee1b0000001a" },\ "updated_at": { "$date" : "2016-02-08T20:02:14.985Z"},\ "struct": {\ "tests": [{\ "v": 12,\ "nested": [30]\ }, {\ "v": 13,\ "nested": [32, 31]\ }\ ]}}\ }' dbname = 'rails4_mongoid_development' db_schemas_path = '/'.join(['test_data', 'schemas', dbname]) schemas = get_schema_engines_as_dict(db_schemas_path) schema_engine = schemas['posts2'] connstr = os.environ['TEST_PSQLCONN'] psql = PsqlRequests(psycopg2.connect(connstr)) # test wrong bson data existing_bson_data = loads(wrong_raw_bson_data) tables_obj_before = \ create_tables_load_bson_data(schema_engine, existing_bson_data) assert (False == tables_obj_before.compare_with_sample(sample_data_before)) # tables loaded from existing_raW_bson_data existing_bson_data = loads(existing_raw_bson_data) tables_obj_before = \ create_tables_load_bson_data(schema_engine, existing_bson_data) assert (False == tables_obj_before.compare_with_sample({})) assert (True == tables_obj_before.compare_with_sample(sample_data_before)) # create table structure, drop existing create_psql_tables(tables_obj_before, psql, PSQL_SCHEMA_NAME, '', True) # insert data totables insert_tables_data_into_dst_psql(psql, tables_obj_before, PSQL_SCHEMA_NAME, '') # oplog path with indexes. insert array item bson_data = loads(oplog_path_array_bson_raw_data) object_id_bson_data = loads(oplog_object_id_bson_raw_data) partial_inserts_list = get_tables_data_from_oplog_set_command(\ schema_engine, bson_data, object_id_bson_data) for partial_insert in partial_inserts_list: tables_for_insert = partial_insert.tables initial_indexes = partial_insert.initial_indexes for name, table in tables_for_insert.iteritems(): query_tuple = generate_insert_queries(table, PSQL_SCHEMA_NAME, "", initial_indexes) for query in query_tuple[1]: getLogger(__name__).debug("EXECUTE: " + \ str(query_tuple[0]) + str(query)) psql.cursor.execute(query_tuple[0], query) # tables loaded from existing_bson_data rec_obj_id = object_id_bson_data['_id'] tables_obj_after = load_single_rec_into_tables_obj(psql, schema_engine, PSQL_SCHEMA_NAME, rec_obj_id) sample_data_after = sample_data_before sample_data_after['posts2_comments']['idx'].append(3) sample_data_after['posts2_comments']['id_oid'].append(\ "56b8f35ef9fcee1b0000001a") sample_data_after['posts2_comments']['updated_at'].append( loads('{ "$date" : "2016-02-08T20:02:14.985Z"}')) sample_data_after['posts2_comment_struct_tests'] = { 'v': [1, 2, 3, 12, 13], 'idx': [1, 2, 1, 1, 2] } sample_data_after['posts2_comment_struct_test_nested'] = { 'nested': [20, 21, 23, 24, 25, 26, 30, 32, 31], 'idx': [1, 2, 1, 2, 1, 2, 1, 1, 2] } assert (False == tables_obj_after.compare_with_sample({})) assert (True == tables_obj_after.compare_with_sample(sample_data_after))
def test_complete_partial_record3(): logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(message)s') PSQL_SCHEMA_NAME = '' # etalon of data sample_data_before = { 'posts': { 'id_oid': ['56b8da59f9fcee1b00000007'], "updated_at": [loads('{ "$date" : "2016-02-08T20:02:12.985Z"}')] }, 'post_comments': { 'id_oid': [ str(loads('{ "$oid": "56b8f35ef9fcee1b0000001a" }')), str(loads('{ "$oid": "56b8f344f9fcee1b00000018" }')) ], 'updated_at': [ loads('{ "$date" : "2016-02-08T20:02:12.985Z"}'), loads('{ "$date" : "2016-02-08T20:02:13.985Z"}') ], 'idx': [1, 2] }, 'post_comment_tests': { 'tests': [0, 2], 'idx': [1, 2] } } existing_raw_bson_data = '[{\ "_id": { "$oid": "56b8da59f9fcee1b00000007" },\ "updated_at": { "$date" : "2016-02-08T20:02:12.985Z"},\ "comments": [ {\ "_id": { "$oid": "56b8f35ef9fcee1b0000001a" },\ "updated_at": { "$date" : "2016-02-08T20:02:12.985Z"}\ }, {\ "_id": { "$oid": "56b8f344f9fcee1b00000018" },\ "updated_at": { "$date" : "2016-02-08T20:02:13.985Z"},\ "tests": [0,2]\ } ]\ }]' oplog_object_id_bson_raw_data = '{\ "_id": { "$oid": "56b8da59f9fcee1b00000007" }\ }' # insert request should be created, to add a record with only single field: updated_at oplog_path_array_bson_raw_data = '{"comments.2.updated_at": \ { "$date" : "2016-02-08T20:02:14.985Z"}}' dbname = 'rails4_mongoid_development' db_schemas_path = '/'.join(['test_data', 'schemas', dbname]) schemas = get_schema_engines_as_dict(db_schemas_path) schema_engine = schemas['posts'] connstr = os.environ['TEST_PSQLCONN'] psql = PsqlRequests(psycopg2.connect(connstr)) # tables loaded from existing_raW_bson_data existing_bson_data = loads(existing_raw_bson_data) tables_obj_before = \ create_tables_load_bson_data(schema_engine, existing_bson_data) assert (True == tables_obj_before.compare_with_sample(sample_data_before)) # create table structure, drop existing create_psql_tables(tables_obj_before, psql, PSQL_SCHEMA_NAME, '', True) # insert data totables insert_tables_data_into_dst_psql(psql, tables_obj_before, PSQL_SCHEMA_NAME, '') # oplog path inserting just a field bson_data = loads(oplog_path_array_bson_raw_data) print bson_data object_id_bson_data = loads(oplog_object_id_bson_raw_data) partial_inserts_list = get_tables_data_from_oplog_set_command(\ schema_engine, bson_data, object_id_bson_data) tables_for_insert = partial_inserts_list[0].tables initial_indexes = partial_inserts_list[0].initial_indexes print "tables_for_insert", tables_for_insert.keys() print "initial_indexes", initial_indexes insert_tests_t = tables_for_insert['post_comments'] insert_query = generate_insert_queries(insert_tests_t, "", "", initial_indexes) print "columns", insert_tests_t.sql_column_names print "insert_query=", insert_query for query in insert_query[1]: print insert_query[0], query psql.cursor.execute(insert_query[0], query) # tables loaded from existing_bson_data rec_obj_id = object_id_bson_data['_id'] tables_obj_after = load_single_rec_into_tables_obj(psql, schema_engine, PSQL_SCHEMA_NAME, rec_obj_id) sample_data_after = sample_data_before sample_data_after['post_comments']['idx'].append(3) sample_data_after['post_comments']['id_oid'].append(None) sample_data_after['post_comments']['updated_at'].append( loads('{ "$date" : "2016-02-08T20:02:14.985Z"}')) assert (True == tables_obj_after.compare_with_sample(sample_data_after))
def run_oplog_engine_check(oplog_test, schemas_path): connstr = os.environ['TEST_PSQLCONN'] dbreq = PsqlRequests(psycopg2.connect(connstr)) psql_schema = MAIN_SCHEMA_NAME schema_engines = get_schema_engines_as_dict(schemas_path) getLogger(__name__).info("Loading oplog data...") oplog_readers, mongo_readers_after = get_readers(oplog_test, enable_exceptions=False) create_truncate_psql_objects(dbreq, schemas_path, psql_schema) dbreq.cursor.execute('COMMIT') # do pseudo "init load", ignore inject_exception on this step for name, mongo_data_path in oplog_test.before.iteritems(): load_mongo_data_to_psql(schema_engines[name], mongo_data_path[0], dbreq, psql_schema) # recreate connection / cursor as rollback won't work after commit del dbreq dbreq = PsqlRequests(psycopg2.connect(connstr)) dbreq_etl = PsqlRequests(psycopg2.connect(connstr)) getLogger(__name__).info("Loading mongo data after initload") try: gizer.oplog_sync_unalligned_data.DO_OPLOG_READ_ATTEMPTS_COUNT \ = oplog_test.max_attempts gizer.oplog_sync_alligned_data.DO_OPLOG_READ_ATTEMPTS_COUNT \ = oplog_test.max_attempts # for better coverage gizer.oplog_sync_unalligned_data.SYNC_REC_COUNT_IN_ONE_BATCH = 2 unalligned_sync \ = OplogSyncUnallignedData(dbreq_etl, dbreq, mongo_readers_after, oplog_readers, schemas_path, schema_engines, psql_schema) #start syncing from very start of oplog ts_synced = unalligned_sync.sync(None) getLogger(__name__).info("Sync done ts_synced: %s" % str(ts_synced)) getLogger(__name__).info("statistic %s" % str(unalligned_sync.statistic())) del unalligned_sync # sync failed if ts_synced is None and not oplog_test.skip_sync: return False del dbreq dbreq = PsqlRequests(psycopg2.connect(connstr)) oplog_readers, mongo_readers_after = get_readers( oplog_test, enable_exceptions=True) # ignore sync results if oplog_test.skip_sync: ts_synced = {} for name in oplog_readers: ts_synced[name] = None alligned_sync \ = OplogSyncAllignedData(dbreq, mongo_readers_after, oplog_readers, schemas_path, schema_engines, psql_schema) res = alligned_sync.sync(ts_synced) if res: getLogger(__name__).info("Test passed") except: # close psql connection to have ability to run next tests dbreq.conn.close() raise if res: return True else: return False
def main(): """ main """ parser = argparse.ArgumentParser() parser.add_argument("--config-file", action="store", help="Config file with settings", type=file, required=True) parser.add_argument("-init-load-status", action="store_true", help="will get exit status=0 if init load not needed,\ or status=-1 if otherwise; Also print 1 - if in progress, 0 - if not.") parser.add_argument( "-init-load-start-save-ts", action="store_true", help='Save latest oplog timestamp to psql etlstatus table') parser.add_argument("-init-load-finish", help='values are: "ok" or "error"', type=str) args = parser.parse_args() config = configparser.ConfigParser() config.read_file(args.config_file) psql_settings = psql_settings_from_config(config, 'psql') psql_main = PsqlRequests(psql_conn_from_settings(psql_settings)) oplog_settings = load_mongo_replicas_from_setting(config, 'mongo-oplog') status_table = PsqlEtlStatusTable(psql_main.cursor, config['psql']['psql-schema-name'], sorted(oplog_settings.keys())) res = 0 if args.init_load_status: status = status_table.get_recent() if status: if (status.status == STATUS_OPLOG_SYNC or \ status.status == STATUS_OPLOG_APPLY or \ status.status == STATUS_INITIAL_LOAD or \ status.status == STATUS_OPLOG_RESYNC) and not status.error: delta = datetime.now() - status.time_start # if operation is running to long if status.time_end: res = 0 elif delta.total_seconds() < 32400: # < 9 hours res = 0 if not status.time_end: print 1 # means etl in progress else: print 0 # means not etl in progress else: # takes to much time -> do init load res = -1 else: # error status -> do init load res = -1 else: # empty status table -> do init load res = -1 elif args.init_load_start_save_ts: # create oplog read transport/s to acquire ts max_ts_dict = {} for oplog_name, settings_list in oplog_settings.iteritems(): print 'Fetch timestamp from oplog: %s' % oplog_name # settings list is a replica set (must be at least one in list) reader = mongo_reader_from_settings(settings_list, 'oplog.rs', {}) reader.make_new_request({}) reader.cursor.sort('ts', DESCENDING) reader.cursor.limit(1) timestamp = reader.next() if timestamp: max_ts_dict[oplog_name] = timestamp['ts'] else: max_ts_dict[oplog_name] = None print 'Initload ts: %s, oplog: %s' % (max_ts_dict[oplog_name], oplog_name) status_manager = PsqlEtlStatusTableManager(status_table) status_manager.init_load_start(max_ts_dict) elif args.init_load_finish: status_manager = PsqlEtlStatusTableManager(status_table) if args.init_load_finish == "ok": status_manager.init_load_finish(False) # ok else: status_manager.init_load_finish(True) # error return res
def main(): """ main """ parser = argparse.ArgumentParser() parser.add_argument("--config-file", action="store", help="Config file with settings", type=file, required=True) args = parser.parse_args() config = configparser.ConfigParser() config.read_file(args.config_file) schemas_path = config['misc']['schemas-dir'] logspath = config['misc']['logs-dir'] oplog_settings = load_mongo_replicas_from_setting(config, 'mongo-oplog') mongo_settings = mongo_settings_from_config(config, 'mongo') psql_settings = psql_settings_from_config(config, 'psql') mongo_readers = {} schema_engines = get_schema_engines_as_dict(schemas_path) for collection_name in schema_engines: reader = mongo_reader_from_settings(mongo_settings, collection_name, {}) mongo_readers[collection_name] = reader mongo_readers[collection_name].set_name(collection_name) # create oplog read transport/s oplog_readers = {} for oplog_name, settings_list in oplog_settings.iteritems(): # settings list is a replica set (must be at least one in list) oplog_readers[oplog_name] = \ mongo_reader_from_settings(settings_list, 'oplog.rs', {}) oplog_readers[oplog_name].set_name(oplog_name) psql_qmetl = PsqlRequests(psql_conn_from_settings(psql_settings)) psql_main = PsqlRequests(psql_conn_from_settings(psql_settings)) status_table = PsqlEtlStatusTable(psql_qmetl.cursor, config['psql']['psql-schema-name'], sorted(oplog_settings.keys())) status_manager = PsqlEtlStatusTableManager(status_table) psql_schema = config['psql']['psql-schema-name'] res = 0 status = status_table.get_recent() if status: if (status.status == STATUS_INITIAL_LOAD \ or status.status == STATUS_OPLOG_RESYNC) \ and status.time_end and not status.error: create_logger(logspath, 'oplogsync') psql_sync = psql_main # intial load done, save oplog sync status and do oplog sync. status_manager.oplog_sync_start(status.ts) unalligned_sync = OplogSyncUnallignedData( psql_qmetl, psql_sync, mongo_readers, oplog_readers, schemas_path, schema_engines, psql_schema) try: ts = unalligned_sync.sync(status.ts) stat = unalligned_sync.statistic() reinit_conn(psql_settings, psql_qmetl, status_manager) if ts: # sync ok status_manager.oplog_sync_finish(stat[0], stat[1], ts, False) res = 0 else: # error status_manager.oplog_sync_finish(stat[0], stat[1], None, True) res = -1 except Exception, e: getLogger(__name__).error(e, exc_info=True) getLogger(__name__).error('ROLLBACK CLOSE') psql_sync.conn.rollback() reinit_conn(psql_settings, psql_qmetl, status_manager) status_manager.oplog_sync_finish(None, True) res = -1 elif (status.status == STATUS_OPLOG_SYNC or \ status.status == STATUS_OPLOG_APPLY) \ and status.time_end and not status.error: create_logger(logspath, 'oploguse') # sync done, now apply oplog pacthes to main psql # save oplog sync status getLogger(__name__).\ info('Sync point is ts:{ts}'.format(ts=status.ts)) status_manager.oplog_use_start(status.ts) alligned_sync = \ OplogSyncAllignedData(psql_main, mongo_readers, oplog_readers, schemas_path, schema_engines, psql_schema) try: ts_res = alligned_sync.sync(status.ts) stat = alligned_sync.statistic() reinit_conn(psql_settings, psql_qmetl, status_manager) if ts_res == 'resync': # some records recovered must do resync at next step status_manager.oplog_resync_finish(stat[0], stat[1], status.ts, False) res = 0 elif ts_res: # oplog apply ok status_manager.oplog_use_finish(stat[0], stat[1], ts_res, False) res = 0 else: # error status_manager.oplog_use_finish(stat[0], stat[1], None, True) res = -1 except Exception, e: getLogger(__name__).error(e, exc_info=True) getLogger(__name__).error('ROLLBACK CLOSE') psql_main.conn.rollback() reinit_conn(psql_settings, psql_qmetl, status_manager) status_manager.oplog_use_finish(None, None, None, True) res = -1