def test_insert_by_pandas_multi(conn_str, load_data): """ Test the data by inserting from Pandas to_sql API """ logging.info("Test case insert data from Pandas DataFrame with to_sql") start_time = time() with DBConnector(conn_str) as db: load_data.to_sql("kaggle_data", db.engine, index=False, method='multi', chunksize=10000, if_exists='append') logging.info("Complated writing the data to DB") end_time = time() total_write_time = end_time - start_time tola_min, toal_seconds = divmod(total_write_time, 60) logging.info("Took {0} minutes {1} seconds to write the data".format( tola_min, toal_seconds)) db.engine.connect().execute("TRUNCATE TABLE kaggle_data") logging.info("Truncated the table")
def test_insert_multiple_update(sample_dataframe, conn_str): """ Testing to insert same data frame twice The second attempt is an update. Since it is same record and columns have PK restriction this should fail. """ with DBConnector(conn_str) as db_conn: db_conn.db_session.bulk_insert_mappings( UserData, sample_dataframe.to_dict(orient="records")) all_records = db_conn.db_session.query(UserData).all() logging.info("total recoreds in table {0}".format(len(all_records))) assert len(all_records) == 4 logging.info("Inserting four records inserting the same records again") try: db_conn.db_session.bulk_update_mappings( UserData, sample_dataframe.to_dict(orient="records")) except IntegrityError as sqlinterr: logging.error(sqlinterr) db_conn.db_session.rollback() logging.info("Filed duplicate reod insert") logging.info("if you see this message this is done")
def test_insert_multiple(sample_dataframe, conn_str): """ Testing to insert same data frame twice This should should fail """ with DBConnector(conn_str) as db_conn: logging.info("Inserting first set of data") db_conn.db_session.bulk_insert_mappings( UserData, sample_dataframe.to_dict(orient="records")) all_records = db_conn.db_session.query(UserData).all() logging.info("total recoreds in table {0}".format(len(all_records))) assert len(all_records) == 4 logging.info("Inserting four records inserting the same records again") try: db_conn.db_session.bulk_insert_mappings( UserData, sample_dataframe.to_dict(orient="records")) logging.info("Compleated bulk update mapping") except IntegrityError as sqlinterr: logging.error(sqlinterr) db_conn.db_session.rollback() logging.info( "Filed duplicate reod insert and rolled back the changes") db_conn.db_session.rollback() logging.info("if you see this message this is done")
def test_insert_rt_by_skiiping_exiting(conn_str, sample_dataframe, sample_dataframe_half): """ Test to insert by skiiping the existing reods """ with DBConnector(conn_str) as conn: logging.info("Insert base data") conn.db_session.bulk_insert_mappings( UserData, sample_dataframe.to_dict(orient="records")) logging.info("Inserted base data") conn.db_session.commit() try: logging.info("Trying to insert data with duplicates") conn.db_session.bulk_insert_mappings( UserData, sample_dataframe_half.to_dict(orient="records")) except IntegrityError as sqleer: #import pdb; pdb.set_trace() #logging.info(sqleer) conn.db_session.rollback() integ_key_error = sqleer.orig.diag.message_detail logging.info(sqleer) primary_key, pk_val = parse_integrity_err(integ_key_error) pk_object = getattr(UserData, primary_key) exiting_records = conn.db_session.query(pk_object).filter( pk_object.in_( sample_dataframe_half[primary_key].to_list())).all() exiting_records = list(itertools.chain(*exiting_records)) logging.info(exiting_records) duplicate_emiminated_data = sample_dataframe_half[ ~sample_dataframe_half[primary_key].isin(exiting_records)] logging.info( "Found existing recods and created new data DataFrame") conn.db_session.bulk_insert_mappings( UserData, duplicate_emiminated_data.to_dict(orient="records")) retrive_all = conn.db_session.query(UserData).all() assert len(retrive_all) == 7 assert retrive_all[0].user_name == "jagan" assert retrive_all[-1].user_name == "vidya" #conn.db_session.commit() logging.info('Inserted the clean data and test complete') truncate_stmt = UserData.__table__.delete() conn.db_session.execute(truncate_stmt) conn.db_session.commit() logging.info("Cleaned the DB")
def test_insert_read_by_pandas(conn_str, load_data): """ Test the data by inserting from Pandas to_sql API """ logging.info("Test case insert data from Pandas DataFrame with to_sql") start_time = time() with DBConnector(conn_str) as db: load_data.to_sql( "kaggle_data", db.engine, index=False, #method='multi', if_exists='append') logging.info("Complated writing the data to DB") end_time = time() total_write_time = end_time - start_time tola_min, toal_seconds = divmod(total_write_time, 60) logging.info("Took {0} minutes {1} seconds to write the data".format( tola_min, toal_seconds)) logging.info("Start read testing with read_sql_table") read_start = time() try: data_back = pd.read_sql_table("kaggle_data", con=db.engine) logging.info("Complated the reading") except Exception as err: logging.error("error in read data {0}".format(err)) read_end = time() total_read_min, read_sec = divmod(read_end - read_start, 60) logging.info("Took {0} minutes {1} seconds to read data".format( total_read_min, read_sec)) logging.info("Start read testing with read_sql") read_start_two = time() try: data_back_two = pd.read_sql("""SELECT * FROM kaggle_data""", con=db.engine) logging.info("Complated the reading") except Exception as err: logging.error("error in read data {0}".format(err)) read_end_two = time() total_read_min_two, read_sec_two = divmod( read_end_two - read_start_two, 60) logging.info("Took {0} minutes {1} seconds to read data".format( total_read_min_two, read_sec_two)) logging.info("Truncating the table") db.engine.connect().execute("TRUNCATE TABLE kaggle_data") logging.info("Truncated the table")
def test_get_pandas_schema(conn_str, sample_dataframe): """ Test the behviour of get_schemaa in Pandas It is not accurate as per schema in the table """ from pandas.io.sql import get_schema with DBConnector(conn_str) as conn: logging.info(get_schema(sample_dataframe, "userdata", con=conn.engine))
def test_insert_single_dup(sample_dataframe_half, sample_dataframe, conn_str): """ Testing to insert same data frame twice This should should fail """ logging.info("Testing single duplicate case One") with DBConnector(conn_str) as db_conn: db_conn.db_session.bulk_insert_mappings( UserData, sample_dataframe.to_dict(orient="records")) logging.info("Inserted base data") try: db_conn.db_session.bulk_insert_mappings( UserData, sample_dataframe_half.to_dict(orient="records")) except IntegrityError as sqleint: logging.error(sqleint) db_conn.db_session.rollback() logging.info("Truncated the table") logging.info("Compleated the single duplicate test")
def test_insert_read_by_model(conn_str, load_data): """ Test insert pandas data by SQLAlchecmy Model """ with DBConnector(conn_str) as db: logging.info("Starting the SQLAlchemy insert test") start_time = time() db.db_session.bulk_insert_mappings(KaggleData, load_data.to_dict(orient="records")) end_time = time() total_write_time = end_time - start_time tola_min, toal_seconds = divmod(total_write_time, 60) logging.info("Complated data insert in {0} minutes {1} seconds".format( tola_min, toal_seconds)) db.db_session.commit() logging.info("Start the read operation") try: start_time = time() data_back = pd.read_sql(db.db_session.query(KaggleData).statement, con=db.engine) logging.info(data_back.head(2)) except Exception as err: logging.error("Error in reading the data") read_end = time() total_read_min, read_sec = divmod(read_end - start_time, 60) logging.info("Took {0} minutes {1} seconds to read data".format( total_read_min, read_sec)) truncate_stmt = KaggleData.__table__.delete() db.db_session.execute(truncate_stmt) db.db_session.commit() logging.info("truncated the table")
def test_inset_single(sample_dataframe, conn_str): """ Insert no dupicate screnario """ logging.info("Starting the no duplicate insert test") logging.info(sample_dataframe.head(2)) with DBConnector(conn_str) as db_conn: truncate_stmt = UserData.__table__.delete() db_conn.db_session.execute(truncate_stmt) db_conn.db_session.bulk_insert_mappings( UserData, sample_dataframe.to_dict(orient="records")) all_records = db_conn.db_session.query(UserData).all() logging.info("total recoreds in table {0}".format(len(all_records))) assert len(all_records) == 4 db_conn.db_session.rollback() logging.info("Compleated no deuplicate insert test")
def test_update_with_pandas(sample_dataframe, sample_dataframe_update, conn_str): """ Test data uipdate with pandas """ logging.info("Testing single update case") with DBConnector(conn_str) as db_conn: db_conn.db_session.bulk_insert_mappings( UserData, sample_dataframe.to_dict(orient="records")) try: db_conn.db_session.bulk_update_mappings( UserData, sample_dataframe_update.to_dict(orient="records")) updated_record = db_conn.db_session.query(UserData).filter_by( user_name="jagan").first() assert updated_record.user_name == "jagan" assert updated_record.record_id == 13 except IntegrityError as sqleint: logging.error(sqleint) logging.info("Truncated the table") db_conn.db_session.rollback() logging.info("Compleated the single duplicate test")
def test_alchemy_integrity_error_cpature(sample_dataframe, conn_str): """ Capture SQL alchmy errro details de,o """ logging.info("Writing the initial data") with DBConnector(conn_str) as db: db.db_session.bulk_insert_mappings( UserData, sample_dataframe.to_dict(orient="records")) logging.info("insrted base data now duplicate entry time") try: db.db_session.bulk_insert_mappings( UserData, sample_dataframe.to_dict(orient="records")) logging.info("Inserted duplicate :-()") except IntegrityError as sqlerr: logging.error( "Error in inseting data. Table {0} Constraint {1}".format( sqlerr.orig.diag.table_name, sqlerr.orig.diag.constraint_name)) logging.info("Cause of the trouble is {0} ".format( sqlerr.orig.diag.message_detail))
def test_get_schema(load_data, conn_str): with DBConnector(conn_str) as conn: logging.info(get_schema(load_data, "kaggle_data", con=conn.engine))