def get_documentum_test(): """Get tables from Documentum test database.""" logging.info('Getting files for documentum test') table_name = dn.table_name('schedule_daily')+dn.table_name('schedule_hourly_15')+dn.table_name('schedule_hourly_30') logging.info(table_name) for name in table_name: logging.info('Querying for {0} table'.format(name)) query_string = 'SELECT * FROM SCSLEGIS.dbo.{0};'.format(name) logging.info('Connecting to MS Database') documentum_conn = MsSqlHook(mssql_conn_id='docm_test_sql') logging.info('Reading data to Pandas DataFrame') try: df = documentum_conn.get_pandas_df(query_string) logging.info('Correcting title column') df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']]) save_path = conf['prod_data_dir'] + '/documentum_{0}_test.csv'.format(name.lower()) general.pos_write_csv(df, save_path) except Exception as e: logging.info(f'Could not read {0} because {e}') return "Successfully retrieved Documentum tables"
def get_documentum(mode, **kwargs): """Get tables from Documentum.""" logging.info('Getting files from documentum') table_name = dn.table_name(mode) for name in table_name: logging.info('Querying for {0} table'.format(name)) query_string = 'SELECT * FROM SCSLEGIS.dbo.{0};'.format(name) logging.info('Connecting to MS Database') documentum_conn = MsSqlHook(mssql_conn_id='docm_sql') logging.info('Reading data to Pandas DataFrame') df = documentum_conn.get_pandas_df(query_string) logging.info('Correcting title column') df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']]) save_path = conf['prod_data_dir'] + '/documentum_{0}.csv'.format(name.lower()) logging.info('Writing Production file') general.pos_write_csv(df, save_path) return "Successfully retrieved Documentum tables"
def get_documentum(mode, **kwargs): """Get tables from Documentum.""" logging.info('Getting files from documentum') table_name = dn.table_name(mode) for name in table_name: logging.info('Querying for {0} table'.format(name)) query_string = 'SELECT * FROM SCSLEGIS.dbo.{0};'.format(name) logging.info('Connecting to MS Database') documentum_conn = MsSqlHook(mssql_conn_id='docm_sql') logging.info('Reading data to Pandas DataFrame') df = documentum_conn.get_pandas_df(query_string) logging.info('Correcting title column') df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']]) if mode == 'schedule_24': save_path = conf['prod_data_dir'] + '/documentum_{0}.csv'.format(name.lower()) else: save_path = conf['prod_data_dir'] + '/documentum_{0}.csv'.format(name.lower()) logging.info('Writing Production file') general.pos_write_csv(df, save_path) return "Successfully retrieved Documentum tables"
on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, dag=dag) #: Execution rules #: documentum_docs_latest_only must run before get_doc_tables get_doc_tables.set_upstream(documentum_docs_latest_only) #: get_doc_tables must run before div_doc_table div_doc_table.set_upstream(get_doc_tables) #: get_doc_tables must run before upload_doc_tables upload_reso_ord.set_upstream(div_doc_table) files = [f for f in os.listdir(conf['prod_data_dir'])] tables_other = dn.table_name(schedule_mode) for f in files: file_name = f.split('.')[0] name_parts = file_name.split('_') if name_parts[0] == "documentum": file_check = '_'.join(name_parts[1:]).upper() if file_check in tables_other: #: Upload onbase prod files to S3 upload_doc_tables = S3FileTransferOperator( task_id='upload_' + file_name, source_base_path=conf['prod_data_dir'], source_key='{}.csv'.format(file_name), dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='city_docs/{}.csv'.format(file_name), on_failure_callback=notify,
start_date=start_date, schedule_interval=schedule) prod_data = conf['prod_data_dir'] #: Get documentum tables get_doc_tables = PythonOperator(task_id='get_documentum_tables', python_callable=get_documentum_test, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Execution rules files = [f for f in os.listdir(conf['prod_data_dir'])] tables_all = dn.table_name('schedule_daily') + dn.table_name( 'schedule_hourly_15') + dn.table_name('schedule_hourly_30') for f in files: file_name = f.split('.')[0] name_parts = file_name.split('_') if name_parts[0] == "documentum": file_check = '_'.join(name_parts[1:-1]).upper() if file_check in tables_all: #: Upload onbase prod files to S3 upload_doc_tables = S3FileTransferOperator( task_id='upload_' + file_name, source_base_path=conf['prod_data_dir'], source_key='{}.csv'.format(file_name), dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='city_docs/{}.csv'.format(file_name),
#: Get documentum tables get_doc_tables = PythonOperator( task_id='get_documentum_tables', python_callable=get_documentum, op_kwargs={'mode': schedule_mode}, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Execution rules #: documentum_docs_latest_only must run before get_doc_tables get_doc_tables.set_upstream(documentum_docs_latest_only) files = [f for f in os.listdir(conf['prod_data_dir'])] tables_other = dn.table_name(schedule_mode) for f in files: file_name = f.split('.')[0] name_parts = file_name.split('_') if name_parts[0] == "documentum": file_check = '_'.join(name_parts[1:]).upper() if file_check in tables_other: #: Upload onbase prod files to S3 upload_doc_tables = S3FileTransferOperator( task_id='upload_' + file_name, source_base_path=conf['prod_data_dir'], source_key='{}.csv'.format(file_name), dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='city_docs/{}.csv'.format(file_name), on_failure_callback=notify,