def load_rows_to_destination(df, conn_id_dest, table_name_dest, load_all): # extract every table as a data frame and convert it to object type -> NaT values then are treated as null # objects and can be converted to None df = df.astype(object) # convert nulls to None (needed in MySQL upload) logging.debug("Convert NaN, NaT -> None") df = df.where(pd.notnull(df), None) target_fields = list(df.keys()) logging.info("Column fields from source: {}".format(target_fields)) logging.info("Row Count from chunk source: '{}'".format(df.shape[0])) if not load_all: # just load the part that has updated_at > last_destination_updated_at mysql_hook_load = hooks.MyMysqlHook(conn_id_dest) # replace should be false, but cannot be sure that we are not repeating values mysql_hook_load.insert_update_on_duplicate_rows( table_name_dest, rows=df.values.tolist(), columns=target_fields, commit_every=1000) else: # load everything replacing any value if the same PK is found mysql_hook_load = MySqlHook(conn_id_dest) mysql_hook_load.insert_rows(table_name_dest, df.values.tolist(), target_fields=target_fields, commit_every=1000, replace=True)
def execute(self, context): hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id) logging.info("Extracting data from Hive") logging.info(self.sql) if self.bulk_load: tmpfile = NamedTemporaryFile() hive.to_csv(self.sql, tmpfile.name, delimiter='\t', lineterminator='\n', output_header=False) else: results = hive.get_records(self.sql) mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id) if self.mysql_preoperator: logging.info("Running MySQL preoperator") mysql.run(self.mysql_preoperator) logging.info("Inserting rows into MySQL") if self.bulk_load: mysql.bulk_load(table=self.mysql_table, tmp_file=tmpfile.name) tmpfile.close() else: mysql.insert_rows(table=self.mysql_table, rows=results) if self.mysql_postoperator: logging.info("Running MySQL postoperator") mysql.run(self.mysql_postoperator) logging.info("Done.")
def execute(self, context): logging.info('Executing: ' + str(self.sql)) src_mysql = MySqlHook(mysql_conn_id=self.src_mysql_conn_id) dest_mysql = MySqlHook(mysql_conn_id=self.dest_mysqls_conn_id) logging.info( "Transferring Mysql query results into other Mysql database.") conn = src_mysql.get_conn() cursor = conn.cursor() cursor.execute(self.sql, self.query_parameters) if self.mysql_preoperator: logging.info("Running Mysql preoperator") dest_mysql.run(self.mysql_preoperator) if cursor.rowcount != 0: logging.info("Inserting rows into Mysql") for i, row in enumerate(cursor): print("row", row) dest_mysql.insert_rows(table=self.dest_table, rows=cursor) logging.info(str(cursor.rowcount) + " rows inserted") else: logging.info("No rows inserted") if self.mysql_postoperator: logging.info("Running Mysql postoperator") dest_mysql.run(self.mysql_postoperator) logging.info("Done.")
def bulk_load_teams(table_name, **kwargs): local_filepath = '/home/vagrant/airflow/dags/baseballdatabank-master/core/top_teams_final.csv' conn = MySqlHook(mysql_conn_id='local_mysql') #conn.bulk_load(table_name, local_filepath) results = pandas.read_csv(local_filepath, sep = '\t', names=['yearID', 'franchID', 'teamID', 'W', 'L', 'percentage', 'franchName' ], encoding='utf-8') conn.insert_rows(table=table_name, rows=results.values.tolist()) return table_name
def execute(self, context): mysql_hook = MySqlHook(schema=self.database, mysql_conn_id=self.mysql_conn_id) for rows in self._bq_get_data(): mysql_hook.insert_rows(self.mysql_table, rows, replace=self.replace)
def filter_db(): api = MySqlHook() data = api.get_records(sql='select * from movie where vote_average > 7') # truncate table filter api.run(sql='truncate table movie_filter') # insert ke table filter api.insert_rows(table='movie_filter', rows=data)
def sql_import(**kwargs): input_file = kwargs['templates_dict']['input_file'] columns = ["WORD", "TIMES"] mysql = MySqlHook(mysql_conn_id='workshop_sql_conn_id') mysql.run("TRUNCATE WORDCOUNT") with open(input_file) as file: reader = csv.reader(file, delimiter=' ') data = list(reader) mysql.insert_rows('WORDCOUNT', data, target_fields=columns)
def sql_import(**kwargs): input_file = kwargs['templates_dict']['input_file'] columns = [ "CODE", "NUMBER_RELATED_ORDERS", "NUMBER_STATUSES", "NUMBER_PARTNERS", "NUMBER_COMMENTS" ] mysql = MySqlHook(mysql_conn_id='workshop_sql_conn_id') mysql.run("TRUNCATE PROCESSED_ORDER") with open(input_file) as file: reader = csv.reader(file, delimiter=' ') data = list(reader) mysql.insert_rows('PROCESSED_ORDER', data, target_fields=columns)
def execute(self, context): presto = PrestoHook(presto_conn_id=self.presto_conn_id) self.log.info("Extracting data from Presto: %s", self.sql) results = presto.get_records(self.sql) mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id) if self.mysql_preoperator: self.log.info("Running MySQL preoperator") self.log.info(self.mysql_preoperator) mysql.run(self.mysql_preoperator) self.log.info("Inserting rows into MySQL") mysql.insert_rows(table=self.mysql_table, rows=results)
def execute(self, context): postgres = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Extracting data from Redshift: %s", self.sql) results = postgres.get_records(self.sql) mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id) if self.mysql_preoperator: self.log.info("Running MySQL preoperator") self.log.info(self.mysql_preoperator) mysql.run(self.mysql_preoperator) self.log.info("Inserting rows into MySQL") mysql.insert_rows(table=self.mysql_table, rows=results, replace=self.replace)
def insert_or_update_table(**kwargs): try: json_data = json.loads(kwargs["extra_json"]) table_name = json_data['schedule_info']['output_table'] sql = kwargs['sql'] logging.info('trying the task') logging.info('connecting to source') src = MySqlHook(mysql_conn_id=kwargs['schema']) logging.info(f"Remotely received sql of {sql}") logging.info(f"Remotely received sql of {table_name}") logging.info('connecting to destination') dest = MySqlHook(mysql_conn_id='analytics') src_conn = src.get_conn() cursor = src_conn.cursor() cursor.execute(sql) dest.insert_rows(table=table_name, rows=cursor, replace=True) except Exception as e3: logging.error('Table update is failed, please refer the logs more details') logging.exception(e3)
def copy(ds, **kwargs): source_query = """select * from address;""" dest_query = "insert into address values %s" source_hook = create_engine( 'postgresql+psycopg2://airflow:airflow@postgres/airflow') source_conn = source_hook.connect() records = source_conn.execute(source_query) dest_hook = MySqlHook(mysql_conn_id="target", schema="mysql") dest_conn = dest_hook.get_conn() dest_cursor = dest_conn.cursor() if records: # logging.info("Inserting rows into MySQL") dest_hook.insert_rows(table="address", rows=records) dest_cursor.close() source_conn.close() dest_conn.close()
def execute(self, context): dest_mysql = MySqlHook(mysql_conn_id=self.dest_mysqls_conn_id) self.cursor = self.cursor if not data_cursor else kwargs['ti'].xcom_pull( key=None, task_ids=data_cursor) logging.info( "Transferring cursor into new Mysql database.") if self.mysql_preoperator: logging.info("Running Mysql preoperator") dest_mysql.run(self.mysql_preoperator) dest_mysql.insert_rows(table=self.dest_table, rows=self.cursor) logging.info(self.cursor.rowcount, " rows inserted") else: logging.info("No rows inserted") if self.mysql_postoperator: logging.info("Running Mysql postoperator") dest_mysql.run(self.mysql_postoperator) logging.info("Done.")
def execute(self, context): vertica = VerticaHook(vertica_conn_id=self.vertica_conn_id) mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id) tmpfile = None result = None selected_columns = [] count = 0 with closing(vertica.get_conn()) as conn: with closing(conn.cursor()) as cursor: cursor.execute(self.sql) selected_columns = [d.name for d in cursor.description] if self.bulk_load: tmpfile = NamedTemporaryFile("w") self.log.info( "Selecting rows from Vertica to local file %s...", tmpfile.name) self.log.info(self.sql) csv_writer = csv.writer(tmpfile, delimiter='\t', encoding='utf-8') for row in cursor.iterate(): csv_writer.writerow(row) count += 1 tmpfile.flush() else: self.log.info("Selecting rows from Vertica...") self.log.info(self.sql) result = cursor.fetchall() count = len(result) self.log.info("Selected rows from Vertica %s", count) if self.mysql_preoperator: self.log.info("Running MySQL preoperator...") mysql.run(self.mysql_preoperator) try: if self.bulk_load: self.log.info("Bulk inserting rows into MySQL...") with closing(mysql.get_conn()) as conn: with closing(conn.cursor()) as cursor: cursor.execute( "LOAD DATA LOCAL INFILE '%s' INTO " "TABLE %s LINES TERMINATED BY '\r\n' (%s)" % (tmpfile.name, self.mysql_table, ", ".join(selected_columns))) conn.commit() tmpfile.close() else: self.log.info("Inserting rows into MySQL...") mysql.insert_rows(table=self.mysql_table, rows=result, target_fields=selected_columns) self.log.info("Inserted rows into MySQL %s", count) except (MySQLdb.Error, MySQLdb.Warning): self.log.info("Inserted rows into MySQL 0") raise if self.mysql_postoperator: self.log.info("Running MySQL postoperator...") mysql.run(self.mysql_postoperator) self.log.info("Done")
def execute(self, context=None): metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id) table = metastore.get_table(table_name=self.table) field_types = {col.name: col.type for col in table.sd.cols} exprs = { ('', 'count'): 'COUNT(*)' } for col, col_type in list(field_types.items()): d = {} if self.assignment_func: d = self.assignment_func(col, col_type) if d is None: d = self.get_default_exprs(col, col_type) else: d = self.get_default_exprs(col, col_type) exprs.update(d) exprs.update(self.extra_exprs) exprs = OrderedDict(exprs) exprs_str = ",\n ".join([ v + " AS " + k[0] + '__' + k[1] for k, v in exprs.items()]) where_clause = ["{} = '{}'".format(k, v) for k, v in self.partition.items()] where_clause = " AND\n ".join(where_clause) sql = "SELECT {exprs_str} FROM {table} WHERE {where_clause};".format( exprs_str=exprs_str, table=self.table, where_clause=where_clause) presto = PrestoHook(presto_conn_id=self.presto_conn_id) self.log.info('Executing SQL check: %s', sql) row = presto.get_first(hql=sql) self.log.info("Record: %s", row) if not row: raise AirflowException("The query returned None") part_json = json.dumps(self.partition, sort_keys=True) self.log.info("Deleting rows from previous runs if they exist") mysql = MySqlHook(self.mysql_conn_id) sql = """ SELECT 1 FROM hive_stats WHERE table_name='{table}' AND partition_repr='{part_json}' AND dttm='{dttm}' LIMIT 1; """.format(table=self.table, part_json=part_json, dttm=self.dttm) if mysql.get_records(sql): sql = """ DELETE FROM hive_stats WHERE table_name='{table}' AND partition_repr='{part_json}' AND dttm='{dttm}'; """.format(table=self.table, part_json=part_json, dttm=self.dttm) mysql.run(sql) self.log.info("Pivoting and loading cells into the Airflow db") rows = [(self.ds, self.dttm, self.table, part_json) + (r[0][0], r[0][1], r[1]) for r in zip(exprs, row)] mysql.insert_rows( table='hive_stats', rows=rows, target_fields=[ 'ds', 'dttm', 'table_name', 'partition_repr', 'col', 'metric', 'value', ] )
def execute(self, context): vertica = VerticaHook(vertica_conn_id=self.vertica_conn_id) mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id) tmpfile = None result = None selected_columns = [] count = 0 with closing(vertica.get_conn()) as conn: with closing(conn.cursor()) as cursor: cursor.execute(self.sql) selected_columns = [d.name for d in cursor.description] if self.bulk_load: tmpfile = NamedTemporaryFile("w") self.log.info( "Selecting rows from Vertica to local file %s...", tmpfile.name) self.log.info(self.sql) csv_writer = csv.writer(tmpfile, delimiter='\t', encoding='utf-8') for row in cursor.iterate(): csv_writer.writerow(row) count += 1 tmpfile.flush() else: self.log.info("Selecting rows from Vertica...") self.log.info(self.sql) result = cursor.fetchall() count = len(result) self.log.info("Selected rows from Vertica %s", count) if self.mysql_preoperator: self.log.info("Running MySQL preoperator...") mysql.run(self.mysql_preoperator) try: if self.bulk_load: self.log.info("Bulk inserting rows into MySQL...") with closing(mysql.get_conn()) as conn: with closing(conn.cursor()) as cursor: cursor.execute("LOAD DATA LOCAL INFILE '%s' INTO " "TABLE %s LINES TERMINATED BY '\r\n' (%s)" % (tmpfile.name, self.mysql_table, ", ".join(selected_columns))) conn.commit() tmpfile.close() else: self.log.info("Inserting rows into MySQL...") mysql.insert_rows(table=self.mysql_table, rows=result, target_fields=selected_columns) self.log.info("Inserted rows into MySQL %s", count) except (MySQLdb.Error, MySQLdb.Warning): self.log.info("Inserted rows into MySQL 0") raise if self.mysql_postoperator: self.log.info("Running MySQL postoperator...") mysql.run(self.mysql_postoperator) self.log.info("Done")
def execute(self, context=None): metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id) table = metastore.get_table(table_name=self.table) field_types = {col.name: col.type for col in table.sd.cols} exprs = {('', 'count'): 'COUNT(*)'} for col, col_type in list(field_types.items()): d = {} if self.assignment_func: d = self.assignment_func(col, col_type) if d is None: d = self.get_default_exprs(col, col_type) else: d = self.get_default_exprs(col, col_type) exprs.update(d) exprs.update(self.extra_exprs) exprs = OrderedDict(exprs) exprs_str = ",\n ".join( [v + " AS " + k[0] + '__' + k[1] for k, v in exprs.items()]) where_clause = [ "{} = '{}'".format(k, v) for k, v in self.partition.items() ] where_clause = " AND\n ".join(where_clause) sql = "SELECT {exprs_str} FROM {table} WHERE {where_clause};".format( exprs_str=exprs_str, table=self.table, where_clause=where_clause) presto = PrestoHook(presto_conn_id=self.presto_conn_id) self.log.info('Executing SQL check: %s', sql) row = presto.get_first(hql=sql) self.log.info("Record: %s", row) if not row: raise AirflowException("The query returned None") part_json = json.dumps(self.partition, sort_keys=True) self.log.info("Deleting rows from previous runs if they exist") mysql = MySqlHook(self.mysql_conn_id) sql = """ SELECT 1 FROM hive_stats WHERE table_name='{table}' AND partition_repr='{part_json}' AND dttm='{dttm}' LIMIT 1; """.format(table=self.table, part_json=part_json, dttm=self.dttm) if mysql.get_records(sql): sql = """ DELETE FROM hive_stats WHERE table_name='{table}' AND partition_repr='{part_json}' AND dttm='{dttm}'; """.format(table=self.table, part_json=part_json, dttm=self.dttm) mysql.run(sql) self.log.info("Pivoting and loading cells into the Airflow db") rows = [(self.ds, self.dttm, self.table, part_json) + (r[0][0], r[0][1], r[1]) for r in zip(exprs, row)] mysql.insert_rows(table='hive_stats', rows=rows, target_fields=[ 'ds', 'dttm', 'table_name', 'partition_repr', 'col', 'metric', 'value', ])