def ingest_csv_into_mysql(input_csv): df = pandas.read_csv(input_csv) hook = MySqlHook(mysql_conn_id='mysql_connection') df.to_sql( name="users", if_exists='replace', con=hook.get_sqlalchemy_engine() )
def upload_db(table, tmp_file, mysql_conn_id='default_mysql'): df = pd.read_csv(tmp_file, ) mysql_hook = MySqlHook(mysql_conn_id=mysql_conn_id) print(df) print( '###############################################################################################' ) conn = mysql_hook.get_conn() cursor = conn.cursor() cursor.execute('truncate {}'.format(table)) conn.commit() print( '###############################################################################################' ) df.to_sql(table, mysql_hook.get_sqlalchemy_engine(), if_exists='append', index=False)
def execute(self, context): ga_conn = GoogleAnalyticsHook(self.google_analytics_conn_id, key_file=self.key_file) try: since_formatted = datetime.strptime( self.since, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d') except: since_formatted = str(self.since) try: until_formatted = datetime.strptime( self.until, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d') except: until_formatted = str(self.until) report = ga_conn.get_analytics_report( self.view_id, since_formatted, until_formatted, self.sampling_level, self.dimensions, self.metrics, self.page_size, self.include_empty_rows, dimension_filter_clauses=self.dimension_filter_clauses) columnHeader = report.get('columnHeader', {}) # Right now all dimensions are hardcoded to varchar(255), will need a map if any non-varchar dimensions are used in the future # Unfortunately the API does not send back types for Dimensions like it does for Metrics (yet..) dimensionHeaders = [{ 'name': header.replace('ga:', ''), 'type': 'varchar(255)' } for header in columnHeader.get('dimensions', [])] metricHeaders = [{ 'name': entry.get('name').replace('ga:', ''), 'type': self.metricMap.get(entry.get('type'), 'varchar(255)') } for entry in columnHeader.get('metricHeader', {}).get( 'metricHeaderEntries', [])] rows = report.get('data', {}).get('rows', []) all_data = [] for row_counter, row in enumerate(rows): root_data_obj = {} dimensions = row.get('dimensions', []) metrics = row.get('metrics', []) for index, dimension in enumerate(dimensions): header = dimensionHeaders[index].get('name').lower() root_data_obj[header] = dimension for metric in metrics: data = {} data.update(root_data_obj) for index, value in enumerate(metric.get('values', [])): header = metricHeaders[index].get('name').lower() data[header] = value data['viewid'] = self.view_id data['timestamp'] = self.since all_data.append(data) df_google_data = pd.DataFrame(all_data) mysql_hook = MySqlHook(self.mysql_conn_id) df_google_data.to_sql(name=self.destination_table, con=mysql_hook.get_sqlalchemy_engine(), dtype=self.destination_table_dtypes, if_exists=self.if_exists, schema=self.destination_schema)
from airflow.hooks.mysql_hook import MySqlHook from sqlalchemy.orm import sessionmaker from datetime import datetime # MySQL config mysqlhook = MySqlHook(mysql_conn_id='PTT') engine_kwargs = {'connect_args': {'charset': 'utf8mb4'}} Session = sessionmaker(bind=mysqlhook.get_sqlalchemy_engine(engine_kwargs)) session = Session() def upsert(row, table): url = row['url'] hits = row['hits'] title = row['title'] board = row['board'] author = row['author'] posted_date = row['timestamp'].split('T')[0] description = row['description'] record = session.query(table).filter_by(title=title, author=author, board=board, url=url).first() if not record: record = table() record.url = url