示例#1
0
def load_rows_to_destination(df, conn_id_dest, table_name_dest, load_all):
    # extract every table as a data frame and convert it to object type -> NaT values then are treated as null
    # objects and can be converted to None
    df = df.astype(object)

    # convert nulls to None (needed in MySQL upload)
    logging.debug("Convert NaN, NaT -> None")
    df = df.where(pd.notnull(df), None)

    target_fields = list(df.keys())

    logging.info("Column fields from source: {}".format(target_fields))
    logging.info("Row Count from chunk source: '{}'".format(df.shape[0]))

    if not load_all:  # just load the part that has updated_at > last_destination_updated_at
        mysql_hook_load = hooks.MyMysqlHook(conn_id_dest)

        # replace should be false, but cannot be sure that we are not repeating values
        mysql_hook_load.insert_update_on_duplicate_rows(
            table_name_dest,
            rows=df.values.tolist(),
            columns=target_fields,
            commit_every=1000)

    else:  # load everything replacing any value if the same PK is found
        mysql_hook_load = MySqlHook(conn_id_dest)
        mysql_hook_load.insert_rows(table_name_dest,
                                    df.values.tolist(),
                                    target_fields=target_fields,
                                    commit_every=1000,
                                    replace=True)
示例#2
0
    def execute(self, context):
        hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id)
        logging.info("Extracting data from Hive")
        logging.info(self.sql)

        if self.bulk_load:
            tmpfile = NamedTemporaryFile()
            hive.to_csv(self.sql,
                        tmpfile.name,
                        delimiter='\t',
                        lineterminator='\n',
                        output_header=False)
        else:
            results = hive.get_records(self.sql)

        mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)
        if self.mysql_preoperator:
            logging.info("Running MySQL preoperator")
            mysql.run(self.mysql_preoperator)

        logging.info("Inserting rows into MySQL")

        if self.bulk_load:
            mysql.bulk_load(table=self.mysql_table, tmp_file=tmpfile.name)
            tmpfile.close()
        else:
            mysql.insert_rows(table=self.mysql_table, rows=results)

        if self.mysql_postoperator:
            logging.info("Running MySQL postoperator")
            mysql.run(self.mysql_postoperator)

        logging.info("Done.")
    def execute(self, context):
        hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id)
        logging.info("Extracting data from Hive")
        logging.info(self.sql)

        if self.bulk_load:
            tmpfile = NamedTemporaryFile()
            hive.to_csv(self.sql, tmpfile.name, delimiter='\t',
                lineterminator='\n', output_header=False)
        else:
            results = hive.get_records(self.sql)

        mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)
        if self.mysql_preoperator:
            logging.info("Running MySQL preoperator")
            mysql.run(self.mysql_preoperator)

        logging.info("Inserting rows into MySQL")

        if self.bulk_load:
            mysql.bulk_load(table=self.mysql_table, tmp_file=tmpfile.name)
            tmpfile.close()
        else:
            mysql.insert_rows(table=self.mysql_table, rows=results)

        if self.mysql_postoperator:
            logging.info("Running MySQL postoperator")
            mysql.run(self.mysql_postoperator)

        logging.info("Done.")
示例#4
0
    def execute(self, context):
        logging.info('Executing: ' + str(self.sql))
        src_mysql = MySqlHook(mysql_conn_id=self.src_mysql_conn_id)
        dest_mysql = MySqlHook(mysql_conn_id=self.dest_mysqls_conn_id)

        logging.info(
            "Transferring Mysql query results into other Mysql database.")
        conn = src_mysql.get_conn()
        cursor = conn.cursor()
        cursor.execute(self.sql, self.query_parameters)

        if self.mysql_preoperator:
            logging.info("Running Mysql preoperator")
            dest_mysql.run(self.mysql_preoperator)

        if cursor.rowcount != 0:
            logging.info("Inserting rows into Mysql")
            for i, row in enumerate(cursor):
                print("row", row)
            dest_mysql.insert_rows(table=self.dest_table, rows=cursor)
            logging.info(str(cursor.rowcount) + " rows inserted")
        else:
            logging.info("No rows inserted")

        if self.mysql_postoperator:
            logging.info("Running Mysql postoperator")
            dest_mysql.run(self.mysql_postoperator)

        logging.info("Done.")
示例#5
0
def bulk_load_teams(table_name, **kwargs):
    local_filepath = '/home/vagrant/airflow/dags/baseballdatabank-master/core/top_teams_final.csv'
    conn = MySqlHook(mysql_conn_id='local_mysql')
    #conn.bulk_load(table_name, local_filepath)
    results = pandas.read_csv(local_filepath, sep = '\t',  names=['yearID', 'franchID', 'teamID', 'W', 'L', 'percentage', 'franchName' ], encoding='utf-8')
    conn.insert_rows(table=table_name, rows=results.values.tolist())
    return table_name
示例#6
0
 def execute(self, context):
     mysql_hook = MySqlHook(schema=self.database,
                            mysql_conn_id=self.mysql_conn_id)
     for rows in self._bq_get_data():
         mysql_hook.insert_rows(self.mysql_table,
                                rows,
                                replace=self.replace)
示例#7
0
def filter_db():
    api = MySqlHook()
    data = api.get_records(sql='select * from movie where vote_average > 7')

    # truncate table filter
    api.run(sql='truncate table movie_filter')

    # insert ke table filter
    api.insert_rows(table='movie_filter', rows=data)
示例#8
0
def sql_import(**kwargs):
    input_file = kwargs['templates_dict']['input_file']
    columns = ["WORD", "TIMES"]
    mysql = MySqlHook(mysql_conn_id='workshop_sql_conn_id')
    mysql.run("TRUNCATE WORDCOUNT")
    with open(input_file) as file:
        reader = csv.reader(file, delimiter=' ')
        data = list(reader)
        mysql.insert_rows('WORDCOUNT', data, target_fields=columns)
示例#9
0
def sql_import(**kwargs):
    input_file = kwargs['templates_dict']['input_file']
    columns = [
        "CODE", "NUMBER_RELATED_ORDERS", "NUMBER_STATUSES", "NUMBER_PARTNERS",
        "NUMBER_COMMENTS"
    ]
    mysql = MySqlHook(mysql_conn_id='workshop_sql_conn_id')
    mysql.run("TRUNCATE PROCESSED_ORDER")
    with open(input_file) as file:
        reader = csv.reader(file, delimiter=' ')
        data = list(reader)
        mysql.insert_rows('PROCESSED_ORDER', data, target_fields=columns)
示例#10
0
    def execute(self, context):
        presto = PrestoHook(presto_conn_id=self.presto_conn_id)
        self.log.info("Extracting data from Presto: %s", self.sql)
        results = presto.get_records(self.sql)

        mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)
        if self.mysql_preoperator:
            self.log.info("Running MySQL preoperator")
            self.log.info(self.mysql_preoperator)
            mysql.run(self.mysql_preoperator)

        self.log.info("Inserting rows into MySQL")
        mysql.insert_rows(table=self.mysql_table, rows=results)
示例#11
0
    def execute(self, context):
        presto = PrestoHook(presto_conn_id=self.presto_conn_id)
        self.log.info("Extracting data from Presto: %s", self.sql)
        results = presto.get_records(self.sql)

        mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)
        if self.mysql_preoperator:
            self.log.info("Running MySQL preoperator")
            self.log.info(self.mysql_preoperator)
            mysql.run(self.mysql_preoperator)

        self.log.info("Inserting rows into MySQL")
        mysql.insert_rows(table=self.mysql_table, rows=results)
示例#12
0
    def execute(self, context):
        postgres = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.log.info("Extracting data from Redshift: %s", self.sql)
        results = postgres.get_records(self.sql)

        mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)
        if self.mysql_preoperator:
            self.log.info("Running MySQL preoperator")
            self.log.info(self.mysql_preoperator)
            mysql.run(self.mysql_preoperator)

        self.log.info("Inserting rows into MySQL")
        mysql.insert_rows(table=self.mysql_table,
                          rows=results,
                          replace=self.replace)
def insert_or_update_table(**kwargs):
    try:
        json_data = json.loads(kwargs["extra_json"])
        table_name = json_data['schedule_info']['output_table']
        sql = kwargs['sql']
        logging.info('trying the task')
        logging.info('connecting to source')
        src = MySqlHook(mysql_conn_id=kwargs['schema'])
        logging.info(f"Remotely received sql of {sql}")
        logging.info(f"Remotely received sql of {table_name}")
        logging.info('connecting to destination')
        dest = MySqlHook(mysql_conn_id='analytics')
        src_conn = src.get_conn()
        cursor = src_conn.cursor()
        cursor.execute(sql)
        dest.insert_rows(table=table_name, rows=cursor, replace=True)
    except Exception as e3:
        logging.error('Table update is failed, please refer the logs more details')
        logging.exception(e3)
示例#14
0
def copy(ds, **kwargs):
    source_query = """select * from address;"""
    dest_query = "insert into address values %s"

    source_hook = create_engine(
        'postgresql+psycopg2://airflow:airflow@postgres/airflow')
    source_conn = source_hook.connect()
    records = source_conn.execute(source_query)

    dest_hook = MySqlHook(mysql_conn_id="target", schema="mysql")
    dest_conn = dest_hook.get_conn()
    dest_cursor = dest_conn.cursor()

    if records:
        # logging.info("Inserting rows into MySQL")
        dest_hook.insert_rows(table="address", rows=records)

    dest_cursor.close()

    source_conn.close()
    dest_conn.close()
示例#15
0
    def execute(self, context):
        dest_mysql = MySqlHook(mysql_conn_id=self.dest_mysqls_conn_id)

        self.cursor = self.cursor if not data_cursor else kwargs['ti'].xcom_pull(
            key=None, task_ids=data_cursor)

        logging.info(
            "Transferring cursor into new Mysql database.")

        if self.mysql_preoperator:
            logging.info("Running Mysql preoperator")
            dest_mysql.run(self.mysql_preoperator)

            dest_mysql.insert_rows(table=self.dest_table, rows=self.cursor)
            logging.info(self.cursor.rowcount, " rows inserted")
        else:
            logging.info("No rows inserted")

        if self.mysql_postoperator:
            logging.info("Running Mysql postoperator")
            dest_mysql.run(self.mysql_postoperator)

        logging.info("Done.")
示例#16
0
    def execute(self, context):
        vertica = VerticaHook(vertica_conn_id=self.vertica_conn_id)
        mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)

        tmpfile = None
        result = None

        selected_columns = []

        count = 0
        with closing(vertica.get_conn()) as conn:
            with closing(conn.cursor()) as cursor:
                cursor.execute(self.sql)
                selected_columns = [d.name for d in cursor.description]

                if self.bulk_load:
                    tmpfile = NamedTemporaryFile("w")

                    self.log.info(
                        "Selecting rows from Vertica to local file %s...",
                        tmpfile.name)
                    self.log.info(self.sql)

                    csv_writer = csv.writer(tmpfile,
                                            delimiter='\t',
                                            encoding='utf-8')
                    for row in cursor.iterate():
                        csv_writer.writerow(row)
                        count += 1

                    tmpfile.flush()
                else:
                    self.log.info("Selecting rows from Vertica...")
                    self.log.info(self.sql)

                    result = cursor.fetchall()
                    count = len(result)

                self.log.info("Selected rows from Vertica %s", count)

        if self.mysql_preoperator:
            self.log.info("Running MySQL preoperator...")
            mysql.run(self.mysql_preoperator)

        try:
            if self.bulk_load:
                self.log.info("Bulk inserting rows into MySQL...")
                with closing(mysql.get_conn()) as conn:
                    with closing(conn.cursor()) as cursor:
                        cursor.execute(
                            "LOAD DATA LOCAL INFILE '%s' INTO "
                            "TABLE %s LINES TERMINATED BY '\r\n' (%s)" %
                            (tmpfile.name, self.mysql_table,
                             ", ".join(selected_columns)))
                        conn.commit()
                tmpfile.close()
            else:
                self.log.info("Inserting rows into MySQL...")
                mysql.insert_rows(table=self.mysql_table,
                                  rows=result,
                                  target_fields=selected_columns)
            self.log.info("Inserted rows into MySQL %s", count)
        except (MySQLdb.Error, MySQLdb.Warning):
            self.log.info("Inserted rows into MySQL 0")
            raise

        if self.mysql_postoperator:
            self.log.info("Running MySQL postoperator...")
            mysql.run(self.mysql_postoperator)

        self.log.info("Done")
    def execute(self, context=None):
        metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id)
        table = metastore.get_table(table_name=self.table)
        field_types = {col.name: col.type for col in table.sd.cols}

        exprs = {
            ('', 'count'): 'COUNT(*)'
        }
        for col, col_type in list(field_types.items()):
            d = {}
            if self.assignment_func:
                d = self.assignment_func(col, col_type)
                if d is None:
                    d = self.get_default_exprs(col, col_type)
            else:
                d = self.get_default_exprs(col, col_type)
            exprs.update(d)
        exprs.update(self.extra_exprs)
        exprs = OrderedDict(exprs)
        exprs_str = ",\n        ".join([
            v + " AS " + k[0] + '__' + k[1]
            for k, v in exprs.items()])

        where_clause = ["{} = '{}'".format(k, v) for k, v in self.partition.items()]
        where_clause = " AND\n        ".join(where_clause)
        sql = "SELECT {exprs_str} FROM {table} WHERE {where_clause};".format(
            exprs_str=exprs_str, table=self.table, where_clause=where_clause)

        presto = PrestoHook(presto_conn_id=self.presto_conn_id)
        self.log.info('Executing SQL check: %s', sql)
        row = presto.get_first(hql=sql)
        self.log.info("Record: %s", row)
        if not row:
            raise AirflowException("The query returned None")

        part_json = json.dumps(self.partition, sort_keys=True)

        self.log.info("Deleting rows from previous runs if they exist")
        mysql = MySqlHook(self.mysql_conn_id)
        sql = """
        SELECT 1 FROM hive_stats
        WHERE
            table_name='{table}' AND
            partition_repr='{part_json}' AND
            dttm='{dttm}'
        LIMIT 1;
        """.format(table=self.table, part_json=part_json, dttm=self.dttm)
        if mysql.get_records(sql):
            sql = """
            DELETE FROM hive_stats
            WHERE
                table_name='{table}' AND
                partition_repr='{part_json}' AND
                dttm='{dttm}';
            """.format(table=self.table, part_json=part_json, dttm=self.dttm)
            mysql.run(sql)

        self.log.info("Pivoting and loading cells into the Airflow db")
        rows = [(self.ds, self.dttm, self.table, part_json) + (r[0][0], r[0][1], r[1])
                for r in zip(exprs, row)]
        mysql.insert_rows(
            table='hive_stats',
            rows=rows,
            target_fields=[
                'ds',
                'dttm',
                'table_name',
                'partition_repr',
                'col',
                'metric',
                'value',
            ]
        )
    def execute(self, context):
        vertica = VerticaHook(vertica_conn_id=self.vertica_conn_id)
        mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)

        tmpfile = None
        result = None

        selected_columns = []

        count = 0
        with closing(vertica.get_conn()) as conn:
            with closing(conn.cursor()) as cursor:
                cursor.execute(self.sql)
                selected_columns = [d.name for d in cursor.description]

                if self.bulk_load:
                    tmpfile = NamedTemporaryFile("w")

                    self.log.info(
                        "Selecting rows from Vertica to local file %s...",
                        tmpfile.name)
                    self.log.info(self.sql)

                    csv_writer = csv.writer(tmpfile, delimiter='\t', encoding='utf-8')
                    for row in cursor.iterate():
                        csv_writer.writerow(row)
                        count += 1

                    tmpfile.flush()
                else:
                    self.log.info("Selecting rows from Vertica...")
                    self.log.info(self.sql)

                    result = cursor.fetchall()
                    count = len(result)

                self.log.info("Selected rows from Vertica %s", count)

        if self.mysql_preoperator:
            self.log.info("Running MySQL preoperator...")
            mysql.run(self.mysql_preoperator)

        try:
            if self.bulk_load:
                self.log.info("Bulk inserting rows into MySQL...")
                with closing(mysql.get_conn()) as conn:
                    with closing(conn.cursor()) as cursor:
                        cursor.execute("LOAD DATA LOCAL INFILE '%s' INTO "
                                       "TABLE %s LINES TERMINATED BY '\r\n' (%s)" %
                                       (tmpfile.name,
                                        self.mysql_table,
                                        ", ".join(selected_columns)))
                        conn.commit()
                tmpfile.close()
            else:
                self.log.info("Inserting rows into MySQL...")
                mysql.insert_rows(table=self.mysql_table,
                                  rows=result,
                                  target_fields=selected_columns)
            self.log.info("Inserted rows into MySQL %s", count)
        except (MySQLdb.Error, MySQLdb.Warning):
            self.log.info("Inserted rows into MySQL 0")
            raise

        if self.mysql_postoperator:
            self.log.info("Running MySQL postoperator...")
            mysql.run(self.mysql_postoperator)

        self.log.info("Done")
示例#19
0
    def execute(self, context=None):
        metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id)
        table = metastore.get_table(table_name=self.table)
        field_types = {col.name: col.type for col in table.sd.cols}

        exprs = {('', 'count'): 'COUNT(*)'}
        for col, col_type in list(field_types.items()):
            d = {}
            if self.assignment_func:
                d = self.assignment_func(col, col_type)
                if d is None:
                    d = self.get_default_exprs(col, col_type)
            else:
                d = self.get_default_exprs(col, col_type)
            exprs.update(d)
        exprs.update(self.extra_exprs)
        exprs = OrderedDict(exprs)
        exprs_str = ",\n        ".join(
            [v + " AS " + k[0] + '__' + k[1] for k, v in exprs.items()])

        where_clause = [
            "{} = '{}'".format(k, v) for k, v in self.partition.items()
        ]
        where_clause = " AND\n        ".join(where_clause)
        sql = "SELECT {exprs_str} FROM {table} WHERE {where_clause};".format(
            exprs_str=exprs_str, table=self.table, where_clause=where_clause)

        presto = PrestoHook(presto_conn_id=self.presto_conn_id)
        self.log.info('Executing SQL check: %s', sql)
        row = presto.get_first(hql=sql)
        self.log.info("Record: %s", row)
        if not row:
            raise AirflowException("The query returned None")

        part_json = json.dumps(self.partition, sort_keys=True)

        self.log.info("Deleting rows from previous runs if they exist")
        mysql = MySqlHook(self.mysql_conn_id)
        sql = """
        SELECT 1 FROM hive_stats
        WHERE
            table_name='{table}' AND
            partition_repr='{part_json}' AND
            dttm='{dttm}'
        LIMIT 1;
        """.format(table=self.table, part_json=part_json, dttm=self.dttm)
        if mysql.get_records(sql):
            sql = """
            DELETE FROM hive_stats
            WHERE
                table_name='{table}' AND
                partition_repr='{part_json}' AND
                dttm='{dttm}';
            """.format(table=self.table, part_json=part_json, dttm=self.dttm)
            mysql.run(sql)

        self.log.info("Pivoting and loading cells into the Airflow db")
        rows = [(self.ds, self.dttm, self.table, part_json) +
                (r[0][0], r[0][1], r[1]) for r in zip(exprs, row)]
        mysql.insert_rows(table='hive_stats',
                          rows=rows,
                          target_fields=[
                              'ds',
                              'dttm',
                              'table_name',
                              'partition_repr',
                              'col',
                              'metric',
                              'value',
                          ])