Пример #1
0
    def run(self, conn_id='postgres_bills3'):
        """
        Returns bill text for train and val datasets
        """
        train = self.inputs["train"].read()
        val = self.inputs["val"].read()

        train_ids = [str(x) for x in set(train["bill_id"].values)]
        val_ids = [str(x) for x in set(val["bill_id"].values)]
        del train
        del val

        train_sql_query = """
        select bill_id, doc
        from ml_policy_class.bill_texts
        where bill_id in ({}) and type_id = 1
        """.format(", ".join(train_ids))

        val_sql_query = """
        select bill_id, doc
        from ml_policy_class.bill_texts
        where bill_id in ({}) and type_id = 1
        """.format(", ".join(val_ids))

        pg_hook = PostgresHook(postgres_conn_id=conn_id)
        txt_train = pg_hook.get_pandas_df(train_sql_query)
        txt_val = pg_hook.get_pandas_df(val_sql_query)

        self.outputs["txt_train"].write(txt_train)
        self.outputs["txt_val"].write(txt_val)
def get_error_dict(redshift_conn_id):

    get_table_name = """
    SELECT DISTINCT
        perm.name,
        stl.tbl AS id
    FROM 
        stl_load_errors stl
    LEFT JOIN 
        STV_TBL_PERM perm
    ON 
        stl.tbl = perm.id
    WHERE 
        perm.name != 'None'
    AND
        stl.session = (SELECT session FROM stl_load_errors ORDER BY session DESC LIMIT 1)
    """   
    
    redshift = PostgresHook(redshift_conn_id)
    
    # gets names and table IDs of tables in stl_load_errors table in current redshift session
    table_df = redshift.get_pandas_df(get_table_name)

    print('table_df: ', table_df)
    # creates dictionary of table names and IDs to loop over
    stl_table_dict = dict(zip(table_df['name'].apply(lambda name: name.strip()).values, table_df['id'].values))
    
    print('Staging tables within stl_load_errors table: ', list(stl_table_dict.keys()))
    
    return stl_table_dict
Пример #3
0
 def partitions(self):
     schema, table = request.args.get("table").split('.')
     sql = """
     SELECT
         a."PART_NAME",
         a."CREATE_TIME",
         c."LOCATION",
         c."IS_COMPRESSED",
         c."INPUT_FORMAT",
         c."OUTPUT_FORMAT"
     FROM "PARTITIONS" a
     JOIN "TBLS" b ON a."TBL_ID" = b."TBL_ID"
     JOIN "DBS" d ON b."DB_ID" = d."DB_ID"
     JOIN "SDS" c ON a."SD_ID" = c."SD_ID"
     WHERE
         b."TBL_NAME" like '{table}' AND
         d."NAME" like '{schema}'
     ORDER BY "PART_NAME" DESC
     """.format(**locals())
     h = PostgresHook(METASTORE_POSTGRE_CONN_ID)
     df = h.get_pandas_df(sql)
     return df.to_html(
         classes="table table-striped table-bordered table-hover",
         index=False,
         na_rep='',
     )
def _load_from_database(**context):
    params = context['params']
    postgres_conn_id = params['postgres_conn_id']
    pg_hook = PostgresHook(postgres_conn_id=postgres_conn_id)
    conn = pg_hook.get_conn()
    cur = conn.cursor()

    raw_query = f"""SELECT * from repositories WHERE processed = %s AND locked = %s  LIMIT 1"""
    query = cur.mogrify(raw_query)
    repo_df = pg_hook.get_pandas_df(query, parameters=[False, False])

    if repo_df is None or len(repo_df) != 1:
        log.info("Could not load a valid repository from the database")
        raise AirflowSkipException(
            "Could not load a valid repository from the database")

    repo = repo_df.iloc[0, :]
    repo['repo_id'] = int(repo['repo_id'])

    log.info(
        f'Loaded repository {repo["url"]} with ID {repo["repo_id"]}. (stars={repo["stars"]},'
        f'size={repo["disk_usage"]}')

    cur = conn.cursor()
    cur.execute("""UPDATE repositories SET locked = TRUE WHERE repo_id = %s""",
                [repo['repo_id']])
    conn.commit()
    log.info(
        f'Aquired lock for repository {repo["url"]} with ID {repo["repo_id"]}')
    task_instance = context['task_instance']
    task_instance.xcom_push('target_repository', repo)
    return True
Пример #5
0
def SOHDailyToS3():
    import airflow.hooks.S3_hook
    from airflow.hooks.postgres_hook import PostgresHook
    import pandas as pd
    from io import StringIO
    import datetime as dt
    postgres_hook = PostgresHook(postgres_conn_id='redshift')
    s3_hook = airflow.hooks.S3_hook.S3Hook('s3connection')
    bucket = 'btq-bi'
    key = 'soh_' + str(dt.datetime.now().strftime('%d%m%y')) + '.csv'
    query = """select sku,config_sku,brand,gender,category1,category2,category3,category4,category_manager,bar_code,boutiqaat_exclusive,
            vendor_item_no, vendor_no,contract_type,payment_term_code,country_code,enable_date,last_selling_price,special_price,
            last_item_cost,last_item_cost_currency,shipping_cost_per_unit,first_grn_date,last_grn_date,total_grn_qty,total_grn_value,
            total_sellable_qty,toal_nav_non_sellable,soh,crs_available,nav2crs_total,full_pending_open_po_qty,partially_pending_open_po_qty,
            partial_pending_open_po_total_qty,partial_pending_open_po_received_qty,sku_avg_cost_2020,
            COALESCE(stock_refreshed_datetime,(select distinct stock_refreshed_datetime from analytics.soh_report where stock_refreshed_datetime is not null limit 1)) as stock_refreshed_datetime,
            report_time::date 
            from analytics.soh_report sr;"""

    df_ = postgres_hook.get_pandas_df(query)
    csv_buf = StringIO()
    df_.to_csv(csv_buf, header=True, index=False)
    csv_buf.seek(0)
    filename = csv_buf.getvalue()
    s3_hook.load_string(filename, key, bucket, replace=True)
    return True
Пример #6
0
def monitor_redshift_table(**op_kwarg):
    """Redshift table monitor collects the following metrics:
    - record count
    - duplicate records
    - Null/NaN record counts in each column
    - mean, median, min, max, std of each numeric column
    """

    hook = PostgresHook(REDSHIFT_CONNECTION_ID)
    data = hook.get_pandas_df(SELECT_DATA,
                              parameters=[REDSHIFT_MONITOR_TABLE_LIMIT])

    log_dataframe(
        "{}".format(REDSHIFT_TABLE),
        data,
        with_histograms=True,
        with_stats=True,
        with_schema=True,
    )

    log_metric("record count", data.shape[0])
    log_metric("Duplicate records",
               data.shape[0] - data.drop_duplicates().shape[0])
    for column in data.columns:
        log_metric("{} null record count".format(column),
                   int(data[column].isna().sum()))

        if issubdtype(data[column].dtype, number):
            log_metric("{} mean".format(column), round(data[column].mean(), 2))
            log_metric("{} median".format(column), data[column].median())
            log_metric("{} min".format(column), data[column].min())
            log_metric("{} max".format(column), data[column].max())
            log_metric("{} std".format(column), round(data[column].std(), 2))
Пример #7
0
    def execute(self, context):
        query = """
                SELECT
                    playername as name,
                    team as team,
                    LEFT(value,3)::int AS value,
                    points_total
                FROM api.solver_data
                WHERE season = '2018-2019'
                    AND points_total is not null
                ORDER BY points_total
                """

        pg = PostgresHook(postgres_conn_id=self.pg_conn_id)
        df = pg.get_pandas_df(query)

        players = df['name']
        teams = df['team']
        df = df.drop(['name'], axis=1)
        df = df.drop(['team'], axis=1)
        print(df)

        km = KMeans(n_clusters=6,
                    init='k-means++',
                    max_iter=300,
                    n_init=10,
                    random_state=0)
        y_means = km.fit_predict(df)
        df['clusters'] = y_means
        df["name"] = players
        df["team"] = teams

        df.to_csv(self.target_file, sep='|', index=False, header=False)
def monitor_redshift_db(**op_kwarg):
    """Redshift database monitor collects the following metrics:
        - Number of tables in database
        - Shape of each table in the database
        - Min, max, mean, median number of rows across all tables,
        - Min, max, mean, median number of columns across all tables,
        - Total number of rows and columns
        - Largest tables by row and column
        - Disk capacity, Free space on disk, Used space on disk (in GB)
        - Disk percent usage
    """
    hook = PostgresHook(REDSHIFT_CONN_ID)
    num_redshift_tables = hook.get_first(COUNT_TABLES, parameters=[TARGET_SCHEMA])[0]
    log_metric("table count", num_redshift_tables)

    table_row_counts = hook.get_records(COUNT_TABLE_ROWS, parameters=[TARGET_SCHEMA])
    num_rows_per_table = {}
    for tablename, row_count in table_row_counts:
        num_rows_per_table[tablename] = int(round(row_count))

    row_counts = list(num_rows_per_table.values())
    log_metric("Max table row count", max(row_counts))
    log_metric("Min table row count", min(row_counts))
    log_metric("Mean table row count", round(mean(row_counts), 2))
    log_metric("Median table row count", median(row_counts))

    tables = hook.get_pandas_df(DESCRIBE_TABLES, parameters=[TARGET_SCHEMA])
    table_shapes = DataFrame()
    table_shapes["columns"] = tables.groupby("tablename").nunique("column")["column"]
    table_shapes["tablename"] = tables["tablename"].unique()
    table_shapes["rows"] = (
        table_shapes["tablename"].map(num_rows_per_table).fillna(0).astype(int)
    )

    for _, row in table_shapes.iterrows():
        log_metric("{} shape".format(row["tablename"]), (row["columns"], row["rows"]))

    log_metric("Max table column count", table_shapes["columns"].max())
    log_metric("Min table column count", table_shapes["columns"].max())
    log_metric("Mean table column count", round(table_shapes["columns"].mean(), 2))
    log_metric("Median table column count", table_shapes["columns"].median())

    log_metric("Total columns", table_shapes["columns"].sum())
    log_metric("Total rows", table_shapes["rows"].sum())

    max_row_table = table_shapes[table_shapes["rows"] == table_shapes["rows"].max()]
    max_col_table = table_shapes[
        table_shapes["columns"] == table_shapes["columns"].max()
    ]
    log_metric("Largest table (by row count)", max_row_table["tablename"][0])
    log_metric("Largest table (by col count)", max_col_table["tablename"][0])

    disk_stats = hook.get_records(DISK_USAGE).pop()
    disk_capacity, disk_used, disk_free = disk_stats
    log_metric("Disk capacity (GB)", disk_capacity)
    log_metric("Disk used (GB)", disk_used)
    log_metric("Disk free (GB)", disk_free)
    log_metric("Percent Disk usage", round((disk_used / disk_capacity) * 100, 2))
 def execute(self, context):
     connection_info = BaseHook.get_connection(self.redshift_connection_id)
     self.log.info(
         'LoadDimensionOperator.execute: redshift_connection_id={}'.format(
             connection_info))
     pg_hook = PostgresHook(self.redshift_connection_id)
     df = pg_hook.get_pandas_df(self.sql)
     print(df)
     return df
class RunDataCheckOperator(BaseOperator):
    """
    Extension of Postgres Operator to do checks on data
    Checks should return no rows if passing
    """

    template_fields = ('sql', )
    template_ext = ('.sql', )
    ui_color = '#ededed'

    @apply_defaults
    def __init__(self,
                 sql,
                 postgres_conn_id='postgres_default',
                 autocommit=False,
                 parameters=None,
                 database=None,
                 check_name=None,
                 raise_error=False,
                 raise_warning=True,
                 *args,
                 **kwargs):
        super(RunDataCheckOperator, self).__init__(*args, **kwargs)
        self.sql = sql
        self.postgres_conn_id = postgres_conn_id
        self.autocommit = autocommit
        self.parameters = parameters
        self.database = database
        self.check_name = check_name
        self.raise_error = raise_error
        self.raise_warning = raise_warning

    def execute(self, context):
        self.log.info('Executing: %s', self.sql)
        self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id,
                                 schema=self.database)
        df = self.hook.get_pandas_df(self.sql, parameters=self.parameters)
        df_string = df.to_string(index=False, header=False)
        for output in self.hook.conn.notices:
            self.log.info(output)
        msg = None
        if len(df) > 0:
            logging.info(
                "Something is wrong with the data, checks return zero rows if everything is ok"
            )
            if self.raise_error:
                raise RuntimeError(
                    f"Check *{self.check_name}* has failed for the following dates:\n```\n{df_string}\n```"
                )
            elif self.raise_warning:
                msg = f"\n@here - :red-cross: Check *{self.check_name}* has failed for the following dates: :red-cross:\n```\n{df_string}\n```"
            else:
                msg = f"\n*{self.check_name}*\n```\n{df_string}\n```"
        else:
            msg = f"\nCheck *{self.check_name}* passed :tick:"
        return msg
Пример #11
0
def _load_from_database(**context):
    params = context['params']
    postgres_conn_id = params['postgres_conn_id']
    pg_hook = PostgresHook(postgres_conn_id=postgres_conn_id)
    table_name = 'repositories'
    constraint_column = 'processed'
    query = f"""SELECT * from {table_name} WHERE {constraint_column} = %s LIMIT 20"""
    repos = pg_hook.get_pandas_df(query, parameters=[False])
    task_instance = context['task_instance']
    task_instance.xcom_push('repositories', repos)
    return True
Пример #12
0
    def run(self, conn_id='postgres_bills3'):
        """
        Fetches data from poostgres schema defined as airflow hook
        :param conn_id: schema to fetch from
        :return: dataframe containg results of sql query
        """
        sql_loc = self.inputs["sql_loc"]

        pg_hook = PostgresHook(postgres_conn_id=conn_id)
        data_out = pg_hook.get_pandas_df(open(sql_loc, "r").read())

        self.outputs["dataframe"].write(data_out)
Пример #13
0
    def execute(self, context):
        source_hook = PostgresHook(postgres_conn_id=self.postgres_conn_id)
        destination_hook = S3Hook(s3_conn_id=self.s3_conn_id)

        df = source_hook.get_pandas_df("SELECT * FROM {}".format(self.table))

        # not suitable for large files
        destination_hook.load_string(
            df.to_csv(None, index=False),
            key=self.s3_key,
            bucket_name=self.s3_bucket,
            replace=True,
        )
Пример #14
0
    def execute(self, context):
        self.log.info('Running data quality checks')
        redshift_conn = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        test_pairs = zip(self.sql_queries, self.test_results)
        for query, test_fn in test_pairs:
            self.log.debug(f'Run data quality query: {query}')
            result = redshift_conn.get_pandas_df(query)
            self.log.debug(f'Result: {result}')
            if test_fn(result):
                self.log.info('Data quality check passed.')
            else:
                self.log.info('Data quality check failed.')
                raise AssertionError('Data quality check failed.')
Пример #15
0
def _load_from_database(**context):
    params = context['params']
    postgres_conn_id = params['postgres_conn_id']
    pg_hook = PostgresHook(postgres_conn_id=postgres_conn_id)
    cur = pg_hook.get_cursor()
    table_name = 'repositories'
    constraint_col0 = 'processed'
    constraint_col1 = 'contains_logging'
    raw_query = f"""SELECT * from {table_name} WHERE {constraint_col0} = %s AND {constraint_col1} = %s  LIMIT 20"""
    query = cur.mogrify(raw_query)
    repos = pg_hook.get_pandas_df(query, parameters=[True, True])
    task_instance = context['task_instance']
    task_instance.xcom_push('target_repositories', repos)
    return True
Пример #16
0
 def execute(self, context):
     hook = PostgresHook(postgres_conn_id=self.conn_id)
     df = hook.get_pandas_df(sql='select * from atp_matches_log;')
     df_load, key = self.dim_mapping[self.table](df)
     prim_key = ', '.join(key)
     print(df.shape)
     print(df.columns)
     df_load = df_load.where((pd.notnull(df_load)), None)
     schema = ','.join(df_load.columns)
     for i, row in df_load.iterrows():
         insert_update_query = self.insert_update_pd(
             row, self.table, schema, prim_key)
         try:
             hook.run(insert_update_query)
         except Exception as e:
             print(e)
     self.log.info(f"{self.table} loaded successfully")
Пример #17
0
 def index(self):
     sql = """
     SELECT
         a."NAME" as db, "DB_LOCATION_URI" as location,
         count(1) as object_count, a."DESC" as description
     FROM "DBS" a
     JOIN "TBLS" b ON a."DB_ID" = b."DB_ID"
     GROUP BY a."NAME", "DB_LOCATION_URI", a."DESC"
     """.format(**locals())
     h = PostgresHook(METASTORE_POSTGRE_CONN_ID)
     df = h.get_pandas_df(sql)
     df.db = ('<a href="/admin/metastorebrowserview/db/?db=' + df.db +
              '">' + df.db + '</a>')
     table = df.to_html(
         classes="table table-striped table-bordered table-hover",
         index=False,
         escape=False,
         na_rep='',
     )
     return self.render("metastore_browser/dbs.html", table=table)
Пример #18
0
def setup_new(*args, **kwargs):
    sql = '''select * from measurement where series_id=10261;'''
    db_url = kwargs['db_url']
    engine = create_engine(db_url, echo=True)
    conn = engine.connect()
    conn.execute('CREATE table IF NOT EXISTS data(x float, value float)')
    conn.execute('DELETE FROM data')

    try:
        pg = PostgresHook(postgres_conn_id='openaq-db')
        df = pg.get_pandas_df(sql, parameters=None)
        print(f'got the df: {df}')
        print(f'{df.columns}')

        for x, y in df['value'].iteritems():
            conn.execute(f'INSERT into data(x, value) values({x},{y} )')

        conn.close()
    except:
        logging.error(
            'Remote database not defined. Use [openaq-db] connection')
        return None
Пример #19
0
    def execute(self, context):
        # AWS Hook
        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()
        # RedShift Hook
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        # Get number of records in the table
        records = redshift.get_records(
            f"SELECT COUNT(*) FROM {self.table_name}")
        # Fields and data
        df = redshift.get_pandas_df(self.sql)
        fields = list(df.columns.values)
        data_rows = redshift.get_records(self.sql)

        if self.load_mode == "clean":
            # Clear data
            self.log.info(f"Clearing data from {self.table_name} table")
            redshift.run("DELETE FROM {}".format(self.table_name))
            self.log.info(
                f"Deleted {records[0][0]} records from {self.table_name}")
        else:
            job_execution_ts = self.filter_key[0].format(**context)
            next_job_execution_ts = self.filter_key[1].format(**context)
            filtered_df = df[(df['start_time'] >= job_execution_ts)
                             & (df['start_time'] < next_job_execution_ts)]
            data_rows = [tuple(x) for x in filtered_df.values]

        # Populate table
        self.log.info("Populating data to {} table".format(self.table_name))
        redshift.insert_rows(table=self.table_name,
                             rows=data_rows,
                             target_fields=fields,
                             commit_every=1000,
                             replace=False)
        self.log.info("Inserted {} records to {}".format(
            len(data_rows), self.table_name))
Пример #20
0
rows = metadb_hook.get_pandas_df("""
        select t2.type              source_type,
           t2.name              source_name,
           t2.host              source_host,
           t2.port              source_port,
           t2.db_name           source_db_name,
           t2.db_user           source_db_user,
           t2.db_psw            source_db_psw,
           t1.id                job_id,
           t1.job_num           job_num,
           t1.job_name          job_name,
           t1.description       job_desc,
           t1.layer             job_layer,
           t1.sql_text          job_sql_text,
           t4.table_name        source_table_name,
           t4.id_column         source_tbl_id_col,
           t1.target_table      target_table_name,
           t1.dependent_jobs    dependent_jobs,
           t5.id                schedule_id,
           t5.schedule_name     schedule_name,
           t5.schedule_interval schedule_interval,
           t1.job_type          job_type,
           t1.job_submit_args   job_submit_args,
           t1.owner as          owner
    from metadata.job t1
           join metadata.datasource t2 on t1.source_id = t2.id and t2.del = false
           left join metadata.ods_table t4 on t1.source_table_id = t4.id and t4.del = false
           join metadata.schedule t5 on t1.schedule_id = t5.id and t5.del = false
    where t1.del = false
      and t1.is_valid = true
      and t1.layer in ('DW', 'ADS')
""")
def compute_similarity_score(*args, **kwargs):
    dw1 = PostgresHook(postgres_conn_id='dw1_etl')

    job_post_sql = """
    SELECT
        id, title
    FROM job_postings
    WHERE is_deleted = False
    AND(
        description IS NOT NULL
        OR title IS NOT NULL
    )
    ORDER BY id
    """
    job_post_df = dw1.get_pandas_df(job_post_sql)

    work_experience_sql = """
    SELECT
        id, user_id, job_title, summary
    FROM users_work_experiences
    WHERE is_deleted = False
    AND(
        job_title IS NOT NULL
        OR summary IS NOT NULL
    )
    ORDER BY id
    """
    work_experience_df = dw1.get_pandas_df(work_experience_sql)

    job_post_title_features_df = \
        pd.read_csv(OUTPUT_DIR + 'bow_job_post_title_features.csv', header=0)

    work_experience_title_features_df = \
        pd.read_csv(OUTPUT_DIR + 'bow_work_experience_title_features.csv',
                    header=0)

    job_post_skill_features_df = \
        pd.read_csv(OUTPUT_DIR + 'bow_job_post_skill_features.csv', header=0)

    work_experience_skill_features_df = \
        pd.read_csv(OUTPUT_DIR + 'bow_work_experience_skill_features.csv',
                    header=0)

    datavecsavg_tfidf_job_post_df = \
        pd.read_csv(OUTPUT_DIR + 'datavecsavg_tfidf_job_post_features.csv',
                    header=0)

    datavecsavg_tfidf_work_experience_df = \
        pd.read_csv(OUTPUT_DIR +
                    'datavecsavg_tfidf_work_experience_features.csv', header=0)

    combine_job_post_features_df = \
        pd.merge(datavecsavg_tfidf_job_post_df,
                 job_post_title_features_df,
                 how='outer',
                 on='job_postings_id')
    combine_job_post_features_df = \
        pd.merge(combine_job_post_features_df,
                 job_post_skill_features_df,
                 how='outer',
                 on='job_postings_id')
    combine_job_post_features_df = \
        pd.merge(combine_job_post_features_df,
                 job_post_df,
                 left_on='job_postings_id',
                 right_on='id')

    # extract the new job post features first before dropping unused columns
    to_update_job_df = pd.read_csv(OUTPUT_DIR + 'to_update_job_posts.csv',
                                   header=0)
    to_add_job_df = pd.read_csv(OUTPUT_DIR + 'to_add_job_posts.csv', header=0)
    to_update_job_df = pd.merge(to_update_job_df,
                                to_add_job_df,
                                on='job_postings_id',
                                how='outer')
    combine_new_job_post_features_df = \
        combine_job_post_features_df[
            combine_job_post_features_df[
                'job_postings_id'].isin(to_update_job_df['job_postings_id'])]

    selected_new_post_id = combine_new_job_post_features_df['job_postings_id']
    combine_new_job_post_features_df = \
        combine_new_job_post_features_df.drop(['job_postings_id',
                                               'id',
                                               'title'], axis=1)

    combine_new_job_post_features_df = \
        combine_new_job_post_features_df.fillna(0)
    datavecsavg_tfidf_new_job_post_stack = \
        combine_new_job_post_features_df.as_matrix()

    combine_old_job_post_features_df = \
        combine_job_post_features_df[
            ~combine_job_post_features_df[
               'job_postings_id'].isin(to_update_job_df['job_postings_id'])]

    selected_old_post_id = combine_old_job_post_features_df['job_postings_id']
    combine_old_job_post_features_df = \
        combine_old_job_post_features_df.drop(['job_postings_id',
                                               'id',
                                               'title'], axis=1)

    combine_old_job_post_features_df = \
        combine_old_job_post_features_df.fillna(0)
    datavecsavg_tfidf_old_job_post_stack = \
        combine_old_job_post_features_df.as_matrix()

    del combine_job_post_features_df

    combine_work_experience_features_df = \
        pd.merge(datavecsavg_tfidf_work_experience_df,
                 work_experience_title_features_df,
                 how='outer',
                 on='id')
    combine_work_experience_features_df = \
        pd.merge(combine_work_experience_features_df,
                 work_experience_skill_features_df,
                 how='outer',
                 on='id')

    combine_work_experience_features_df = \
        pd.merge(combine_work_experience_features_df,
                 work_experience_df,
                 left_on='id',
                 right_on='id')

    to_update_work_experience_df = \
        pd.read_csv(OUTPUT_DIR + 'to_update_work_experiences.csv', header=0)

    to_add_work_experience_df = \
        pd.read_csv(OUTPUT_DIR + 'to_add_work_experiences.csv', header=0)

    to_update_work_experience_df = pd.merge(to_update_work_experience_df,
                                            to_add_work_experience_df,
                                            on='work_experience_id',
                                            how='outer')

    combine_new_work_experience_features_df = \
        combine_work_experience_features_df[
            combine_work_experience_features_df['id']
            .isin(to_update_work_experience_df['work_experience_id'])]

    selected_work_experience_id = combine_work_experience_features_df['id']
    selected_work_experience_user_id = \
        combine_work_experience_features_df['user_id']

    combine_work_experience_features_df = \
        combine_work_experience_features_df.drop(['id',
                                                  'user_id',
                                                  'job_title',
                                                  'summary'], axis=1)

    combine_work_experience_features_df = \
        combine_work_experience_features_df.fillna(0)
    datavecsavg_tfidf_work_experience_stack = \
        combine_work_experience_features_df.as_matrix()

    selected_new_work_experience_id = \
        combine_new_work_experience_features_df['id']
    selected_new_work_experience_user_id = \
        combine_new_work_experience_features_df['user_id']

    combine_new_work_experience_features_df = \
        combine_new_work_experience_features_df.drop(['id',
                                                      'user_id',
                                                      'job_title',
                                                      'summary'], axis=1)

    combine_new_work_experience_features_df = \
        combine_new_work_experience_features_df.fillna(0)
    datavecsavg_tfidf_new_work_experience_stack = \
        combine_new_work_experience_features_df.as_matrix()

    # compute cosine similarity
    scores_all_work_exp_new_job = []
    if datavecsavg_tfidf_new_job_post_stack.shape[0] > 0:
        scores_matrix_all_work_exp_new_job = \
            pairwise.cosine_similarity(datavecsavg_tfidf_work_experience_stack,
                                       datavecsavg_tfidf_new_job_post_stack)

        selected_indices = \
            np.where(np.round(scores_matrix_all_work_exp_new_job, 2) > 0.01)

        for i in np.arange(len(selected_indices[0])):
            if i % 1000000 == 0:
                logging.info('Processed %s work experiences vs job post' % (i))
            work_id = selected_indices[0][i]
            job_id = selected_indices[1][i]
            v = scores_matrix_all_work_exp_new_job[work_id, job_id]
            scores_all_work_exp_new_job\
                .append((selected_work_experience_id[work_id],
                         selected_work_experience_user_id[work_id],
                         selected_new_post_id.iloc[job_id],
                         v))

    scores_all_work_exp_new_job_df = \
        pd.DataFrame.from_records(scores_all_work_exp_new_job,
                                  columns=['work_experience_id',
                                           'user_id',
                                           'similar_job_postings_id',
                                           'score'])

    scores_new_work_exp_old_job = []
    if datavecsavg_tfidf_new_work_experience_stack.shape[0] > 0:
        scores_matrix_new_work_exp_old_job = \
            pairwise.cosine_similarity(datavecsavg_tfidf_new_work_experience_stack,
                                       datavecsavg_tfidf_old_job_post_stack)

        selected_indices = \
            np.where(np.round(scores_matrix_new_work_exp_old_job, 2) > 0.01)

        for i in np.arange(len(selected_indices[0])):
            if i % 1000000 == 0:
                logging.info('Processed %s work experiences vs job posts' % (i))
            work_id = selected_indices[0][i]
            job_id = selected_indices[1][i]
            v = scores_matrix_new_work_exp_old_job[work_id, job_id]

            scores_new_work_exp_old_job\
                .append((selected_new_work_experience_id.iloc[work_id],
                         selected_new_work_experience_user_id.iloc[work_id],
                         selected_old_post_id.iloc[job_id],
                         v))

    scores_new_work_exp_old_job_df = \
        pd.DataFrame.from_records(scores_new_work_exp_old_job,
                                  columns=['work_experience_id',
                                           'user_id',
                                           'similar_job_postings_id',
                                           'score'])

    scores_df = pd.concat([scores_all_work_exp_new_job_df,
                           scores_new_work_exp_old_job_df])
    scores_df['model_id'] = pd.Series(6, index=scores_df.index)

    output_filename = OUTPUT_DIR + 'scores_work_experience_to_job_posts.csv'
    scores_df.to_csv(output_filename, index=False, encoding='utf-8')
Пример #22
0
def get_data(table_name, filepath, **kwargs):
    hook = PostgresHook(postgres_conn_id=POSTGRES_CONN)
    df = hook.get_pandas_df(sql=f"SELECT * FROM {table_name}")
    if not len(df):
        raise ValueError("There is no row")
    df.to_csv(filepath, index=False)
Пример #23
0
class KModeSurveyRecOperator(BaseOperator):
    """
    This function calculates the centroid based on the data given using the
    K-mode method.

    https://arxiv.org/ftp/cs/papers/0603/0603120.pdf

    Param rs_conn_id:       Connection ID to Redshift
    Param rs_table:         Table to get data from
    Param features:         Column to be included as features
    Param n_cluster:        The clusters to classify users into
    Param n_iter:           The number of iterations to produce the centroid
    Param init_method:      The algorithm to use - Cao Method: Cao et al.
                            [2009] OR Huang Method: Huang [1997]

    This function returns the Centroid of the survey response and the dataframe
    with predictions
    """
    @apply_defaults
    def __init__(self,
                 cluster_name,
                 rs_conn_id,
                 rs_table,
                 rs_schema="public",
                 features=[
                     'age', 'gender', 'weight', 'existing_conditions',
                     'light_exercise'
                 ],
                 n_cluster=5,
                 n_iter=5,
                 init_method="Cao",
                 *args,
                 **kwargs):
        super(KModeSurveyRecOperator, self).__init__(*args, **kwargs)
        self.cluster_name = cluster_name
        self.rs_conn_id = rs_conn_id
        self.rs_table = rs_table
        self.rs_schema = rs_schema
        self.features = features
        self.n_cluster = n_cluster
        self.n_iter = n_iter
        self.init_method = init_method

    def kmode_calculation(self, data):
        """
        This function calculates the centroid using the k-mode algorithm.

        This functiontakes in the cleaned data and returns:

        - Column element mapping dictionary
        - Centroids
        - The output data with classification
        """
        col_dict = {}

        for col in data.columns:
            data[col] = data[col].astype('category')
            col_dict.update({col: dict(enumerate(data[col].cat.categories))})

        # Get all the cols in the DataFrame
        cols = [col for col in data.columns]

        # Transform all values into categorical and numerical values
        for col in cols:
            data[col] = data[col].astype('category')
            data[col] = data[col].cat.codes

        # Run k-modes using the algorithm
        kmodes_method = KModes(n_clusters=self.n_cluster,
                               init=self.init_method,
                               n_init=self.n_iter,
                               verbose=1)
        kmode_result = kmodes_method.fit_predict(data[cols])

        # Attach the output label for each data point
        data['classification'] = pd.Series(kmode_result, index=data.index)

        return col_dict, kmodes_method.cluster_centroids_, data

    def get_rs_cols(self):
        """
        This function will get all of the columns into a list generated from
        the questions in the last 6 days.
        """

        query = """
        SELECT
        DISTINCT question
        FROM survey_response
        WHERE 1=1
        AND response_time > (current_timestamp - interval '6 day')
        """.format(schema=self.rs_schema, table=self.rs_table)

        # Establish connection to Redshift
        self.rs_hook = PostgresHook(postgres_conn_id=self.rs_conn_id)

        # Get the cols in a list
        df = self.rs_hook.get_pandas_df(query)

        # Convert into list
        cols_list = df['question'].values.T.tolist()

        return cols_list

    def get_rs_query(self, cols_list):
        """
        This function will generate, using the column information to get all
        of the data
        """
        rs_query = """
        SELECT
        user_id
        """

        for question in cols_list:
            if question in self.features:
                rs_query += """
                ,COALESCE(CASE WHEN question = '{question}' THEN regexp_replace(response, '\\[|\\]|"', '') END, 'unspecified') AS {question_cleaned}
                """.format(question=question,
                           question_cleaned=question.replace(" ", "_"))

        rs_query += """
        FROM {schema}.{table}
        WHERE 1=1
        AND response_time > (current_timestamp - interval '7 day')
        AND user_id IS NOT NULL
        """.format(schema=self.rs_schema, table=self.rs_table)

        return rs_query

    def get_rs_data(self, query):
        """
        This function returns the survey data in the dataframe format
        """
        # Establish connection to Redshift
        self.rs_hook = PostgresHook(postgres_conn_id=self.rs_conn_id)

        # Get the data in dataframe
        survey_df = self.rs_hook.get_pandas_df(query)

        return survey_df

    def rs_execute(self, rs_query):
        """
        This function executes the query passed in.
        """
        logging.info("Connecting to Redshift.")
        rs_conn = PostgresHook(self.rs_conn_id)
        logging.info("Connection Successful. Executing query.")

        if rs_query:
            rs_conn.run(rs_query, False)
            logging.info("Query Execution Complete.")
        else:
            logging.info("No Query to Execute")

    def dict_to_sql(self, dict_obj):
        create_query = """
        CREATE TABLE IF NOT EXISTS {schema}.{table}_{name}_dict (
            question        VARCHAR(64),
            value           VARCHAR(128),
            value_mapping   int);

        TRUNCATE {schema}.{table}_{name}_dict;
        """.format(name=self.cluster_name,
                   schema=self.rs_schema,
                   table=self.rs_table)

        insert_query = """
        INSERT INTO {schema}.{table}_{name}_dict VALUES
        """.format(name=self.cluster_name,
                   schema=self.rs_schema,
                   table=self.rs_table)
        for question, sub_dict in dict_obj.items():
            for value_mapping, value in sub_dict.items():
                insert_query += """
                ('{question}','{value}',{value_mapping}),
                """.format(question=question,
                           value=value,
                           value_mapping=value_mapping)

        insert_query = insert_query.strip()[:-1] + ';'

        return create_query, insert_query

    def col_mapping(self, dataframe, col_dict):
        for key, value in col_dict.items():
            value.update({-1: 'null'})
            dataframe[key].replace(value, inplace=True)
        return dataframe

    def df_to_sql(self, dataframe):
        # Generate Create Query
        rs_df_create_query = """
            CREATE TABLE IF NOT EXISTS {schema}.{table}_{name}_cluster (
            """.format(name=self.cluster_name,
                       schema=self.rs_schema,
                       table=self.rs_table)

        for column in dataframe.columns:
            rs_df_create_query += """
                {column} VARCHAR(128),
                """.format(column=column)

        rs_df_create_query = rs_df_create_query.strip()[:-1] + ');'
        rs_df_create_query += """
        TRUNCATE {schema}.{table}_{name}_cluster;
        """.format(name=self.cluster_name,
                   schema=self.rs_schema,
                   table=self.rs_table)

        # Generate Insert Query
        rs_df_insert_query = """
            INSERT INTO {schema}.{table}_{name}_cluster VALUES
            """.format(name=self.cluster_name,
                       schema=self.rs_schema,
                       table=self.rs_table)

        rs_insert_list = []
        for index, row in dataframe.iterrows():
            rs_insert_list.append(
                [dataframe[column][index] for column in dataframe.columns])

        for row in range(len(rs_insert_list)):
            if row % 500 == 0 and row != 0:
                rs_df_insert_query = rs_df_insert_query[:-1] + ';'
                rs_df_insert_query += """
                    INSERT INTO {schema}.{table}_{name}_cluster VALUES
                    """.format(name=self.cluster_name,
                               schema=self.rs_schema,
                               table=self.rs_table)
            rs_df_insert_query += str(rs_insert_list[row]).replace(
                "[", "(").replace("]", ")")
            rs_df_insert_query += ','

        rs_df_insert_query = rs_df_insert_query[:-1] + ';'

        return rs_df_create_query, rs_df_insert_query

    def list_to_sql(self, list_obj, col_dict):
        create_query = """
        CREATE TABLE IF NOT EXISTS {schema}.{table}_{name}_centroids (
        """.format(name=self.cluster_name,
                   schema=self.rs_schema,
                   table=self.rs_table)

        for key in col_dict.keys():
            create_query += """
            {key} INT,
            """.format(key=key)

        create_query += """
        cluster INT);

        TRUNCATE {schema}.{table}_{name}_centroids;
        """.format(name=self.cluster_name,
                   schema=self.rs_schema,
                   table=self.rs_table)

        insert_query = """
        INSERT INTO {schema}.{table}_{name}_centroids VALUES
        """.format(name=self.cluster_name,
                   schema=self.rs_schema,
                   table=self.rs_table)

        iter = 0
        for item in list_obj:
            insert_query += str(np.append(item, iter).tolist()).replace(
                "[", "(").replace("]", ")")
            insert_query += ","
            iter = +1

        insert_query = insert_query[:-1] + ";"
        return create_query, insert_query

    def execute(self, context):
        # Get the columns from Redshift
        cols_list = self.get_rs_cols()

        # Get the query to get the data
        data_query = self.get_rs_query(cols_list)

        # Get the data
        survey_df = self.get_rs_data(data_query)

        # Calculate clusters using kmodes clustering
        col_dict, kmodes_centroids, data_result_cat = self.kmode_calculation(
            survey_df)

        # Map the data back to original form
        data_result = self.col_mapping(data_result_cat, col_dict)

        # Convert the dict object to SQL insert queries and the list to insert query
        rs_df_create_query, rs_df_insert_query = self.df_to_sql(data_result)
        rs_col_dict_create_query, rs_col_dict_insert_query = self.dict_to_sql(
            col_dict)
        rs_centroid_create_query, rs_centroid_insert_query = self.list_to_sql(
            kmodes_centroids, col_dict)

        # Generate list of query to run
        create_sql_list = [
            rs_df_create_query, rs_col_dict_create_query,
            rs_centroid_create_query
        ]
        insert_sql_list = [
            rs_df_insert_query, rs_col_dict_insert_query,
            rs_centroid_insert_query
        ]

        # Insert the cluster data into Redshift
        for create_query in create_sql_list:
            self.rs_execute(create_query)

        for insert_query in insert_sql_list:
            self.rs_execute(insert_query)
def create_stl_table(redshift_conn_id,
                     table,
                     error_table_name,
                     table_id):
    """
    Creates a Redshift table containing all stl error rows associated with the input staging table. All columns within the error table 
    will be converted to VARCHAR given that the errors may be linked to data type issues.

    Keyword Arguments:
    redshift_conn_id -- Redshift connection ID (str)
    table -- Staging table name (str)
    errror_table_name -- Name to be used to create the error table
    table_id -- The staging table's table_id defined in the stl_load_errors table
    """

    get_column_names = """
    SELECT 
        col_name 
    FROM
        (SELECT
            * 
        FROM
            pg_get_cols('{}')
        COLS(
            view_schema name, 
            view_name name, 
            col_name name, 
            col_type varchar,
            col_num int
            )
        )
    """
    
    create_error_table = """
        DROP TABLE IF EXISTS 
            {error_table_name};
        CREATE TABLE
            {error_table_name}
        (
            {cast}, 
            err_code INT,
            err_reason VARCHAR(72)
        );
        """

    insert_rows = """
        INSERT INTO 
            {error_table_name}
        SELECT 
            {split_part},
            err_code,
            err_reason
        FROM 
            stl_load_errors stl
        WHERE 
            stl.tbl = {id}
        """
            
    redshift = PostgresHook(redshift_conn_id)

    # load column names into pandas dataframe
    col_names_df = redshift.get_pandas_df(get_column_names.format(table))

    # put column names into list
    col_names_list = col_names_df['col_name'].values.tolist()

    cast_col = ""
    split_raw_line = ""
    # loop over table's column names
    for i,col in enumerate(col_names_list):
        # if last column don't include ',' at end of string
        if col == col_names_list[-1]:
            # adds CAST statement to cast_col string
            cast_col += "{} VARCHAR".format(col)
            # adds split_part function to split_raw_line string
            split_raw_line += "CAST(split_part(raw_line, ',', {}) AS VARCHAR(500))".format(i+1)
        else:
            cast_col += "{} VARCHAR, ".format(col)
            split_raw_line += "CAST(split_part(raw_line, ',', {}) AS VARCHAR(500)), ".format(i+1)

    format_dict = {
        'table': table, 
        'error_table_name': error_table_name,
        'cast': cast_col,
        'split_part':split_raw_line,
        'id': table_id
    }
    print(f'Creating error table: {error_table_name}')

    # creates an empty table with duplicate columns of looped table
    formatted_create_sql = create_error_table.format(**format_dict)
    redshift.run(formatted_create_sql)

    # inserts all stl_load_errors raw_line values as strings into apporiate columns within the empty table
    formatted_insert_sql = insert_rows.format(**format_dict)
    redshift.run(formatted_insert_sql)
    
    error_table_count = redshift.get_records(f'SELECT COUNT(*) FROM {error_table_name}')[0][0]
    table_count = redshift.get_records(f'SELECT COUNT(*) FROM {table}')[0][0]

    print(f'{table} COUNT: {table_count}')
    print(f'{error_table_name} COUNT: {error_table_count}')
    
    return error_table_name
Пример #25
0
 def execute(self, context):
     pg_hook = PostgresHook(postgres_conn_id=self._conn_id)
     df = pg_hook.get_pandas_df(sql='SELECT * FROM templates')
     task_instance = context['task_instance']            # type: TaskInstance
     task_instance.xcom_push('database_df', df)
Пример #26
0
def compute_description_feature(*args, **kwargs):
    dw1 = PostgresHook(postgres_conn_id='dw1_etl')

    job_post_sql = """
    SELECT *
    FROM job_postings
    WHERE is_deleted = False
    AND description IS NOT NULL
    ORDER BY id
    """
    job_post_df = dw1.get_pandas_df(job_post_sql)

    work_experience_sql = """
    SELECT *
    FROM users_work_experiences
    WHERE is_deleted = False
    AND summary IS NOT NULL
    ORDER BY id
    """
    work_experience_df = dw1.get_pandas_df(work_experience_sql)

    db1 = PostgresHook(postgres_conn_id='db1_etl')

    processed_descriptions_sql = """
    SELECT *
    FROM job_posting_description_meta
    ORDER BY job_posting_id
    """
    processed_descriptions_unpivot_df = \
        db1.get_pandas_df(processed_descriptions_sql)

    processed_descriptions_unpivot_df = \
        processed_descriptions_unpivot_df.drop(['created_at',
                                                'updated_at',
                                                'id',
                                                'content_type_metadata'],
                                               axis=1)
    # pivot back
    processed_descriptions_df = \
        processed_descriptions_unpivot_df.pivot(index='job_posting_id',
                                                columns='content_type',
                                                values='content').reset_index()

    processed_descriptions_df.sort_values(by='job_posting_id', inplace=True)

    # flatten words_after_lemma_no_stopwords
    processed_descriptions_df['words_after_lemma_no_stopwords'] = \
        processed_descriptions_df['lemmatized_words_with_no_stopwords']\
        .map(lambda x: " ".join([val for sublist in x for val in sublist]))

    processed_descriptions_df.loc[
        processed_descriptions_df['words_after_lemma_no_stopwords'].isnull(),
        'words_after_lemma_no_stopwords'] = ""

    # lowercase
    processed_descriptions_df['words_after_lemma_no_stopwords'] = \
        processed_descriptions_df['words_after_lemma_no_stopwords']\
        .map(lambda x: non_letter_removal(x.lower()))

    processed_descriptions_df['lemmatized_sentences'] = \
        processed_descriptions_df['lemmatized_sentences']\
        .map(lambda x: [non_letter_removal(sublist.lower()) for sublist in x])

    processed_descriptions_df['lemmatized_sentences'] = \
        processed_descriptions_df["lemmatized_sentences"]\
        .map(lambda y: list(filter(lambda x: x != ' ', y)))

    sentences_tmp = processed_descriptions_df['lemmatized_sentences']
    sentences = []

    for description in sentences_tmp:
        for sentence in description:
            sentences.append(sentence.split())

    files = []
    # --------------------train word2vec--------------------------------
    # configure logging
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    # Set values for various parameters
    num_features = 500    # Word vector dimensionality
    min_word_count = 20   # Minimum word count
    num_workers = 2       # Number of threads to run in parallel
    context = 10          # Context window size
    downsampling = 1e-3   # Downsample setting for frequent words

    # Initialize and train the model (this will take some time)
    logging.info('Training model...')

    # train model with POS and lemmatization
    model = word2vec.Word2Vec(sentences,
                              workers=num_workers,
                              size=num_features,
                              min_count=min_word_count,
                              window=context,
                              sample=downsampling)

    # If you don't plan to train the model any further, calling
    # init_sims will make the model much more memory-efficient.
    model.init_sims(replace=True)

    # save the model
    model_name = OUTPUT_DIR + 'w2v_500features_20minwords_10context'
    model.save(model_name)

    files.append(OUTPUT_DIR + 'w2v_500features_20minwords_10context')
    db1 = PostgresHook(postgres_conn_id='db1_etl')

    processed_work_summaries_sql = """
    SELECT *
    FROM user_work_experience_meta
    ORDER BY work_experience_id
    """
    processed_work_summaries_unpivot_df = \
        db1.get_pandas_df(processed_work_summaries_sql)

    processed_work_summaries_unpivot_df = \
        processed_work_summaries_unpivot_df.drop(['created_at',
                                                  'updated_at',
                                                  'id',
                                                  'content_type_metadata'],
                                                 axis=1)
    # pivot back
    processed_work_summaries_df = \
        processed_work_summaries_unpivot_df.pivot(index='work_experience_id',
                                                  columns='content_type',
                                                  values='content')\
        .reset_index()

    processed_work_summaries_df.sort_values(by='work_experience_id',
                                            inplace=True)

    processed_work_summaries_df['words_after_lemma_no_stopwords'] = \
        processed_work_summaries_df[
            'lemmatized_words_with_no_stopwords'].map(
            lambda x: " ".join([val for sublist in x for val in sublist]))

    processed_work_summaries_df\
        .loc[processed_work_summaries_df[
             'words_after_lemma_no_stopwords'].isnull(),
             'words_after_lemma_no_stopwords'] = ''
    processed_work_summaries_df['words_after_lemma_no_stopwords'] = \
        processed_work_summaries_df[
            'words_after_lemma_no_stopwords']\
        .map(lambda x: non_letter_removal(x.lower()))

    # ----------Creating features from Vector Averaging using Tfidf weights----
    selected_processed_descriptions_df = \
        processed_descriptions_df[
            processed_descriptions_df[
                  'job_posting_id'].isin(job_post_df['id'])]

    selected_descriptions = \
        selected_processed_descriptions_df['words_after_lemma_no_stopwords']

    clean_descriptions = \
        processed_descriptions_df['words_after_lemma_no_stopwords']

    vectorizer = TfidfVectorizer(analyzer='word',
                                 tokenizer=None,
                                 preprocessor=None,
                                 stop_words=None,
                                 max_features=5000)

    vectorizer.fit(clean_descriptions)

    # need to save when running full model
    # store the content
    with open(OUTPUT_DIR + 'description_tfidf.pkl', 'wb') as handle:
        pickle.dump(vectorizer, handle)

    files.append(OUTPUT_DIR + 'description_tfidf.pkl')

    logging.info('Compute feature vectors\n')
    datavecsavg_tfidf_job_post = \
        get_avgfeature_vec_tfidf(selected_descriptions,
                                 model,
                                 vectorizer,
                                 num_features)

    col_name = []
    for i in xrange(0, datavecsavg_tfidf_job_post.shape[1]):
        col_name.append('feature_%s' % i)

    datavecsavg_tfidf_job_post_df = \
        pd.DataFrame.from_records(datavecsavg_tfidf_job_post,
                                  columns=col_name)

    datavecsavg_tfidf_job_post_df['job_postings_id'] = \
        selected_processed_descriptions_df['job_posting_id']

    output_filename = OUTPUT_DIR + 'datavecsavg_tfidf_job_post_features.csv'
    datavecsavg_tfidf_job_post_df.to_csv(output_filename,
                                         index=False,
                                         encoding='utf-8')

    selected_processed_work_summaries_df = \
        processed_work_summaries_df[
            processed_work_summaries_df[
                'work_experience_id'].isin(work_experience_df['id'])]

    selected_summaries = \
        selected_processed_work_summaries_df['words_after_lemma_no_stopwords']

    datavecsavg_tfidf_work_experience = \
        get_avgfeature_vec_tfidf(selected_summaries,
                                 model,
                                 vectorizer,
                                 num_features)

    datavecsavg_tfidf_work_experience_df = \
        pd.DataFrame.from_records(datavecsavg_tfidf_work_experience,
                                  columns=col_name)

    datavecsavg_tfidf_work_experience_df['id'] = \
        selected_processed_work_summaries_df['work_experience_id'].values

    output_filename = \
        OUTPUT_DIR + 'datavecsavg_tfidf_work_experience_features.csv'
    datavecsavg_tfidf_work_experience_df.to_csv(output_filename,
                                                index=False,
                                                encoding='utf-8')
    return files
Пример #27
0
def compute_similarity_score(*args, **kwargs):
    dw1 = PostgresHook(postgres_conn_id='dw1_etl')

    job_post_sql = """
    SELECT
        id, title
    FROM job_postings
    WHERE is_deleted = False
    AND(
        description IS NOT NULL
        OR title IS NOT NULL
    )
    ORDER BY id
    """
    job_post_df = dw1.get_pandas_df(job_post_sql)

    work_experience_sql = """
    SELECT
        id, user_id, job_title, summary
    FROM users_work_experiences
    WHERE is_deleted = False
    AND(
        job_title IS NOT NULL
        OR summary IS NOT NULL
    )
    ORDER BY id
    """
    work_experience_df = dw1.get_pandas_df(work_experience_sql)

    job_post_title_features_df = \
        pd.read_csv(OUTPUT_DIR + 'bow_job_post_title_features.csv', header=0)
    work_experience_title_features_df = \
        pd.read_csv(OUTPUT_DIR + 'bow_work_experience_title_features.csv',
                    header=0)

    job_post_skill_features_df = \
        pd.read_csv(OUTPUT_DIR + 'bow_job_post_skill_features.csv', header=0)
    work_experience_skill_features_df = \
        pd.read_csv(OUTPUT_DIR + 'bow_work_experience_skill_features.csv',
                    header=0)

    datavecsavg_tfidf_job_post_df = \
        pd.read_csv(OUTPUT_DIR + 'datavecsavg_tfidf_job_post_features.csv',
                    header=0)
    datavecsavg_tfidf_work_experience_df = \
        pd.read_csv(OUTPUT_DIR +
                    'datavecsavg_tfidf_work_experience_features.csv',
                    header=0)

    combine_job_post_features_df = pd.merge(datavecsavg_tfidf_job_post_df,
                                            job_post_title_features_df,
                                            how='outer',
                                            on='job_postings_id')

    combine_job_post_features_df = pd.merge(combine_job_post_features_df,
                                            job_post_skill_features_df,
                                            how='outer',
                                            on='job_postings_id')

    combine_job_post_features_df = pd.merge(combine_job_post_features_df,
                                            job_post_df,
                                            left_on='job_postings_id',
                                            right_on='id')

    selected_post_id = combine_job_post_features_df['job_postings_id']

    combine_job_post_features_df = \
        combine_job_post_features_df.drop(['job_postings_id',
                                           'id',
                                           'title'], axis=1)

    combine_job_post_features_df = combine_job_post_features_df.fillna(0)
    datavecsavg_tfidf_job_post_stack = combine_job_post_features_df.as_matrix()

    combine_work_experience_features_df = \
        pd.merge(datavecsavg_tfidf_work_experience_df,
                 work_experience_title_features_df,
                 how='outer',
                 on='id')
    combine_work_experience_features_df = \
        pd.merge(combine_work_experience_features_df,
                 work_experience_skill_features_df,
                 how='outer',
                 on='id')

    combine_work_experience_features_df = \
        pd.merge(combine_work_experience_features_df,
                 work_experience_df, left_on='id',
                 right_on='id')

    selected_work_experience_id = combine_work_experience_features_df['id']

    selected_work_experience_user_id = \
        combine_work_experience_features_df['user_id']

    combine_work_experience_features_df = \
        combine_work_experience_features_df.drop(['id',
                                                  'user_id',
                                                  'job_title',
                                                  'summary'], axis=1)

    combine_work_experience_features_df = \
        combine_work_experience_features_df.fillna(0)
    datavecsavg_tfidf_work_experience_stack = \
        combine_work_experience_features_df.as_matrix()

    # compute cosine similarity
    scores_matrix = \
        pairwise.cosine_similarity(datavecsavg_tfidf_work_experience_stack,
                                   datavecsavg_tfidf_job_post_stack)

    # create dataframefrom cosine similarity matrix
    logging.info('Matrix size: %s x %s' % (scores_matrix.shape[0],
                                           scores_matrix.shape[1]))

    partnum = kwargs['params']['partnum']
    partindex = kwargs['params']['partindex']

    selected_indices = np.where(np.round(scores_matrix, 2) > 0.01)

    del combine_work_experience_features_df
    del datavecsavg_tfidf_work_experience_stack
    del datavecsavg_tfidf_job_post_stack
    del combine_job_post_features_df
    del work_experience_df
    del job_post_df
    del job_post_title_features_df
    del job_post_skill_features_df
    del datavecsavg_tfidf_job_post_df
    del work_experience_title_features_df
    del work_experience_skill_features_df
    del datavecsavg_tfidf_work_experience_df

    end_points = [0]
    for i in np.arange(partnum):
        end_points.append((i + 1) * len(selected_indices[0]) / partnum)

    scores = []
    for i in np.arange(end_points[partindex], end_points[partindex + 1]):
        if i % 1000000 == 0:
            logging.info('Processed %s job posts' % (i))
        work_id = selected_indices[0][i]
        job_id = selected_indices[1][i]
        v = scores_matrix[work_id, job_id]
        scores.append((selected_work_experience_id[work_id],
                       selected_work_experience_user_id[work_id],
                       selected_post_id[job_id],
                       v))
    scores_df = pd.DataFrame.from_records(scores,
                                          columns=['work_experience_id',
                                                   'user_id',
                                                   'similar_job_posting_id',
                                                   'score'])

    scores_df['model_id'] = pd.Series(6, index=scores_df.index)

    output_filename = \
        OUTPUT_DIR + \
        'scores_work_experience_to_job_posts_part%d.csv' % (partindex + 1)
    scores_df.to_csv(output_filename, index=False, encoding='utf-8')
Пример #28
0
def compute_title_feature(*args, **kwargs):
    dw1 = PostgresHook(postgres_conn_id='dw1_etl')

    work_experience_sql = """
    SELECT *
    FROM users_work_experiences
    WHERE is_deleted = False
    AND job_title IS NOT NULL
    ORDER BY id
    """
    job_post_sql = """
    SELECT *
    FROM job_postings
    WHERE is_deleted = False
    AND title IS NOT NULL
    ORDER BY id
    """

    job_post_df = dw1.get_pandas_df(job_post_sql)
    work_experience_df = dw1.get_pandas_df(work_experience_sql)

    job_post_titles = job_post_df['title']
    num_job_post_titles = len(job_post_titles)

    work_experience_titles = work_experience_df['job_title']
    num_work_experience_titles = len(work_experience_titles)

    clean_job_post_titles = []
    logging.info('Cleaning and parsing the job titles...\n')
    count = 0
    for title in job_post_titles:
        # If the index is evenly divisible by 1000, print a message
        if ((count + 1) % 1000 == 0):
            logging.info('job title %d of %d\n' % (count + 1,
                                                   num_job_post_titles))
        (words, tagged_words) = (text_to_words(title,
                                               remove_stopwords=False,
                                               use_lem=False))
        clean_job_post_titles.append(words)
        count += 1

    (vectorizer, job_post_title_features) = create_bow_vectors(clean_job_post_titles)

    files = []
    # store title bow
    with open(OUTPUT_DIR + 'title_bow.pkl', 'wb') as handle:
        pickle.dump(vectorizer, handle)

    clean_work_experience_titles = []
    logging.info('Cleaning and parsing the job titles...\n')
    count = 0
    for title in work_experience_titles:
        # If the index is evenly divisible by 1000, print a message
        if ((count + 1) % 1000 == 0):
            logging.info('work experience title %d of %d\n' %
                         (count + 1, num_work_experience_titles))
        (words, tagged_words) = (text_to_words(title,
                                               remove_stopwords=False,
                                               use_lem=False))
        clean_work_experience_titles.append(words)
        count += 1

    # get work exp title feature
    work_experience_title_features = \
        vectorizer.transform(clean_work_experience_titles)

    col_name = []
    for i in xrange(0, job_post_title_features.shape[1]):
        col_name.append('feature_%s' % i)
    job_post_title_feature_df = \
        pd.DataFrame.from_records(job_post_title_features, columns=col_name)
    job_post_title_feature_df['job_postings_id'] = job_post_df['id']

    output_filename = OUTPUT_DIR + 'bow_job_post_title_features.csv'
    job_post_title_feature_df.to_csv(output_filename,
                                     index=False,
                                     encoding='utf-8')

    work_experience_title_feature_df = \
        pd.DataFrame.from_records(work_experience_title_features.toarray(),
                                  columns=col_name)
    work_experience_title_feature_df['id'] = work_experience_df['id']

    output_filename = OUTPUT_DIR + '/bow_work_experience_title_features.csv'
    work_experience_title_feature_df.to_csv(output_filename,
                                            index=False,
                                            encoding='utf-8')
    files.append(OUTPUT_DIR + 'title_bow.pkl')
    return files
Пример #29
0
 def execute(self, context):
     log.info('Run Pandas over postgres')
     postgres_instance = PostgresHook(postgres_conn_id=self.connection_id)
     df = postgres_instance.get_pandas_df(self.sql_query)
     self.etl_function(df)
Пример #30
0
def compute_skill_feature(*args, **kwargs):
    dw1 = PostgresHook(postgres_conn_id='dw1_etl')

    job_post_sql = """
    SELECT *
    FROM job_postings
    WHERE is_deleted = False
    AND skills_group_id IS NOT NULL
    ORDER BY id
    """
    job_post_df = dw1.get_pandas_df(job_post_sql)

    skills_groups_sql = """SELECT * FROM skills_groups"""
    skills_groups_df = dw1.get_pandas_df(skills_groups_sql)

    join_df = pd.merge(job_post_df,
                       skills_groups_df,
                       left_on='skills_group_id',
                       right_on='id',
                       how='left')

    job_skills_sql = """
    SELECT *
    FROM job_skills
    WHERE is_deleted = False
    ORDER BY "order"
    """
    job_skills_df = dw1.get_pandas_df(job_skills_sql)
    job_skills = join_df['list']

    work_experience_sql = """
    SELECT *
    FROM users_work_experiences
    WHERE is_deleted = False
    ORDER BY id
    """
    work_experience_df = dw1.get_pandas_df(work_experience_sql)

    users_sql = """
    SELECT *
    FROM users
    WHERE is_deleted = False
    AND skills_group_id IS NOT NULL
    ORDER BY id
    """
    users_df = dw1.get_pandas_df(users_sql)
    users_join_df = pd.merge(users_df,
                             skills_groups_df,
                             left_on='skills_group_id',
                             right_on='id',
                             how='left')

    work_experience_join_df = pd.merge(work_experience_df,
                                       users_join_df,
                                       left_on='user_id',
                                       right_on='id_x')
    users_skills = work_experience_join_df['list']

    count_vectorizer = \
        CountVectorizer(lowercase=False,
                        analyzer=json_array_string_to_list,
                        vocabulary=job_skills_df['name'].tolist(),
                        max_features=500)

    files = []
    # store the content
    with open(OUTPUT_DIR + 'skills_bow.pkl', 'wb') as handle:
        pickle.dump(count_vectorizer, handle)

    job_post_skills_features = count_vectorizer.fit_transform(job_skills)

    col_name = []
    for i in xrange(0, job_post_skills_features.shape[1]):
        col_name.append('feature_%s' % i)
    # put back in db temporaily for other tasks to access
    job_post_skill_feature_df = \
        pd.DataFrame.from_records(job_post_skills_features.toarray(),
                                  columns=col_name)

    job_post_skill_feature_df['job_postings_id'] = join_df['id_x']

    output_filename = OUTPUT_DIR + 'bow_job_post_skill_features.csv'
    job_post_skill_feature_df.to_csv(output_filename,
                                     index=False,
                                     encoding='utf-8')

    # users_skills
    work_experience_skills_features = count_vectorizer.fit_transform(users_skills)

    # put back in db temporaily for other tasks to access
    work_experience_skill_feature_df = \
        pd.DataFrame.from_records(work_experience_skills_features.toarray(),
                                  columns=col_name)

    work_experience_skill_feature_df['id'] = work_experience_join_df['id']

    output_filename = OUTPUT_DIR + 'bow_work_experience_skill_features.csv'
    work_experience_skill_feature_df.to_csv(output_filename,
                                            index=False,
                                            encoding='utf-8')
    files.append(OUTPUT_DIR + 'skills_bow.pkl')
    return files