示例#1
0
def get_news_meta(**kwargs):
    '''
    Retrieve metadata for news articles published on our set
    of sources during the time period between our last crawl
    and the current execution time.
    '''
    exec_date = kwargs.get('ds') or datetime.now()
    start_date = exec_date - timedelta(hours=3)

    api_key = kwargs.get('api_key')
    api = NewsAPI(api_key)

    query_args = {
        'sources': SOURCE_LIST,
        'page_size': 100,
        'from': convert_time(start_date),
        'to': convert_time(exec_date),
        'language': 'en'
    }

    pg_hook = PostgresHook(postgres_conn_id='sumzero')
    pagination = api.paginate_articles(**query_args)

    for resp in pagination:
        meta_list = resp.articles
        if meta_list:
            rows = (meta_to_row(d) for d in meta_list)
            pg_hook.insert_rows('news_meta1',
                                rows=rows,
                                target_fields=('title', 'description', 'url',
                                               'published_on'),
                                commit_every=1000)

    return meta_list
示例#2
0
def pull_and_insert(**kwargs):

    #Pull query from sql directory
    query = kwargs['templates_dict']['query']

    #Initialize snql hook and pull data
    snql_hook = PostgresHook('snql')
    results = snql_hook.get_records(query)

    #Initialize staging hook and insert data to staging table
    staging_hook = PostgresHook('snql_staging')
    staging_hook.insert_rows('dim_sneakers', results)
示例#3
0
def populate_target(**kwargs):

    #Pull data from staging
    staging_hook = PostgresHook('snql_staging')
    #Pull query from sql directory
    query = kwargs['templates_dict']['query']
    #Execute query
    staging_results = staging_hook.get_records(query)

    #Initialize hook to snql
    snql_hook = PostgresHook('snql')

    #Delete current rows in target table table
    snql_hook.run("""DELETE FROM dim_sneakers""")

    #Insert new rows into target table
    snql_hook.insert_rows('dim_sneakers', staging_results)
示例#4
0
def api_json_to_db(**kwargs):
    base_dir = Variable.get("tfl_park_base_dir")

    json_files = [
        f for f in listdir(base_dir) if isfile(join(base_dir, f))
        if '.json' in f
    ]

    db_tuples = []

    for current_file in json_files:
        time_pattern = re.compile('[0-9]*')
        pattern_match = time_pattern.match(current_file)

        if pattern_match is not None:
            timestamp = pattern_match.group()

            with open(join(base_dir, current_file), 'r') as f:
                car_parks = json.loads(f.read())

            for car_park in car_parks:
                park_id = car_park['id']
                park_name = car_park['name']

                for bay in car_park['bays']:
                    park_bay_type = bay['bayType']
                    park_bay_occupied = bay['occupied']
                    park_bay_free = bay['free']
                    park_bay_count = bay['bayCount']

                    db_tuples.append(
                        (timestamp, park_id, park_name, park_bay_type,
                         park_bay_occupied, park_bay_free, park_bay_count))

    target_fields = [
        't_stamp', 'park_id', 'park_name', 'park_bay_type',
        'park_bay_occupied', 'park_bay_free', 'park_bay_count'
    ]

    db_hook = PostgresHook(postgres_conn_id='tfl_db')
    db_hook.insert_rows('tfl_data', db_tuples, target_fields)

    for current_file in json_files:
        remove(join(base_dir, current_file))