def add_to_database(**kwargs): objs = kwargs['ti'].xcom_pull(key='object_location', task_ids='generate_object_list') logging.info(f'Processing object list from {objs}') with open(objs, 'r') as f: wl = read_object_list(f) # execution_date = kwargs['execution_date'].strftime('%Y-%m-%dT%H-%M') # previous_run = kwargs['prev_execution_date'].strftime('%Y-%m-%dT%H-%M') # filtered = list(filter_objects(all_objects=wl, start_date=previous_run, end_date=execution_date)) filtered = list(wl) station_dao, series_dao, mes_dao = setup_daos() records = 0 for obj in filtered: for record in get_jsons_from_object(bucket=FETCHES_BUCKET, object_name=obj['Name']): station, measurement, _ = split_record(record) add_to_db(station_dao=station_dao, series_dao=series_dao, mes_dao=mes_dao, station=station, measurement=measurement) records += 1 print_db_stats(station_dao, series_dao, mes_dao)
def update_last(**kwargs): prefix = get_prefix(**kwargs) target_dir = os.path.join(Variable.get('target_dir'), prefix) logging.info(f'Will be processing [{ target_dir }]') flist = list_directory(target_dir) logging.info(f'Files detected: { len(flist)}') previous_run = kwargs['prev_execution_date'] next_run = kwargs['next_execution_date'] filtered_list = filter_file_list( flist=flist, previous_run=previous_run, next_run=next_run) logging.info(f'Previous run was @{previous_run}, next will be @{next_run}. File list reduced to: {len(filtered_list)}') station_dao, series_dao, mes_dao = setup_daos() m = 0 for fname in filtered_list: logging.info(f'Analyzing { fname}') with open(fname, 'rb') as f: for record in get_jsons_from_stream(stream=f, object_name=fname): station, measurement, _ = split_record(record) m += 1 add_to_db(station_dao, series_dao, mes_dao, station=station, measurement=measurement) logging.info(f'Number of measurements added to DB: {m}') print_db_stats(station_dao, series_dao, mes_dao) return True
def store_objects_in_db(**kwargs): objs = kwargs['ti'].xcom_pull( key='object_location', task_ids='generate_object_list') logging.info('Processing object list from %s', objs) with open(objs, 'r') as f: wl = read_object_list(f) execution_date = kwargs['execution_date'] previous_run = kwargs['prev_execution_date'] if kwargs['filter_objects']: logging.info('Filtering objects...') filtered = list(filter_objects( all_objects=wl, start_date=previous_run, end_date=execution_date)) logging.info('Filterd objects. Number of objects from [%s, %s]: %d', previous_run, execution_date, len(filtered)) else: filtered = list(wl) logging.info('Number of non-filtered objects %d', len(filtered)) station_dao, series_dao, mes_dao = setup_daos() with concurrent.futures.ThreadPoolExecutor(max_workers=None) as executor: processor_objects = {executor.submit( local_process_file, obj['Name']): obj['Name'] for obj in filtered} for future in concurrent.futures.as_completed(processor_objects): object_name = processor_objects[future] try: rr = future.result() except Exception as exc: logging.warning( '%r generated an exception: %s', object_name, exc) else: logging.info('Processing %s', object_name) for it in rr: add_to_db(station_dao=station_dao, series_dao=series_dao, mes_dao=mes_dao, station=it[0], measurement=it[1]) print_db_stats(station_dao, serialize_object, mes_dao)
def go_through(**kwargs): prefix = get_prefix(**kwargs) target_dir = os.path.join(Variable.get('target_dir'), prefix) logging.info(f'Will be processing [{ target_dir }]') flist = glob.glob(os.path.join(target_dir, '*')) logging.info(f'Files detected: { len(flist)}') station_dao, series_dao, mes_dao = setup_daos() for fname in flist: logging.info(f'Processing { fname}') with open(fname, 'rb') as f: for record in get_jsons_from_stream(stream=f, object_name=fname): station, measurement, _ = split_record(record) add_to_db(station_dao, series_dao, mes_dao, station=station, measurement=measurement) print_db_stats(station_dao, series_dao, mes_dao) return True
def transform_objects(**kwargs): pfl = setup_objectlist(**kwargs) pfl.load() objects_count = len(pfl.get_list()) logging.info(f'Loaded {objects_count} objects.') station_dao, series_dao, mes_dao = setup_daos() def process(x): return local_process_file(x['Name']) with concurrent.futures.ThreadPoolExecutor(max_workers=None) as executor: for obj, results in zip(pfl.get_list(), executor.map(process, pfl.get_list())): logging.info(f"Processing { obj['Name'] } ({ obj['Size']})") # we are linerizing it here anyways? for it in results: add_to_db(station_dao, series_dao, mes_dao, station=it[0], measurement=it[1]) print_db_stats(station_dao, series_dao, mes_dao)