def main() -> None:
    env = environs.Env()
    data_path: Path = env.path('DATA_PATH')
    out_path: Path = env.path('OUT_PATH')
    log_level: str = env.log_level('LOG_LEVEL')
    log_config.configure(log_level)
    log = get_logger()
    log.debug(f'data_dir: {data_path} out_dir: {out_path}')
    parser = argparse.ArgumentParser()
    parser.add_argument('--yearindex')
    parser.add_argument('--monthindex')
    parser.add_argument('--dayindex')
    parser.add_argument('--locindex')
    parser.add_argument('--subdirindex')
    args = parser.parse_args()
    year_index = int(args.yearindex)
    month_index = int(args.monthindex)
    day_index = int(args.dayindex)
    location_index = int(args.locindex)
    data_type_index = int(args.subdirindex)
    config = Config(data_path=data_path,
                    out_path=out_path,
                    year_index=year_index,
                    month_index=month_index,
                    day_index=day_index,
                    location_index=location_index,
                    data_type_index=data_type_index,
                    relative_path_index=0,
                    window_size=0)
    variable_pad = VariablePad(config)
    variable_pad.pad()
def main() -> None:
    """Add the location group name from the location file into the path."""
    env = environs.Env()
    source_path: Path = env.path('SOURCE_PATH')
    group: str = env.str('GROUP')
    out_path: Path = env.path('OUT_PATH')
    log_level: str = env.log_level('LOG_LEVEL', 'INFO')
    source_type_index: int = env.int('SOURCE_TYPE_INDEX')
    year_index: int = env.int('YEAR_INDEX')
    month_index: int = env.int('MONTH_INDEX')
    day_index: int = env.int('DAY_INDEX')
    location_index: int = env.int('LOCATION_INDEX')
    data_type_index: int = env.int('DATA_TYPE_INDEX')
    log_config.configure(log_level)
    log.debug(
        f'source_path: {source_path} group: {group} out_path: {out_path}')
    config = Config(source_path=source_path,
                    out_path=out_path,
                    group=group,
                    source_type_index=source_type_index,
                    year_index=year_index,
                    month_index=month_index,
                    day_index=day_index,
                    location_index=location_index,
                    data_type_index=data_type_index)
    location_group_path = LocationGroupPath(config)
    location_group_path.add_groups_to_paths()
    def setUp(self):
        log_config.configure('DEBUG')
        self.setUpPyfakefs()
        self.in_path = Path('/dir/in')
        self.out_path = Path('/out')
        self.metadata_path = Path('prt/2019/10')

        file_name_1 = 'GRSM_prt_6974_2019-10-02.parquet'
        file_name_2 = 'UNDE_prt_6848_2019-10-02.parquet'
        file_name_3 = 'WREF_prt_6848_2019-10-02.parquet'
        file_name_4 = 'CPER_prt_6848_2019-10-03.parquet'

        data_path_1 = Path(self.in_path, self.metadata_path, '02/6974/data', file_name_1)
        actual_data_file_path = Path(os.path.dirname(__file__), file_name_1)
        self.fs.add_real_file(actual_data_file_path, target_path=data_path_1)

        data_path_2 = Path(self.in_path, self.metadata_path, '02/6848/data', file_name_2)
        actual_data_file_path = Path(os.path.dirname(__file__), file_name_2)
        self.fs.add_real_file(actual_data_file_path, target_path=data_path_2)

        data_path_3 = Path(self.in_path, self.metadata_path, '02/6848/data', file_name_3)
        actual_data_file_path = Path(os.path.dirname(__file__), file_name_3)
        self.fs.add_real_file(actual_data_file_path, target_path=data_path_3)

        data_path_4 = Path(self.in_path, self.metadata_path, '03/6848/data', file_name_4)
        actual_data_file_path = Path(os.path.dirname(__file__), file_name_4)
        self.fs.add_real_file(actual_data_file_path, target_path=data_path_4)
def main() -> None:
    env = environs.Env()
    data_path: Path = env.path('DATA_PATH')
    schema_path: Path = env.path('SCHEMA_PATH')
    out_path: Path = env.path('OUT_PATH')
    parse_calibration = env.bool('PARSE_CALIBRATION')
    log_level: str = env.log_level('LOG_LEVEL', 'INFO')
    source_type_index: int = env.int('SOURCE_TYPE_INDEX')
    year_index: int = env.int('YEAR_INDEX')
    month_index: int = env.int('MONTH_INDEX')
    day_index: int = env.int('DAY_INDEX')
    source_id_index: int = env.int('SOURCE_ID_INDEX')
    data_type_index: int = env.int('DATA_TYPE_INDEX')
    test_mode: bool = env.bool("TEST_MODE")
    log.debug(
        f'data_path: {data_path} schema_path: {schema_path} out_path: {out_path}'
    )
    log_config.configure(log_level)
    config = Config(data_path=data_path,
                    schema_path=schema_path,
                    out_path=out_path,
                    parse_calibration=parse_calibration,
                    source_type_index=source_type_index,
                    year_index=year_index,
                    month_index=month_index,
                    day_index=day_index,
                    source_id_index=source_id_index,
                    data_type_index=data_type_index,
                    test_mode=test_mode)
    array_parser.parse(config)
示例#5
0
def main() -> None:
    env = environs.Env()
    calibrated_path: Path = env.path('CALIBRATED_PATH')
    location_path: Path = env.path('LOCATION_PATH')
    out_path: Path = env.path('OUT_PATH')
    log_level: str = env.log_level('LOG_LEVEL', 'INFO')
    source_type_index: int = env.int('SOURCE_TYPE_INDEX')
    year_index: int = env.int('YEAR_INDEX')
    month_index: int = env.int('MONTH_INDEX')
    day_index: int = env.int('DAY_INDEX')
    source_id_index: int = env.int('SOURCE_ID_INDEX')
    data_type_index: int = env.int('DATA_TYPE_INDEX')
    log_config.configure(log_level)
    log.debug(
        f'calibrated_path: {calibrated_path} location_path: {location_path} out_path: {out_path}'
    )
    config = Config(calibrated_path=calibrated_path,
                    location_path=location_path,
                    out_path=out_path,
                    source_type_index=source_type_index,
                    year_index=year_index,
                    month_index=month_index,
                    day_index=day_index,
                    source_id_index=source_id_index,
                    data_type_index=data_type_index)
    grouper = CalibratedLocationFileGrouper(config)
    grouper.group_files()
def main() -> None:
    env = environs.Env()
    data_path: Path = env.path('DATA_PATH', None)
    calibration_path: Path = env.path('CALIBRATION_PATH', None)
    out_path: Path = env.path('OUT_PATH')
    log_level: str = env.log_level('LOG_LEVEL', 'INFO')
    data_source_id_index: int = env.int('DATA_SOURCE_ID_INDEX')
    data_source_type_index: int = env.int('DATA_SOURCE_TYPE_INDEX')
    data_year_index: int = env.int('DATA_YEAR_INDEX')
    data_month_index: int = env.int('DATA_MONTH_INDEX')
    data_day_index: int = env.int('DATA_DAY_INDEX')
    relative_path_index: int = env.int('RELATIVE_PATH_INDEX')

    log_config.configure(log_level)
    log = structlog.get_logger()
    log.debug(f'data_path: {data_path} calibration_path: {calibration_path} out_path: {out_path}')

    config = Config(data_path=data_path,
                    calibration_path=calibration_path,
                    out_path=out_path,
                    data_source_type_index=data_source_type_index,
                    data_source_id_index=data_source_id_index,
                    data_year_index=data_year_index,
                    data_month_index=data_month_index,
                    data_day_index=data_day_index,
                    relative_path_index=relative_path_index)
    process_files(config)
def main() -> None:
    env = environs.Env()
    data_path: Path = env.path('DATA_PATH')
    location_path: Path = env.path('LOCATION_PATH')
    out_path: Path = env.path('OUT_PATH')
    log_level: str = env.log_level('LOG_LEVEL')
    source_type_index: int = env.int('SOURCE_TYPE_INDEX')
    year_index: int = env.int('YEAR_INDEX')
    month_index: int = env.int('MONTH_INDEX')
    day_index: int = env.int('DAY_INDEX')
    source_id_index: int = env.int('SOURCE_ID_INDEX')
    log_config.configure(log_level)
    log.debug(
        f'data_dir: {data_path} location_dir: {location_path} out_dir: {out_path}'
    )

    config = Config(data_path=data_path,
                    location_path=location_path,
                    out_path=out_path,
                    source_type_index=source_type_index,
                    year_index=year_index,
                    month_index=month_index,
                    day_index=day_index,
                    source_id_index=source_id_index)
    event_location_grouper = EventLocationGrouper(config)
    event_location_grouper.group_files()
示例#8
0
def main() -> None:
    """Group data by related location groups."""
    env = environs.Env()
    data_path: Path = env.path('DATA_PATH')
    out_path: Path = env.path('OUT_PATH')
    log_level: str = env.log_level('LOG_LEVEL')
    source_type_index: int = env.int('SOURCE_TYPE_INDEX')
    year_index: int = env.int('YEAR_INDEX')
    month_index: int = env.int('MONTH_INDEX')
    day_index: int = env.int('DAY_INDEX')
    group_index: int = env.int('GROUP_INDEX')
    location_index: int = env.int('LOCATION_INDEX')
    data_type_index: int = env.int('DATA_TYPE_INDEX')
    log_config.configure(log_level)
    log.debug(f'data_path: {data_path} out_path: {out_path}')
    config = Config(data_path=data_path,
                    out_path=out_path,
                    source_type_index=source_type_index,
                    year_index=year_index,
                    month_index=month_index,
                    day_index=day_index,
                    group_index=group_index,
                    location_index=location_index,
                    data_type_index=data_type_index)
    related_location_grouper = RelatedLocationGrouper(config)
    related_location_grouper.group_files()
示例#9
0
def main() -> None:
    env = environs.Env()
    log_level: str = env.log_level('LOG_LEVEL', 'INFO')
    prefix_index: int = env.int('PREFIX_INDEX')
    prefix_length: int = env.int('PREFIX_LENGTH')
    sort_index: int = env.int('SORT_INDEX')
    log_config.configure(log_level)
    package(prefix_index=prefix_index, prefix_length=prefix_length, sort_index=sort_index)
示例#10
0
def main():
    env = environs.Env()
    in_path = env.path('IN_PATH')
    out_path = env.path('OUT_PATH')
    log_level = env.str('LOG_LEVEL', 'INFO')
    # 30 percent duplication threshold for dedup by default
    dedup_threshold = env.float('DEDUP_THRESHOLD', 0.3)
    log_config.configure(log_level)
    linkmerge(in_path, out_path, dedup_threshold)
示例#11
0
def main() -> None:
    env = environs.Env()
    location_path: Path = env.path('LOCATION_PATH')
    out_path: Path = env.path('OUT_PATH')
    schema_index: int = env.int('SCHEMA_INDEX')
    log_level: str = env.log_level('LOG_LEVEL', 'INFO')
    log_config.configure(log_level)
    link_location_files(location_path=location_path,
                        out_path=out_path,
                        schema_index=schema_index)
示例#12
0
def main() -> None:
    env = environs.Env()
    in_path: Path = env.path('IN_PATH')
    out_path: Path = env.path('OUT_PATH')
    log_level: str = env.log_level('LOG_LEVEL')
    indices: list = env.list('PATH_INDICES')
    log_config.configure(log_level)
    log = structlog.get_logger()
    log.debug(f'in_path: {in_path} out_path: {out_path}')
    order_paths(in_path, out_path, indices)
def main() -> None:
    env = environs.Env()
    in_path: Path = env.path('IN_PATH')
    out_path: Path = env.path('OUT_PATH')
    filter_dirs: list = env.list('FILTER_DIR')
    log_level: str = env.log_level('LOG_LEVEL', 'INFO')
    relative_path_index: int = env.int('RELATIVE_PATH_INDEX')
    log_config.configure(log_level)
    log = structlog.get_logger()
    log.debug(f'in_path: {in_path} filter_dirs: {filter_dirs} out_dir: {out_path}')
    filter_directory(in_path, out_path, filter_dirs, relative_path_index)
def main() -> None:
    env = environs.Env()
    config: str = env.str('CONFIG')
    out_path: Path = env.path('OUT_PATH')
    log_level: str = env.log_level('LOG_LEVEL', 'INFO')
    relative_path_index: int = env.int('RELATIVE_PATH_INDEX')
    log_config.configure(log_level)
    filter_joiner = FilterJoiner(config=config,
                                 out_path=out_path,
                                 relative_path_index=relative_path_index)
    filter_joiner.join()
def main() -> None:
    env = environs.Env()
    in_path: Path = env.path('IN_PATH')
    out_path: Path = env.path('OUT_PATH')
    log_level: str = env.log_level('LOG_LEVEL', 'INFO')
    relative_path_index: int = env.int('RELATIVE_PATH_INDEX')
    location_index: int = env.int('LOCATION_INDEX')
    empty_file_suffix: str = env.str('EMPTY_FILE_SUFFIX')

    log_config.configure(log_level)

    linker = DataGapFillerLinker(in_path, out_path, relative_path_index, location_index, empty_file_suffix)
    linker.link_files()
def main() -> None:
    """Analyze padded time series data"""
    env = environs.Env()
    data_path: Path = env.path('DATA_PATH')
    out_path: Path = env.path('OUT_PATH')
    log_level: str = env.log_level('LOG_LEVEL')
    relative_path_index: int = env.int('RELATIVE_PATH_INDEX')
    log_config.configure(log_level)
    log = get_logger()
    log.debug(f'data_path: {data_path} out_path: {out_path}')
    analyzer = PaddedTimeSeriesAnalyzer(data_path, out_path,
                                        relative_path_index)
    analyzer.analyze()
示例#17
0
 def setUp(self):
     """Set required files in mock filesystem."""
     self.setUpPyfakefs()
     self.log = get_logger()
     log_config.configure('DEBUG')
     self.out_dir = Path('/tmp/out')
     input_root = Path('/tmp/in')
     site_month = Path('CPER/2019/01')
     self.input_dir = Path(input_root, site_month)
     # Data file
     self.source_file_name = 'NEON.D10.CPER.DP1.00041.001.001.501.001.ST_1_minute.2019-01.basic.20210720T001022Z.csv'
     self.target_file_name = 'NEON.DOM.SITE.DP1.00041.001/CPER/20190101T000000--20190201T000000/basic/NEON.D10.CPER.DP1.00041.001.001.501.001.ST_1_minute.2019-01.basic.20210720T001022Z.csv'
     data_path = Path(input_root, self.input_dir, self.source_file_name)
     self.fs.create_file(data_path)
def main() -> None:
    env = environs.Env()
    location_type: str = env.str('LOCATION_TYPE')
    db_url: str = env.str('DATABASE_URL')
    out_path: Path = env.path('OUT_PATH')
    log_level: str = env.log_level('LOG_LEVEL', 'INFO')
    log_config.configure(log_level)

    with closing(connect(db_url)) as connection:
        get_named_locations_partial = partial(get_named_locations,
                                              connection=connection,
                                              location_type=location_type)
        load_locations(out_path=out_path,
                       get_locations=get_named_locations_partial)
def main() -> None:
    env = environs.Env()
    source_path: Path = env.path('SOURCE_PATH')
    out_path: Path = env.path('OUT_PATH')
    log_level: str = env.log_level('LOG_LEVEL')
    source_type_index: int = env.int('SOURCE_TYPE_INDEX')
    source_id_index: int = env.int('SOURCE_ID_INDEX')
    log_config.configure(log_level)
    log.debug(f'source_path: {source_path} out_path: {out_path}')
    event_asset_loader = EventAssetLoader(source_path=source_path,
                                          out_path=out_path,
                                          source_type_index=source_type_index,
                                          source_id_index=source_id_index)
    event_asset_loader.link_event_files()
示例#20
0
def main() -> None:
    env = environs.Env()
    out_path: Path = env.path('OUT_PATH')
    db_url: str = env.str('DATABASE_URL')
    log_level: str = env.log_level('LOG_LEVEL', 'INFO')
    log_config.configure(log_level)
    log.debug(f'out_path: {out_path}')

    with closing(connect(db_url)) as connection:
        get_assets_partial = partial(get_assets, connection)
        get_asset_locations_partial = partial(get_asset_locations, connection)
        location_asset_loader.write_files(
            get_assets=get_assets_partial,
            get_asset_locations=get_asset_locations_partial,
            out_path=out_path)
def main() -> None:
    """
    Link files in the data path into the output path. A specification file with multiple inputs
    will use the same 'DATA_PATH' name to group the inputs.
    """
    env = environs.Env()
    data_path: Path = env.path('DATA_PATH')
    out_path: Path = env.path('OUT_PATH')
    log_level: str = env.log_level('LOG_LEVEL', 'INFO')
    relative_path_index: int = env.int('RELATIVE_PATH_INDEX')
    log_config.configure(log_level)
    log.debug(f'data_path: {data_path} out_path: {out_path}')
    group_files(path=data_path,
                out_path=out_path,
                relative_path_index=relative_path_index)
示例#22
0
def main() -> None:
    """Read files from the list of related paths and link them into the output path."""
    env = environs.Env()
    related_paths: list = env.list('RELATED_PATHS')
    out_path: Path = env.path('OUT_PATH')
    log_level: str = env.log_level('LOG_LEVEL', 'INFO')
    relative_path_index: int = env.int('RELATIVE_PATH_INDEX')
    log_config.configure(log_level)
    log.debug(f'related_paths: {related_paths} out_path: {out_path}')
    paths = []
    for p in related_paths:
        path = os.environ[p]
        paths.append(Path(path))
    join_files(related_paths=paths,
               out_path=out_path,
               relative_path_index=relative_path_index)
示例#23
0
def main() -> None:
    env = environs.Env()
    out_path: Path = env.path('OUT_PATH')
    bootstrap_server: str = env.str('BOOTSTRAP_SERVER')
    topic: str = env.str('TOPIC')
    group_id: str = env.str('GROUP_ID')
    auto_offset_reset: str = env.str('AUTO_OFFSET_RESET')
    enable_auto_commit: bool = env.bool('ENABLE_AUTO_COMMIT')
    log_level: str = env.log_level('LOG_LEVEL', 'INFO')
    log_config.configure(log_level)
    config = Config(out_path=out_path,
                    bootstrap_server=bootstrap_server,
                    topic=topic,
                    group_id=group_id,
                    auto_offset_reset=auto_offset_reset,
                    enable_auto_commit=enable_auto_commit,
                    is_test=False)
    read_messages_partial = partial(read_messages, config)
    run(config, open_pipe, read_messages_partial)
示例#24
0
def main() -> None:
    env = environs.Env()
    data_path: Path = env.path('DATA_PATH')
    out_path: Path = env.path('OUT_PATH')
    log_level: str = env.log_level('LOG_LEVEL')
    log_config.configure(log_level)
    log = get_logger()
    log.debug(f'data_dir: {data_path}')
    log.debug(f'out_dir: {out_path}')

    parser = argparse.ArgumentParser()
    parser.add_argument('--outputname')
    parser.add_argument('--dateindex')
    parser.add_argument('--locindex')
    args = parser.parse_args()

    egress = Egress(data_path, out_path, args.outputname, int(args.dateindex),
                    int(args.locindex))
    egress.upload()
def main() -> None:
    env = environs.Env()
    in_path: Path = env.path('IN_PATH')
    out_path: Path = env.path('OUT_PATH')
    context: str = env.str('CONTEXT')
    log_level: str = env.log_level('LOG_LEVEL', 'INFO')
    trim_index: int = env.int('TRIM_INDEX')
    source_id_index: int = env.int('SOURCE_ID_INDEX')
    data_type_index: int = env.int('DATA_TYPE_INDEX')
    log_config.configure(log_level)
    log = structlog.get_logger()
    log.debug(f'in_path: {in_path} out_path: {out_path} context: {context}')
    config = Config(in_path=in_path,
                    out_path=out_path,
                    context=context,
                    trim_index=trim_index,
                    source_id_index=source_id_index,
                    data_type_index=data_type_index)
    context_filter = ContextFilter(config)
    context_filter.filter_files()
def main() -> None:
    """
    Link input paths into the output path.
    """
    env = environs.Env()
    related_paths: list = env.list('RELATED_PATHS')
    data_path: Path = env.path('DATA_PATH')
    location_path: Path = env.path('LOCATION_PATH')
    out_path: Path = env.path('OUT_PATH')
    log_level: str = env.log_level('LOG_LEVEL', 'INFO')
    relative_path_index: int = env.int('RELATIVE_PATH_INDEX')
    year_index: int = env.int('YEAR_INDEX')
    loc_index: int = env.int('LOC_INDEX')
    grouploc_key: str = env.str('GROUPLOC_KEY')
    log_config.configure(log_level)
    log.debug(f'related_paths: {related_paths} out_path: {out_path}')
    paths = []
    for p in related_paths:
        path = os.environ[p]
        paths.append(Path(path))
    location_group(related_paths=paths, data_path=data_path, location_path=location_path, out_path=out_path, 
              relative_path_index=relative_path_index, year_index=year_index, loc_index=loc_index, grouploc_key=grouploc_key)
def main() -> None:
    env = environs.Env()
    in_path: Path = env.path('IN_PATH')
    out_path: Path = env.path('OUT_PATH')
    log_level: str = env.log_level('LOG_LEVEL', 'INFO')
    # default 30 percent duplication threshold
    duplication_threshold: float = env.float('DEDUPLICATION_THRESHOLD', 0.3)
    source_type_index: int = env.int('SOURCE_TYPE_INDEX')
    year_index: int = env.int('YEAR_INDEX')
    month_index: int = env.int('MONTH_INDEX')
    day_index: int = env.int('DAY_INDEX')
    source_id_index: int = env.int('SOURCE_ID_INDEX')
    log_config.configure(log_level)
    config = Config(in_path=in_path,
                    out_path=out_path,
                    duplication_threshold=duplication_threshold,
                    source_type_index=source_type_index,
                    year_index=year_index,
                    month_index=month_index,
                    day_index=day_index,
                    source_id_index=source_id_index)
    parquet_file_merger = ParquetFileMerger(config)
    parquet_file_merger.merge()
def main() -> None:
    env = environs.Env()
    data_path: Path = env.path('DATA_PATH')
    out_path: Path = env.path('OUT_PATH')
    log_level: str = env.log_level('LOG_LEVEL', 'INFO')
    window_size: int = env.int('WINDOW_SIZE')
    year_index: int = env.int('YEAR_INDEX')
    month_index: int = env.int('MONTH_INDEX')
    day_index: int = env.int('DAY_INDEX')
    location_index: int = env.int('LOCATION_INDEX')
    data_type_index: int = env.int('DATA_TYPE_INDEX')
    relative_path_index: int = env.int('RELATIVE_PATH_INDEX')
    log_config.configure(log_level)
    config = Config(data_path=data_path,
                    out_path=out_path,
                    relative_path_index=relative_path_index,
                    year_index=year_index,
                    month_index=month_index,
                    day_index=day_index,
                    location_index=location_index,
                    data_type_index=data_type_index,
                    window_size=window_size)
    constant_pad = ConstantPad(config)
    constant_pad.pad()
def main() -> None:
    env = environs.Env()
    log_level: str = env.log_level('LOG_LEVEL', 'INFO')
    year_index: int = env.int('YEAR_INDEX')
    log_config.configure(log_level)
    transform(year_index=year_index)