Python crawl示例，lib.file_crawler.crawl Python示例

示例#1

0

显示文件

def group_related(path, out_path):
    """
    Link related data and location files into the output directory.

    :param path: Directory or file path.
    :type path: str
    :param out_path: The output path for related data.
    :type out_path: str
    """
    for file_path in file_crawler.crawl(path):
        trimmed_path = target_path.trim_path(file_path)
        parts = pathlib.Path(trimmed_path).parts
        source_type = parts[0]
        year = parts[1]
        month = parts[2]
        day = parts[3]
        group = parts[4]
        location = parts[5]
        data_type = parts[6]
        remainder = parts[7:]
        base_output_path = os.path.join(out_path, year, month, day, group)
        target = os.path.join(base_output_path, source_type, location,
                              data_type, *remainder[0:])
        log.debug(f'File target: {target}')
        file_linker.link(file_path, target)

示例#2

0

显示文件

文件： app.py 项目： covesturtevant/NEON-IS-data-processing-covesturtevant

def linkmerge(in_path, out_path, dedup_threshold):
    filedict = {}
    for parquet_file_path in file_crawler.crawl(in_path):
        file_source_id = parquet_file_path.name.split('_')[2]
        if file_source_id not in filedict:
            filedict[file_source_id] = [parquet_file_path]
        else:
            filedict[file_source_id].append(parquet_file_path)
    for source_id in filedict:
        # If there is only one file for the sourceid, we just symlink it
        if len(filedict[source_id]) == 1:
            inpath = filedict[source_id][0]
            inpath_new_filename = '_'.join(inpath.name.split('_')[1:])
            # Strip off / pfs /IN_PATH (3 parts)
            stripped_inpath = pathlib.PurePosixPath().joinpath(
                *inpath.parts[3:])
            outpath = pathlib.Path(
                os.path.join(out_path, stripped_inpath.parent,
                             inpath_new_filename))
            if not os.path.exists(outpath.parent):
                log.info(f"{outpath.parent} directory not found, creating")
                os.makedirs(outpath.parent)
            log.info(f"Linking {inpath} to {outpath}")
            os.symlink(inpath, outpath)
        else:
            write_merged_parquet(inputfiles=filedict[source_id],
                                 in_path=in_path,
                                 out_path=out_path)

示例#3

0

显示文件

文件： app.py 项目： covesturtevant/NEON-IS-data-processing-covesturtevant

def group(data_path, location_path, out_path):
    """
    Write event data and location files into output path.

    :param data_path: The path to the data files.
    :type data_path: str
    :param location_path: The path to the location file.
    :type location_path: str
    :param out_path: The path for writing results.
    :type out_path: str
    :return:
    """
    for file_path in file_crawler.crawl(data_path):
        trimmed_path = target_path.trim_path(file_path)
        log.debug(f'trimmed_path: {trimmed_path}')
        parts = trimmed_path.parts
        source_type = parts[0]
        year = parts[1]
        month = parts[2]
        day = parts[3]
        source_id = parts[4]
        filename = parts[5]
        log.debug(f'filename: {filename}')
        log.debug(f'source type: {source_type} source_id: {source_id}')
        target_root = os.path.join(out_path, source_type, year, month, day, source_id)
        link_location(location_path, target_root)
        data_target_path = os.path.join(target_root, 'data', filename)
        log.debug(f'data_target_path: {data_target_path}')
        file_linker.link(file_path, data_target_path)

示例#4

0

显示文件

def get_data_files(data_path, out_path, start_date=None, end_date=None):
    """
    Get the data file path keys between start and end dates.

    :param data_path: The path to the data file directory.
    :type data_path: str
    :param out_path: The path to write results.
    :type out_path: str
    :param start_date: The start date.
    :type start_date: datetime object
    :param end_date: The end date.
    :type end_date: datetime object
    :return: list of data files.
    """
    keys = []
    for file_path in file_crawler.crawl(data_path):
        parts = file_path.parts
        source_type = parts[3]
        year = parts[4]
        month = parts[5]
        day = parts[6]
        location_name = parts[7]
        data_type = parts[8]
        filename = parts[9]
        if not check_date(year, month, day, start_date, end_date):
            continue
        target_root = os.path.join(out_path, source_type, year, month, day,
                                   location_name)
        target_path = os.path.join(target_root, data_type, filename)
        file_linker.link(file_path, target_path)
        key = '/' + source_type + '/' + year + '/' + month + '/' + day + '/' + location_name
        keys.append(key)
    return keys

示例#5

0

显示文件

def group(calibrated_path, location_path, out_path):
    """
    Write calibrated data and location files into the output path.

    :param calibrated_path: The input path for calibrated files.
    :type calibrated_path: str
    :param location_path: The input path for location files.
    :type location_path: str
    :param out_path: The output path for writing grouped files.
    :type out_path: str
    :return:
    """
    i = 0
    for file_path in file_crawler.crawl(calibrated_path):
        parts = file_path.parts
        source_type = parts[3]
        year = parts[4]
        month = parts[5]
        day = parts[6]
        source_id = parts[7]
        data_type = parts[8]
        log.debug(f'year: {year}  month: {month}  day: {day}')
        log.debug(
            f'source type: {source_type} source_id: {source_id} data type: {data_type}'
        )
        target_root = os.path.join(out_path, source_type, year, month, day,
                                   source_id)
        if i == 0:  # Only link location once.
            link_location(location_path, target_root)
        # Grab all directories and files under the common path (after the data type).
        target = os.path.join(target_root, data_type, *parts[9:])
        log.debug(f'target: {target}')
        file_linker.link(file_path, target)
        i += 1

示例#6

0

显示文件

def group(data_path, location_path, out_path):
    """
    Write data and location files into the output path.

    :param data_path: The path to the data files.
    :type data_path: str
    :param location_path: The path to the location files.
    :type location_path: str
    :param out_path: The output path to write grouped files.
    :type out_path: str
    :return:
    """
    for file_path in file_crawler.crawl(data_path):
        parts = file_path.parts
        source_type = parts[3]
        year = parts[4]
        month = parts[5]
        day = parts[6]
        filename = parts[7]
        log.debug(f'data filename: {filename}')
        name = DataFilename(filename)
        source_id = name.source_id()
        log.debug(f'source type: {source_type} source_id: {source_id}')
        log.debug(f'year: {year}  month: {month}  day: {day}')
        log.debug(f'filename: {filename}')
        target_root = os.path.join(out_path, source_type, year, month, day,
                                   source_id)
        link_location(location_path, target_root)
        data_target_path = os.path.join(target_root, 'data', filename)
        log.debug(f'data_target_path: {data_target_path}')
        file_linker.link(file_path, data_target_path)

示例#7

0

显示文件

文件： app.py 项目： covesturtevant/NEON-IS-data-processing-covesturtevant

def group_data(data_path, out_path):
    """
    Write data and event files into output path.

    :param data_path: The path to the data files.
    :type data_path: str
    :param out_path: The output path for writing results.
    :type out_path: str
    :return:
    """
    target_root = None
    for file_path in file_crawler.crawl(data_path):
        trimmed_path = target_path.trim_path(file_path)
        parts = trimmed_path.parts
        year = parts[0]
        month = parts[1]
        day = parts[2]
        group_name = parts[3]
        source_type = parts[4]
        location = parts[5]
        data_type = parts[6]
        filename = parts[7]
        target_root = os.path.join(out_path, year, month, day, group_name)
        data_target_path = os.path.join(target_root, source_type, location,
                                        data_type, filename)
        file_linker.link(file_path, data_target_path)
    return target_root

示例#8

0

显示文件

def get_empty_file_paths(empty_files_path):
    """
    Get the paths to the collection of empty files.

    :param empty_files_path: The path to the directory containing empty files.
    :type empty_files_path: str
    :return: dict of file paths.
    """
    empty_data_path = None
    empty_flags_path = None
    empty_uncertainty_data_path = None
    for file_path in file_crawler.crawl(empty_files_path):
        parts = pathlib.Path(file_path).parts
        trimmed = parts[3:]
        directory_name = trimmed[1]
        if 'data' == directory_name:
            empty_data_path = file_path
        elif 'flags' == directory_name:
            empty_flags_path = file_path
        elif 'uncertainty_data' == directory_name:
            empty_uncertainty_data_path = file_path
    if empty_data_path is None:
        log.error('Empty data file not found.')
        exit(1)
    if empty_flags_path is None:
        log.error('Empty flags file not found.')
        exit(1)
    if empty_uncertainty_data_path is None:
        log.error('Empty uncertainty data file not found.')
        exit(1)
    return {'empty_data_path': empty_data_path,
            'empty_flags_path': empty_flags_path,
            'empty_uncertainty_data_path': empty_uncertainty_data_path}

示例#9

0

显示文件

文件： app.py 项目： covesturtevant/NEON-IS-data-processing-covesturtevant

def group(path, out_path):
    """
    Link files into the output directory.

    :param path: File or directory paths.
    :type path: str
    :param out_path: The output path for writing results.
    :type out_path: str
    """
    for file_path in file_crawler.crawl(path):
        target = target_path.get_path(file_path, out_path)
        log.debug(f'target: {target}')
        file_linker.link(file_path, target)

示例#10

0

显示文件

def link_location(location_path, target_root):
    """
    Link the location file into the target root.

    :param location_path: The location file path.
    :type location_path: str
    :param target_root: The target directory to write the location file.
    :type target_root: str
    :return:
    """
    for file in file_crawler.crawl(location_path):
        location_filename = pathlib.Path(file).name
        target = os.path.join(target_root, 'location', location_filename)
        file_linker.link(file, target)

示例#11

0

显示文件

    def pad(self):
        """
        Pad the data for the window size.

        :return:
        """
        try:
            manifests = {}
            manifest_file_names = {}
            for file_path in file_crawler.crawl(self.data_path):
                parts = pathlib.Path(file_path).parts
                year = parts[self.year_index]
                month = parts[self.month_index]
                day = parts[self.day_index]
                location = parts[self.location_index]
                sub_dir = parts[self.sub_dir_index]
                if sub_dir in self.sub_dirs_to_process:
                    location_path = os.path.join(*parts[0:self.location_index + 1])
                    if location not in manifests:
                        manifests[location] = []
                    # get data date
                    date = datetime.date(int(year), int(month), int(day))
                    # get dates in padded range
                    dates_in_padded_range = padder_util.get_dates_in_padded_range(date, self.window_size)
                    # link file into each date in padded range
                    destination_parts = list(parts)
                    for index in range(1, len(self.out_dir_parts)):
                        destination_parts[index] = self.out_dir_parts[index]
                    for date_in_padded_range in dates_in_padded_range:
                        destination_parts[self.year_index] = str(date_in_padded_range.year)
                        destination_parts[self.month_index] = str(date_in_padded_range.month).zfill(2)
                        destination_parts[self.day_index] = str(date_in_padded_range.day).zfill(2)
                        # generate destination path
                        destination_path = os.path.join(*destination_parts)
                        log.debug(f'source: {file_path}')
                        log.debug(f'destination: {destination_path}')
                        file_linker.link(file_path, destination_path)
                        manifests[location].append(date_in_padded_range)
                        if date_in_padded_range == date:
                            # construct manifest filename
                            manifest_path = os.path.dirname(destination_path)  # remove data filename
                            manifest_file_names[location] = os.path.join(manifest_path, 'manifest.txt')
                        output_writer.write_thresholds(location_path, destination_path)
                else:
                    destination_path = os.path.join(self.out_path, *parts[3:len(parts) + 1])
                    file_linker.link(file_path, destination_path)
            output_writer.write_manifests(manifests, manifest_file_names)  # write manifest files
        except Exception:
            exception_type, exception_obj, exception_tb = sys.exc_info()
            log.error("Exception at line " + str(exception_tb.tb_lineno) + ": " + str(sys.exc_info()))

示例#12

0

显示文件

文件： app.py 项目： covesturtevant/NEON-IS-data-processing-covesturtevant

def link_location(location_path, target_root):
    """
    Link the location file into the target directory.

    :param location_path: The location file path.
    :type location_path: str
    :param target_root: The target directory path.
    :type target_root: str
    :return:
    """
    for file in file_crawler.crawl(location_path):
        location_filename = pathlib.Path(file).name
        location_target_path = os.path.join(target_root, 'location', location_filename)
        log.debug(f'location_target_path: {location_target_path}')
        file_linker.link(file, location_target_path)

示例#13

0

显示文件

文件： app.py 项目： covesturtevant/NEON-IS-data-processing-covesturtevant

def process(source_path, group, out_path):
    """
    Link source files into the output directory with the related location group in the path.
    There must be only one location file under the source path.

    :param source_path: The input path.
    :type source_path: str
    :param group: The group to match in the location files.
    :type group: str
    :param out_path: The output path.
    :type out_path: str
    """
    paths = []
    group_names = []
    for file_path in file_crawler.crawl(source_path):
        # parse path elements
        parts = pathlib.Path(file_path).parts
        source_type = parts[3]
        year = parts[4]
        month = parts[5]
        day = parts[6]
        location = parts[7]
        data_type = parts[8]
        remainder = parts[9:]  # everything after the data type
        # put path parts into dictionary
        path_parts = {
            "source_type": source_type,
            "year": year,
            "month": month,
            "day": day,
            "location": location,
            "data_type": data_type,
            "remainder": remainder
        }
        # add the original file path and the path parts to the path list
        paths.append({"file_path": file_path, "path_parts": path_parts})

        # get the location context group name from the location file
        if data_type == 'location':
            group_names = location_file_context.get_matching_items(file_path, group)

    # location context group name was not found!
    if len(group_names) == 0:
        log.error('No location directory found.')
    # context group name found, link all the files into the output directory
    else:
        link(paths, group_names, out_path)

示例#14

0

显示文件

def write_ancillary_data(out_dir, root):
    """
    Write any additional files present in the input directory
    beyond data and thresholds into the output directory.

    :param out_dir: The output directory for writing results.
    :type out_dir: str
    :param root: The threshold root directory.
    :type root: str
    :return:
    """
    parent_dir = pathlib.Path(root).parent
    for file_path in file_crawler.crawl(parent_dir):
        file_path = str(file_path)
        if 'data' not in file_path and 'threshold' not in file_path:
            parts = pathlib.Path(file_path).parts
            trimmed_path = os.path.join(*parts[3:])
            output_path = os.path.join(out_dir, trimmed_path)
            file_linker.link(file_path, output_path)

示例#15

0

显示文件

文件： app.py 项目： covesturtevant/NEON-IS-data-processing-covesturtevant

def group(paths, out_path):
    """
    Link all files into the output directory.

    :param paths: Comma separated list of environment variable names whose values are full directory paths.
    :type paths: str
    :param out_path: The output path for writing results.
    :type out_path: str
    """
    if ',' in paths:
        paths = paths.split(',')
    log.debug(f'paths: {paths}')
    for p in paths:
        log.debug(f'path: {p}')
        path = os.environ[p]
        for file_path in file_crawler.crawl(path):
            target = target_path.get_path(file_path, out_path)
            log.debug(f'target: {target}')
            file_linker.link(file_path, target)

示例#16

0

显示文件

def process(source_path, group, out_path):
    """
    Link source files into the output directory with the related location group in the path.
    There must be only one location file under the source path.

    :param source_path: The input path.
    :type source_path: str
    :param group: The group to match in the location files.
    :type group: str
    :param out_path: The output path.
    :type out_path: str
    :return
    """
    paths = []
    group_names = []
    for file_path in file_crawler.crawl(source_path):

        # Parse path elements
        trimmed_path = target_path.trim_path(file_path)
        parts = pathlib.Path(trimmed_path).parts
        source_type = parts[0]
        source_id = parts[1]
        data_type = parts[2]
        filename = parts[3]

        path_parts = {
            "source_type": source_type,
            "source_id": source_id,
            "data_type": data_type,
            "filename": filename
        }

        paths.append({"file_path": file_path, "path_parts": path_parts})

        # Get the full group name from the location file
        if data_type == 'location':
            group_names = location_file_context.get_matching_items(file_path, group)

    if len(group_names) == 0:
        log.error('No location directory found.')
    else:
        link(paths, group_names, out_path)

示例#17

0

显示文件

    def filter(self, in_path, out_path, context):
        """
        Group files in the input directory by context.

        :param in_path: The input path.
        :type in_path: str
        :param out_path: The output path.
        :type out_path: str
        :param context: The context to match.
        :type context: str
        """
        sources = {}
        for file_path in file_crawler.crawl(in_path):
            parts = pathlib.Path(file_path).parts
            source_id = parts[self.source_id_index]
            data_type = parts[self.data_type_index]
            log.debug(f'source_id: {source_id} data_type: {data_type}')
            paths = sources.get(source_id)
            if paths is None:
                paths = []
            paths.append({data_type: file_path})
            sources.update({source_id: paths})
        self.group_sources(sources, context, out_path)

示例#18

0

显示文件

文件： app.py 项目： covesturtevant/NEON-IS-data-processing-covesturtevant

def process(data_path, out_path):
    """
    Load events from the asset data path.

    :param data_path: The data path.
    :type data_path: str
    :param out_path: The output path for writing results.
    :type out_path: str
    :return:
    """
    for file_path in file_crawler.crawl(data_path):
        trimmed_path = target_path.trim_path(file_path)
        parts = trimmed_path.parts
        source_type = parts[0]
        source_id = parts[1]
        filename = parts[2]
        log.debug(f'source filename: {filename}')
        log.debug(f'source type: {source_type} source_id: {source_id}')
        output_filename = source_type + '_' + source_id + '_events.json'
        output_path = os.path.join(out_path, source_type, source_id,
                                   output_filename)
        log.debug(f'output_path: {output_path}')
        if not os.path.exists(output_path):
            file_linker.link(file_path, output_path)

示例#19

0

显示文件

文件： app.py 项目： covesturtevant/NEON-IS-data-processing-covesturtevant

def group_events(event_path, target_root):
    """
    Group the event files into the target directory.

    :param event_path: The path to the event files.
    :type event_path: str
    :param target_root: The root output path.
    :type target_root: str
    :return:
    """
    reference_group = pathlib.Path(target_root).name
    for file_path in file_crawler.crawl(event_path):
        trimmed_path = target_path.trim_path(file_path)
        parts = pathlib.Path(trimmed_path).parts
        source_type = parts[0]
        group_name = parts[1]
        source_id = parts[2]
        data_type = parts[3]
        filename = parts[4]
        event_target = os.path.join(target_root, source_type, source_id,
                                    data_type, filename)
        log.debug(f'event_target: {event_target}')
        if group_name == reference_group:
            file_linker.link(file_path, event_target)

示例#20

0

显示文件

def process_location_files(location_path,
                           keys,
                           out_path,
                           output_directories,
                           empty_data_path,
                           empty_flags_path,
                           empty_uncertainty_data_path,
                           start_date=None,
                           end_date=None):
    """
    Process the location files.

    :param location_path: The path to the location file.
    :type location_path: str
    :param keys: The path keys to the data files.
    :type keys: list
    :param out_path: The path to write results.
    :type out_path: str
    :param output_directories: The output directories to write.
    :type output_directories: list
    :param empty_data_path: Path to the empty data files.
    :type empty_data_path: str
    :param empty_flags_path: Path to the empty flag files.
    :type empty_flags_path: str
    :param empty_uncertainty_data_path: Path to the empty uncertainty data file.
    :type empty_uncertainty_data_path: str
    :param start_date: The start date.
    :type start_date datetime object
    :param end_date: The end date.
    :type end_date: datetime object
    :return:
    """
    for file_path in file_crawler.crawl(location_path):
        parts = file_path.parts
        source_type = parts[3]
        year = parts[4]
        month = parts[5]
        day = parts[6]
        named_location_name = parts[7]
        filename = parts[8]
        if not check_date(year, month, day, start_date, end_date):
            continue
        target_root = os.path.join(out_path, source_type, year, month, day,
                                   named_location_name)
        # link the location file into the output directory
        location_target = os.path.join(target_root, 'location', filename)
        file_linker.link(file_path, location_target)
        # create an empty calibration file in the target directory but do not overwrite
        calibration_target = os.path.join(target_root, 'calibration')
        os.makedirs(calibration_target, exist_ok=True)
        # create key to find corresponding data for the sensor and date
        key = '/' + source_type + '/' + year + '/' + month + '/' + day + '/' + named_location_name
        if key not in keys:
            # key not found, create empty directories and files
            print(f'Key not found {key}')
            for directory in output_directories:
                target_dir = os.path.join(target_root, directory)
                if directory == 'data':
                    link_path(target_dir, empty_data_path, named_location_name,
                              year, month, day)
                elif directory == 'flags':
                    link_path(target_dir, empty_flags_path,
                              named_location_name, year, month, day)
                elif directory == 'uncertainty_data':
                    link_path(target_dir, empty_uncertainty_data_path,
                              named_location_name, year, month, day)
                elif directory == 'uncertainty_coef':
                    os.makedirs(target_dir, exist_ok=True)

示例#21

0

显示文件

文件： app.py 项目： covesturtevant/NEON-IS-data-processing-covesturtevant

def convert(in_path, out_path, dedup_threshold):
    """
    Convert .avro files in in_path into .parquet files in out_path.

    :param in_path: The input path for the .avro files.
    :type in_path: str
    :param out_path: The output path to write .parquet files.
    :type out_path: str
    :param dedup_threshold: The duplication percentage for dictionary compression.
    :type dedup_threshold: float
    :return:
    """
    for avro_file_path in file_crawler.crawl(in_path):
        log.info(f"Opening Avro file {avro_file_path}")
        if not is_avro(str(avro_file_path)):
            log.error(f"error: {avro_file_path} is not an Avro file")
            sys.exit(1)

        with open(avro_file_path, "rb") as open_file:
            avro_data = reader(open_file)
            # Get the ordered list of field names from the avro schema
            avro_file_schema = avro_data.metadata['avro.schema']
            log.debug(f"avro_file_schema: {avro_file_schema}")
            avro_schema = avro_data.writer_schema
            log.debug(f"avro_schema: {avro_schema}")

            # Read Avro file into Pandas dataframe
            data_frame = pd.DataFrame(
                data=avro_data,
                # Preserve column ordering
                columns=[x['name'] for x in avro_schema['fields']])
            log.debug(f"Data Frame info: {data_frame}")
        # Get a list of columns with hashable types
        log.debug(f"All Columns: {[x for x in data_frame.columns]}")
        hashable_cols = [
            x for x in data_frame.columns
            if isinstance(data_frame[x][0], Hashable)
        ]
        log.debug(f"Hashable columns from the data_frame: {hashable_cols}")
        # Find columns with high duplication (> 30%) for use with dictionary encoding
        dupcols = [
            x.encode('UTF-8') for x in hashable_cols
            if (data_frame[x].duplicated().sum() /
                (int(data_frame[x].size) - 1)) > dedup_threshold
        ]
        log.debug(f"Columns to dedup: {dupcols}")
        table = pa.Table.from_pandas(data_frame).replace_schema_metadata({
            'parquet.avro.schema':
            avro_file_schema,
            'writer.model.name':
            'avro'
        })
        parts = avro_file_path.parts
        parquet_file_path = pathlib.Path(os.path.join(out_path, *parts[3:]))
        parquet_file_path.parent.mkdir(parents=True, exist_ok=True)
        parquet_file_path = os.path.splitext(parquet_file_path)[0] + '.parquet'
        log.info(f"Writing parquet file: {parquet_file_path}")
        pq.write_table(table,
                       parquet_file_path,
                       compression='gzip',
                       use_dictionary=dupcols,
                       compression_level=5,
                       coerce_timestamps='ms',
                       allow_truncated_timestamps=False)