def test_find_input_files_sortkeys(self):
     result = file_scanner.find_input_files(ScanSource(
         include=os.path.join(PROJECT_ROOT, "**/ex*.xls*"),
         sort_keys=[r'/.*?(\d+).*/\1/i'])
     )
     sort_keys = {r.sort_key for r in result}
     self.assertSetEqual({'2004', '2005'}, sort_keys)
Exemplo n.º 2
0
    def test_find_input_files_deep(self):
        result = file_scanner.find_input_files(
            ScanSource(include=os.path.join(PROJECT_ROOT, "**/ex*.xlsx")))
        self.assertEqual(len(result), 1)

        filesource = result[0]
        self.assertEqual('mples/example-B-2004.xlsx',
                         filesource.filename[-25:])
Exemplo n.º 3
0
def __to_scan_source(
        input_value: Union[str, ScanSource, Iterable]) -> Iterable[ScanSource]:
    if isinstance(input_value, str):
        return [ScanSource(input_value)]
    elif isinstance(input_value, Iterable):
        sources: List[ScanSource] = []
        for i in input_value:
            sources += __to_scan_source(i)
        return sources
    else:
        return [input_value]
def process_report(match_input: Union[Iterable[MatchInput], pd.DataFrame, str],
                   data_sources: List[SourceConfig]):
    if isinstance(match_input, str) or isinstance(match_input, pd.DataFrame):
        match_input = parse_report(match_input)

    files_to_scan = set()
    columns_per_table = dict()
    mapping_dict: Dict[Tuple, List[MatchInput]] = dict()

    for input in match_input:
        if input.sheetname is None:
            # First we look for unscanned files
            sort_key = None
            if input.sort_key is not None:
                sort_key = [f'/.*/{input.sort_key}/']
            scan_source = ScanSource(include=input.filename,
                                     sort_keys=sort_key)
            files = file_scanner.find_input_files(scan_source)
            files_to_scan.update(files)
        else:
            # Then we build a lookup of files and tables to see if any tables have no columns listed
            key = (input.filename, input.sort_key, input.sheetname,
                   input.table)
            columns = columns_per_table.setdefault(key, set())
            if input.column_name is not None:
                columns.add(input.column_name)

        if input.column_name is not None and input.header_name is not None:
            key = (input.filename, input.sort_key, input.sheetname,
                   input.table)
            mapping_dict.setdefault(key, []).append(input)

    matched_list: List[MatchedSheet] = []
    unmatched_list: List[WorkSheetDetail] = []

    for file in files_to_scan:
        worksheets = workbook_util.find_worksheets(file)
        # Match datasources based on configuration
        matched, unmatched = matcher.match_data_sources(
            worksheets, data_sources)
        matched_list += matched
        unmatched_list += unmatched

    for key, columns in columns_per_table.items():
        if len(columns) == 0:
            file, sort_key, sheetname, table = key
            if table is not None:
                worksheet_list = workbook_util.find_worksheets(
                    FileSource(file, sort_key=sort_key))
                worksheet = next(
                    iter([
                        w for w in worksheet_list if w.sheetname == sheetname
                    ]))
                source_config = next(
                    iter([d for d in data_sources if d.name == table]))
                matched = MatchedSheet(sheet_detail=worksheet,
                                       source_config=source_config)
                matched_list.append(matched)

    # Match headers to column configuration
    sheet_with_headers: List[SheetWithHeaders] = matcher.match_columns(
        matched_list)

    for key, mapping_list in mapping_dict.items():
        file, sort_key, sheetname, table = key
        worksheet_list = workbook_util.find_worksheets(
            FileSource(file, sort_key=sort_key))
        sheet_detail = next(
            iter([w for w in worksheet_list if w.sheetname == sheetname]))
        source_config = next(iter([d for d in data_sources
                                   if d.name == table]))
        sheet = MatchedSheet(sheet_detail=sheet_detail,
                             source_config=source_config)

        column_list: List[MatchedColumn] = []

        for mapping in mapping_list:
            column_config = next(
                iter([
                    c for c in source_config.columns
                    if c.name == mapping.column_name
                ]))
            header_config = next(
                iter([
                    h for h in sheet_detail.headers
                    if h.value == mapping.header_name
                ]))
            column = MatchedColumn(column=column_config, header=header_config)
            column_list.append(column)

        sheet_with_headers.append(
            SheetWithHeaders(sheet=sheet,
                             columns=column_list,
                             unmatched_columns=[]))

    return sheet_with_headers, unmatched_list
 def test_find_input_files_empty(self):
     result = file_scanner.find_input_files(ScanSource(include=os.path.join(PROJECT_ROOT,
                                                                            "oh-no-I-do-not-exist.xlsx")))
     self.assertEqual(result, [])
 def test_find_input_files_multiext(self):
     result = file_scanner.find_input_files(ScanSource(include=os.path.join(PROJECT_ROOT, "**/ex*.xls*")))
     self.assertEqual(2, len(result))
Exemplo n.º 7
0
 def test_find_input_files_sortkeys(self):
     result = file_scanner.find_input_files(
         ScanSource(include=os.path.join(PROJECT_ROOT, "**/ex*.xls*"),
                    sort_keys=[r'/.*?(\d+).*/\1/i']))
     filesource = result[0]
     self.assertEqual('2004', filesource.sort_key)