def test_find_input_files_sortkeys(self): result = file_scanner.find_input_files(ScanSource( include=os.path.join(PROJECT_ROOT, "**/ex*.xls*"), sort_keys=[r'/.*?(\d+).*/\1/i']) ) sort_keys = {r.sort_key for r in result} self.assertSetEqual({'2004', '2005'}, sort_keys)
def test_find_input_files_deep(self): result = file_scanner.find_input_files( ScanSource(include=os.path.join(PROJECT_ROOT, "**/ex*.xlsx"))) self.assertEqual(len(result), 1) filesource = result[0] self.assertEqual('mples/example-B-2004.xlsx', filesource.filename[-25:])
def __to_scan_source( input_value: Union[str, ScanSource, Iterable]) -> Iterable[ScanSource]: if isinstance(input_value, str): return [ScanSource(input_value)] elif isinstance(input_value, Iterable): sources: List[ScanSource] = [] for i in input_value: sources += __to_scan_source(i) return sources else: return [input_value]
def process_report(match_input: Union[Iterable[MatchInput], pd.DataFrame, str], data_sources: List[SourceConfig]): if isinstance(match_input, str) or isinstance(match_input, pd.DataFrame): match_input = parse_report(match_input) files_to_scan = set() columns_per_table = dict() mapping_dict: Dict[Tuple, List[MatchInput]] = dict() for input in match_input: if input.sheetname is None: # First we look for unscanned files sort_key = None if input.sort_key is not None: sort_key = [f'/.*/{input.sort_key}/'] scan_source = ScanSource(include=input.filename, sort_keys=sort_key) files = file_scanner.find_input_files(scan_source) files_to_scan.update(files) else: # Then we build a lookup of files and tables to see if any tables have no columns listed key = (input.filename, input.sort_key, input.sheetname, input.table) columns = columns_per_table.setdefault(key, set()) if input.column_name is not None: columns.add(input.column_name) if input.column_name is not None and input.header_name is not None: key = (input.filename, input.sort_key, input.sheetname, input.table) mapping_dict.setdefault(key, []).append(input) matched_list: List[MatchedSheet] = [] unmatched_list: List[WorkSheetDetail] = [] for file in files_to_scan: worksheets = workbook_util.find_worksheets(file) # Match datasources based on configuration matched, unmatched = matcher.match_data_sources( worksheets, data_sources) matched_list += matched unmatched_list += unmatched for key, columns in columns_per_table.items(): if len(columns) == 0: file, sort_key, sheetname, table = key if table is not None: worksheet_list = workbook_util.find_worksheets( FileSource(file, sort_key=sort_key)) worksheet = next( iter([ w for w in worksheet_list if w.sheetname == sheetname ])) source_config = next( iter([d for d in data_sources if d.name == table])) matched = MatchedSheet(sheet_detail=worksheet, source_config=source_config) matched_list.append(matched) # Match headers to column configuration sheet_with_headers: List[SheetWithHeaders] = matcher.match_columns( matched_list) for key, mapping_list in mapping_dict.items(): file, sort_key, sheetname, table = key worksheet_list = workbook_util.find_worksheets( FileSource(file, sort_key=sort_key)) sheet_detail = next( iter([w for w in worksheet_list if w.sheetname == sheetname])) source_config = next(iter([d for d in data_sources if d.name == table])) sheet = MatchedSheet(sheet_detail=sheet_detail, source_config=source_config) column_list: List[MatchedColumn] = [] for mapping in mapping_list: column_config = next( iter([ c for c in source_config.columns if c.name == mapping.column_name ])) header_config = next( iter([ h for h in sheet_detail.headers if h.value == mapping.header_name ])) column = MatchedColumn(column=column_config, header=header_config) column_list.append(column) sheet_with_headers.append( SheetWithHeaders(sheet=sheet, columns=column_list, unmatched_columns=[])) return sheet_with_headers, unmatched_list
def test_find_input_files_empty(self): result = file_scanner.find_input_files(ScanSource(include=os.path.join(PROJECT_ROOT, "oh-no-I-do-not-exist.xlsx"))) self.assertEqual(result, [])
def test_find_input_files_multiext(self): result = file_scanner.find_input_files(ScanSource(include=os.path.join(PROJECT_ROOT, "**/ex*.xls*"))) self.assertEqual(2, len(result))
def test_find_input_files_sortkeys(self): result = file_scanner.find_input_files( ScanSource(include=os.path.join(PROJECT_ROOT, "**/ex*.xls*"), sort_keys=[r'/.*?(\d+).*/\1/i'])) filesource = result[0] self.assertEqual('2004', filesource.sort_key)