def check_headers(self, cells, sample): errors = [] for cell in copy(cells): # Skip if cell has field if 'field' in cell: continue # Infer field if self.__infer_fields: column_sample = [] for row in sample: value = None if len(row) >= cell['column-number']: value = row[cell['column-number'] - 1] column_sample.append([value]) schema = Schema() schema.infer(column_sample, headers=[cell.get('header')]) cell['field'] = schema.fields[0] # Add error/remove column else: error = Error('extra-header', cell) errors.append(error) cells.remove(cell) return errors
def test_infer(): schema = Schema() schema.infer([ ['id', 'age', 'name'], ['1','39','Paul'], ['2','23','Jimmy'], ['3','36','Jane'], ['4','N/A','Judy'], ]) assert schema.descriptor == { 'fields': [ {'format': 'default', 'name': 'id', 'type': 'integer'}, {'format': 'default', 'name': 'age', 'type': 'integer'}, {'format': 'default', 'name': 'name', 'type': 'string'}], 'missingValues': ['']}
def test_infer(): data = [ ['id', 'age', 'name'], ['1','39','Paul'], ['2','23','Jimmy'], ['3','36','Jane'], ['4','N/A','Judy'], ] schema = Schema() schema.infer(data) assert schema.descriptor == { 'fields': [ {'format': 'default', 'name': 'id', 'type': 'integer'}, {'format': 'default', 'name': 'age', 'type': 'integer'}, {'format': 'default', 'name': 'name', 'type': 'string'}], 'missingValues': ['']} data = [ ['id', 'age', 'name'], ['1','39','Paul'], ['2','23','Jimmy'], ['3','36','Jane'], ['4','N/A','Judy'], ] schema = Schema() schema.infer(data, confidence=0.8) assert schema.descriptor == { 'fields': [ {'format': 'default', 'name': 'id', 'type': 'integer'}, {'format': 'default', 'name': 'age', 'type': 'string'}, {'format': 'default', 'name': 'name', 'type': 'string'}], 'missingValues': ['']} class AllStrings(): def cast(self, value): return [('string', 'default', 0)] data = [ ['id', 'age', 'name'], ['1','39','Paul'], ['2','23','Jimmy'], ['3','36','Jane'], ['4','100','Judy'], ] schema = Schema() schema.infer(data, confidence=0.8, guesser_cls=AllStrings) assert schema.descriptor['fields'] == [ {'format': 'default', 'name': 'id', 'type': 'string'}, {'format': 'default', 'name': 'age', 'type': 'string'}, {'format': 'default', 'name': 'name', 'type': 'string'}] assert schema.descriptor == { 'fields': [ {'format': 'default', 'name': 'id', 'type': 'string'}, {'format': 'default', 'name': 'age', 'type': 'string'}, {'format': 'default', 'name': 'name', 'type': 'string'}], 'missingValues': ['']}
def __inspect_table(self, table): # Start timer start = datetime.datetime.now() # Prepare vars errors = [] warnings = [] headers = [] row_number = 0 fatal_error = False source = table['source'] stream = table['stream'] schema = table['schema'] extra = table['extra'] # Prepare checks checks = registry.compile_checks(table.get('checks', self.__checks), self.__skip_checks, order_fields=self.__order_fields, infer_fields=self.__infer_fields) # Prepare table try: stream.open() sample = stream.sample headers = stream.headers if headers is None: headers = [None] * len(sample[0]) if sample else [] if _filter_checks(checks, type='schema'): if schema is None and self.__infer_schema: schema = Schema() schema.infer(sample, headers=headers) if schema is None: checks = _filter_checks(checks, type='schema', inverse=True) except Exception as exception: fatal_error = True error = _compose_error_from_exception(exception) errors.append(error) # Prepare schema if not fatal_error: if schema: if schema.primary_key: for field in schema.descriptor.get('fields', []): if field.get('name') in schema.primary_key: field['primaryKey'] = True schema.commit() for error in schema.errors: fatal_error = True error = _compose_error_from_schema_error(error) errors.append(error) # Prepare cells if not fatal_error: cells = [] fields = [None] * len(headers) if schema is not None: fields = schema.fields iterator = zip_longest(headers, fields, fillvalue=_FILLVALUE) for number, (header, field) in enumerate(iterator, start=1): cell = {'number': number} if header is not _FILLVALUE: cell['header'] = header cell['value'] = header if field is not _FILLVALUE: cell['field'] = field cells.append(cell) # Head checks if not fatal_error: if None not in headers: head_checks = _filter_checks(checks, context='head') for check in head_checks: if not cells: break check_func = getattr(check['func'], 'check_headers', check['func']) check_func(errors, cells, sample) for error in errors: error['row'] = None # Body checks if not fatal_error: cellmap = {cell['number']: cell for cell in cells} body_checks = _filter_checks(checks, context='body') with stream: extended_rows = stream.iter(extended=True) while True: try: row_number, _, row = next(extended_rows) except StopIteration: break except Exception as exception: fatal_error = True error = _compose_error_from_exception(exception) errors.append(error) break cells = [] iterator = zip_longest(headers, row, fillvalue=_FILLVALUE) for number, (header, value) in enumerate(iterator, start=1): cellref = cellmap.get(number, {}) cell = {'number': number} if header is not _FILLVALUE: cell['header'] = cellref.get('header', header) if 'field' in cellref: cell['field'] = cellref['field'] if value is not _FILLVALUE: cell['value'] = value cells.append(cell) for check in body_checks: if not cells: break check_func = getattr(check['func'], 'check_row', check['func']) check_func(errors, cells, row_number) for error in reversed(errors): if 'row' in error: break error['row'] = row if row_number >= self.__row_limit: warnings.append( 'Table "%s" inspection has reached %s row(s) limit' % (source, self.__row_limit)) break if len(errors) >= self.__error_limit: warnings.append( 'Table "%s" inspection has reached %s error(s) limit' % (source, self.__error_limit)) break # Table checks if not fatal_error: for check in checks: check_func = getattr(check['func'], 'check_table', None) if check_func: check_func(errors) # Stop timer stop = datetime.datetime.now() # Compose report headers = headers if None not in headers else None errors = errors[:self.__error_limit] errors = _sort_errors(errors) report = copy(extra) report.update({ 'time': round((stop - start).total_seconds(), 3), 'valid': not bool(errors), 'error-count': len(errors), 'row-count': row_number, 'source': source, 'headers': headers, 'scheme': stream.scheme, 'format': stream.format, 'encoding': stream.encoding, 'schema': 'table-schema' if schema else None, 'errors': errors, }) return warnings, report
def getSchema(self): s = Schema() return s.infer(self.data)
class TableExtractor(AbstractExtractor): """ Extracts tables from HTML as structured content and plain text. """ def __init__(self): self.__current_table_row = [] self.__current_text = '' self.__is_table = False self.__is_table_head = False self.__is_table_body = False self.__table_content = None self.__table_stack = [] self.__table_index = 1 self.__is_anchor = False self.__anchor_text = '' self.__anchor_url = None self.schema = Schema() def extract(self, el, ev, structured_content: List[Dict[str, Any]], text_list: List[str], nlp=None): if el.tag == 'table': if ev == 'start': if self.__is_table: ref = 'table:{}'.format(self.__table_index) self.__current_text += f'{{{ref}}} ' self.__table_content.setdefault('references', []).append(ref) self.__table_stack.append( (self.__current_table_row, self.__current_text, self.__is_table_head, self.__is_table_body, self.__table_content)) self.__current_table_row = [] self.__current_text = '' self.__is_table = True self.__is_table_head = False self.__is_table_body = False self.__table_content = { 'type': 'table', 'index': self.__table_index, 'head': [], 'body': [] } self.__table_index += 1 elif ev == 'end': table = self.__table_content if table['body']: if table['head']: headers = table['head'] fields = self.schema.infer(table['body'], headers=headers)['fields'] else: head = table['body'][0] headers = [ 'name%d' % (i + 1) for i in range(len(head)) ] fields = self.schema.infer(table['body'], headers=headers)['fields'] if len(table['body']) > 1: dtypes = [field['type'] for field in fields] if any([ typ != guess_type(val) for typ, val in zip(dtypes, head) ]): table['head'] = [head] table['body'] = table['body'][1:] for field, name in zip(fields, head): field['name'] = name table['fields'] = fields structured_content.append(table) if len(self.__table_stack): (self.__current_table_row, self.__current_text, self.__is_table_head, self.__is_table_body, self.__table_content) = self.__table_stack.pop() else: self.__is_table_body = False self.__is_table_head = False self.__is_table = False self.__current_text = '' self.__current_table_row = [] self.__table_content = None self.__table_index = 1 elif self.__is_table: # noinspection SpellCheckingInspection if el.tag == 'thead' and ev == 'start': self.__is_table_head = True self.__is_table_body = False elif el.tag == 'tbody' and ev == 'start': self.__is_table_head = False self.__is_table_body = True elif el.tag == 'tr' and ev == 'end': if self.__is_current_table_row_not_empty(): values = [v for _, v in self.__current_table_row] text_list.append(strip_link_markers(r'\t'.join(values))) if not self.__is_table_head and ( self.__is_table_body or not self.__is_header_row()): self.__table_content['body'].append(values) self.__is_table_head = False self.__is_table_body = True else: self.__table_content['head'].append(values) self.__current_text = '' self.__current_table_row = [] elif el.tag == 'th': if ev == 'end': self.__current_table_row.append( ('th', clean_text(self.__current_text))) self.__current_text = '' elif el.tag == 'td': if ev == 'end': self.__current_table_row.append( ('td', clean_text(self.__current_text))) self.__current_text = '' elif el.tag == 'a': if ev == 'start': anchor_url = el.get('href') if anchor_url: self.__is_anchor = True self.__current_text += LINK_OPEN_MARKER self.__anchor_url = el.get('href') elif ev == 'end' and self.__is_anchor: self.__is_anchor = False if self.__anchor_text.strip(): self.__current_text += LINK_CLOSE_MARKER if self.__anchor_url and self.__anchor_text: structured_content.append({ 'type': 'link', 'url': self.__anchor_url, 'text': self.__anchor_text }) else: n = self.__current_text.rfind(LINK_OPEN_MARKER) self.__current_text = self.__current_text[:n] + ' ' self.__anchor_url = None self.__anchor_text = '' if ev == 'start' and el.text: self.__current_text += el.text if self.__is_anchor: self.__anchor_text += el.text elif ev == 'end' and el.tail: self.__current_text += el.tail if self.__is_anchor: self.__anchor_text += el.tail def __is_current_table_row_not_empty(self) -> bool: return any(v for _, v in self.__current_table_row) def __is_header_row(self) -> bool: return all(k == 'th' for k, _ in self.__current_table_row)
def test_schema_infer_with_non_headers_issues_goodtables_258(): schema = Schema() schema.infer([[1], [2], [3]], headers=[None]) assert schema.field_names == ['field1']