def test_prune(self): ii = IngestInfo(people=[ ingest_info.Person(), ingest_info.Person(bookings=[ ingest_info.Booking(), ingest_info.Booking( arrest=ingest_info.Arrest(), charges=[ ingest_info.Charge(), ingest_info.Charge(bond=ingest_info.Bond(), sentence=ingest_info.Sentence()), ingest_info.Charge(bond=ingest_info.Bond(), sentence=ingest_info.Sentence( is_life='False')) ], holds=[ingest_info.Hold(), ingest_info.Hold(hold_id=1)]) ]) ]) expected = IngestInfo(people=[ ingest_info.Person(bookings=[ ingest_info.Booking( charges=[ ingest_info.Charge(sentence=ingest_info.Sentence( is_life='False')) ], holds=[ ingest_info.Hold(jurisdiction_name='UNSPECIFIED'), ingest_info.Hold(hold_id=1, jurisdiction_name='UNSPECIFIED') ]) ]) ]) self.assertEqual(ii.prune(), expected)
def extract_and_populate_data(self, content: Union[Dict, List], ingest_info: IngestInfo = None): """This function does all the work of taking the users yaml file and content and returning a populated data class. This function iterates through every field in the object and builds a model based on the keys that it sees. Args: content: An already parsed JSON object or array ingest_info: An IngestInfo object to use, if None we create a new one by default Returns: A populated ingest data model for a scrape. """ if ingest_info is None: ingest_info = IngestInfo() self._extract(content, ingest_info, defaultdict(set)) return ingest_info.prune()
def extract_and_populate_data(self, content: Union[str, Iterable[str]], ingest_info: IngestInfo = None) -> IngestInfo: """This function does all the work of taking the users yaml file and content and returning a populated data class. This function iterates through every field in the object and builds a model based on the keys that it sees. Args: content: CSV-formatted text (Either a string with the full file contents, or an Interable where each element is a single line of contents. Not a file object.) ingest_info: An IngestInfo object to use, if None we create a new one by default Returns: A populated ingest data model for a scrape. """ if ingest_info is None: ingest_info = IngestInfo() self._extract(content, ingest_info) self._run_file_post_hooks(ingest_info) return ingest_info.prune()
def extract_and_populate_data( self, content: HtmlElement, ingest_info: IngestInfo = None, search_for_keys: bool = True, ) -> IngestInfo: """This function does all the work of taking the users yaml file and content and returning a populated data class. This function iterates through every cell on the page and builds a model based on the keys that it sees. Args: content: An already parsed html data structure ingest_info: An IngestInfo object to use, if None we create a new one by default search_for_keys: Flag to allow searching for keys outside of table cells (<td> and <tr> elements). Returns: A populated ingest data model for a scrape. """ content_copy = copy.deepcopy(content) HtmlDataExtractor._process_html(content_copy) self._set_all_cells(content_copy, search_for_keys) if ingest_info is None: ingest_info = IngestInfo() seen_map: Dict[int, Set[str]] = defaultdict(set) # We use this set to keep track of keys we have seen, by the end of this # function it should be the empty set. If not we throw an error to let # the user know we have a problem. needed_keys = set(self.keys.keys()) | set(self.multi_keys.keys()) for cell in self.cells: # This is a tiny hack to avoid an O(n) search over the keys list for # every cell. # An alternative approach is to force the user to give the exact key # with a semi colon in the yaml file, but that might be confusing. # Finally, we could preprocess the keys mapping to include multiple # keys that map to the same value ('hi' and 'hi:' both map to the # same thing) but that is a more expensive preprocessing calculation cell_val = self._normalize_cell(cell) lookup_keys = self.keys.get(cell_val) or self.multi_keys.get( cell_val) if not lookup_keys: # Users can specify a key with no value associated and then use # |get_value()| later. We shouldn't warn, even though we # won't find values for these keys. if cell_val in needed_keys: needed_keys.remove(cell_val) continue values: List[Optional[str]] = [] if cell_val in self.keys: values = [self._get_value_cell(cell)] elif cell_val in self.multi_keys: values = self._get_values_below_cell(cell) if values: self._set_or_create_object(ingest_info, lookup_keys, values, seen_map) if cell_val in needed_keys: needed_keys.remove(cell_val) # If at the end of everything there are some keys we haven't found on # page we should complain. if needed_keys: logging.debug("The following keys could not be found: %s", needed_keys) return ingest_info.prune()