def process_collection(collection, metamap, splitter, parser, ptb2dep, lemmatizer, neg_detector, cuis, sec_title_patterns): for document in collection.documents: normalize_mimiccxr.normalize(document) section_split.split_document(document, sec_title_patterns) ssplit.ssplit(document, splitter) dner_mm.run_metamap_col(collection, metamap, cuis) for document in collection.documents: document = parse.parse(document, parser) document = ptb2ud.convert(document, ptb2dep, lemmatizer) document = negdetect.detect(document, neg_detector) cleanup.clean_sentences(document) return collection
def load(self): """Load and clean the reports.""" collection = bioc.BioCCollection() reports = pd.read_csv(self.reports_path, header=None, names=[REPORTS])[REPORTS].tolist() for i, report in enumerate(reports): clean_report = self.clean(report) document = text2bioc.text2document(str(i), clean_report) if self.extract_impression: document = section_split.split_document(document) self.extract_impression_from_passages(document) split_document = self.splitter.split_doc(document) assert len(split_document.passages) == 1,\ ('Each document must have a single passage, ' + 'the Impression section.') collection.add_document(split_document) self.reports = reports self.collection = collection
def prep_collection(self): """Apply splitter and create bioc collection""" collection = bioc.BioCCollection() for i, report in enumerate(self.reports): clean_report = self.clean(report) document = text2bioc.text2document(str(i), clean_report) if self.extract_impression: document = section_split.split_document(document) self.extract_impression_from_passages(document) split_document = self.splitter.split_doc(document) assert len(split_document.passages) == 1,\ ('Each document must have a single passage, ' + 'the Impression section.') collection.add_document(split_document) self.collection = collection