def example_doc(request): with open(request.param["file"]) as stream: return dict( request.param, document=Document(request.param["file"]), document_string=Document(stream.read()), )
def identify_publications(doc: Document) -> Dict[str, float]: """ Determine entities with high probability of being a publication name. Arguments: doc (Document): The document to search Returns: Dict[str, float]: A list of publication names and the probability that we believe that we've made a correct assessment. For example, {"My Paper Title": 1.0} means that we are 100% sure that this is a reference to a publication. """ _DOC_ID_MARKERS = [ "arxiv:", "doi:", ] _AUTHORSHIP_MARKERS = [ "et al", ] potential_publications = {} # Anything in quotes get a low probability: rxp = re.compile('"[^"]+"') for match in re.findall(rxp, doc.text()): if match not in potential_publications: potential_publications[match] = 0.0 potential_publications[match] += 0.25 return potential_publications
def test_good_documents_pass_detector(): for doc in GOOD_DOCUMENTS: text = str(PersonalLifeDetector().get_report(Document(doc))) assert len(text.split("\n")) == 2
def test_bad_documents_trip_detector(): for doc in BAD_DOCUMENTS: text = str(PersonalLifeDetector().get_report(Document(doc))) assert len(text.split("\n")) > 2
def test_good_documents_pass_detector(): for doc in GOOD_DOCUMENTS: report = EffortDetector().get_report(Document(doc)) report.set_summary("MY_SUMMARY") text = str(report) assert len(text.split("\n")) == 2
def test_bad_documents_trip_detector(): for doc in BAD_DOCUMENTS: assert len(EffortDetector().get_report(Document(doc)).get_flags()) > 0
def test_good_documents_pass_detector(): for doc in GOOD_DOCUMENTS: report = PersonalLifeDetector().get_report(Document(doc)) assert len(report.get_flags()) == 0
def test_bad_documents_trip_detector(): for doc in BAD_DOCUMENTS: text = PersonalLifeDetector().get_report(Document(doc)).to_string() print(text) assert "tends to relate" in text