def acronyms_and_definitions(self, **kwargs): """ Extract a collection of acronyms and their most likely definitions, if available, from doc. If multiple definitions are found for a given acronym, only the most frequently occurring definition is returned. .. seealso:: :func:`extract.acronyms_and_definitions() <textacy.extract.acronyms_and_definitions>` for all function kwargs. """ return extract.acronyms_and_definitions(self.spacy_doc, **kwargs)
def acronyms_and_definitions(self, **kwargs): """ Extract a collection of acronyms and their most likely definitions, if available, from doc. If multiple definitions are found for a given acronym, only the most frequently occurring definition is returned. .. seealso:: :func:`extract.acronyms_and_definitions() <textacy.extract.acronyms_and_definitions>` for all function kwargs. """ return extract.acronyms_and_definitions(self.spacy_doc, **kwargs)
def acronyms_and_definitions(self, known_acro_defs=None): """ Extract a collection of acronyms and their most likely definitions, if available, from doc. If multiple definitions are found for a given acronym, only the most frequently occurring definition is returned. Args: known_acro_defs (dict, optional): if certain acronym/definition pairs are known, pass them in as {acronym (str): definition (str)}; algorithm will not attempt to find new definitions Returns: dict: unique acronyms (keys) with matched definitions (values) .. seealso:: :func:`extract.acronyms_and_definitions() <textacy.extract.acronyms_and_definitions>` for all function kwargs. """ return extract.acronyms_and_definitions(self.spacy_doc, known_acro_defs=known_acro_defs)
def acronyms_and_definitions(self, known_acro_defs=None): """ Extract a collection of acronyms and their most likely definitions, if available, from doc. If multiple definitions are found for a given acronym, only the most frequently occurring definition is returned. Args: known_acro_defs (dict, optional): if certain acronym/definition pairs are known, pass them in as {acronym (str): definition (str)}; algorithm will not attempt to find new definitions Returns: dict: unique acronyms (keys) with matched definitions (values) .. seealso:: :func:`extract.acronyms_and_definitions() <textacy.extract.acronyms_and_definitions>` for all function kwargs. """ return extract.acronyms_and_definitions( self.spacy_doc, known_acro_defs=known_acro_defs)
def test_acronyms_and_definitions(self): expected = {'I.M.F.': ''} observed = extract.acronyms_and_definitions(self.spacy_doc) self.assertEqual(observed, expected)
def get_acronyms_and_definitions(doc, known_acro_defs=None): assert isinstance(doc, textacy.Doc) or isinstance( doc, spacy.tokens.Doc), "Only {} are supported".format(possible_docs) return extract.acronyms_and_definitions(doc, known_acro_defs)
def test_acronyms_and_definitions(self): expected = {'I.M.F.': ''} observed = extract.acronyms_and_definitions(self.spacy_doc) self.assertEqual(observed, expected)
def test_acronyms_and_definitions_known(self): expected = {'I.M.F.': 'International Monetary Fund'} observed = extract.acronyms_and_definitions( self.spacy_doc, known_acro_defs={'I.M.F.': 'International Monetary Fund'}) self.assertEqual(observed, expected)
def test_known(self, spacy_doc): expected = {"I.M.F.": "International Monetary Fund"} observed = extract.acronyms_and_definitions( spacy_doc, known_acro_defs={"I.M.F.": "International Monetary Fund"} ) assert observed == expected
def test_default(self, spacy_doc): # TODO: figure out if this function no longer works, ugh # expected = {"I.M.F.": "International Monetary Fund"} expected = {"I.M.F.": ""} observed = extract.acronyms_and_definitions(spacy_doc) assert observed == expected
def extract_from_doc(doc): """ Extract features from a spacy doc. Args: doc (spacy.doc): a doc processed by the spacy 'en' model Returns: terms_tagged (dict): features with their respective tags Examples: >>> from dsconcept.model import extract_from_doc >>> import spacy >>> nlp = spacy.load('en_core_web_sm') >>> txt = "The ship hung in the sky much the same way bricks don't." >>> doc = nlp(txt) >>> features = extract_from_doc(doc) >>> features {'ship': 'NOUN', 'sky': 'NOUN', 'way': 'NOUN', 'brick': 'NOUN', 'the ship': 'NOUN_CHUNK'} """ # TODO: change this function such that it processes better but maintains the same interface. terms_tagged = dict() desired_parts_of_speech = ["NOUN", "PROPN"] # Get any 1-gram terms which are not % signs, or stop words. terms = {w.lemma_: w.pos_ for w in doc if should_keep(w, desired_parts_of_speech)} terms_tagged.update(terms) # Lemmatize each gram and join with a space. noun_chunks = { " ".join([w.lemma_ for w in nc if not w.is_stop]): nc.label_ for nc in doc.noun_chunks } # filter our noun chunks that are already in terms set and not in excluded_list. excluded_list = ["-PRON-", ""] noun_chunks_filtered = { w.strip(): "NOUN_CHUNK" for w, lab in noun_chunks.items() if (w not in terms.keys()) and (w not in excluded_list) } terms_tagged.update(noun_chunks_filtered) # TODO: entities take precedence over noun chunks # Get entities from text and remove collisions with terms and noun chunks. ent_excluded_set = ["ORDINAL", "CARDINAL", "QUANTITY", "DATE", "PERCENT"] ents = {e.lemma_: e.label_ for e in doc.ents if e.label_ not in ent_excluded_set} ents_filtered = { ent: "ENT" for ent, lab in ents.items() if ent not in terms.keys() and ent not in noun_chunks_filtered.keys() } terms_tagged.update(ents_filtered) # Add acronyms which have definitions. # These acronyms could create Noise if they are not good. Maybe better to use their definitions. # This schema will only pull out identifical definitions. No lemmatizing, no fuzzy matching. # TODO: add lemmatizing and fuzzy matching for acrnoyms. This code exists in acronyms project. acronyms_with_defs = acronyms_and_definitions(doc) acronyms_filtered = { "{} - {}".format(ac, definition): "ACRONYM" for ac, definition in acronyms_with_defs.items() if definition != "" } terms_tagged.update(acronyms_filtered) return terms_tagged
def test_default(self, spacy_lang, text, known, exp): obs = extract.acronyms_and_definitions(spacy_lang(text), known_acro_defs=known) assert obs == exp
def test_default(self, spacy_lang, text, exp): obs = extract.acronyms_and_definitions(spacy_lang(text)) assert obs == exp
def test_acronyms_and_definitions_known(spacy_doc): expected = {'I.M.F.': 'International Monetary Fund'} observed = extract.acronyms_and_definitions( spacy_doc, known_acro_defs={'I.M.F.': 'International Monetary Fund'}) assert observed == expected
def test_acronyms_and_definitions(spacy_doc): expected = {'I.M.F.': ''} observed = extract.acronyms_and_definitions(spacy_doc) assert observed == expected