def create_lookups_from_json_reader(path: Path) -> Lookups: lookups = Lookups() for p in path.glob("*.json"): table_name = p.stem data = srsly.read_json(p) lookups.add_table(table_name, data) return lookups
def concept_sets(self, value): """ Sets concepts_sets and the attributes derived from it. Args: value (list of list of str): A list of lists of strings; each string being a concept, each set in the larger list corresponding to a document which has the tags seen in the set. """ self._concept_sets = value LOG.debug("Extracting raw keywords as concepts.") all_concepts = [ concept for concept_set in tqdm(self._concept_sets) for concept in concept_set if concept.strip() != "" ] raw_concepts = set(all_concepts) LOG.debug("Lemmatizing {} raw concepts.".format(len(raw_concepts))) concepts = [c.lower() for c in raw_concepts] self.raw2lemma = {rc: c for rc, c in zip(raw_concepts, concepts)} lookups = Lookups() lookups.add_table("lemma_lookup", self.raw2lemma) self.lemmatizer = Lemmatizer(lookups) self.lemma2raw = {v: k for k, v in self.raw2lemma.items()} lemma_concepts = [ self.lemmatizer(concept, "NOUN")[0] for concept in all_concepts ] self.concepts_frequencies = Counter(lemma_concepts) self.concepts = set(lemma_concepts) self._fit_concept_indices()
def __init__( self, vocab: Vocab, name: str = "morphologizer", *, overwrite_lemma: bool = False, ) -> None: super().__init__() self.name = name self.vocab = vocab self.voikko = libvoikko.Voikko("fi") self.lookups = Lookups() self.overwrite_lemma = overwrite_lemma self.aux_labels = [vocab.strings.add(x) for x in ["aux", "aux:pass"]] self.cop_labels = [vocab.strings.add(x) for x in ["cop", "cop:own"]] self.nsubj_labels = [ vocab.strings.add(x) for x in ["nsubj", "nsubj:cop"] ] self.ccomp_labels = [ vocab.strings.add(x) for x in ["csubj", "csubj:cop", "xcomp", "xcomp:ds"] ] self.relcl_labels = [ vocab.strings.add(x) for x in ["acl:relcl", "ccomp"] ] self.foreign_tag = vocab.strings.add('Foreign')
def cope_lookups(): lookups = Lookups() lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"}) lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) lookups.add_table("lemma_exc", {"verb": {"coping": ("cope", )}}) lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) return lookups
def lemmatizer(): lookups = Lookups() lookups.add_table("lemma_lookup", { "dogs": "dog", "boxen": "box", "mice": "mouse" }) return Lemmatizer(lookups)
def initialize( self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language = None ) -> None: lookups = Lookups() self._lookups = lookups.from_disk(path=self.source)
def test_issue595(): """Test lemmatization of base forms""" words = ["Do", "n't", "feed", "the", "dog"] lookups = Lookups() lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]}) lookups.add_table("lemma_index", {"verb": {}}) lookups.add_table("lemma_exc", {"verb": {}}) vocab = Vocab() doc = Doc(vocab, words=words) doc[2].tag_ = "VB" assert doc[2].text == "feed" assert doc[2].lemma_ == "feed"
def test_issue1387(): tag_map = {"VBG": {POS: VERB, VerbForm_part: True}} lookups = Lookups() lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) lookups.add_table("lemma_exc", {"verb": {"coping": ("cope", )}}) lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) lemmatizer = Lemmatizer(lookups) vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) doc = Doc(vocab, words=["coping"]) doc[0].tag_ = "VBG" assert doc[0].text == "coping" assert doc[0].lemma_ == "cope"
def test_lookups_api(): table_name = "test" data = {"foo": "bar", "hello": "world"} lookups = Lookups() lookups.add_table(table_name, data) assert table_name in lookups assert lookups.has_table(table_name) table = lookups.get_table(table_name) assert table.name == table_name assert len(table) == 2 assert table.get("hello") == "world" table.set("a", "b") assert table.get("a") == "b" table = lookups.get_table(table_name) assert len(table) == 3 with pytest.raises(KeyError): lookups.get_table("xyz")
def test_issue595(): """Test lemmatization of base forms""" words = ["Do", "n't", "feed", "the", "dog"] tag_map = {"VB": {POS: VERB, VerbForm_inf: True}} lookups = Lookups() lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]}) lookups.add_table("lemma_index", {"verb": {}}) lookups.add_table("lemma_exc", {"verb": {}}) lemmatizer = Lemmatizer(lookups) vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) doc = Doc(vocab, words=words) doc[2].tag_ = "VB" assert doc[2].text == "feed" assert doc[2].lemma_ == "feed"
def __init__(self): self.entities = [] self.columns = [] self.relationships = [] self.synonyms_col = [] self.synonyms_tab = [] self.entity_graph = [] self.loaded_entities = [] self.config = Configuration() self.conn = pyodbc.connect(self.config.get_sql_connection_string()) lookups = Lookups() self.lemmatizer = Lemmatizer(lookups) self.load_db_model()
def test_ner_warns_no_lookups(): nlp = Language() nlp.vocab.lookups = Lookups() assert not len(nlp.vocab.lookups) ner = nlp.create_pipe("ner") nlp.add_pipe(ner) with pytest.warns(UserWarning): nlp.begin_training() nlp.vocab.lookups.add_table("lexeme_norm") nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" with pytest.warns(None) as record: nlp.begin_training() assert not record.list
def from_jsons( self, in_indices, in_raw2lemma ): # a little strange because it does not fill in all attributes """ Load index and raw2lemma dictionaries into empty ConceptExtractor Args: in_indices (): in_raw2lemma (): """ with open(in_indices, "r") as f0: self.concept_index_mapping = json.load(f0) with open(in_raw2lemma, "r") as f0: self.raw2lemma = json.load(f0) lookups = Lookups() lookups.add_table("lemma_lookup", self.raw2lemma) self.lemmatizer = Lemmatizer(lookups) self.lemma2raw = {v: k for k, v in self.raw2lemma.items()} self.concepts = self.concept_index_mapping.keys() tmp_frequencies = { concept: len(index) for concept, index in self.concept_index_mapping.items() } self.concepts_frequencies = Counter(tmp_frequencies)
def lemmatize(self, tokens, toke=False): lookups = Lookups() lookups.add_table('lemma_index', lemma_index) lookups.add_table('lemma_exc', lemma_exc) lookups.add_table('lemma_rules', lemma_rules) lemmatizer = Lemmatizer(lookups) lemmas = [] for t in tokens: lemmas.append(lemmatizer(token.text, token.tag_)) if toke: return lemmas return " ".join(lemmas)
def make_lookups_bin(self, lookup_name_pattern='lemma_lookup_{}', filename_pattern='it_lemma_lookup_{}.json'): lookups = Lookups() lookup_keys = list(self.tag_map.keys()) for lookup_pos in lookup_keys: lookup_name = lookup_name_pattern.format(lookup_pos.lower()) filename = filename_pattern.format(lookup_pos.lower()) with open(os.path.join(self.out_path, filename)) as json_file: lookup_dict = json.load(json_file) lookups.add_table(lookup_name, lookup_dict) with open(os.path.join(self.out_path, 'it_lemma_lookup.json')) as json_file: lookup_dict = json.load(json_file) lookups.add_table('lemma_lookup', lookup_dict) lookups.to_disk(self.out_path, 'lookups.bin')
def test_tagger_warns_no_lemma_lookups(): nlp = Language() nlp.vocab.lookups = Lookups() assert not len(nlp.vocab.lookups) tagger = nlp.create_pipe("tagger") with pytest.warns(UserWarning): tagger.begin_training() nlp.add_pipe(tagger) with pytest.warns(UserWarning): nlp.begin_training() nlp.vocab.lookups.add_table("lemma_lookup") with pytest.warns(None) as record: nlp.begin_training() assert not record.list
def test_ner_warns_no_lookups(caplog): nlp = English() assert nlp.lang in util.LEXEME_NORM_LANGS nlp.vocab.lookups = Lookups() assert not len(nlp.vocab.lookups) nlp.add_pipe("ner") with caplog.at_level(logging.DEBUG): nlp.initialize() assert "W033" in caplog.text caplog.clear() nlp.vocab.lookups.add_table("lexeme_norm") nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" with caplog.at_level(logging.DEBUG): nlp.initialize() assert "W033" not in caplog.text
def test_lemmatizer_without_is_base_form_implementation(): # Norwegian example from #5658 lookups = Lookups() lookups.add_table("lemma_rules", {"noun": []}) lookups.add_table("lemma_index", {"noun": {}}) lookups.add_table("lemma_exc", {"noun": { "formuesskatten": ["formuesskatt"] }}) lemmatizer = Lemmatizer(lookups, is_base_form=None) assert lemmatizer("Formuesskatten", "noun", { 'Definite': 'def', 'Gender': 'masc', 'Number': 'sing' }) == ["formuesskatt"]
def test_lemmatizer_init(nlp): lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"}) assert isinstance(lemmatizer.lookups, Lookups) assert not lemmatizer.lookups.tables assert lemmatizer.mode == "lookup" with pytest.raises(ValueError): nlp("test") nlp.initialize() assert lemmatizer.lookups.tables assert nlp("cope")[0].lemma_ == "cope" assert nlp("coped")[0].lemma_ == "cope" # replace any tables from spacy-lookups-data lemmatizer.lookups = Lookups() # lookup with no tables sets text as lemma assert nlp("cope")[0].lemma_ == "cope" assert nlp("coped")[0].lemma_ == "coped" nlp.remove_pipe("lemmatizer") lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"}) with pytest.raises(ValueError): # Can't initialize without required tables lemmatizer.initialize(lookups=Lookups()) lookups = Lookups() lookups.add_table("lemma_lookup", {}) lemmatizer.initialize(lookups=lookups)
docx = "" returnedSearch = HeadingSearch(filename) #If we didnt find anything with the heading search, just use the whole document. if (returnedSearch == False): document1 = docx2txt.process(filename) docx = nlp(document1) #Otherwise, send the headingsearch result through nlp else: docx = nlp(returnedSearch) word_frequencies = {} # how many times each word occurs int the document words = [ ] # a list of every word in the document stores in the same index of the frequency array #spacy lemmatizer to get root words lookups = Lookups() lemmatizer = Lemmatizer(lookups) for word in docx: # go through every word in document if word.text not in stopwords: # as long as the word isnt a stop word if lemmatizer.lookup(word.text) not in word_frequencies.keys( ): # if we havent come across the word yet word_frequencies[lemmatizer.lookup( word.text)] = 1 # its frequency is one words.append(lemmatizer.lookup(word.text)) # add it to words else: word_frequencies[lemmatizer.lookup( word.text )] += 1 # otherwise it is already in the list, so increment it #Sort through the array by bubble sort
def test_lookups_to_from_disk(): lookups = Lookups() lookups.add_table("table1", {"foo": "bar", "hello": "world"}) lookups.add_table("table2", {"a": 1, "b": 2, "c": 3}) with make_tempdir() as tmpdir: lookups.to_disk(tmpdir) new_lookups = Lookups() new_lookups.from_disk(tmpdir) assert len(new_lookups) == 2 assert "table1" in new_lookups assert "table2" in new_lookups table1 = new_lookups.get_table("table1") assert len(table1) == 2 assert table1["foo"] == "bar" table2 = new_lookups.get_table("table2") assert len(table2) == 3 assert table2["b"] == 2
def test_lookups_to_from_bytes(): lookups = Lookups() lookups.add_table("table1", {"foo": "bar", "hello": "world"}) lookups.add_table("table2", {"a": 1, "b": 2, "c": 3}) lookups_bytes = lookups.to_bytes() new_lookups = Lookups() new_lookups.from_bytes(lookups_bytes) assert len(new_lookups) == 2 assert "table1" in new_lookups assert "table2" in new_lookups table1 = new_lookups.get_table("table1") assert len(table1) == 2 assert table1["foo"] == "bar" table2 = new_lookups.get_table("table2") assert len(table2) == 3 assert table2["b"] == 2 assert new_lookups.to_bytes() == lookups_bytes
def test_lookups_api(): table_name = "test" data = {"foo": "bar", "hello": "world"} lookups = Lookups() lookups.add_table(table_name, data) assert len(lookups) == 1 assert table_name in lookups assert lookups.has_table(table_name) table = lookups.get_table(table_name) assert table.name == table_name assert len(table) == 2 assert table["hello"] == "world" table["a"] = "b" assert table["a"] == "b" table = lookups.get_table(table_name) assert len(table) == 3 with pytest.raises(KeyError): lookups.get_table("xyz") with pytest.raises(ValueError): lookups.add_table(table_name) table = lookups.remove_table(table_name) assert table.name == table_name assert len(lookups) == 0 assert table_name not in lookups with pytest.raises(KeyError): lookups.get_table(table_name)
def lemmatize(): """""" lookups = Lookups() lookups.add_table("lemma_rules", {"noun": [["s", ""]]}) lemmatizer = Lemmatizer(lookups) return lemmatizer
def from_disk(self, path, exclude=tuple()) -> "LookupLemmatizer": path: Path = ensure_path(path) lookups = Lookups() self._lookups = lookups.from_disk(path=path) return self
# ``` # pip install -U spacy # ``` # # You will then need to download the English model: # ``` # spacy -m download en_core_web_sm # ``` # %% import spacy # %% from spacy.lemmatizer import Lemmatizer from spacy.lookups import Lookups lookups = Lookups() lookups.add_table("lemma_rules", {"noun": [["s", ""]]}) lemmatizer = Lemmatizer(lookups) # %% [lemmatizer.lookup(word) for word in word_list] # %% [markdown] # Spacy doesn't offer a stemmer (since lemmatization is considered better-- this is an example of being opinionated!) # %% [markdown] # Stop words vary from library to library # %% nlp = spacy.load("en_core_web_sm")
class MorphologizerLemmatizer(Pipe): """Pipeline component that assigns morphological features and lemmas to Docs. The actual morphological analysis is done by libvoikko. """ compound_re = re.compile(r"\+(\w+)(?:\(\+?[\w=]+\))?") minen_re = re.compile(r"\b(\w+)\[Tn4\]mi") ny_re = re.compile(r"\[X\]\[\w+\]\[Ny\](\w+)") roman_numeral_structure_re = re.compile(r"=j+|=q+") voikko_cases = { "nimento": "Case=Nom", "omanto": "Case=Gen", "kohdanto": "Case=Acc", "olento": "Case=Ess", "osanto": "Case=Par", "tulento": "Case=Tra", "sisaolento": "Case=Ine", "sisaeronto": "Case=Ela", "sisatulento": "Case=Ill", "ulkoolento": "Case=Ade", "ulkoeronto": "Case=Abl", "ulkotulento": "Case=All", "vajanto": "Case=Abe", "seuranto": "Case=Com", "keinonto": "Case=Ins", "kerrontosti": "Case=Nom" # Should never occur. "kerrontosti" # should only appear on ADVs, which # don't have cases. } voikko_classes_by_pos = { ADJ: frozenset(["laatusana", "nimisana_laatusana"]), ADP: frozenset(["nimisana", "seikkasana", "suhdesana"]), ADV: frozenset(["seikkasana"]), AUX: frozenset(["teonsana", "kieltosana"]), CCONJ: frozenset(["sidesana"]), INTJ: frozenset(["huudahdussana"]), NOUN: frozenset(["nimisana", "nimisana_laatusana", "lyhenne"]), NUM: frozenset(["lukusana"]), PRON: frozenset(["asemosana", "nimisana", "nimisana_laatusana"]), PROPN: frozenset(["nimi", "etunimi", "sukunimi", "paikannimi"]), SCONJ: frozenset(["sidesana"]), VERB: frozenset([]), # Would be "teonsana" but # MINEN-infinitives are treated as noun. # See _analysis_has_compatible_pos() SYM: frozenset([]), X: frozenset([]) } affix_to_sijamuoto = { "n": "omanto", "na": "olento", "nä": "olento", "a": "osanto", "ä": "osanto", "ta": "osanto", "tä": "osanto", "ksi": "tulento", "ssa": "sisaolento", "ssä": "sisaolento", "sta": "sisaeronto", "stä": "sisaeronto", "han": "sisatulento", "hin": "sisatulento", "hun": "sisatulento", "seen": "sisatulento", "siin": "sisatulento", "lla": "ulkoolento", "llä": "ulkoolento", "lta": "ulkoeronto", "ltä": "ulkoeronto", "lle": "ulkotulento", "tta": "vajanto", "ttä": "vajanto", } possessive_suffixes = { "1s": ["ni"], "2s": ["si"], "1p": ["mme"], "2p": ["nne"], "3": ["nsa", "nsä", "an", "en", "in" "on", "un", "yn", "än", "ön"], } voikko_degree = { "positive": "Degree=Pos", "comparative": "Degree=Cmp", "superlative": "Degree=Sup" } voikko_mood = { "A-infinitive": "InfForm=1", "E-infinitive": "InfForm=2", "MA-infinitive": "InfForm=3", "indicative": "Mood=Ind", "conditional": "Mood=Cnd", "potential": "Mood=Pot", "imperative": "Mood=Imp" } voikko_part_form = { "past_active": "PartForm=Past", "past_passive": "PartForm=Past", "present_active": "PartForm=Pres", "present_passive": "PartForm=Pres", "agent": "PartForm=Agt" } voikko_tense = { "present_active": "Tense=Pres", "present_passive": "Tense=Pres", "present_simple": "Tense=Pres", "past_active": "Tense=Past", "past_passive": "Tense=Past", "past_imperfective": "Tense=Past" } pron_types = { "minä": "Prs", "sinä": "Prs", "hän": "Prs", "me": "Prs", "te": "Prs", "he": "Prs", "tämä": "Dem", "tuo": "Dem", "se": "Dem", "nämä": "Dem", "nuo": "Dem", "ne": "Dem", # The relative "mikä" will be handled as a special case # separately so here we label all occurences of "mikä" as # interrogative. "mikä": "Int", "kuka": "Int", "ken": "Int", # ketä "kumpi": "Int", "millainen": "Int", "kuinka": "Int", "miksi": "Int", # The relative "joka" will be handled else where. Here "joka" # is Voikko's lemmatization of jotakin, jollekin, jostakin, ... "joka": "Ind", "kaikki": "Ind", "jokainen": "Ind", "koko": "Ind", "harva": "Ind", "muutama": "Ind", "jokunen": "Ind", "yksi": "Ind", "ainoa": "Ind", "eräs": "Ind", "muuan": "Ind", "joku": "Ind", "jokin": "Ind", "kukin": "Ind", "moni": "Ind", "usea": "Ind", "molempi": "Ind", "kumpikin": "Ind", "kumpikaan": "Ind", "jompikumpi": "Ind", "sama": "Ind", "muu": "Ind", "kukaan": "Ind", "mikään": "Ind", "toinen": "Rcp" } pron_persons = { "minä": "1", "sinä": "2", "hän": "3", "me": "1", "te": "2", "he": "3" } infinite_moods = frozenset( ["A-infinitive", "E-infinitive", "MA-infinitive", "MAINEN-infinitive"]) def __init__( self, vocab: Vocab, name: str = "morphologizer", *, overwrite_lemma: bool = False, ) -> None: super().__init__() self.name = name self.vocab = vocab self.voikko = libvoikko.Voikko("fi") self.lookups = Lookups() self.overwrite_lemma = overwrite_lemma self.aux_labels = [vocab.strings.add(x) for x in ["aux", "aux:pass"]] self.cop_labels = [vocab.strings.add(x) for x in ["cop", "cop:own"]] self.nsubj_labels = [ vocab.strings.add(x) for x in ["nsubj", "nsubj:cop"] ] self.ccomp_labels = [ vocab.strings.add(x) for x in ["csubj", "csubj:cop", "xcomp", "xcomp:ds"] ] self.relcl_labels = [ vocab.strings.add(x) for x in ["acl:relcl", "ccomp"] ] self.foreign_tag = vocab.strings.add('Foreign') def __call__(self, doc: Doc) -> Doc: error_handler = self.get_error_handler() try: for token in doc: if token.pos in (PUNCT, SPACE): if self.overwrite_lemma or token.lemma == 0: token.lemma = token.orth else: analysis = self._analyze(token) morph = self.voikko_morph(token, analysis) if morph: token.set_morph(morph) if self.overwrite_lemma or token.lemma == 0: token.lemma_ = self.lemmatize(token, analysis) return doc except Exception as e: error_handler(self.name, self, [doc], e) def initialize( self, get_examples: Optional[Callable[[], Iterable[Example]]] = None, *, nlp: Optional[Language] = None, lookups: Optional[Lookups] = None, ): """Initialize the morphologizer and load in data. get_examples (Callable[[], Iterable[Example]]): Function that returns a representative sample of gold-standard Example objects. nlp (Language): The current nlp object the component is part of. lookups (Lookups): The lookups object containing the (optional) tables such as "lemma_exc" and "morphologizer_exc". Defaults to None. """ if lookups is None: lookups = load_lookups(lang=self.vocab.lang, tables=["lemma_exc", "morphologizer_exc"]) self.lookups = lookups def voikko_morph(self, token: Token, analysis: dict) -> Optional[str]: # Run Voikko's analysis and convert the result to morph # features. exc_table = self.lookups.get_table("morphologizer_exc", {}).get(token.pos) if exc_table is not None: exc = exc_table.get(token.orth_.lower()) if exc: return exc # Pre-compute some frequent morphs to avoid code duplication. # (Functions are not an option because the function call # overhead is too high.) # Clitic morph_clitic = None if "FOCUS" in analysis: focus = analysis["FOCUS"] if focus == "kin": morph_clitic = "Clitic=Kin" elif focus == "kaan": morph_clitic = "Clitic=Kaan" elif focus == "ka": morph_clitic = "Clitic=Ka" elif "KYSYMYSLIITE" in analysis: morph_clitic = "Clitic=Ko" morph_number = None morph_number_psor = None morph_person_psor = None if token.pos in (ADJ, ADP, ADV, AUX, NOUN, NUM, PRON, PROPN, VERB): # Number if "NUMBER" in analysis: number = analysis["NUMBER"] if number == "singular": morph_number = "Number=Sing" elif number == "plural": morph_number = "Number=Plur" # Number[psor] and Person[psor] if "POSSESSIVE" in analysis: possessive = analysis["POSSESSIVE"] if possessive == "1s": morph_number_psor = "Number[psor]=Sing" morph_person_psor = "Person[psor]=1" elif possessive == "1p": morph_number_psor = "Number[psor]=Plur" morph_person_psor = "Person[psor]=1" elif possessive == "3": morph_person_psor = "Person[psor]=3" # Set morphs per POS morphology = [] if token.pos in (ADJ, NOUN, PROPN): # Abbr if "CLASS" in analysis and analysis["CLASS"] == "lyhenne": morphology.append("Abbr=Yes") # Case if "SIJAMUOTO" in analysis: morphology.append(self.voikko_cases[analysis["SIJAMUOTO"]]) # Clitic if morph_clitic is not None: morphology.append(morph_clitic) # Degree if token.pos == ADJ and "COMPARISON" in analysis: morphology.append(self.voikko_degree[analysis["COMPARISON"]]) # Number if morph_number is not None: morphology.append(morph_number) # Number[psor] if morph_number_psor is not None: morphology.append(morph_number_psor) # NumType if token.pos == ADJ and "NUMTYPE" in analysis: morphology.append(f'NumType={analysis["NUMTYPE"]}') # Person[psor] if morph_person_psor is not None: morphology.append(morph_person_psor) elif token.pos in (AUX, VERB): vclass = analysis.get("CLASS") # Abbr if vclass == "lyhenne": morphology.append("Abbr=Yes") # Case if "SIJAMUOTO" in analysis: morphology.append(self.voikko_cases[analysis["SIJAMUOTO"]]) # Clitic if morph_clitic is not None: morphology.append(morph_clitic) # Connegative if "CONNEGATIVE" in analysis: morphology.append("Connegative=Yes") # Degree if "COMPARISON" in analysis: morphology.append(self.voikko_degree[analysis["COMPARISON"]]) # InfForm and Mood # These are mutually exclusive and both are based on MOOD mood = None if "MOOD" in analysis: mood = analysis["MOOD"] morph_inf_form_or_mood = self.voikko_mood.get(mood) if morph_inf_form_or_mood is not None: morphology.append(morph_inf_form_or_mood) # Number if morph_number is not None: morphology.append(morph_number) # Number[psor] if morph_number_psor is not None: morphology.append(morph_number_psor) # PartForm participle = None if "PARTICIPLE" in analysis: participle = analysis["PARTICIPLE"] morph_part_form = self.voikko_part_form.get(participle) if morph_part_form: morphology.append(morph_part_form) # Person person = None if "PERSON" in analysis: person = analysis["PERSON"] if person in ("0", "1", "2", "3"): morphology.append(f"Person={person}") # Person[psor] if morph_person_psor is not None: morphology.append(morph_person_psor) # Polarity if vclass == "kieltosana": morphology.append("Polarity=Neg") # Tense if "TENSE" in analysis: morphology.append(self.voikko_tense[analysis["TENSE"]]) # VerbForm if mood in self.infinite_moods: morphology.append("VerbForm=Inf") elif participle is not None: morphology.append("VerbForm=Part") else: morphology.append("VerbForm=Fin") # Voice if person in ("0", "1", "2", "3"): morphology.append("Voice=Act") elif person == "4": morphology.append("Voice=Pass") elif "VOICE" in analysis: morphology.append(f"Voice={analysis['VOICE']}") elif participle == "past_passive": morphology.append("Voice=Pass") elif participle in ("present_active", "past_active", "present_passive"): morphology.append("Voice=Act") elif token.pos == ADV: # Abbr if "CLASS" in analysis and analysis["CLASS"] == "lyhenne": morphology.append("Abbr=Yes") # Clitic if morph_clitic is not None: morphology.append(morph_clitic) # Degree if "COMPARISON" in analysis: degree = analysis["COMPARISON"] if degree in ("comparative", "superlative"): morphology.append(self.voikko_degree[degree]) # Number[psor] if morph_number_psor is not None: morphology.append(morph_number_psor) # Person[psor] if morph_person_psor is not None: morphology.append(morph_person_psor) elif token.pos == PRON: # Case if "SIJAMUOTO" in analysis: morphology.append(self.voikko_cases[analysis["SIJAMUOTO"]]) # Clitic if morph_clitic is not None: morphology.append(morph_clitic) # Degree if "COMPARISON" in analysis: morphology.append(self.voikko_degree[analysis["COMPARISON"]]) # Number if morph_number is not None: morphology.append(morph_number) # Number[psor] if morph_number_psor is not None: morphology.append(morph_number_psor) # Person if "PERSON" in analysis: person = analysis["PERSON"] if person in ("0", "1", "2", "3"): morphology.append(f"Person={person}") # Person[psor] if morph_person_psor is not None: morphology.append(morph_person_psor) # PronType if "PRONTYPE" in analysis: morphology.append(f"PronType={analysis['PRONTYPE']}") # Reflex if "BASEFORM" in analysis and analysis["BASEFORM"] == "itse": morphology.append("Reflex=Yes") elif token.pos in (CCONJ, SCONJ): # Clitic if morph_clitic is not None: morphology.append(morph_clitic) elif token.pos == NUM: # Abbr if "CLASS" in analysis and analysis["CLASS"] == "lyhenne": morphology.append("Abbr=Yes") # Case if "SIJAMUOTO" in analysis: morphology.append(self.voikko_cases[analysis["SIJAMUOTO"]]) # Clitic if morph_clitic is not None: morphology.append(morph_clitic) # Number if morph_number is not None: morphology.append(morph_number) # NumType if "NUMTYPE" in analysis: morphology.append(f'NumType={analysis["NUMTYPE"]}') elif token.pos == ADP: # AdpType if "ADPTYPE" in analysis: morphology.append(f"AdpType={analysis['ADPTYPE']}") # Clitic if morph_clitic is not None: morphology.append(morph_clitic) # Number[psor] if morph_number_psor is not None: morphology.append(morph_number_psor) # Person[psor] if morph_person_psor is not None: morphology.append(morph_person_psor) elif token.pos == SYM: # Case if "SIJAMUOTO" in analysis: morphology.append(self.voikko_cases[analysis["SIJAMUOTO"]]) elif token.pos == X: # Foreign if token.tag == self.foreign_tag: morphology.append('Foreign=Yes') return "|".join(morphology) if morphology else None def lemmatize(self, token: Token, analysis: dict) -> str: cached_lower = None exc_table = self.lookups.get_table("lemma_exc", {}).get(token.pos) if exc_table is not None: cached_lower = token.orth_.lower() exc = exc_table.get(cached_lower) if exc: return exc # Some exceptions to Voikko's lemmatization algorithm to # better match UD lemmas if token.pos in (AUX, VERB) and "PARTICIPLE" in analysis: return self._participle_lemma(analysis) elif token.pos == NOUN and analysis.get("MOOD") == "MINEN-infinitive": return self._minen_noun_lemma(analysis) elif token.pos in (NOUN, NUM, PROPN) and (colon_i := token.orth_.find(":")) > 0: # Lemma of inflected abbreviations: BBC:n, EU:ssa return token.orth_[:colon_i] elif token.pos == ADV: cached_lower = cached_lower or token.orth_.lower() return self._adv_lemma(analysis, cached_lower)
def create_lemmatizer(): lookups = Lookups() with open("lookups/fi_lemma_exc.json") as f: lookups.add_table("lemma_exc", json.load(f)) return FinnishLemmatizer(lookups)
def P(T): import pandas as pd import emoji #checking if a character is an emojis from collections import Counter #remove the formating of source T['source'] = T['source'].str.lower() T['source'] = T['source'].str.findall('>([^<]+?)<').apply( lambda x: x[0] if len(x) >= 1 else '') #import location dictionary and generate country T['location'] = [ T.loc[k, 'place']['country_code'] if not pd.isnull(T.loc[k, 'place']) else i['location'] for k, i in enumerate(T['user']) ] Trans = pd.read_csv( '/Users/livi/Documents/2020 Fall/data mining/Proposal/Tweepy related files/transloc.csv', index_col=0) Trans['googlemap'] = Trans['googlemap'].apply(eval) Trans.set_index('UserInfo', inplace=True) locdict = Trans.T.to_dict('records') locdict = locdict[0] kys = list(locdict.keys()) for k in kys: if locdict[k] == None: del locdict[k] elif len(locdict[k]) != 0: if 'address_components' in locdict[k][0]: for ii in locdict[k][0]['address_components']: if 'country' in ii['types']: locdict[k] = ii['long_name'] else: del locdict[k] elif len(locdict[k]) > 1: if 'address_components' in locdict[k][1]: for ii in locdict[k][1]['address_components']: if 'country' in ii['types']: locdict[k] = ii['long_name'] else: del locdict[k] else: del locdict[k] else: del locdict[k] ## Generate the column l = [] for i in T['location']: try: l.append(locdict[i]) except: l.append(float('nan')) T['CountryCode'] = l print('Finish Generate Country Code') #Generate Extended tweets and SDGs for i in range(len(T)): quote = None comment = None #prepare quote part if not pd.isnull(T.loc[i, 'quoted_status']): try: quote = T.loc[i, 'quoted_status']['extended_tweet']['full_text'] except: quote = T.loc[i, 'quoted_status']['text'] #print('no extended_tweet for quote',i) #prepare comment part if pd.isnull(T.loc[i, 'extended_tweet']): if pd.isnull(T.loc[i, 'retweeted_status']): try: comment = T.loc[i, 'text'] except: print('no text', i) else: try: comment = T.loc[ i, 'retweeted_status']['extended_tweet']['full_text'] except: comment = T.loc[i, 'retweeted_status']['text'] #print('no extended_tweet for retweeted status',i) else: try: comment = T.loc[i, 'extended_tweet']['full_text'] except: print('no extended_tweet', i) #combine quote and comments if pd.isnull(quote): T.loc[i, 'extended_tweet'] = comment else: T.loc[i, 'extended_tweet'] = '\"' + comment + ' \" ' + quote ## remove some useless information T['extended_tweet'] = T['extended_tweet'].str.replace("http\S+", "") #T['extended_tweet']=T['extended_tweet'].str.replace("@\S+","") T['extended_tweet'] = T['extended_tweet'].str.replace("&", "") print('Finish Generate Extended Tweets') T = T.reset_index(drop=True) T['extended_tweet'] = T['extended_tweet'].str.lower() T['SDG'] = T['extended_tweet'].str.upper() T['SDG'] = T['SDG'].str.findall('(SDG\d+)') print('Finish Generate SDGs') # Generate User Information and hashtags T['id'] = [i['id'] for i in T['user']] #T['name']=[i['name']for i in T['user']] T['screen_name'] = [i['screen_name'] for i in T['user']] T['url'] = [i['url'] for i in T['user']] T['friends_count'] = T['user'].apply(lambda x: x['friends_count']) T['followers_count'] = T['user'].apply(lambda x: x['followers_count']) T['hashtags'] = T['extended_tweet'].str.findall('#\S+') print('Finish Generate UserInfo and Hashtags') # Prepare lemmatized analysis and tokenized extended tweets def char_is_emoji(character): return character in emoji.UNICODE_EMOJI #does the text contain an emoji? def text_has_emoji(text): for character in text: if character in emoji.UNICODE_EMOJI: return True return False #remove the emoji def deEmojify(inputString): return inputString.encode('ascii', 'ignore').decode('ascii') T['extended_tweet'] = T['extended_tweet'].apply(lambda x: deEmojify(x)) import spacy from spacy.lemmatizer import Lemmatizer from spacy.lookups import Lookups sp = spacy.load('en') lookups = Lookups() lemm = Lemmatizer(lookups) def lemma_function(text): dummy = [] #this is just a test to see if it works for word in sp(text): dummy.append(word.lemma_) return ' '.join(dummy) T['extended_tweet_lemmatized'] = T['extended_tweet'].apply( lambda x: lemma_function(x)) T['extended_tweet_lemmatized'] = T['extended_tweet_lemmatized'].apply( lambda x: x.replace('-PRON-', '')) print('Finish deemoji and lemmatization') # Generate Sentiment Scores from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer analyser = SentimentIntensityAnalyzer() def sentiment_analyzer_scores(sentence): score = analyser.polarity_scores(sentence) print("{:-<40} {}".format(sentence, str(score))) T['neg'] = T['extended_tweet_lemmatized'].apply( lambda x: analyser.polarity_scores(x)['neg']) T['neu'] = T['extended_tweet_lemmatized'].apply( lambda x: analyser.polarity_scores(x)['neu']) T['pos'] = T['extended_tweet_lemmatized'].apply( lambda x: analyser.polarity_scores(x)['pos']) T['compound'] = T['extended_tweet_lemmatized'].apply( lambda x: analyser.polarity_scores(x)['compound']) print('Finish Generate Sentiment Score') return T
def morphology(): lemmatizer = Lemmatizer(Lookups()) return Morphology(StringStore(), {}, lemmatizer)