def process_text(corpus): import hu_core_ud_lg nlp = hu_core_ud_lg.load() doc = nlp(corpus) return { str(token): str(token.lemma_) for token in doc if token.tag_ not in ['PUNCT', 'NUM'] and not token.is_stop }
import pickle import spacy from spacy.pipeline import EntityRuler import hu_core_ud_lg with open("data/interim/entities.p", "rb") as f: patterns = pickle.load(f) patterns = [e for e in patterns if len(e["pattern"]) > 0] nlp = hu_core_ud_lg.load() ruler = EntityRuler(nlp) ruler.add_patterns(patterns) nlp.add_pipe(ruler) ruler.to_disk("data/processed/patterns.jsonl") # test it doc = nlp( "Magyarország miniszterelnöke Orbán Viktor nem határos az Amerikai Egyesült Államokkal." ) print([(ent.text, ent.label_) for ent in doc.ents]) doc = nlp("Gyurcsány Ferenc, Orbán Viktor és Antal József miniszterelnökök.") print([(ent.text, ent.label_) for ent in doc.ents]) doc = nlp( "Nagy port kavart Márki-Zay Péter hódmezővásárhelyi polgármester, független ellenzéki politikus egy múlt heti interjúja, amiben hosszasan beszélt arról, hogy szerinte elfogadható (sőt szükséges), ha a szülő testi fenyítést alkalmaz gyerekeivel szemben fegyelmezésként." ) print([(ent.text, ent.label_) for ent in doc.ents])
word.lemma_, # LEMMA word.pos_, # UPOSTAG "_", # XPOSTAG "_", # FEATS str(head_idx), # HEAD word.dep_, # DEPREL "_", # DEPS "_", # MISC ) res.append("\t".join(linetuple)) res.append("") return "\n".join(res) + "\n" nlp = hu_core_ud_lg.load() # Load models print(nlp.pipeline) # Print loaded modules nlp.remove_pipe("tagger") nlp.remove_pipe("lemmatizer") #nlp.remove_pipe("parser") # Remove modules on demand print(nlp.pipeline) doc = nlp.make_doc('Ennek kell lennie a példamondatnak. Ez egy másik.' ) # Create Document from text. for name, proc in nlp.pipeline: # iterate over components in order doc = proc(doc) print(name) print(format_as_conllu(doc)) # Retrieve the result in CoNLL-U
---------- Cleaned docs as list of lists ''' all_tokens = sum(df[tokens_col], []) tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) tokenized_no_single = [[word for word in text if word not in tokens_once] for text in df[tokens_col]] return tokenized_no_single if __name__ == '__main__': # Load hu model nlp = hu.load() # Load corpus path_data = Path('../../data') df = pd.read_csv(path_data / 'processed' / 'input_data.csv') # Run tokenizer df['tokenized_raw'] = df['text'].apply( lambda x: tokenize_1gram(clean(x), model=nlp, ents=['PER', 'LOC'])) df['tokenized_raw_cnt'] = df['tokenized_raw'].apply(lambda x: len(x)) df['tokenized_mults'] = remove_single(df, 'tokenized_raw') df['tokenized_mults_cnt'] = df['tokenized_mults'].apply(lambda x: len(x)) # Drop extra stops if available try: extra_stops = pd.read_csv(path_data / 'processed' / 'extra_stops.csv')
def __init__(self, vector_output=False): self.vector_output = vector_output self.nlp = hu_core_ud_lg.load()