예제 #1
0
def process_text(corpus):
    import hu_core_ud_lg

    nlp = hu_core_ud_lg.load()
    doc = nlp(corpus)
    return {
        str(token): str(token.lemma_)
        for token in doc
        if token.tag_ not in ['PUNCT', 'NUM'] and not token.is_stop
    }
import pickle

import spacy
from spacy.pipeline import EntityRuler
import hu_core_ud_lg

with open("data/interim/entities.p", "rb") as f:
    patterns = pickle.load(f)

patterns = [e for e in patterns if len(e["pattern"]) > 0]

nlp = hu_core_ud_lg.load()
ruler = EntityRuler(nlp)

ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
ruler.to_disk("data/processed/patterns.jsonl")

# test it
doc = nlp(
    "Magyarország miniszterelnöke Orbán Viktor nem határos az Amerikai Egyesült Államokkal."
)
print([(ent.text, ent.label_) for ent in doc.ents])

doc = nlp("Gyurcsány Ferenc, Orbán Viktor és Antal József miniszterelnökök.")
print([(ent.text, ent.label_) for ent in doc.ents])

doc = nlp(
    "Nagy port kavart Márki-Zay Péter hódmezővásárhelyi polgármester, független ellenzéki politikus egy múlt heti interjúja, amiben hosszasan beszélt arról, hogy szerinte elfogadható (sőt szükséges), ha a szülő testi fenyítést alkalmaz gyerekeivel szemben fegyelmezésként."
)
print([(ent.text, ent.label_) for ent in doc.ents])
예제 #3
0
                word.lemma_,  # LEMMA
                word.pos_,  # UPOSTAG
                "_",  # XPOSTAG
                "_",  # FEATS
                str(head_idx),  # HEAD
                word.dep_,  # DEPREL
                "_",  # DEPS
                "_",  # MISC
            )
            res.append("\t".join(linetuple))

        res.append("")
    return "\n".join(res) + "\n"


nlp = hu_core_ud_lg.load()  # Load models

print(nlp.pipeline)  # Print loaded modules

nlp.remove_pipe("tagger")
nlp.remove_pipe("lemmatizer")
#nlp.remove_pipe("parser")    # Remove modules on demand
print(nlp.pipeline)

doc = nlp.make_doc('Ennek kell lennie a példamondatnak. Ez egy másik.'
                   )  # Create Document from text.
for name, proc in nlp.pipeline:  # iterate over components in order
    doc = proc(doc)
    print(name)

print(format_as_conllu(doc))  # Retrieve the result in CoNLL-U
예제 #4
0
    ----------
    Cleaned docs as list of lists

    '''
    all_tokens = sum(df[tokens_col], [])
    tokens_once = set(word for word in set(all_tokens)
                      if all_tokens.count(word) == 1)
    tokenized_no_single = [[word for word in text if word not in tokens_once]
                           for text in df[tokens_col]]

    return tokenized_no_single


if __name__ == '__main__':
    # Load hu model
    nlp = hu.load()

    # Load corpus
    path_data = Path('../../data')
    df = pd.read_csv(path_data / 'processed' / 'input_data.csv')

    # Run tokenizer
    df['tokenized_raw'] = df['text'].apply(
        lambda x: tokenize_1gram(clean(x), model=nlp, ents=['PER', 'LOC']))
    df['tokenized_raw_cnt'] = df['tokenized_raw'].apply(lambda x: len(x))
    df['tokenized_mults'] = remove_single(df, 'tokenized_raw')
    df['tokenized_mults_cnt'] = df['tokenized_mults'].apply(lambda x: len(x))

    # Drop extra stops if available
    try:
        extra_stops = pd.read_csv(path_data / 'processed' / 'extra_stops.csv')
예제 #5
0
 def __init__(self, vector_output=False):
     self.vector_output = vector_output
     self.nlp = hu_core_ud_lg.load()