def save_noun(doc_clean, language): if language == 'en': new_doc = [] for doc in doc_clean: tagged = nltk.pos_tag(doc) for (word, tag) in tagged: if not tag.startswith("N"): doc.remove(word) else: continue new_doc.append(doc) return new_doc elif language == 'nl': nlp = nl_core_news_sm.load() new_doc = [] for doc in doc_clean: doc_for_tagging = nlp(doc) for word in doc_for_tagging: if str(word) in [ "wer", "wel", "mooi", "onz", "gan", "mak", "waarschijn", "leuk", "hel" ]: doc = doc.replace(str(word) + " ", "") else: continue tag = word.pos_ if not tag == "NOUN": doc = doc.replace(str(word) + " ", "") else: continue new_doc.append(doc) return new_doc
def extract_tag(text): nouns = [] nlp = nl_core_news_sm.load() doc = nlp(text) for token in doc: if token.head.pos == NOUN: nouns.append(("%s %s" % (token.text, token.head.text)).lower()) return nouns
trg[1:].view(-1)) epoch_loss += loss.item() return epoch_loss / len(iterator) if __name__ == '__main__': SEED = 1234 random.seed(SEED) torch.manual_seed(SEED) torch.cuda.manual_seed(SEED) torch.backends.cudnn.deterministic = True Multi30k.download('data') spacy_de = nl_core_news_sm.load() spacy_en = en_core_web_sm.load() SRC = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>', lower=True) TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True) train, valid, test = TranslationDataset.splits(path='./data/multi30k/', exts=['.de', '.en'], fields=[('src', SRC), ('trg', TRG)],
def check_spacy_models(main, lang, pipeline): if pipeline == 'word_tokenization': nlp_pipelines = [] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['sentence_tokenization', 'tokenization']: nlp_pipelines = ['sentencizer'] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['pos_tagging', 'lemmatization']: nlp_pipelines = ['tagger'] nlp_disable = ['parser', 'ner'] # Languages with models if lang in [ 'nld', 'eng', 'fra', 'deu', 'ell', 'ita', 'por', 'spa', 'other' ]: if f'spacy_nlp_{lang}' in main.__dict__: if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines: del main.__dict__[f'spacy_nlp_{lang}'] if f'spacy_nlp_{lang}' not in main.__dict__: # Dutch if lang == 'nld': import nl_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = nl_core_news_sm.load( disable=nlp_disable) # English elif lang == 'eng': import en_core_web_sm main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load( disable=nlp_disable) # French elif lang == 'fra': import fr_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = fr_core_news_sm.load( disable=nlp_disable) # German elif lang == 'deu': import de_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = de_core_news_sm.load( disable=nlp_disable) # Greek (Modern) elif lang == 'ell': import el_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = el_core_news_sm.load( disable=nlp_disable) # Italian elif lang == 'ita': import it_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = it_core_news_sm.load( disable=nlp_disable) # Portuguese elif lang == 'por': import pt_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = pt_core_news_sm.load( disable=nlp_disable) # Spanish elif lang == 'spa': import es_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = es_core_news_sm.load( disable=nlp_disable) # Other Languages elif lang == 'other': import en_core_web_sm main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load( disable=nlp_disable) # Languages without models else: # Serbian (Cyrillic) & Serbian (Latin) if lang in ['srp_cyrl', 'srp_latn']: main.__dict__['spacy_nlp_srp_cyrl'] = spacy.blank('rs') main.__dict__['spacy_nlp_srp_latn'] = spacy.blank('rs') else: main.__dict__[f'spacy_nlp_{lang}'] = spacy.blank( wordless_conversion.to_iso_639_1(main, lang)) if 'sentencizer' in nlp_pipelines: nlp = main.__dict__[f'spacy_nlp_{lang}'] if 'sentencizer' not in nlp.pipe_names: nlp.add_pipe(nlp.create_pipe('sentencizer'))
def check_spacy_models(main, lang, pipeline): if lang == 'other': lang = 'eng' if pipeline == 'word_tokenization': nlp_pipelines = [] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['tokenization', 'sentence_tokenization']: nlp_pipelines = ['sbd'] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['pos_tagging', 'lemmatization']: nlp_pipelines = ['tagger'] nlp_disable = ['parser', 'ner'] if lang in ['nld', 'eng', 'fra', 'deu', 'ell', 'ita', 'por', 'spa']: if f'spacy_nlp_{lang}' in main.__dict__: if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines: del main.__dict__[f'spacy_nlp_{lang}'] if f'spacy_nlp_{lang}' not in main.__dict__: # Dutch if lang == 'nld': import nl_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = nl_core_news_sm.load( disable=nlp_disable) # English elif lang == 'eng': import en_core_web_sm main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load( disable=nlp_disable) # French elif lang == 'fra': import fr_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = fr_core_news_sm.load( disable=nlp_disable) # German elif lang == 'deu': import de_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = de_core_news_sm.load( disable=nlp_disable) # Greek (Modern) elif lang == 'ell': import el_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = el_core_news_sm.load( disable=nlp_disable) # Italian elif lang == 'ita': import it_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = it_core_news_sm.load( disable=nlp_disable) # Portuguese elif lang == 'por': import pt_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = pt_core_news_sm.load( disable=nlp_disable) # Spanish elif lang == 'spa': import es_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = es_core_news_sm.load( disable=nlp_disable) if 'sbd' in nlp_pipelines: nlp = main.__dict__[f'spacy_nlp_{lang}'] if 'sbd' not in nlp.pipe_names: nlp.add_pipe(nlp.create_pipe('sentencizer'))
nlp_en = en_core_web_sm.load() import de_core_news_sm nlp_de = de_core_news_sm.load() import es_core_news_sm nlp_es = es_core_news_sm.load() import it_core_news_sm nlp_it = it_core_news_sm.load() import pt_core_news_sm nlp_pt = pt_core_news_sm.load() import nl_core_news_sm nlp_nl = nl_core_news_sm.load() # global variables wnl = WordNetLemmatizer() html_parser = HTMLParser() stopword_list = [] language = "" def init_lib(lang): global stopword_list, language nltk.download('stopwords') nltk.download('wordnet') nltk.download('punkt') language = lang
import pandas as pd import csv import spacy import json from spacy import displacy from collections import Counter from sklearn.feature_extraction.text import CountVectorizer import nl_core_news_sm nlp = nl_core_news_sm.load() df = pd.read_csv('dutch-news-articles.csv') categories = df['category'].unique() def export_named_entities(categories): for category in categories: df_category = df.loc[df['category'] == category] all_named_entities = df_category['content'].apply( get_all_named_entities) sum_named_entities = count_named_entities(all_named_entities) create_json(category, sum_named_entities) def get_all_named_entities(row): doc = nlp(row) items = [] for entity in doc.ents: if entity.label_ != 'CARDINAL' and entity.label_ != 'DATE' and entity.label_ != 'QUANTITY' and entity.label_ != 'TIME' and entity.label_ != 'ORDINAL' and entity.label_ != 'PERCENT' and entity.label_ != 'MONEY': items.append(entity.text) print(items)