def test_issue2179(): """Test that spurious 'extra_labels' aren't created when initializing NER.""" nlp = Italian() ner = nlp.create_pipe("ner") ner.add_label("CITIZENSHIP") nlp.add_pipe(ner) nlp.begin_training() nlp2 = Italian() nlp2.add_pipe(nlp2.create_pipe("ner")) nlp2.from_bytes(nlp.to_bytes()) assert "extra_labels" not in nlp2.get_pipe("ner").cfg assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP", )
def test_issue2179(): """Test that spurious 'extra_labels' aren't created when initializing NER.""" nlp = Italian() ner = nlp.add_pipe("ner") ner.add_label("CITIZENSHIP") nlp.initialize() nlp2 = Italian() nlp2.add_pipe("ner") assert len(nlp2.get_pipe("ner").labels) == 0 model = nlp2.get_pipe("ner").model model.attrs["resize_output"](model, nlp.get_pipe("ner").moves.n_moves) nlp2.from_bytes(nlp.to_bytes()) assert "extra_labels" not in nlp2.get_pipe("ner").cfg assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP", )
def preprocess(text, NUM_DOCS, num_preprocessed, stemming): global i if i == 0: i = num_preprocessed i += 1 result = [] stemmer = ItalianStemmer() if i % 20 == 0: print(f"\t{i} out of {NUM_DOCS+num_preprocessed} documents preprocessed") nlp = Italian() t0 = text.split("Lingua processuale")[0].split("Sentenza")[-1] t1 = "".join(t0) t1 = re.sub(r"’|'|«|»|\d{1,4}\/\d{1,4}\/(cee|ce)|\d+|---\|*|^(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$", " ", t1, flags=re.IGNORECASE) # print(t1) doc = nlp(t1) for token in doc: if token.text.lower() not in it_stopwords and not token.is_punct | token.is_space and len(token) > 3: assert token.lang_ == "it" if stemming: result.append(stemmer.stem(word=token.text)) else: result.append(token.lemma_.lower()) if "'" in result[-1] or "’" in result[-1]: raise Exception(f"Detected_ {token.lemma_}") return result
def init_resources(self): self.punctuation_pattern = re.compile("|".join(PUNCTUATION)) self.stemmer = None stopwords_path = os.path.join( os.path.dirname(assistant_dialog_skill_analysis.__file__), "resources", self.language_code, "stopwords", ) if self.language_code == "en": from spacy.lang.en import English self.tokenizer = Tokenizer(English().vocab) self.stemmer = SnowballStemmer(language="english") self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "fr": from spacy.lang.fr import French self.tokenizer = Tokenizer(French().vocab) self.stemmer = SnowballStemmer(language="french") self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "de": from spacy.lang.de import German self.tokenizer = Tokenizer(German().vocab) self.stemmer = SnowballStemmer(language="german") self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "it": from spacy.lang.it import Italian self.tokenizer = Tokenizer(Italian().vocab) self.stemmer = SnowballStemmer(language="italian") self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "cs": from spacy.lang.cs import Czech self.tokenizer = Tokenizer(Czech().vocab) self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "pt": from spacy.lang.pt import Portuguese self.tokenizer = Tokenizer(Portuguese().vocab) self.stemmer = SnowballStemmer(language="portuguese") self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "es": from spacy.lang.es import Spanish self.tokenizer = Tokenizer(Spanish().vocab) self.stemmer = SnowballStemmer(language="spanish") self.stop_words = self.load_stop_words(stopwords_path) else: raise Exception("language code %s is not supported", self.language_code)
def get_tokenizers(self, lang): os.environ['TOKENIZERS_PARALLELISM'] = "True" if lang == 'de': spacy = German() bert = "deepset/gbert-base" elif lang == 'fr': spacy = French() bert = "camembert/camembert-base-ccnet" elif lang == 'it': spacy = Italian() bert = "dbmdz/bert-base-italian-cased" else: raise ValueError( f"Please choose one of the following languages: {self.languages}" ) return spacy.tokenizer, AutoTokenizer.from_pretrained(bert)
def lang_change(language): if language == 'en': from spacy.lang.en import English from spacy.lang.en.stop_words import STOP_WORDS parser = English() file = "\config_files\config_spacy_en.yaml" configfile_path = os.getcwd() + file elif language == 'de': from spacy.lang.de import German from spacy.lang.de.stop_words import STOP_WORDS parser = German() file = "\config_files\config_spacy_de.yaml" configfile_path = os.getcwd() + file elif language == 'es': from spacy.lang.es import Spanish from spacy.lang.es.stop_words import STOP_WORDS parser = Spanish() file = "\config_files\config_spacy_es.yaml" configfile_path = os.getcwd() + file elif language == 'pt': from spacy.lang.pt import Portuguese from spacy.lang.pt.stop_words import STOP_WORDS parser = Portuguese() file = "\config_files\config_spacy_pt.yaml" configfile_path = os.getcwd() + file elif language == 'fr': from spacy.lang.fr import French from spacy.lang.fr.stop_words import STOP_WORDS parser = French() file = "\config_files\config_spacy_fr.yaml" configfile_path = os.getcwd() + file elif language == 'it': from spacy.lang.it import Italian from spacy.lang.it.stop_words import STOP_WORDS parser = Italian() file = "\config_files\config_spacy_it.yaml" configfile_path = os.getcwd() + file elif language == 'nl': from spacy.lang.nl import Dutch from spacy.lang.nl.stop_words import STOP_WORDS parser = Dutch() file = "\config_files\config_spacy_nl.yaml" configfile_path = os.getcwd() + file return parser, STOP_WORDS, configfile_path
text = word_tokenize(open("data/converted/" + doc, "r", encoding='utf-8').read(), language='italian') average += len(text) print(average / i, i) av1 = 0 av2 = 0 i = 0 for dic in os.listdir("data/.preprocessed"): year = pickle.load(open("data/.preprocessed/" + dic, "rb")) i += len(year) print(len(year)) for doc in year: if dic[0] == 's': av1 += len(doc) else: av2 += len(doc) print(av1 / i, av2 / i, i) nlp = Italian() doc = nlp(open("data/converted/61999CJ0001.txt", "r", encoding='utf-8').read()) tok = [] for t in doc: tok.append(t.lemma_.lower()) dictionary = Dictionary([tok])
def test_issue2482(): """Test we can serialize and deserialize a blank NER or parser model.""" nlp = Italian() nlp.add_pipe(nlp.create_pipe("ner")) b = nlp.to_bytes() Italian().from_bytes(b)
from datasketch import MinHash, MinHashLSH from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from spacy.lang.en import English from spacy.lang.it import Italian import whoosh.index as index from tqdm import tqdm from whoosh.fields import Schema, TEXT, ID from whoosh.qparser import QueryParser, OrGroup from semantic_sim import SimServer tokenize_it = Italian().Defaults.create_tokenizer() tokenize_en = English().Defaults.create_tokenizer() wnl = WordNetLemmatizer() punct = string.punctuation.replace('.', '').replace(',', '') def to_shingles(doc, k=5): shingles = set() doc_string = doc.lower() if len(doc_string) <= k: doc_string = doc + 'no_txt_' + str(xxhash.xxh64(str(random.random())).hexdigest()) for i in range(len(doc_string) - k + 1): h = doc_string[i:i+k] shingles.add(h.encode('utf8')) return list(shingles)
import sys import string import spacy from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.base import TransformerMixin from sklearn.pipeline import Pipeline from spacy.lang.it.stop_words import STOP_WORDS from spacy.lang.it import Italian from sklearn import svm from sklearn import neural_network from sklearn import metrics punctuations = string.punctuation nlp = spacy.load("it_core_news_sm") stop_words = spacy.lang.it.stop_words.STOP_WORDS parser = Italian() # Custom transformer using spaCy class predictors(TransformerMixin): def transform(self, X, **transform_params): # Cleaning Text return [clean_text(text) for text in X] def fit(self, X, y=None, **fit_params): return self def get_params(self, deep=True): return {}
elif lang == "nl": return set( get_stop_words("nl") + stopwords.words('dutch') + STOP_LIST_NL) except: print("warning: no stopwords were downloaded. check nltk corpora") print(format_exc()) return set() # load resources _stop_words = load_stoplist() print("Loading spacy model...") _spacy = English() _spacy_fr = French() _spacy_nl = Dutch() _spacy_it = Italian() def get_stoplist(): return _stop_words def lemmatize(text, lowercase=True, lang="en"): """ Return lemmatized text """ if lang == "en": tokens = _spacy(text) elif lang == "fr": tokens = _spacy_fr(text) elif lang == "nl": tokens = _spacy_nl(text)
from spacy.lang.ru import Russian from spacy.lang.zh import Chinese from spacy.lang.ja import Japanese from spacy.lang.ca import Catalan from spacy.lang.eu import Basque from DataHandler import load_df_twitter_sent, load_df_lorelei from util import clean_str as test_clean_str from nltk.corpus import stopwords from util import identity_fn, lang2id language_dict = { 'english': English(), 'spanish': Spanish(), 'french': French(), 'italian': Italian(), 'german': German(), 'russian': Russian(), 'chinese': Chinese(), 'japanese': Japanese(), 'catalan': Catalan(), 'basque': Basque(), } class Tokenizer: def __init__(self, language, tokenizer_method='spacy', remove_stopwords=True, lowercase=True,