def read_mtl_file(domain, filename): X = [] Y = [] if domain == 'en': # tokenizer = WordPunctTokenizer() tokenizer = English().Defaults.create_tokenizer() elif domain == 'fr': # tokenizer = nltk.data.load('tokenizers/punkt/french.pickle') tokenizer = French().Defaults.create_tokenizer() elif domain == 'de': # tokenizer = nltk.data.load('tokenizers/punkt/german.pickle') tokenizer = German().Defaults.create_tokenizer() with open(filename, 'r', encoding='utf-8') as inf: for line in inf.readlines(): parts = line.split('\t') if len(parts) == 3: # labeled Y.append(int(float(parts[1]))) elif len(parts) == 2: # unlabeled Y.append(0) else: raise Exception('Unknown format') clean = clean_sentence(parts[-1]) # if domain is 'en': # words = word_tokenize(clean, language='english') # elif domain is 'fr': # words = word_tokenize(clean, language='french') # elif domain is 'de': # words = word_tokenize(clean, language='german') words = [str(e) for e in tokenizer(clean)] tmp = {} tmp['tokens'] = words tmp['sent'] = clean X.append(tmp) #Y = torch.LongTensor(Y).to(opt.device) return (X, Y)
def __init__(self): if LANGUAGE == "DE": from spacy.lang.de.stop_words import STOP_WORDS self.nlp = spacy.load('de_core_news_sm') self.domain_stopwords = ["Ausschreibung", "Bekanntmachung"] from spacy.lang.de import German self.parser = German() elif LANGUAGE == "EN": from spacy.lang.en.stop_words import STOP_WORDS self.nlp = spacy.load('en') self.domain_stopwords = [ "contract", "system", "service", "tender", "company", "notice", "procurement", "work", "include", "support", "approximately", "management", "agreement", "office", "solution", "manage", "product", "design", "program", "project", "supply", "trust", "equipment" ] from spacy.lang.en import English self.parser = English() else: raise Exception("unknown language") self.stopwords = list(STOP_WORDS) self.stopwords.extend(self.domain_stopwords) self.pipe = None
def test_pipe_factories_from_source_language_subclass(): class CustomEnglishDefaults(English.Defaults): stop_words = set(["custom", "stop"]) @registry.languages("custom_en") class CustomEnglish(English): lang = "custom_en" Defaults = CustomEnglishDefaults source_nlp = English() source_nlp.add_pipe("tagger") # custom subclass nlp = CustomEnglish() nlp.add_pipe("tagger", source=source_nlp) assert "tagger" in nlp.pipe_names # non-subclass nlp = German() nlp.add_pipe("tagger", source=source_nlp) assert "tagger" in nlp.pipe_names # mismatched vectors nlp = English() nlp.vocab.vectors.resize((1, 4)) nlp.vocab.vectors.add("cat", vector=[1, 2, 3, 4]) logger = logging.getLogger("spacy") with mock.patch.object(logger, "warning") as mock_warning: nlp.add_pipe("tagger", source=source_nlp) mock_warning.assert_called()
def test_issue3002(): """Test that the tokenizer doesn't hang on a long list of dots""" nlp = German() doc = nlp( '880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl' ) assert len(doc) == 5
def bleu_scores_europarl(input_texts, target_texts, predict, parser=German()): assert len(input_texts) == len(target_texts) N = len(input_texts) # to handle short sequences, see also # http://www.nltk.org/_modules/nltk/translate/bleu_score.html#SmoothingFunction.method3 chencherry = SmoothingFunction() def remove_spaces_and_puncts(tokens): return [ token.orth_ for token in tokens if not (token.is_space or token.is_punct) ] bleu_scores = np.zeros(N) for i in tqdm(range(N)): ref_tokens = remove_spaces_and_puncts(parser(target_texts.iloc[i])) pred_tokens = remove_spaces_and_puncts( parser(predict(input_texts.iloc[i]))) bleu_scores[i] = sentence_bleu([ref_tokens], pred_tokens, smoothing_function=chencherry.method3) return bleu_scores
def test_pipe_factories_from_source_language_subclass(): class CustomEnglishDefaults(English.Defaults): stop_words = set(["custom", "stop"]) @registry.languages("custom_en") class CustomEnglish(English): lang = "custom_en" Defaults = CustomEnglishDefaults source_nlp = English() source_nlp.add_pipe("tagger") # custom subclass nlp = CustomEnglish() nlp.add_pipe("tagger", source=source_nlp) assert "tagger" in nlp.pipe_names # non-subclass nlp = German() nlp.add_pipe("tagger", source=source_nlp) assert "tagger" in nlp.pipe_names # mismatched vectors nlp = English() nlp.vocab.vectors.resize((1, 4)) nlp.vocab.vectors.add("cat", vector=[1, 2, 3, 4]) with pytest.warns(UserWarning): nlp.add_pipe("tagger", source=source_nlp)
def init_resources(self): self.punctuation_pattern = re.compile("|".join(PUNCTUATION)) self.stemmer = None stopwords_path = os.path.join( os.path.dirname(assistant_dialog_skill_analysis.__file__), "resources", self.language_code, "stopwords", ) if self.language_code == "en": from spacy.lang.en import English self.tokenizer = Tokenizer(English().vocab) self.stemmer = SnowballStemmer(language="english") self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "fr": from spacy.lang.fr import French self.tokenizer = Tokenizer(French().vocab) self.stemmer = SnowballStemmer(language="french") self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "de": from spacy.lang.de import German self.tokenizer = Tokenizer(German().vocab) self.stemmer = SnowballStemmer(language="german") self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "it": from spacy.lang.it import Italian self.tokenizer = Tokenizer(Italian().vocab) self.stemmer = SnowballStemmer(language="italian") self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "cs": from spacy.lang.cs import Czech self.tokenizer = Tokenizer(Czech().vocab) self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "pt": from spacy.lang.pt import Portuguese self.tokenizer = Tokenizer(Portuguese().vocab) self.stemmer = SnowballStemmer(language="portuguese") self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "es": from spacy.lang.es import Spanish self.tokenizer = Tokenizer(Spanish().vocab) self.stemmer = SnowballStemmer(language="spanish") self.stop_words = self.load_stop_words(stopwords_path) else: raise Exception("language code %s is not supported", self.language_code)
def get_spacy_base_model(lang: str): if lang == 'english': return English() elif lang == 'german': return German() else: raise AttributeError( f'Language {lang} not supported for spacy-based tokenization')
def __init__(self): self.domain_stopwords = ["Ausschreibung", "Bekanntmachung"] self.parser = German() self.stopwords = list(STOP_WORDS) self.stopwords.extend(self.domain_stopwords) self.fast_text_model = None self.svm_average_model = None self.create_new_model()
def test_pipe_factories_language_specific(): """Test that language sub-classes can have their own factories, with fallbacks to the base factories.""" name1 = "specific_component1" name2 = "specific_component2" Language.component(name1, func=lambda: "base") English.component(name1, func=lambda: "en") German.component(name2, func=lambda: "de") assert Language.has_factory(name1) assert not Language.has_factory(name2) assert English.has_factory(name1) assert not English.has_factory(name2) assert German.has_factory(name1) assert German.has_factory(name2) nlp = Language() assert nlp.create_pipe(name1)() == "base" with pytest.raises(ValueError): nlp.create_pipe(name2) nlp_en = English() assert nlp_en.create_pipe(name1)() == "en" with pytest.raises(ValueError): nlp_en.create_pipe(name2) nlp_de = German() assert nlp_de.create_pipe(name1)() == "base" assert nlp_de.create_pipe(name2)() == "de"
def __init__(self, filename): self._nlp = German() self._myfile = open(filename, encoding="utf8") self._text = self._nlp( re.sub( r'[^a-zA-Z_\s_\t]+', '', self._myfile.read().replace('\n', '').replace('\t', '').replace(' ', '')))
def __get_nlp__(self): if self._blank_model != '': # i.e. the pipeline has only the tokenizer, all other pipes have to added manually return spacy.blank(self._blank_model) if self._model == 'en': return English() elif self._model == 'de': return German() else: return spacy.load( 'en_core_web_sm') if self._model == '' else spacy.load( self._model)
def main(): nlp = English() doc = nlp("This is a sentence.") print(doc.text) nlp = German() doc = nlp('Liebe Grüße!') print(doc.text) nlp = Spanish() doc = nlp('¿Cómo estás?') print(doc.text)
def main(): nlp = spacy.load('en_core_web_sm') cat_hash = nlp.vocab.strings['cat'] print(cat_hash) cat_string = nlp.vocab.strings[cat_hash] print(cat_string) nlp = English() nlp_de = German() bowie_id = nlp.vocab.strings['Bowie'] print(bowie_id)
def __init__(self): self.stopwords = list(STOP_WORDS) self.domain_stopwords = ["Ausschreibung", "Bekanntmachung"] self.stopwords.extend(self.domain_stopwords) self.parser = German() self.punctuations = string.punctuation self.domain_stopwords = ["contract", "system", "service", "tender", "company", "notice", "procurement", "work", "include", "support", "approximately", "management", "agreement", "office", "solution", "manage", "product", "design", "program", "project", "supply", "trust", "equipment"] self.stopwords = list(STOP_WORDS) self.stopwords.extend(self.domain_stopwords) self.create_new_model()
def get_tokenizers(self, lang): os.environ['TOKENIZERS_PARALLELISM'] = "True" if lang == 'de': spacy = German() bert = "deepset/gbert-base" elif lang == 'fr': spacy = French() bert = "camembert/camembert-base-ccnet" elif lang == 'it': spacy = Italian() bert = "dbmdz/bert-base-italian-cased" else: raise ValueError( f"Please choose one of the following languages: {self.languages}" ) return spacy.tokenizer, AutoTokenizer.from_pretrained(bert)
def get_tokenizer(lang): if lang == "zh": # nlp = spacy.load("zh_core_web_sm") nlp = Chinese() elif lang == "en": # nlp = spacy.load("en_core_web_sm") nlp = English() elif lang == "cs": nlp = Czech() elif lang == "de": # nlp = spacy.load("de_core_web_sm") nlp = German() elif lang == "ru": nlp = Russian() else: raise Exception("Unacceptable language.") return nlp
def main(): # create an English and German nlp object nlp = English() # spacy.load('en_core_web_sm') nlp_de = German() # nlp = spacy.load("de_core_news_sm") # look up a string and hash using in the Vocab print('\nShared vocab and String Store -----------------') doc = nlp("I love coffee") print('hash value:', doc.vocab.strings['coffee']) print('string value:', doc.vocab.strings[3197928453018144401]) # lexemes: entries in the vocabulary # a lexeme object is an entry in the vocabulary that # conatins the context-independent information about a word # rem: orth means hash print('\nLexemes: entries in the Vocabulary -----------------') lexeme = doc.vocab['coffee'] print('word: ', lexeme.text) print('hash: ', lexeme.orth) print('alphanumeric?: ', lexeme.is_alpha) # practice 1: look up a string in vocab to get the hash print('\npractice 1: English nlp obj ----------------------') doc = nlp('My favorite guitar is a parlor guitar by Art and Lutherie.') guitar_hash = doc.vocab.strings['guitar'] print('guitar hash:\t', guitar_hash) guitar_string = doc.vocab.strings[13533102915073649304] print('guitar string:\t', guitar_string) # practice 2: print('\npractice 2: insert into EN vocab but not DE vocab ------') # get the ID for the string 'Jazz' jazz_id = nlp.vocab.strings['Jazz'] print(jazz_id) # print(nlp.vocab.strings[jazz_id]) # throws error jazz_id = nlp('Jazz') print(jazz_id) print(jazz_id.vocab.strings[16658944612980789447]) # look up the ID for 'Jazz' in the vocab #print('Jazz in EN vocab: ', nlp.vocab.strings[jazz_id]) # end program print('\nDone.')
def lang_change(language): if language == 'en': from spacy.lang.en import English from spacy.lang.en.stop_words import STOP_WORDS parser = English() file = "\config_files\config_spacy_en.yaml" configfile_path = os.getcwd() + file elif language == 'de': from spacy.lang.de import German from spacy.lang.de.stop_words import STOP_WORDS parser = German() file = "\config_files\config_spacy_de.yaml" configfile_path = os.getcwd() + file elif language == 'es': from spacy.lang.es import Spanish from spacy.lang.es.stop_words import STOP_WORDS parser = Spanish() file = "\config_files\config_spacy_es.yaml" configfile_path = os.getcwd() + file elif language == 'pt': from spacy.lang.pt import Portuguese from spacy.lang.pt.stop_words import STOP_WORDS parser = Portuguese() file = "\config_files\config_spacy_pt.yaml" configfile_path = os.getcwd() + file elif language == 'fr': from spacy.lang.fr import French from spacy.lang.fr.stop_words import STOP_WORDS parser = French() file = "\config_files\config_spacy_fr.yaml" configfile_path = os.getcwd() + file elif language == 'it': from spacy.lang.it import Italian from spacy.lang.it.stop_words import STOP_WORDS parser = Italian() file = "\config_files\config_spacy_it.yaml" configfile_path = os.getcwd() + file elif language == 'nl': from spacy.lang.nl import Dutch from spacy.lang.nl.stop_words import STOP_WORDS parser = Dutch() file = "\config_files\config_spacy_nl.yaml" configfile_path = os.getcwd() + file return parser, STOP_WORDS, configfile_path
def prepare_twitter_data(data_file, type_of_analysis): labels = [] text_fake, text_normal = '', '' df = pd.read_csv(data_file, sep='|', encoding='utf-8', keep_default_na=False) print('removing duplicates') df = utils.remove_duplicates(df) print('getting preprocessed train articles') idx = 0 for key, item in enumerate(df['article_text']): idx += 1 if df['is_fake'].values[key] == 1: text_fake += get_preprocessed_text(item) labels.append('FAKE') else: text_normal += get_preprocessed_text(item) labels.append('NOT_FAKE') if idx % 100 == 0: print('got {} of {} preprocessed train articles'.format(idx, len(df))) print('Finished gathering train text items') train = pd.DataFrame() train['data'] = df[type_of_analysis] train['labels'] = df['is_fake'] # TOPIC MODELLING nlp = German() stop_words = get_stop_words('de') stop_words.append('foto') stop_words.append('⬅') for stopword in stop_words: lexeme = nlp.vocab[stopword] lexeme.is_stop = True texts = get_spacy_corpus(train['data'], nlp, logging=True, topic_modelling=True) bigram = gensim.models.Phrases(texts) texts = [bigram[line] for line in texts] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary) print(ldamodel.show_topics()) return do_create_twitter(train, None)
def create_p3ml_vocab(fdir='', odir=''): nlp = spacy.load('de_core_news_sm') deTokenizer = German().Defaults.create_tokenizer(nlp) i = 0 for fn in os.listdir(fdir): print(i, fn) i += 1 with codecs.open(os.path.join(fdir, fn), 'r', 'utf-8-sig') as fh: txt = fh.read() # txtLst = list(set(re.sub(r'[^\w\s]',' ', txt).split())) txtLst = [str(s) for s in list(deTokenizer(txt))] counter = collections.Counter(txtLst) ofile = 'VOC_'+fn codecs.open(os.path.join(odir, ofile), 'w').close() with codecs.open(os.path.join(odir, ofile), 'a+', 'utf-8-sig') as ofh: keys = list(counter.keys()) keys.sort() for key in keys: if isinstance(key, str): ofh.write(' '.join([key, str(counter[key])])+'\n')
def tokenize(document, language, punctutation): if language == 'fr': nlp = French() if language == 'de': nlp = German() if language == 'en': nlp = French() if language == 'es': nlp = Spanish() sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) doc = nlp(document) if punctutation: sentences = [[str(word) for word in sent if str(word) != '\n'] for sent in doc.sents] else: sentences = [[ str(word) for word in sent if ((str(word) != '\n') and (str(word).isalpha())) ] for sent in doc.sents] return sentences
def get_nlp(self, language): """" this method returns the corresponding spacy language model when provided with a language. To do so it also does the required import. This is certainly not the standard approach. But as this endpoint will be deployed to Heroku (space limitation) and only be invoked rarely it is the fastest approach. """ if language == "en": from spacy.lang.en import English return English() elif language == "fr": from spacy.lang.fr import French return French() elif language == "de": from spacy.lang.de import German return German() elif language == "es": from spacy.lang.es import Spanish return Spanish() elif language == "pt": from spacy.lang.pt import Portuguese return Portuguese() else: return {"error": "invalid or not supported language entered"}
import json from spacy.lang.de import German from spacy.tokens import Span from spacy.matcher import PhraseMatcher with open("exercises/de/countries.json") as f: COUNTRIES = json.loads(f.read()) with open("exercises/en/capitals.json") as f: CAPITALS = json.loads(f.read()) nlp = German() matcher = PhraseMatcher(nlp.vocab) matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES))) def countries_component(doc): # Erstelle eine Entitäts-Span mit dem Label "LOC" für alle Resultate matches = matcher(doc) doc.ents = [____(____, ____, ____, label=____) for match_id, start, end in matches] return doc # Füge die Komponente zur Pipeline hinzu ____.____(____) print(nlp.pipe_names) # Getter-Funktion, die den Text der Span im Lexikon der Hauptstädte nachschlägt get_capital = lambda span: CAPITALS.get(span.text) # Registriere die Span-Erweiterung "capital" mit Getter-Funktion get_capital
print(device) #Reading the English-German sentences pairs from the file with open("../deu.txt", "r+") as file: deu = [x[:-1] for x in file.readlines()] en = [] de = [] for line in deu: en.append(line.split("\t")[0]) de.append(line.split("\t")[1]) #Setting the number of training sentences we'll use training_examples = 10000 #We'll be using the spaCy's English and German tokenizers spacy_en = English() spacy_de = German() en_words = Counter() de_words = Counter() en_inputs = [] de_inputs = [] #Tokenizing the English and German sentences and creating our word banks for both languages for i in tqdm_notebook(range(training_examples)): en_tokens = spacy_en(en[i]) de_tokens = spacy_de(de[i]) if len(en_tokens) == 0 or len(de_tokens) == 0: continue for token in en_tokens: en_words.update([token.text.lower()]) en_inputs.append([token.text.lower() for token in en_tokens] + ['_EOS'])
def NumberComplexRemover(string): """ removes numbers in complex format, but not if a . is followed as it introduces the end of a sentence. run after DateRemover() Examples: 15.10 Uhr OR 3,5 bis 4 stunden. OR 100 000 euro. OR 20?000 förderanträge OR um 2025/2030 OR OR abc 18.000. a OR abc. 18.000. a OR abc 18. a OR abc 7.8.14. a OR abc 7. 14. 18. a OR abc 1970er. a OR abc 20?()/&!%000. a OR abc 2,9-3,5. a OR abc . 18. a OR abc . 7.8.14. a OR abc . 7. 14. 18. a OR abc 1790er OR abc . 20?()/&!%000 a OR abc . 2,9-3,5 a OR abc 45, 59 a OR abc . 14 z OR abc 1. e OR abc v. 2 a """ string = re.sub('(?<!\w)(\d+)([\W\s]+|)|([\W\s]+)\d+', ' ', string) # TODO: check later # Alternative: ((\d+)(.|\s{1,3}|)\d+)(.|\s)(?! er) return string nlp = German() sbd = nlp.create_pipe('sentencizer') nlp.add_pipe(sbd) def Sentencizer(string, verbose=False): """ requires from importing language from spacy and loading of sentence boundary detection: from spacy.lang.de import German nlp = German() sbd = nlp.create_pipe('sentencizer') nlp.add_pipe(sbd) for some single strings nlp() cannot process (rare, e.g. 'nan'), exclude those; except pass solve later """ sents_list = []
import spacy # python -m spacy download de_core_news_sm --user from spacy.lang.de import German nlp = spacy.load('de_core_news_sm') tokenizer = German().Defaults.create_tokenizer(nlp) doc = nlp( 'Hallo Welt, ich finde es hier so spannend. der die das. dwwomdowmd 404') for token in doc: if token.text in nlp.vocab: print('Im Wörterbuch') print(token, token.lemma, token.lemma_) # for x in tokenizer('Hello world!'): # print(x)
import json from spacy.lang.de import German with open("exercises/de/countries.json", encoding="utf8") as f: COUNTRIES = json.loads(f.read()) nlp = German() doc = nlp("Tschechien könnte der Slowakei dabei helfen, ihren Luftraum zu schützen") # Importiere den PhraseMatcher und initialisiere ihn from spacy.matcher import PhraseMatcher matcher = PhraseMatcher(nlp.vocab) # Erstelle Pattern-Doc-Objekte und füge sie zum Matcher hinzu # Dies ist die schnellere Version von: [nlp(country) for country in COUNTRIES] patterns = list(nlp.pipe(COUNTRIES)) matcher.add("COUNTRY", None, *patterns) # Wende den Matcher auf das Test-Dokument an und drucke das Resultat matches = matcher(doc) print([doc[start:end] for match_id, start, end in matches])
# Importiere die Klasse German und erstelle das nlp-Objekt from spacy.lang.de import German nlp = German() # Verarbeite den Text doc = nlp("Ich mag niedliche Katzen und Faultiere.") # Wähle den ersten Token aus erster_token = doc[0] # Drucke den Text des ersten Tokens print(erster_token.text)
def getSentences(text): nlp = German() nlp.add_pipe(nlp.create_pipe('sentencizer')) document = nlp(text) return [sent.string.strip() for sent in document.sents]