def main(argv): parser = get_parser() preprocessor = PreprocessingClass() tagger_en = treetaggerwrapper.TreeTagger(TAGLANG='en', TAGDIR="C:\\TreeTagger") tagger_es = treetaggerwrapper.TreeTagger(TAGLANG='es', TAGDIR="C:\\TreeTagger") tagger_nl = treetaggerwrapper.TreeTagger(TAGLANG='nl', TAGDIR="C:\\TreeTagger") (options, args) = parser.parse_args(argv) if not (options.input and options.output): parser.error("Required arguments not provided") else: lang = detect_language(options.input) if lang.lower() not in ['en', 'es', 'nl']: print >> sys.stderr, 'Language other than en, es, nl' sys.exit(1) else: print print "Current Language: ", lang # final_model_path = options.model final_model_path = change_path_to_windows_style(options.model) final_output_path = change_path_to_windows_style(options.output) if lang.lower() == "en": main_classifier = FinalClassicationClass(lang.lower(), options.input, final_output_path, None, 2) dataset_input = preprocessor.read_all_files(options.input, "Test Set", lang.lower()) X, y_author = preprocessor.split_lists_dev(dataset_input) X_list_pos_tags, X_list_lemma = preprocessor.stem_and_pos(X, tagger_en) pipelined_dictionary = preprocessor.create_pipeline_dict(X, X_list_lemma, X_list_pos_tags, y_author) main_classifier.dataset_statistics_dev(X, y_author) # load models gender_model, age_model = load_model(final_model_path, lang.lower()) main_classifier.test_model(gender_model, pipelined_dictionary, lang, y_author, final_output_path, age_model) elif lang.lower() == "nl": main_classifier = FinalClassicationClass(lang.lower(), options.input, final_output_path, None, 2) dataset_input = preprocessor.read_all_files(options.input, "Test Set", lang.lower()) X, y_author = preprocessor.split_lists_dev(dataset_input) X_list_pos_tags, X_list_lemma = preprocessor.stem_and_pos(X, tagger_nl) pipelined_dictionary = preprocessor.create_pipeline_dict(X, X_list_lemma, X_list_pos_tags, y_author) main_classifier.dataset_statistics_dev(X, y_author) # load models gender_model = load_model(final_model_path, lang.lower()) main_classifier.test_model(gender_model, pipelined_dictionary, lang, y_author, final_output_path) elif lang.lower() == "es": main_classifier = FinalClassicationClass(lang.lower(), options.input, final_output_path, None, 2) dataset_input = preprocessor.read_all_files(options.input, "Test Set", lang.lower()) X, y_author = preprocessor.split_lists_dev(dataset_input) X_list_pos_tags, X_list_lemma = preprocessor.stem_and_pos(X, tagger_es) pipelined_dictionary = preprocessor.create_pipeline_dict(X, X_list_lemma, X_list_pos_tags, y_author) main_classifier.dataset_statistics_dev(X, y_author) # load models gender_model, age_model = load_model(final_model_path, lang.lower()) main_classifier.test_model(gender_model, pipelined_dictionary, lang, y_author, final_output_path, age_model)
def __init__(self, index_path="/INDEX"): self.index = {} self.index_document = {} self.plain_word_fr = re.compile("ABR|ADJ|NAM|NOM|VER") self.plain_word_en = re.compile("JJ|NP|NN|VB") self.fr_tagger = treetaggerwrapper.TreeTagger(TAGLANG="fr") self.en_tagger = treetaggerwrapper.TreeTagger(TAGLANG="en") self.save_folder = index_path self.keep_path = self.save_folder + "/documentsIndex" self.index_name = self.save_folder + "/index.json" self.index_document_name = self.save_folder + "/index_document.json"
def tag_poslemma(dirName): # TAGGED-ALL writer=csv.writer(open('dictionaries/taggedAll_' + dirName + '.csv', 'w')) # OPEN FILES IN DIRECTORY for witness in os.listdir('data/' + dirName): if witness.endswith(".txt"): with open('data/' + dirName + '/' + witness) as wit: witText = wit.read() # TAG USING FRO taggerFro = treetaggerwrapper.TreeTagger(TAGLANG='froBfm') tagsFro = taggerFro.tag_text(witText) # LIST WITH TAGGED WORDS (FRO) # TAG USING STEIN taggerStein = treetaggerwrapper.TreeTagger(TAGLANG='stein') tagsSteinDirt = taggerStein.tag_text(witText) # dirst, because has too much info and symbols tagsSteinStr = '\n'.join(tagsSteinDirt) # list to string # CLEAN OUTPUT STEIN patterns = [('_.*', ''), ('\d.*', ''), # ('\|.*', ''), so that the different output possibilities are saved ('�', 'ö'), # encoding problem, but it does not seem to depend on the TreeTaggerWrapper, nor on the script. Maybe on the lexicon? Anyway, this is not real solution but works ('<nolem>', 'UNKNOWN')] for (p1,p2) in patterns: p = re.compile(p1) tagsSteinStr = p.sub(p2, tagsSteinStr) tagsStein = tagsSteinStr.split('\n') # LIST WITH TAGGED WORDS (STEIN) for itemFro, itemStein in zip(tagsFro, tagsStein): token = itemFro.split('\t')[0] pos = itemFro.split('\t')[1] lemma = itemStein.split('\t')[2] item = token + '\t' + pos + '_' + lemma writer.writerow([item]) # populate the file with items (made by token, pos and lemma) # TAGGED-DISTINCT reader=csv.reader(open('dictionaries/taggedAll_' + dirName + '.csv', 'r'), delimiter='\t') writer=csv.writer(open('dictionaries/taggedDistinct_' + dirName + '.csv', 'w'), delimiter=',') entries = set() writer.writerow(['Original', 'Normalised']) for row in reader: key = (row[0], row[1]) if key not in entries: writer.writerow(row) entries.add(key)
def tokenize_and_lemmatize_tweets(listTweets): """Tokenize & lemmatize a list of texts""" global french_stop_words global mention_regex global LOCALTAGDIR # Setting up TreeTagger tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr', TAGDIR=LOCALTAGDIR) for t in listTweets: text = mention_regex.sub("", t["text"]).lower() tags = tagger.tag_text(text) tags = treetaggerwrapper.make_tags(tags) tokens = [] lemma = [] # Filtering for tag in tags: if hasattr(tag, 'word'): if not (len(tag.lemma) < 2 or tag.lemma in french_stop_words): tokens.append(tag.word) lemma.append(tag.lemma) else: token = tag.what if not (len(token) < 2 or token in french_stop_words): if token.startswith("<repurl") or token.startswith( "<repdns"): token = token[token.find('"') + 1:token.rfind('"')] else: lemma.append(token) tokens.append(token) t["tokenArray"] = tokens t["lemmaArray"] = lemma return listTweets
def postag_directory(input_directory, output_directory): """ This function POS-tag a directory full of documents (opinions, tweets, comments) Args: input_directory: The output_directory: """ tagger = treetaggerwrapper.TreeTagger(TAGLANG='en') # Loop over the files all_tags = [] for filename in sorted(glob.glob(os.path.join(input_directory, '*.txt'))): with codecs.open(filename, encoding='utf-8') as f: # Read the file content = f.read() # Tag it tags = tagger.tag_text(content) # add those tags to the master tag list all_tags.append(tags) for i, a_list in enumerate(all_tags): new_dir_path = output_directory path = os.path.join(new_dir_path, "list%d.txt" % i) with open(path, "w") as f: for item in a_list: f.write(item + "\n")
def postag_pandas(input_file, output_file): def postag_string(s): '''Returns tagged text from string s''' if isinstance(s, basestring): s = s.decode('UTF-8') return tagger.tag_text(s) # Reading in the file all_lines = [] with open(input_file) as f: for line in f: all_lines.append(line.strip().split('|', 1)) df = pd.DataFrame(all_lines[1:], columns=all_lines[0]) tagger = treetaggerwrapper.TreeTagger(TAGLANG='en') df['POS-tagged_content'] = df['content'].apply(postag_string) # Format fix: def fix_format(x): '''x - a list or an array''' # With encoding: out = list(tuple(i.encode().split('\t')) for i in x) # or without: # out = list(tuple(i.split('\t')) for i in x) return out df['POS-tagged_content'] = df['POS-tagged_content'].apply(fix_format) df['content'] = df['content'].map(lambda x: x.lstrip('"""' '')) print list(df.columns.values) return df.to_csv(output_file, sep='|', index=False)
def __init__(self, input, output):#, tagdir='/opt/treetagger'): self.tagger = treetaggerwrapper.TreeTagger(TAGLANG='en')#, TAGDIR=tagdir) self.inp = input self.outp = output with open(self.inp + 'news_stops') as f: self.stopwords = f.read().strip().split() self.excltags = ['CC', 'CD', 'DT', 'EX', 'IN', 'LS', 'MD', 'PDT', 'POS', 'RP', 'SYM', 'TO', 'WDT', 'WP', 'WP$', 'WRB', 'VB', 'VBD', 'VBG', 'VBN', 'VBZ', 'VBP', 'VD', 'VDD', 'VDG', 'VDN', 'VDZ', 'VDP', 'VH', 'VHD', 'VHG', 'VHN', 'VHZ', 'VHP', 'RB', 'RBR', 'RBS', 'JJ', 'JJR', 'JJS']
def Tagger(dataset): # ------------Ouverture des fichiers-----------------------------------# D = open(dataset, 'r') Tag = open("DataTAG.txt", 'w') # ------------Liste des tags a prendre en concideration-----------------# ListeTags = [ "JJ", "JJR", "JJS", "VV", "VVD", "VVG", "VVN", "RBR", "RBS", "UH", "RB" ] # ------------Config du wrapper------------------------------------------# tagger = treetaggerwrapper.TreeTagger(TAGLANG='en', TAGDIR='./TreeTagger', TAGINENC='utf-8', TAGOUTENC='utf-8') # ------------Analyse:----------------------------------------------------# # Pour chaque ligne du dataset ,treetagger analyse le commentaire # et renvoie les mots importants dans le fichier DataTAG for line in D: line = re.sub('http:\/\/[0-9a-zA-Z-_\.]*(\.[a-z]{0,9}\/?)?', "website ", line) #supression des url line = re.sub( "[^a-zA-Z0-9 '-]", " ", line ) # suppression de tous les carracteres speciaux (sauf :',- et l'espace) tags = tagger.TagText(line.decode( encoding="utf-8")) # "taggage" de chaque mot du commentaire for words in tags: #recuperations des mots qui possedent les tags de ListTags w = words.split("\t") if w[1] in ListeTags: Tag.write(w[0] + " ") Tag.write("\n")
def _set_treetagger(self, language): import treetaggerwrapper as ttw try: self._tagger = ttw.TreeTagger(TAGLANG=language) self.morphy = self._treetagger_morphy except ttw.TreeTaggerError: raise (ImportError)
def _useTagger(self, langID:str): assert isinstance(langID, str) with self.__mainLock: langIDCache = self.__unused.get(langID, None) if langIDCache is None: langIDCache = PoolOfThreadedTreeTaggers._LangSpecificCache(langID) self.__unused[langID] = langIDCache langIDCache.touch() if langIDCache.idleInstances: with langIDCache.langLock: tagger = langIDCache.idleInstances[-1] del langIDCache.idleInstances[-1] langIDCache.countUsedInstances += 1 else: tagger = ttpw.TreeTagger( TAGLANG=langID, TAGOPT="-prob -threshold 0.7 -token -lemma -sgml -quiet", TAGDIR=self.__treeTaggerInstallationPath) self.__onTaggerCreated.fire(self, langID) with langIDCache.langLock: langIDCache.countUsedInstances += 1 try: yield tagger finally: with langIDCache.langLock: langIDCache.countUsedInstances -= 1 langIDCache.idleInstances.append(tagger)
def get_level(file_name): tagger = ttw.TreeTagger(TAGLANG='en', TAGDIR=os.getcwd()) tags = tagger.tag_file(file_name + '.txt') tags2 = ttw.make_tags(tags) # with open(file_name + '.tag', 'w') as f: # for tag in tags: # f.write("%s\n" % tag) import re words = [] for tag in tags2: if re.search('^\w', tag.lemma): for word in tag.lemma.lower().split('-'): words.append(word) words = list(set(words)) import word_level sentence_level = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] unknown_words = [] for word in words: level = word_level.get_level(word) sentence_level[level] += 1 if level == 0: unknown_words.append(word) print(sentence_level) print(unknown_words) return sentence_level
def __init__(self, text): tagger = treetaggerwrapper.TreeTagger(TAGLANG='en',TAGDIR='/home/lr/hayashi/ra_web_app') self.text = text #小文字にすると拾えない self.sentences = sent_tokenize(self.text) self.tagged = [tagger.TagText(sentence) for sentence in self.sentences] self.parsed = [' '.join(sentence).replace('\t', '_') for sentence in self.tagged]
def build_corpus(filename): corpus = ihmm.corpus() # 訓練データを形態素解析して各品詞ごとにその品詞になりうる単語の総数を求めておく sentence_list = [] with codecs.open(filename, "r", "utf-8") as f: for sentence_str in f: sentence_list.append(sentence_str) with codecs.open(filename, "r", "utf-8") as f: tagger = treetaggerwrapper.TreeTagger(TAGLANG="en") for i, sentence_str in enumerate(f): sentence_str = sentence_str.strip() if (i + 1) % 10 == 0: printr("データを準備しています ... {}".format(i + 1)) result = tagger.tag_text(sentence_str) if len(result) == 0: continue # 形態素解析を行いながら訓練データも作る # 英語は通常スペース区切りなので不要と思うかもしれないが、TreeTaggerを使うと$600が$ 600に分割されたりする # そのためplot_en.pyで評価の際に文の単語数が[スペース区切り]と[TreeTagger]で異なる場合があり正しく評価を行えなくなる # よって単語分割は全てTreeTaggerによるものに統一しておく words = [] for metadata in result: metadata = metadata.split("\t") if len(metadata) == 3: word, true_tag, lowercase = metadata true_tag = collapse_true_tag(true_tag, lowercase) else: lowercase = metadata[0] words.append(lowercase) # データを追加 corpus.add_words(words) return corpus
def postTreatment(TW): # Fonction destinée a transformer les nombres écrits en lettres en chiffres # non implémenté : rassemblement de token : pomme de terre en un seul token par exemple... def preTreatment(TW): for w in TW: if w.posTag == "NUM" and w.lemma != "@card@": try: w.word = numbers[w.word] except: ans = input("Je ne comprends pas bien ce nombre : " + w.word + ". Pourriez-vous l'écrire en chiffres ?\n") taggedAns = formatTTG(tagger.TagText(ans)) for wAns in taggedAns: if wAns.posTag == "NUM" and wAns.lemma == "@card@": numbers[w.word] = int(wAns.word) w.word = numbers[w.word] return TW tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr', TAGDIR='./Ressources', TAGINENC='utf-8', TAGOUTENC='utf-8') demande = input("Bonjour ! Que puis-je pour vous aujourd'hui ?\n") taggedWords = formatTTG(tagger.TagText(demande)) taggedWords = postTreatment(taggedWords) if(eatingIntention(taggedWords)): r = Request() r.fillRequest(taggedWords) r.printRequest()
def tok(tex): """ Tag le texte et renvoie le texte taggé sous format facile à lire pour l'utilisateur en colonnes Entrée: texte Output: texte taggée """ tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr') tags = tagger.tag_text(tex) tags2 = treetaggerwrapper.make_tags(tags) pprint.pprint(tags2) empty = [] for tag in tags2: tagg = tag empty.append(tagg) print(empty) grammar = [] for element in empty: for i in element: # if element.index(i)==0 or element.index(i)==1: grammar.append(i) grammar.append("\t") grammar.append("\n") res = "".join(grammar) return "{}".format(res)
def pos_tag(inp: str, out, tagdir: str = '/usr/local/tree-tagger'): #Generates POS representation of data and pickles output into a jar. texts, genders, ages = read_data(inp) tagger = treetaggerwrapper.TreeTagger(TAGLANG='en', TAGDIR=tagdir, TAGOPT='-token -sgml') logging.info('POS tagging data') pos_texts = [] d_infothresholds = { int((i / 100.0 * len(texts))): "%i%%" % (i) for i in range(0, 101) } for i, t in enumerate(texts): tags = [ el.split('\t')[1] for el in tagger.tag_text(t) if len(el.split()) == 2 ] pos_texts.append(' '.join(tags)) if i in d_infothresholds.keys(): logging.info('{} of documents processed'.format( d_infothresholds[i])) logging.info('Pickling results to {}'.format(out.name)) pickle.dump((pos_texts, genders, ages), out)
def process_files(source_path, target_path, languages=['en'], exception_file=None): source_path = Path(source_path) target_path = Path(target_path) for l in languages: tagger = treetaggerwrapper.TreeTagger(TAGLANG=l) ipath = Path(l) / '**/*.gz' paths = list(source_path.glob(str(ipath))) exceptions = None if exception_file: with open(exception_file) as f: exceptions = json.loads(f.read())[l] if isinstance(exceptions, dict): exception_terms = [] for k in exceptions: exception_terms += exceptions[k] exceptions = exception_terms for f_i, path in tqdm(enumerate(paths), desc=l, total=len(paths)): with gzip.open(path, 'rt') as source: target_file = target_path / path.relative_to(source_path) os.makedirs(target_file.parent, exist_ok=True) with gzip.open(target_file, 'wt') as target: for line in source: if line.startswith('\n') or line.startswith( '\t') or line.startswith( ' ') or line.startswith('<'): continue p = ' '.join(process(line, l, tagger, exceptions)) target.write(p + '\n')
def lemma(inpath, outpath, charFilter): ''' Lemmatisiert Texte in gegebenem Ordner inpath. ''' for text in os.listdir(inpath): if text.endswith('.txt'): f_lemma = [] result = '' t = open(inpath + '/' + text, 'r') f = t.read() tagger = treetaggerwrapper.TreeTagger(TAGLANG='de') tags = tagger.tag_text(f) tags2 = treetaggerwrapper.make_tags(tags) print("text", text) for t in tags2: try: result += t.lemma result += ' ' except: pass f_lemma.append(result) if os.path.exists(outpath + text.split('_')[1]+'_'+text.split('_')[2]+'.txt'): txtFile = open(outpath + text.split('_')[1]+'_'+text.split('_')[2]+'.txt', 'w') txtFile.write('') txtFile.close() txtFile = open(outpath + text.split('_')[1]+'_'+text.split('_')[2]+'.txt', 'a') for i in f_lemma: txtFile.write(replace(i, charFilter + ' ')) txtFile.close() return
def taggerTexte(texte): """ Normalise le texte et renvoie les lemmes Arguments: Texte Renvoie: liste des lemmes pertinents """ texxt = texte.split("’") tex = "'".join(texxt) if detect(tex) == "fr": langdet = 'fr' if detect(tex) == 'en': langdet = 'en' tagger = treetaggerwrapper.TreeTagger(TAGLANG=langdet) tags = tagger.tag_text(tex) tags2 = treetaggerwrapper.make_tags(tags) #pprint(tags2) empty = [] for tag in tags2: tagg = tag empty.append(tagg) #print(empty) grammar = [] for element in empty: compt = 0 for i in element: if compt == 1 or compt == 2: grammar.append(i) grammar.append("\t") compt += 1 grammar.append("\n") del grammar[-1] res = "".join(grammar) ress = res.split("\n") lemmes = [] for rrr in ress: if len(rrr) == 0 or "@" in rrr: del ress[ress.index(rrr)] for rr in ress: match_tag = re.search(r"(.*)\t(.*)\t", rr) if "VER" in match_tag.group(1) or "NOM" in match_tag.group( 1) or "ABR" in match_tag.group(1) or "ADJ" in match_tag.group( 1): lemmes.append(match_tag.group(2).lower()) elif "VV" in match_tag.group(1) or "NN" in match_tag.group( 1) or "NP" in match_tag.group(1) or "JJ" in match_tag.group( 1) or "VH" in match_tag.group( 1) or "VB" in match_tag.group( 1) or "MD" in match_tag.group(1): lemmes.append(match_tag.group(2).lower()) return lemmes
def fit_transform(self): # Parsing (lemmatisation and pos-tagging) tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr') df['tags'] = self.df[self.text_column].apply( lambda x: treetaggerwrapper.make_tags(tagger.tag_text(x))) df['lemma'] = df.tags.apply(lambda x: [(t.lemma).lower() if isinstance( t, treetaggerwrapper.Tag) else '' for t in x]) df['text_lemma'] = df.apply(lambda row: " ".join(row.lemma), axis=1) df['pos'] = df.tags.apply( lambda x: [t.pos if isinstance(t, treetaggerwrapper.Tag) else '' for t in x]) # surface based features df['number_verbs'] = df.pos.apply(lambda x: self.number_verbs(x)) df['number_proper_nouns'] = df.pos.apply( lambda x: self.number_proper_nouns(x)) df['number_imperative_verb'] = df.pos.apply( lambda x: self.number_imperative_verb(x)) # sentiment features lex_dict = self.lex_df.to_dict('index') list_lex_exp = df.lemma.apply(lambda x: [ lex_dict[key] if self.has_expression(exp=lex_dict[key]['lemma'], string_ls=x, group=lex_dict[key]['group']) == True else None for key in lex_dict.keys() ]) list_lex_exp = list_lex_exp.apply(lambda x: list(filter(None, x))) has_int = self.has_intensifier(list_lex_exp) avg_pol = self.avg_polarity(list_lex_exp) df['has_intensifier'] = has_int df['avg_polarity'] = avg_pol return df
def analyze_ft_others(outPath, name): """ Calculate readability metrics for fulltexts. :param outPath: Raw fulltext directory :param name: Name of journal to be processed :return: Dataframe with calculated readability metrics for fulltexts in outPath/metrics/ """ from functions import readabilityFunctions as rf import treetaggerwrapper print("Processing fulltexts of journal " + name) inPath = outPath + 'pmid/' + name + '.json' outPathMetrics = outPath + 'metrics/' + name + '_metrics.json' rf.analyze(path=inPath, spath=outPathMetrics, tagger=treetaggerwrapper.TreeTagger(TAGLANG='en'), textType='body', columnList={ 'year', 'pmid', 'doi', 'strippedText', 'wordLength', 'wordCount', 'sentenceCount', 'sylCount', 'flesch', 'NDC', 'PercDiffWord', 'DiffWord_lst' })
def __init__(self, data): self.processed_data = [] self.labeling = [] nes = ['Name', 'Surname', 'Location'] counter = 0 self.tagger = treetaggerwrapper.TreeTagger(TAGLANG='ru') # 890 : 920 зависает for el in tqdm(data[:890] + data[920:]): elem = deepcopy(el) lemmas = lemmatize_words(elem['text']) elem['text'] = ' '.join(lemmas) sents = sent_tokenize(elem['text']) for sent in sents: tokens, labels = self.word2features(sent, elem) self.processed_data.append(tokens) self.labeling.append(labels) for ne in nes: if elem[ne] != []: raise NormalizeError((elem[ne], elem['text'], self.pos_tagging(elem['text']))) counter += 1
def get_documents(docs, stopwords): """Extrait les documents du corpus :param corpus: [(source,datetime,text)]""" documents = list() corpus = [ (doc[0], doc[1], doc[2]) for doc in docs if len(doc[2].split()) > 3 ] # supprime les lignes courtes de moins de 4 mots tagger = tagr.TreeTagger( TAGLANG='fr', TAGDIR='c:/Applications/TreeTagger', TAGPARFILE='C:/Applications/TreeTagger/lib/french-utf8.par') idx, start_time = 1, time.time() for doc in corpus: source, datetime, raw = doc[0], doc[1], doc[2] tags = tagr.make_tags(tagger.tag_text(clean_text(raw))) tags = [tag for tag in tags if type(tag) == tagr.Tag] # add all our elements to the array (documents) # each element in the array is a dictionary documents.append({ 'idx': idx, 'source': source, 'time': datetime, 'raw': raw, 'tags': tags }) idx = progress_per(idx, len(corpus), start_time) # print the progress percentage info print() return documents
def __init__(self): # several attributes for language informations self._plainText = "" self._filteredText = "" self._tokens = [] self._tokensAndPOS = [] self._lemmas = [] self._lemmasWithoutStopwords = [] self._lemmasAndPOS = [] self._lemmaAndPOSDict = {} self._lemmasWithLanguageInfo = [] self._lemmasAndPOSAndTokensDict = {} self._stopwords = [] self._stopwords_lemmatized = [] self._currentDramaName = "" self._tokensWithoutStopwords = [] self._stopwordLists = [ "standardList", "enhancedList", "enhancedFilteredList" ] self._tagger = treetaggerwrapper.TreeTagger(TAGLANG='de')
def get_tagger(language: str): """ :param language: language code e.g. de,nl :return: tagger object """ tagger = treetaggerwrapper.TreeTagger(TAGLANG=language) return tagger
def build_tree_tagger(text, source_file, output_path): global dir_tree_tagger # build a TreeTagger wrapper tagger = treetaggerwrapper.TreeTagger(TAGDIR=dir_tree_tagger, TAGLANG="fr") # tag text tags = tagger.tag_text(text) if not output_path.exists(): output_path.mkdir(parents=True) treetaggerwrapper.TreeTagger.tag_file_to( tagger, str(source_file), str(output_path / 'tagger_result.txt')) # pprint.pprint(tags) tags2 = treetaggerwrapper.make_tags(tags) # pprint.pprint(tags2) tag_dict = dict() for tag in tags2: if hasattr(tag, 'pos'): tag_dict[unicodedata.normalize('NFD', tag.word).encode('ascii', 'ignore')] = { "pos": tag.pos, "lemma": tag.lemma } # pprint.pprint(tags2) return tag_dict, tags2
def process_file(out_file_name): tagger = treetaggerwrapper.TreeTagger(TAGLANG='pl') f = open(out_file_name + '.txt', 'r') w = open(out_file_name + '_lemmatized.txt', 'w') i = 0 wrong_pos = ['SENT', 'interp'] for line in f: try: with timeout(5, exception=RuntimeError): tags = tagger.tag_text(line) tag_list = [] tags2 = treetaggerwrapper.make_tags(tags) for tag in tags2: if tag.pos not in wrong_pos: tag_list.append(tag.lemma) w.write(' '.join(tag_list) + '\n') i += 1 if i % 100: print(i) except RuntimeError: continue
def run_treetagger(text, language): """ Runs treetagger on the text string. Returns a treetagger tagged object. """ tagger = treetaggerwrapper.TreeTagger(TAGLANG=language) tagged = tagger.tag_text(text) return tagged
def lemmatize_input_files(): tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr') files = [f for f in glob.glob("../texts/txt/*.txt")] return { os.path.basename(f): treetaggerwrapper.make_tags(tagger.tag_file(f), exclude_nottags=True) for f in files }
def split_words(path, doc_id=''): tagger = ttw.TreeTagger(TAGLANG='en', TAGDIR=settings.TTBin) tags = tagger.tag_file(path) tags = ttw.make_tags(tags) return TaggedDocument( tags=[doc_id], words=[tag.lemma for tag in tags if tag.pos in NN_list])