def tok(tex): """ Tag le texte et renvoie le texte taggé sous format facile à lire pour l'utilisateur en colonnes Entrée: texte Output: texte taggée """ tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr') tags = tagger.tag_text(tex) tags2 = treetaggerwrapper.make_tags(tags) pprint.pprint(tags2) empty = [] for tag in tags2: tagg = tag empty.append(tagg) print(empty) grammar = [] for element in empty: for i in element: # if element.index(i)==0 or element.index(i)==1: grammar.append(i) grammar.append("\t") grammar.append("\n") res = "".join(grammar) return "{}".format(res)
def write_to_json(words, data, destination, tagger): data.setdefault("list", []) listItem = {} if len(destination) > 0: listItem["party"] = destination[0] listItem["lastName"] = destination[1] listItem["firstName"] = destination[2] listItem["year"] = destination[3] listItem["month"] = destination[4] listItem["day"] = destination[5][:-4] listItem.setdefault("words", []) else: listItem.setdefault("text", []) for item in words: y = {} word, count = item tag = tagger.tag_text(unicode(word)) maketags=treetaggerwrapper.make_tags(tag) for item in maketags: y["tag"] = item[1] y["word"] = word y["count"] = count if len(destination) > 0: listItem["words"].append(y) else: listItem["text"].append(y) data["list"].append(listItem)
def _pos_tag(words: [str], lang: str): tagger = _get_tagger(lang) # we do our own chunking, so call the treetagger with a list of words instead tags_strs = tagger.tag_text(words, tagonly=True) return treetaggerwrapper.make_tags(tags_strs, exclude_nottags=False, allow_extra=True)
def tag(text, tt_home): # Default NLTK's tokenizer # TreebankWordTokenizer + PunktSentenceTokenizer nltk_start = time() tokens = word_tokenize(text) # Default NLTK's POS tagger # ? # Use tagset='universal' for universal tagset nltk_tagged = pos_tag(tokens) nltk_end = time() nltk_execution = nltk_end - nltk_start logger.info("NLTK took %f seconds" % nltk_execution) # TreeTagger wrapper # Tokenization: ? # Default language: English # English: trained on Penn treebank # Default flags: -token -lemma -sgml -quiet -no-unknown tt_start = time() tt = TreeTagger(TAGDIR=tt_home) raw_tags = tt.tag_text(text) tt_end = time() tt_execution = tt_end - tt_start tt_tagged = make_tags(raw_tags) logger.info("TreeTagger took %f seconds" % tt_execution) return (nltk_tagged, nltk_execution), (tt_tagged, tt_execution)
def fit_transform(self): # Parsing (lemmatisation and pos-tagging) tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr') df['tags'] = self.df[self.text_column].apply( lambda x: treetaggerwrapper.make_tags(tagger.tag_text(x))) df['lemma'] = df.tags.apply(lambda x: [(t.lemma).lower() if isinstance( t, treetaggerwrapper.Tag) else '' for t in x]) df['text_lemma'] = df.apply(lambda row: " ".join(row.lemma), axis=1) df['pos'] = df.tags.apply( lambda x: [t.pos if isinstance(t, treetaggerwrapper.Tag) else '' for t in x]) # surface based features df['number_verbs'] = df.pos.apply(lambda x: self.number_verbs(x)) df['number_proper_nouns'] = df.pos.apply( lambda x: self.number_proper_nouns(x)) df['number_imperative_verb'] = df.pos.apply( lambda x: self.number_imperative_verb(x)) # sentiment features lex_dict = self.lex_df.to_dict('index') list_lex_exp = df.lemma.apply(lambda x: [ lex_dict[key] if self.has_expression(exp=lex_dict[key]['lemma'], string_ls=x, group=lex_dict[key]['group']) == True else None for key in lex_dict.keys() ]) list_lex_exp = list_lex_exp.apply(lambda x: list(filter(None, x))) has_int = self.has_intensifier(list_lex_exp) avg_pol = self.avg_polarity(list_lex_exp) df['has_intensifier'] = has_int df['avg_polarity'] = avg_pol return df
def __init__(self, auteur, numero, langue = "fr"): """Crée l'objet Oeuvre s'il n'existe pas encore et le sauvegarde dans un fichier du même nom. S'il existe déjà, on le reprend simplement dans le fichier.""" self.auteur = auteur self.numero = numero self.langue = langue self.categorie = None emplacement_textes = emplacement_dossier_groupe + "Corpus/" + dico_langues[langue] + "/Fichiers txt/" emplacement_oeuvres = emplacement_dossier_groupe + "Corpus/" + dico_langues[langue] + "/Fichiers oeuvres/" #self.infos = Infos(auteur,numero) print(auteur + str(numero), end = " ") try: with open(emplacement_oeuvres + auteur + str(numero), "rb") as mon_fichier: o = pickle.load(mon_fichier) self.texte_brut = o.texte_brut self.tags = o.tags self.mots = o.mots self.racines = o.racines self.POS = o.POS print("(importation terminee)", end = " / ") except FileNotFoundError: tagger = TreeTagger(TAGLANG = self.langue) self.texte_brut = formater(importer(auteur, numero,emplacement_textes)) self.tags = make_tags(tagger.tag_text(self.texte_brut)) self.mots = [t[0] for t in self.tags if len(t) == 3] self.racines = [t[2] for t in self.tags if len(t) == 3] self.POS = [t[1] for t in self.tags if len(t) == 3] with open(emplacement_oeuvres + "/" + auteur + str(numero), "wb") as mon_fichier: pickle.dump(self,mon_fichier,protocol = 2) print("(creation terminee)", end = " / ")
def tagText(self, text): cleared_text = preprocessTweetText(text) tags = self.tagger.tag_text(cleared_text, notagemail=True, notagdns=True) tags2 = treetaggerwrapper.make_tags(tags) return tags2
def get_documents(docs, stopwords): """Extrait les documents du corpus :param corpus: [(source,datetime,text)]""" documents = list() corpus = [ (doc[0], doc[1], doc[2]) for doc in docs if len(doc[2].split()) > 3 ] # supprime les lignes courtes de moins de 4 mots tagger = tagr.TreeTagger( TAGLANG='fr', TAGDIR='c:/Applications/TreeTagger', TAGPARFILE='C:/Applications/TreeTagger/lib/french-utf8.par') idx, start_time = 1, time.time() for doc in corpus: source, datetime, raw = doc[0], doc[1], doc[2] tags = tagr.make_tags(tagger.tag_text(clean_text(raw))) tags = [tag for tag in tags if type(tag) == tagr.Tag] # add all our elements to the array (documents) # each element in the array is a dictionary documents.append({ 'idx': idx, 'source': source, 'time': datetime, 'raw': raw, 'tags': tags }) idx = progress_per(idx, len(corpus), start_time) # print the progress percentage info print() return documents
def choose_next_step(tree, text, context=None): responses = tree['next'] responses.append("start>stop") tags = make_tags(tagger.tag_text(text), exclude_nottags=True) proper_name = [] noun = [] verb = [] other = [] for index, (word, pos, lemma) in enumerate(tags): lemma = lemma.lower() if (pos == "NAM" or pos == "ADJ") and lemma not in proper_name: proper_name.append(lemma) elif pos == "NOM" and lemma not in noun: noun.append(lemma) elif pos.startswith("VER") and lemma not in verb: verb.append(lemma) else: other.append(lemma) list_tags = proper_name + noun + verb + other # check for other keywords for tag in list_tags: for response in responses: if type(response) == str: response = get_tree_by_tag(response) if tag in response['keywords']: if context: response['context'] = context return response # cannot find an anwser in the middle of an action => REPETE text_to_speech("Désolé, je n'ai pas compris") return REPETE
def lemmatizza(frase): frasefinale = "" try: b = frase.split() frasef = [] for parole in b: tags = tagger.tag_text(parole) pos = treetaggerwrapper.make_tags(tags) #pprint(pos) for w in pos: if parole == "vai": frasef.append("andare") else: a = w[2].lower() if "|" not in a: frasef.append(w[2].lower()) else: a = a.replace("|", "I") a = re.sub(r'.*I', '', a) frasef.append(a) frasefinale = " ".join(frasef) except: pass return frasefinale
def text_to_tags(text): tagger = treetaggerwrapper.TreeTagger(TAGLANG='en',TAGDIR='/home/yassine/EMSE 2015-2016/Projet Recherche/tree-tagger-linux-3.2') tags = treetaggerwrapper.make_tags(tagger.tag_text(unicode(text,encoding='utf-8'))) pos_tags = [] for pos in tags: pos_tags.append(pos[1]) return " ".join(pos_tags)
def _extract_tags(sentence): ''' Méthode interne d'extraction des tags (Mot, classe grammaticale, lemme) depuis le treetaggerwrapper ''' tagged_text = _TAGGER.tag_text(sentence) tags = treetaggerwrapper.make_tags(tagged_text) return tags
def lemmatize_word(self, doc): tags = self.lemmatizer.tag_text(doc) tags2 = treetaggerwrapper.make_tags(tags) if tags2: return tags2[0].lemma else: return ""
def tag(self, text): """POS tag tokenized text.""" if self.tagger_name == POSTagger.SFT or self.tagger_name == POSTagger.STAN: tokens = self.__tokenizer.tokenize(text) return self.__tagger.tag(tokens) elif self.tagger_name == POSTagger.TT: tags = self.__tagger.tag_text(text) tuple_list = [] tag_list = treetaggerwrapper.make_tags(tags) for item in tag_list: tuple_list.append((item[0], item[1])) return tuple_list elif self.tagger_name == POSTagger.SPACY: tags = self.__tagger(text) tuple_list = [] for word in tags: tuple_list.append((word.orth_, word.tag_)) return tuple_list else: pass #tagger = POSTagger("spacy-tagger") #doc = tagger.tag(u"Bei mir zu Hause denken sie bestimmt, daß ich noch krank sei.") #print(tagger.tag("Ich werde morgen in die Schule gehen.")) #print(tagger.tag("Hat Aglaja den Brief etwa der Alten gezeigt?«"))
def process_file(out_file_name): tagger = treetaggerwrapper.TreeTagger(TAGLANG='pl') f = open(out_file_name + '.txt', 'r') w = open(out_file_name + '_lemmatized.txt', 'w') i = 0 wrong_pos = ['SENT', 'interp'] for line in f: try: with timeout(5, exception=RuntimeError): tags = tagger.tag_text(line) tag_list = [] tags2 = treetaggerwrapper.make_tags(tags) for tag in tags2: if tag.pos not in wrong_pos: tag_list.append(tag.lemma) w.write(' '.join(tag_list) + '\n') i += 1 if i % 100: print(i) except RuntimeError: continue
def lemma(inpath, outpath, charFilter): ''' Lemmatisiert Texte in gegebenem Ordner inpath. ''' for text in os.listdir(inpath): if text.endswith('.txt'): f_lemma = [] result = '' t = open(inpath + '/' + text, 'r') f = t.read() tagger = treetaggerwrapper.TreeTagger(TAGLANG='de') tags = tagger.tag_text(f) tags2 = treetaggerwrapper.make_tags(tags) print("text", text) for t in tags2: try: result += t.lemma result += ' ' except: pass f_lemma.append(result) if os.path.exists(outpath + text.split('_')[1]+'_'+text.split('_')[2]+'.txt'): txtFile = open(outpath + text.split('_')[1]+'_'+text.split('_')[2]+'.txt', 'w') txtFile.write('') txtFile.close() txtFile = open(outpath + text.split('_')[1]+'_'+text.split('_')[2]+'.txt', 'a') for i in f_lemma: txtFile.write(replace(i, charFilter + ' ')) txtFile.close() return
def build_tree_tagger(text, source_file, output_path): global dir_tree_tagger # build a TreeTagger wrapper tagger = treetaggerwrapper.TreeTagger(TAGDIR=dir_tree_tagger, TAGLANG="fr") # tag text tags = tagger.tag_text(text) if not output_path.exists(): output_path.mkdir(parents=True) treetaggerwrapper.TreeTagger.tag_file_to( tagger, str(source_file), str(output_path / 'tagger_result.txt')) # pprint.pprint(tags) tags2 = treetaggerwrapper.make_tags(tags) # pprint.pprint(tags2) tag_dict = dict() for tag in tags2: if hasattr(tag, 'pos'): tag_dict[unicodedata.normalize('NFD', tag.word).encode('ascii', 'ignore')] = { "pos": tag.pos, "lemma": tag.lemma } # pprint.pprint(tags2) return tag_dict, tags2
def getwords(doc): tags = [ x.lemma for x in treetaggerwrapper.make_tags(tagger.tag_text(doc), exclude_nottags=True) if not re.search("[0-9a-f]{10,}|[^0-9A-Za-z]", x.lemma) ] return tags[0] if len(tags) > 0 else None
def part_of_speech(words): tags = [] t_tags = ttw.make_tags(tagger.tag_text(unicode(' '.join(words))), exclude_nottags=True) for tag in t_tags: tags.append((tag.lemma, tag.pos)) return dict([(tag, True) for tag in tags])
def annotate_with_pos(articles, tagger): all_pos = [] for art in articles: tokenized_art = [] # Tokenize by word for sent in art.split('\n'): tokenized_art += word_tokenize(sent) tg_input = "\n".join([t for t in tokenized_art]) # Apply tagger tg_output = tagger.tag_text(tg_input) # Receive tags for each word, exclude URLs and similar anno = treetaggerwrapper.make_tags(tg_output, exclude_nottags=True) pos_dict = defaultdict(int) # Count the POS tag occurences # NOTE: Saving the list of tags caused memory errors. for a in anno: pos_dict[a[1]] += 1 all_pos.append(pos_dict) vec = DictVectorizer() pos_vectorized = vec.fit_transform(all_pos) # Convert the dictionary to feature matrix return pos_vectorized.toarray()
def tag(text, tt_home): # Default NLTK's tokenizer # TreebankWordTokenizer + PunktSentenceTokenizer nltk_start = time() tokens = word_tokenize(text) # Default NLTK's POS tagger # ? # Use tagset='universal' for universal tagset nltk_tagged = pos_tag(tokens) nltk_end = time() nltk_execution = nltk_end - nltk_start logger.info("NLTK took %f seconds" % nltk_execution) # TreeTagger wrapper # Tokenization: ? # Default language: English # English: trained on Penn treebank # Default flags: -token -lemma -sgml -quiet -no-unknown tt_start = time() tt = TreeTagger(TAGDIR=tt_home) raw_tags = tt.tag_text(text) tt_end = time() tt_execution = tt_end - tt_start tt_tagged = make_tags(raw_tags) logger.info("TreeTagger took %f seconds" % tt_execution) return (nltk_tagged, nltk_execution), (tt_tagged, tt_execution)
def get_level(file_name): tagger = ttw.TreeTagger(TAGLANG='en', TAGDIR=os.getcwd()) tags = tagger.tag_file(file_name + '.txt') tags2 = ttw.make_tags(tags) # with open(file_name + '.tag', 'w') as f: # for tag in tags: # f.write("%s\n" % tag) import re words = [] for tag in tags2: if re.search('^\w', tag.lemma): for word in tag.lemma.lower().split('-'): words.append(word) words = list(set(words)) import word_level sentence_level = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] unknown_words = [] for word in words: level = word_level.get_level(word) sentence_level[level] += 1 if level == 0: unknown_words.append(word) print(sentence_level) print(unknown_words) return sentence_level
def taggerTexte(texte): """ Normalise le texte et renvoie les lemmes Arguments: Texte Renvoie: liste des lemmes pertinents """ texxt = texte.split("’") tex = "'".join(texxt) if detect(tex) == "fr": langdet = 'fr' if detect(tex) == 'en': langdet = 'en' tagger = treetaggerwrapper.TreeTagger(TAGLANG=langdet) tags = tagger.tag_text(tex) tags2 = treetaggerwrapper.make_tags(tags) #pprint(tags2) empty = [] for tag in tags2: tagg = tag empty.append(tagg) #print(empty) grammar = [] for element in empty: compt = 0 for i in element: if compt == 1 or compt == 2: grammar.append(i) grammar.append("\t") compt += 1 grammar.append("\n") del grammar[-1] res = "".join(grammar) ress = res.split("\n") lemmes = [] for rrr in ress: if len(rrr) == 0 or "@" in rrr: del ress[ress.index(rrr)] for rr in ress: match_tag = re.search(r"(.*)\t(.*)\t", rr) if "VER" in match_tag.group(1) or "NOM" in match_tag.group( 1) or "ABR" in match_tag.group(1) or "ADJ" in match_tag.group( 1): lemmes.append(match_tag.group(2).lower()) elif "VV" in match_tag.group(1) or "NN" in match_tag.group( 1) or "NP" in match_tag.group(1) or "JJ" in match_tag.group( 1) or "VH" in match_tag.group( 1) or "VB" in match_tag.group( 1) or "MD" in match_tag.group(1): lemmes.append(match_tag.group(2).lower()) return lemmes
def tokenize_and_lemmatize_tweets(listTweets): """Tokenize & lemmatize a list of texts""" global french_stop_words global mention_regex global LOCALTAGDIR # Setting up TreeTagger tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr', TAGDIR=LOCALTAGDIR) for t in listTweets: text = mention_regex.sub("", t["text"]).lower() tags = tagger.tag_text(text) tags = treetaggerwrapper.make_tags(tags) tokens = [] lemma = [] # Filtering for tag in tags: if hasattr(tag, 'word'): if not (len(tag.lemma) < 2 or tag.lemma in french_stop_words): tokens.append(tag.word) lemma.append(tag.lemma) else: token = tag.what if not (len(token) < 2 or token in french_stop_words): if token.startswith("<repurl") or token.startswith( "<repdns"): token = token[token.find('"') + 1:token.rfind('"')] else: lemma.append(token) tokens.append(token) t["tokenArray"] = tokens t["lemmaArray"] = lemma return listTweets
def split_words(path, doc_id=''): tagger = ttw.TreeTagger(TAGLANG='en', TAGDIR=settings.TTBin) tags = tagger.tag_file(path) tags = ttw.make_tags(tags) return TaggedDocument( tags=[doc_id], words=[tag.lemma for tag in tags if tag.pos in NN_list])
def compute(self): """ Compute the feature value for attribute Bag of Pos Tag First check whether there is a seen BoP skeleton or not. If there isn't build_model().Walking through text_set and compute feature value for every text object. Counting every Pos Tag appearance from the text_set. Storing feature value in text.feature hash. """ if self.bow_model is not None: print "BOW not None" for text in self._text_set: temp_model = dict(self.bow_model) tags = treetaggerwrapper.make_tags( self.tagger.tag_text(text.text.decode("utf-8"))) for tag in tags: try: temp_model[tag[1]] += 1 except KeyError: continue text.features["bag_of_pos"] = temp_model.values() else: print "BOW is None" self.build_model() for text in self._text_set: # for test_case ''' test__bag_of_words__compute ''' use the OrderedDict # to check the values with the term_frequency in test_suitcase.resource # # temp_model = collections.OrderedDict(sorted(self.model.items())) temp_model = dict(self.model) tags = treetaggerwrapper.make_tags( self.tagger.tag_text(text.text.decode("utf-8"))) for tag in tags: try: temp_model[tag[1]] += 1 except KeyError: continue text.features["bag_of_pos"] = temp_model.values() self.bow_model = self.model
def lemmatize_input_files(): tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr') files = [f for f in glob.glob("../texts/txt/*.txt")] return { os.path.basename(f): treetaggerwrapper.make_tags(tagger.tag_file(f), exclude_nottags=True) for f in files }
def compute(self): """ Compute the feature value for attribute Bag of Pos Tag First check whether there is a seen BoP skeleton or not. If there isn't build_model().Walking through text_set and compute feature value for every text object. Counting every Pos Tag appearance from the text_set. Storing feature value in text.feature hash. """ if self.bow_model is not None: print "BOW not None" for text in self._text_set: temp_model = dict(self.bow_model) tags = treetaggerwrapper.make_tags(self.tagger.tag_text(text.text.decode("utf-8"))) for tag in tags: try: temp_model[tag[1]] += 1 except KeyError: continue text.features["bag_of_pos"] = temp_model.values() else: print "BOW is None" self.build_model() for text in self._text_set: # for test_case ''' test__bag_of_words__compute ''' use the OrderedDict # to check the values with the term_frequency in test_suitcase.resource # # temp_model = collections.OrderedDict(sorted(self.model.items())) temp_model = dict(self.model) tags = treetaggerwrapper.make_tags(self.tagger.tag_text(text.text.decode("utf-8"))) for tag in tags: try: temp_model[tag[1]] += 1 except KeyError: continue text.features["bag_of_pos"] = temp_model.values() self.bow_model = self.model
def french_labeling(New, lang): TAGS = GetTags.get_tags(lang) A = [1 / (n) for n in range(1, len(TAGS) + 1)] A.sort() # Ranking the tags d = {} for i in range(len(A)): d[TAGS[i]] = A[len(A) - 1 - i] # Generating positive and negative lexicos for words labeling Positive, Negative = Lexicons.lexicons(lang) # Extracting sentences L = [] if lang == 'English': lang = 'en' else: if lang == 'French': lang = 'fr' #1) build a TreeTagger wrapper accordng to the language: tagger = treetaggerwrapper.TreeTagger(TAGLANG=lang) for i in range(len(New)): s = New['description'][i] s = re.sub('#', '', s) #2) Tagging the text tags = tagger.tag_text(s) tags2 = treetaggerwrapper.make_tags(tags) pos_s = 1 neg_s = 1 for j in range(len(tags2)): if tags2[j][0] in Positive or tags2[j][2] in Positive: if tags2[j - 1][0] in ['jamais', 'pas', 'ne', 'non']: pos_s = pos_s - (pos_s * d[tags2[j][1]]) else: pos_s = pos_s + (pos_s * d[tags2[j][1]]) else: if tags2[j][0] in Negative or tags2[j][2] in Negative: if tags2[j - 1][0] in ['jamais', 'pas', 'ne', 'non']: neg_s = neg_s - (neg_s * d[tags2[j][1]]) else: neg_s = neg_s + (neg_s * d[tags2[j][1]]) # Mapping scores to labels if pos_s / len(s) == neg_s / len(s) or abs(pos_s - neg_s) < 0.05: polarity = 'Neutral' else: if neg_s / len(s) > pos_s / len(s): polarity = 'Negative' else: if pos_s / len(s) > neg_s / len(s): polarity = 'Positive' L.append(s + '\t' + polarity) LL = [a.split('\t')[1] for a in L] return (LL)
def test_apply_gives_correct_relative_values_for_skip_grams(self): tags = treetaggerwrapper.make_tags( self.tagger.tag_text( 'ceci est un texte très court à taguer, et un mot apparaît deux fois' )) result = SkipGramFeature().apply(tags) self.assertEqual({ ('DET:ART', 'NOM'): 0.08, ('VER:pres', 'NOM'): 0.08 }, result)
def comment_to_lemme(comment): t = treetaggerwrapper.make_tags(tagger.tag_text(comment)) lemme = '' for i in t: if type(i) == treetaggerwrapper.Tag: if i.pos[:3] in ('ADJ', 'ADV', 'INT', 'KON', 'NOM', 'VER'): if i.lemma != 'dns-remplacé': if len(i.lemma) > 1: lemme = lemme + ' ' + i.lemma.split('|')[0].lower() return lemme
def tag_text(text): if language == 'german': country_code = 'de' else: country_code = 'en' tagger = treetaggerwrapper.TreeTagger(TAGLANG=country_code, TAGDIR=tree_tagger_dir) text_with_tags = tagger.tag_text(text) tags = treetaggerwrapper.make_tags(text_with_tags) return tags
def sentences_to_ngrams(sentences, ngram_size, fr_nouns_file): ngrams = [] context_size = int(ngram_size / 2) tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr', TAGINENC='utf-8', TAGOUTENC='utf-8') with open(fr_nouns_file, "r") as file: fr_nouns = file.readlines() for s in sentences: s = s.replace(';', '') s = s.replace("'", chr(39)) s = s.replace('\'', chr(39)) s = s.replace("d\'", " deeee ") s = s.replace("l\'", " leeee ") sentence_tagged = treetaggerwrapper.make_tags(tagger.tag_text(s)) sentence = list(np.array(sentence_tagged) [:, 0]) # getting only the token (not lemmas and POS) for i, token in enumerate(sentence): if token == "leeee": sentence[i] = "l\'" if token == 'deeee': sentence[i] = "d\'" index_left = sentence.index('[') index_right = sentence.index(']') phrase_ngram = [] # add left context for i in range(context_size): try: phrase_ngram.append(sentence[index_left - context_size + i]) except IndexError: # when there is not enough words (ex: pivot word starting the sentence) phrase_ngram.append(random.choice(fr_nouns).rstrip()) # add pivot token(s) (can contain several tokens) phrase_ngram.append(' '.join(sentence[index_left + 1:index_right])) # add right context for i in range(context_size): try: phrase_ngram.append(sentence[index_right + 1 + i]) except IndexError: # when there is not enough words (ex: pivot word starting the sentence) phrase_ngram.append(random.choice(fr_nouns).rstrip()) ngrams.append(phrase_ngram) return ngrams
def extract_tags(text): ret = list() tags = tagger.tag_text(question) tags = make_tags(tags) for tag in tags: tmp = dict() tmp['word'] = tag.word tmp['lemma'] = tag.lemma.split('|')[0] tmp['pos'] = translate_pos(tag.pos.split(':')[0]) ret.append(tmp) return ret
def compute(self): """ Compute the feature value for attribute Adjective Walking through text_set and compute feature value for every text object. Storing faeture value in text.feature hash. """ for text in self._text_set: tag_list = treetaggerwrapper.make_tags(self.tagger.tag_text(text.text.decode("utf-8"))) text.features["adjective"] = self.count_adj(tag_list)
def lemmatise(self, text): """ lemmatise a text """ tagger = treetaggerwrapper.TreeTagger(TAGLANG='en') tags = tagger.tag_text(text) mytags = treetaggerwrapper.make_tags(tags) lemma_list=[] for tag in mytags: lemma_list.append(tag.lemma) return ' '.join(lemma_list)
def build_model(self): """ Building a Bag of Pos Tag Skeleton. A Bag of Pos Tag Skeleton is a hash containing every unique Pos Tag, that is in a text from the text_set, as key. Initial value is an integer(0). """ for text in self._text_set: tags = treetaggerwrapper.make_tags(self.tagger.tag_text(text.text.decode("utf-8"))) for tag in tags: self.model[tag[1]] = 0
def treetag_paragraphs(self, paragraphs, tagger): try: tt_tags = [ treetaggerwrapper.make_tags(tagger.tag_text(para.lower()), exclude_nottags=True) for para in paragraphs ] except: print(f'Treetagging error on id: {self.id}') tt_tags = [] return tt_tags
def compute(self): """ Compute the quantity of ModalVerbs in every text from the text_set Walking through text_set and compute feature value for every text object. Storing feature value in text.feature hash. """ for text in self._text_set: count_modal = 0 tags = treetaggerwrapper.make_tags(self.tagger.tag_text(text.text.decode("utf-8"))) for tag in tags: if tag[1] in self.tag_list: count_modal += 1 text.features[self._name] = count_modal
def applyTreeTagger(self, text): """ Calcs TreeTagger Result for given text if not already done @parameters text string text that will be tagged @returns list treetagger result """ if self.treetagged == "": tagger = tt.TreeTagger(TAGLANG="de") tagged_list = tt.make_tags(tagger.tag_text(self.cleanSource(text))) return tagged_list else: return self.treetagged
def tag_one(self, text, skip_unknown=True, **kwargs): """ POS-Tags the given text, optionally skipping unknown lemmas :param unicode text: Text to be tagged :param bool skip_unknown: Automatically emove unrecognized tags from the result Sample usage: >>> from strephit.commons.pos_tag import TTPosTagger >>> from pprint import pprint >>> pprint(TTPosTagger('en').tag_one(u'sample sentence to be tagged fycgvkuhbj')) [Tag(word=u'sample', pos=u'NN', lemma=u'sample'), Tag(word=u'sentence', pos=u'NN', lemma=u'sentence'), Tag(word=u'to', pos=u'TO', lemma=u'to'), Tag(word=u'be', pos=u'VB', lemma=u'be'), Tag(word=u'tagged', pos=u'VVN', lemma=u'tag')] """ return self._postprocess_tags(make_tags(self.tagger.tag_text(text, **kwargs)), skip_unknown)
def compute(self): """ Compute the feature value for attribute SentenceStart Building lemmatas and tags with TreeTagger. Walking through text_set and compute feature value for every text object. Storing faeture value in text.feature hash. """ for text in self._text_set: for sent in text.sentencelist: tags = treetaggerwrapper.make_tags(self.tagger.tag_text(sent))[0:2] try: self.tuple_list_lemma.append((tags[0][2], tags[1][2])) self.tuple_list_tag.append((tags[0][1], tags[1][1])) except IndexError: continue text.features["sentence_start"] = [self.count_lemma(), self.count_tag()] self.tuple_list_tag = [] self.tuple_list_lemma = []
def treetag(sentence, encoding = None): # TreeTagger helper function. if encoding != None: return treetaggerwrapper.make_tags(tagger.tag_text(unicode(sentence, "utf-8"))) else: return treetaggerwrapper.make_tags(tagger.tag_text(sentence))
def lemmatize_chunk(self, doc): tags = self.lemmatizer.tag_text(doc) tags2 = treetaggerwrapper.make_tags(tags) return [item.lemma for item in tags2]
def get_tree_tagged_tokens(self): """takes the tokens and tags them""" tagger = self.tagger return treetaggerwrapper.make_tags(tagger.tag_text(self.tokens))
def _finalize_batch(self, jobs, pos_tag_key): for item, job in jobs: job.wait_finished() item[pos_tag_key] = self._postprocess_tags(make_tags(job.result)) yield item
def stemmed_token_count(self, token): stem = treetaggerwrapper.make_tags(self.tagger.tag_text(token))[0].lemma return FreqDist(self.stems)[stem]
message = re.sub(r'\n','',message) # delete vote coments for comments in voteComments: for comment in comments: message = re.sub(comment,'',message) message = message.replace(comment, '') comment = r'\([1-9]* inline comment(s)*\)' message = re.sub(comment,'',message) if message == '': continue #set message tag tagText = tagger.tag_text(message.decode('utf-8')) tags = treetaggerwrapper.make_tags(tagText, exclude_nottags=True) #print words fv = {} # Count of word words = 0 # total count of ward in line for word in list(fv_df.keys()): count_flag[word] = False for tag in tags: words += 1 word = tag.lemma if fv.has_key(word): fv[word]+=1 else:
count_flag = {} # fv_dfを計算する上で必要なフラグを格納するためのディクショナリ # 各文書の形態素解析と、単語の出現回数の計算 for txt_id, txt in enumerate(text): # MeCabを使うための初期化 #tagger = MeCab.Tagger("-d /var/lib/mecab/dic/ipadic-utf8") #tagger.parse('') #node = tagger.parseToNode(txt) # TreeTaggerを使うための初期化 tagger = treetaggerwrapper.TreeTagger(TAGLANG='en',TAGDIR='../treetagger') tagText = tagger.tag_text(txt) node = treetaggerwrapper.make_tags(tagText, exclude_nottags=False) fv = {} # 単語の出現回数を格納するためのディクショナリ words = 0 # ある文書の単語の総出現回数 for word in list(fv_df.keys()): count_flag[word] = False while node.next: node = node.next surface = node.surface # 形態素解析により得られた単語 words += 1
sent_tokenizer=nltk.data.LazyLoader('tokenizers/punkt/spanish.pickle')) stop_words = nltk.corpus.stopwords.words('spanish') non_alphabetic = re.compile("\W|\d") words = [] tags = [] # Using TreeTagger # 1) pip install treetaggerwrapper # 2) put treetragger in %PYHOME%\Lib\site-packages\TreeTagger # 3) put spanish-utf8.par and spanish-chunker.par in \TreeTagger\lib # See http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/spanish-tagset.txt for tag meanings tagger = treetaggerwrapper.TreeTagger(TAGLANG='es') for sentence in article_corpus.sents(): tagged_sentence = tagger.tag_text(sentence) tags.extend(treetaggerwrapper.make_tags(tagged_sentence)) #TODO: create a tagger script, save the tagged files #TODO: look at alternate taggers, compare #TODO: profile this and see which part is taking so long for tag in tags: lemma = tag[2].lower() if lemma not in stop_words and not non_alphabetic.search(lemma): words.append(lemma) freq_dist = FreqDist(words) with open('./frequency_distribution.txt', 'w', encoding='utf-8') as f: f.write("word, number of occurences\n") for word in freq_dist.most_common():
def tag_pos(text, language): ''' Tag parts-of-speech in text; return tagged text ''' # ttw will throw an error if the code isn't supported tagger = ttw.TreeTagger(TAGLANG=lang_codes[language]) tags = tagger.tag_text(text) return ttw.make_tags(tags)