def add_pos_tags(self, treetaggerlocation, taglang="es"): # POS-tag the tokenized sentences according to language print("Adding POS-Tags...") pos_tagged_sents = pos_tag_sents(self.tokenizedLines_en) # pos_tagged_sents = [[(w1,tag1),(w2,tag2)],[(w1,t2),(w2,t2),...]...] for sent in pos_tagged_sents: fo = [] for word in sent: temp = word[0] + word[1] fo.append(temp) self.pos_tagged_sents_en.append( fo) # -> [["w1t1","w2t2",...],["w1t1",...],...] if taglang == "fi": tagger = TreeTagger(TAGLANG=taglang, TAGDIR=treetaggerlocation, TAGPARFILE="Preprocessing/fi_en/finnish.par") if taglang == "es": tagger = TreeTagger(TAGLANG=taglang, TAGDIR=treetaggerlocation, TAGPARFILE="Preprocessing/es_en/spanish.par") if taglang == "de": tagger = TreeTagger(TAGLANG=taglang, TAGDIR=treetaggerlocation, TAGPARFILE="Preprocessing/de_en/german.par") if taglang == "pl": tagger = TreeTagger(TAGLANG=taglang, TAGDIR=treetaggerlocation, TAGPARFILE="Preprocessing/pl_en/polish.par") pos_tagged_sents = [] for line in self.tokenizedLines_es: if "EE.UU" in line: line_t = [] for w in line: if w == "EE.UU": line_t.append("EEUU") else: line_t.append(w) line = line_t tags = tagger.tag_text(line) pos_tagged_sents.append(tags) # -> [['Esto\tDM\teste','es\tVSfin\tser', 'un\tART\tun','texto\tNC\ttexto',],[...],...] for i in range(len(pos_tagged_sents)): fo = [] for word in pos_tagged_sents[i]: temp = word.split( '\t') # 'esto\tDM\teste' => ['esto', 'DM', 'este'] word_n_tag = temp[0] + temp[ 1] # ['esto', 'DM', 'este'] => 'estoDM' fo.append(word_n_tag) self.pos_tagged_sents_es.append(fo)
def tag(text, tt_home): # Default NLTK's tokenizer # TreebankWordTokenizer + PunktSentenceTokenizer nltk_start = time() tokens = word_tokenize(text) # Default NLTK's POS tagger # ? # Use tagset='universal' for universal tagset nltk_tagged = pos_tag(tokens) nltk_end = time() nltk_execution = nltk_end - nltk_start logger.info("NLTK took %f seconds" % nltk_execution) # TreeTagger wrapper # Tokenization: ? # Default language: English # English: trained on Penn treebank # Default flags: -token -lemma -sgml -quiet -no-unknown tt_start = time() tt = TreeTagger(TAGDIR=tt_home) raw_tags = tt.tag_text(text) tt_end = time() tt_execution = tt_end - tt_start tt_tagged = make_tags(raw_tags) logger.info("TreeTagger took %f seconds" % tt_execution) return (nltk_tagged, nltk_execution), (tt_tagged, tt_execution)
def __init__(self, auteur, numero, langue = "fr"): """Crée l'objet Oeuvre s'il n'existe pas encore et le sauvegarde dans un fichier du même nom. S'il existe déjà, on le reprend simplement dans le fichier.""" self.auteur = auteur self.numero = numero self.langue = langue self.categorie = None emplacement_textes = emplacement_dossier_groupe + "Corpus/" + dico_langues[langue] + "/Fichiers txt/" emplacement_oeuvres = emplacement_dossier_groupe + "Corpus/" + dico_langues[langue] + "/Fichiers oeuvres/" #self.infos = Infos(auteur,numero) print(auteur + str(numero), end = " ") try: with open(emplacement_oeuvres + auteur + str(numero), "rb") as mon_fichier: o = pickle.load(mon_fichier) self.texte_brut = o.texte_brut self.tags = o.tags self.mots = o.mots self.racines = o.racines self.POS = o.POS print("(importation terminee)", end = " / ") except FileNotFoundError: tagger = TreeTagger(TAGLANG = self.langue) self.texte_brut = formater(importer(auteur, numero,emplacement_textes)) self.tags = make_tags(tagger.tag_text(self.texte_brut)) self.mots = [t[0] for t in self.tags if len(t) == 3] self.racines = [t[2] for t in self.tags if len(t) == 3] self.POS = [t[1] for t in self.tags if len(t) == 3] with open(emplacement_oeuvres + "/" + auteur + str(numero), "wb") as mon_fichier: pickle.dump(self,mon_fichier,protocol = 2) print("(creation terminee)", end = " / ")
def build_stemed_day_array(df, date_range, stop_words, lang): # init TreeTagger tagger = TreeTagger(TAGLANG=lang, TAGDIR='./TreeTagger') res = [] # iterating over all days in the given day range for i in tqdm(date_range, desc='Stemming Tweets'): sentence = [] # selecting all tweets of a certain day for tweet in df[np.logical_and( df['date'] > i, df['date'] < i + pd.DateOffset(1))]['text']: tweet = format_text(tweet) # if the tweet has content left after cleaning, lemmatization will start if tweet != '': for word in [j.split('\t')[2] for j in tagger.tag_text(tweet)]: if len(word) < 3: continue if word.lower() in stop_words: continue # +,| and @ are added by TreeTagger for multiple meanings or composed words. # Will be ignored to prevend impurities if "+" in word or "|" in word or "@" in word: continue sentence.append(word) res.append(" ".join(sentence)) return res
def desc2domaine(description_cas, dom_logement=1, dom_famille=9): """Classifieur: détermine si la description appartient au droit de la famille ou au droit du logement. TODO: remplacer par une fonction qui utilise les doc2vec (je fais pas trop confiance au nearest neighbor — c'est presque toujours le pire classifieur) """ ttroot = os.path.abspath(os.path.join(os.getcwd(), "treetagger-install")) tagger = TreeTagger( TAGLANG="fr", TAGDIR=ttroot ) v = np.zeros(len(mots)) t = [ ln.split("\t") for ln in tagger.tag_text(description_cas) ] t = [ i[2] for i in t if len(i)==3 ] t = [ i for i in t for i in mots ] nmots = float(len(t)) for k, val in Counter(t).items(): v[mots.index(k)] = val / nmots dfamille = cosine(v, vec["famille"]) dlogement = cosine(v, vec["logement"]) if dlogement < dfamille: return dom_logement else: return dom_famille
def text2vec(description_cas): "Convertit du texte en vecteur sur l'espace du modèle Doc2Vec d2v" ttroot = os.path.abspath(os.path.join(os.getcwd(), "treetagger-install")) tagger = TreeTagger(TAGLANG="fr", TAGDIR=ttroot) t = [ ln.split("\t") for ln in tagger.tag_text(description_cas.lower()) ] t = [ i[2] for i in t if len(i)==3 ] t = [ i for i in t for i in d2v.wv.index2entity ] return d2v.infer_vector(t)
def tagText(doc, lg): """ permet de tagger du texte avec TreeTagger attend une chaîne de caractères renvoie une liste d'éléments taggés sous ce format : ['je\tPRO:PER\tje', 'suis\tVER:pres\tsuivre|être'] """ assert doc, "Problème : l'élément à tagger est vide" tag_options = TreeTagger(TAGLANG=lg, TAGOPT="-token -lemma -sgml -no-unknown") tags = tag_options.tag_text(doc) return tags
def __init__(self, stem=False): dictionaries = dicts() path = '/home/alangb/TWPP' # path to TreeTagger installation directory self.english_dict = enchant.Dict("en_EN") self.spanish_dict = enchant.Dict("es_ES") self.ND = dictionaries.norm self.SD = dictionaries.lemario self.PND = dictionaries.names self.stem = stem if stem: self.stemmer = SpanishStemmer() else: self.tagger = TreeTagger(TAGLANG='es', TAGDIR=path)
def __init__(self, language, tt_home=None, **kwargs): self.language = language self.tt_home = tt_home self.tokenizer = Tokenizer(language) self.tagger = TreeTagger( TAGLANG=language, TAGDIR=tt_home, # Explicit TAGOPT: the default has the '-no-unknown' option, # which prints the token rather than '<unknown>' for unknown lemmas # We'd rather skip unknown lemmas, as they are likely to be wrong tags TAGOPT=u'-token -lemma -sgml -quiet', # Use our tokenization logic (CHUNKERPROC here) CHUNKERPROC=self._tokenizer_wrapper, **kwargs)
import nltk nltk.download('stopwords') from nltk.corpus import stopwords from bs4 import BeautifulSoup from pymongo import MongoClient from treetaggerwrapper import make_tags, TreeTagger import mysql.connector import re global tagger tagger = TreeTagger(TAGLANG='it') def translate_pos(pos): if pos == "NOM": return 'n' elif pos[:3] == "VER": return 'v' elif pos == "ADJ": return 'a' elif pos == "ADV": return 'r' else: return pos def extract_tags(text): ret = list() tags = tagger.tag_text(question)
#1er tag avant le lower pour repérer les noms propres preTokens = nltk.word_tokenize(document) prePosTokens = pos_tagger.tag(preTokens) tokens = [] for i in range(len(prePosTokens)): if prePosTokens[i][1] == 'NPP': tokens.append(prePosTokens[i][0]) else: tokens.append((prePosTokens[i][0]).lower()) #Lemmatisation avec TreeTagger import treetaggerwrapper from treetaggerwrapper import TreeTagger tagger = TreeTagger(TAGLANG='fr') tags = tagger.tag_text(tokens) lemmas = [t[2] for t in treetaggerwrapper.make_tags(tags)] #Filtre alphanumérique alphaTokens = [t for t in lemmas if re.match("^[A-Za-z -]+$", t)] #Filtre stopwords filteredLemmas = [t for t in alphaTokens if t not in stopFrench] #print(filteredLemmas) filteredText = nltk.Text(filteredLemmas) fdistFiltered = nltk.FreqDist(filteredText) filteredLemmasTaged = {} #dictionnaire for i in range(len(filteredLemmas)): filteredLemmasTaged[filteredLemmas[i]] = pos_tagger.tag(
EXIT_TREE = 1 REPETE = 2 CONFIG_PATH = abspath(__file__) + "config.json" path_volume = abspath(__file__) + "_data/" tree = json.loads(open('tree.json').read()) faces = json.loads(open(path_volume + 'faces.json').read()) config = json.loads(open(CONFIG_PATH).read()) # define the TreeTagger folder my_os = get_os() if my_os == 'win': tagger = TreeTagger(TAGLANG='fr', TAGDIR=join(getcwd(), 'Treetagger', 'TreeTagger_windows')) elif my_os == 'pi': tagger = TreeTagger(TAGLANG='fr', TAGDIR=join(getcwd(), 'Treetagger', 'TreeTagger_pi')) elif my_os == 'linux': tagger = TreeTagger(TAGLANG='fr', TAGDIR=join(getcwd(), 'Treetagger', 'TreeTagger_unix')) else: sys.exit('Système d\'exploitation non compatible.') def Tree(tree=tree): """the main recursive fonction that is responsible of reading the tree and deciding witch node is next This fonction takes the cuurent position in the tree (current node), do the processing and end up with a recursive call
from treetaggerwrapper import TreeTagger from multiprocessing import Process """ simple web service which embeds treetagger """ # install treetagger from here: # https://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/ tagbuildopt = {} tagbuildopt[ "TAGDIR"] = '/opt/bot/treetagger' # location on fs where treetagger is installed (follow instructions on website) tagbuildopt["TAGLANG"] = "it" tagger = TreeTagger(**tagbuildopt) class TreetaggerWebservice(object): @cherrypy.expose @cherrypy.tools.json_in() @cherrypy.tools.json_out() def treetagger_ws(self): data = cherrypy.request.json # print(data) # print(type(data)) j = json.loads(data) sentence = j['text']
import os from treetaggerwrapper import TreeTagger from formatConversor import Tokenizer2treeTagger from formatConversor import ChunkerMarc2tarsqi from formatConversor import normalizeXML from formatConversor import normalizePOS from ttk_path import TTK_ROOT from library.tarsqi_constants import PREPROCESSOR from components.common_modules.component import ComponentWrapper from components.preprocessing.tokenizer import Tokenize_File treetagger_dir = os.path.join(TTK_ROOT, 'components', 'preprocessing', 'treetagger') treetagger = TreeTagger(TAGLANG='en', TAGDIR=treetagger_dir) class PreprocessorWrapper(ComponentWrapper): """Wrapper for the preprocessing components. See ComponentWrapper for more details on how component wrappers work. Instance variables DIR_PRE - directry where the preprocessor code lives see ComponentWrapper for other variables.""" def __init__(self, tag, xmldoc, tarsqi_instance): """Calls __init__ of the base class and sets component_name, DIR_PRE, CREATION_EXTENSION and RETRIEVAL_EXTENSION.""" ComponentWrapper.__init__(self, tag, xmldoc, tarsqi_instance) self.component_name = PREPROCESSOR self.DIR_PRE = os.path.join(TTK_ROOT, 'components', 'preprocessing')
def initialize_treetagger(treetagger_dir): global treetagger if treetagger is None: treetagger = TreeTagger(TAGLANG='en', TAGDIR=treetagger_dir) return treetagger
""" Code de Chuanming et Suhaib (Pretraitement/phrase2conll.py), déplacé pour pouvoir l'utiliser comme module + mini adaptations """ from treetaggerwrapper import TreeTagger, make_tags TAGGER = TreeTagger(TAGLANG='fr', TAGOPT="-token -lemma -sgml -no-unknown") def main(phrase): line = "" tags = TAGGER.tag_text(phrase) for tag in tags: line += tag.replace("\t", "/") + " " # print("tagged phrase : "+line) return line if __name__ == "__main__": main()
def classifyingCV(): bow_isear, words_num = processWords('isear') dbconfig = read_db_config() conn = MySQLConnection(**dbconfig) cursor = conn.cursor(buffered=True) start = 0 step = 760 end = start + step tt = TreeTagger(TAGLANG='en') wna = WNAffect(prefix + 'wordnet1.6/', prefix + 'wordnetAffect/') for i in range(0, 10): if i == 0: data_train = bow_isear[step:, :] data_test = bow_isear[:end, :] elif i == 9: data_train = bow_isear[:start, :] data_test = bow_isear[start:, :] # new_test = data_test else: temp = bow_isear[:start, :] data_train = bow_isear[end:, :] data_train = np.concatenate((data_train, temp), axis=0) data_test = bow_isear[start:end, :] start += step end += step bnb = MultinomialNB(alpha=0.01).fit(data_train[:, 2:], data_train[:, 0]) lr = linear_model.LogisticRegression(solver='liblinear', n_jobs=3).fit( data_train[:, 2:], data_train[:, 0]) total_test = 0 true_number = 0 true_nb = 0 true_lr = 0 prediction_nb = [] prediction_lr = [] prediction = [] for item in data_test: score_table = np.zeros([2, 3], dtype=int) if str(item[1]) != '0': print item[1] total_test += 1 # ENSEMBLE result_nb = bnb.predict([item[2:]]) result_lr = lr.predict([item[2:]]) result_kb = kb.predict(item[1], tt, wna, cursor, 'isear') score_table[1][result_nb] += 1 score_table[1][result_lr] += 1 if result_kb != 0: score_table[1][result_kb] += 1 predicted = np.unravel_index(np.argmax(score_table, axis=None), score_table.shape) final_prediction = predicted[1] if score_table[1][predicted[1]] == 1: final_prediction = result_lr if final_prediction == item[0]: true_number += 1 prediction.append(int(final_prediction)) # SINGLE if result_nb == item[0]: true_nb += 1 if result_lr == item[0]: true_lr += 1 prediction_nb.append(int(result_nb)) prediction_lr.append(int(result_lr)) else: print item prediction = np.array(prediction) prediction_nb = np.array(prediction_nb) prediction_lr = np.array(prediction_lr) if i == 9: nb_score = prf(data_test[:665, 0], prediction_nb, average='binary') lr_score = prf(data_test[:665, 0], prediction_lr, average='binary') all_score = prf(data_test[:665, 0], prediction, average='binary') tn, fp, fn, tp = confusion_matrix(data_test[:665, 0], prediction).ravel() else: nb_score = prf(data_test[:, 0], prediction_nb, average='binary') lr_score = prf(data_test[:, 0], prediction_lr, average='binary') all_score = prf(data_test[:, 0], prediction, average='binary') tn, fp, fn, tp = confusion_matrix(data_test[:, 0], prediction).ravel() with open('ensemble_confusion[isear].txt', 'a') as file: file.write("\n\nTRUE NEG :: " + str(tn)) file.write("\nFALSE POS :: " + str(fp)) file.write("\nFALSE NEG :: " + str(fn)) file.write("\nTRUE POS :: " + str(tp)) # file.write("\n\n\nTrue :: "+str(true_number)) # file.write(" from :: "+str(total_test)) # file.write("\nAccuracy :: "+str((float(true_number) / float(total_test)))) # file.write("\nPrecision :: "+str(all_score[0])) # file.write("\nF-Score :: "+str(all_score[2])) # file.write("\nNB True :: "+str(true_nb)) # file.write(" from :: "+str(total_test)) # file.write("\nAccuracy :: "+str((float(true_nb) / float(total_test)))) # file.write("\nPrecision :: "+str(nb_score[0])) # file.write("\nF-Score :: "+str(nb_score[2])) # file.write("\nLR True :: "+str(true_lr)) # file.write(" from :: "+str(total_test)) # file.write("\nAccuracy :: "+str((float(true_lr) / float(total_test)))) # file.write("\nPrecision :: "+str(lr_score[0])) # file.write("\nF-Score :: "+str(lr_score[2])) bow_affective, words_num = processWords('affective') start = 0 step = 123 end = start + step tt = TreeTagger(TAGLANG='en') wna = WNAffect(prefix + 'wordnet1.6/', prefix + 'wordnetAffect/') for i in range(0, 10): if i == 0: data_train = bow_affective[step:, :] data_test = bow_affective[:end, :] elif i == 9: data_train = bow_affective[:start, :] data_test = bow_affective[start:, :] else: temp = bow_affective[:start, :] data_train = bow_affective[end:, :] data_train = np.concatenate((data_train, temp), axis=0) data_test = bow_affective[start:end, :] start += step end += step bnb = MultinomialNB(alpha=0.01).fit(data_train[:, 2:], data_train[:, 0]) lr = linear_model.LogisticRegression(solver='liblinear', n_jobs=3).fit( data_train[:, 2:], data_train[:, 0]) total_test = 0 true_number = 0 true_nb = 0 true_lr = 0 prediction_nb = [] prediction_lr = [] prediction = [] for item in data_test: score_table = np.zeros([2, 3], dtype=int) if str(item[1]) != '0': print item[1] total_test += 1 # ENSEMBLE result_nb = bnb.predict([item[2:]]) result_lr = lr.predict([item[2:]]) result_kb = kb.predict(item[1], tt, wna, cursor, 'affective') score_table[1][result_nb] += 1 score_table[1][result_lr] += 1 if result_kb != 0: score_table[1][result_kb] += 1 predicted = np.unravel_index(np.argmax(score_table, axis=None), score_table.shape) final_prediction = predicted[1] if score_table[1][predicted[1]] == 1: final_prediction = result_lr if final_prediction == item[0]: true_number += 1 prediction.append(int(final_prediction)) # SINGLE if result_nb == item[0]: true_nb += 1 if result_lr == item[0]: true_lr += 1 prediction_nb.append(int(result_nb)) prediction_lr.append(int(result_lr)) prediction = np.array(prediction) prediction_nb = np.array(prediction_nb) prediction_lr = np.array(prediction_lr) if i == 9: nb_score = prf(data_test[:116, 0], prediction_nb, average='binary') lr_score = prf(data_test[:116, 0], prediction_lr, average='binary') all_score = prf(data_test[:116, 0], prediction, average='binary') tn, fp, fn, tp = confusion_matrix(data_test[:116, 0], prediction).ravel() else: nb_score = prf(data_test[:, 0], prediction_nb, average='binary') lr_score = prf(data_test[:, 0], prediction_lr, average='binary') all_score = prf(data_test[:, 0], prediction, average='binary') tn, fp, fn, tp = confusion_matrix(data_test[:, 0], prediction).ravel() with open('ensemble_confusion[affective].txt', 'a') as file: file.write("\n\nTRUE NEG :: " + str(tn)) file.write("\nFALSE POS :: " + str(fp)) file.write("\nFALSE NEG :: " + str(fn)) file.write("\nTRUE POS :: " + str(tp)) # file.write("\n\n\nTrue :: "+str(true_number)) # file.write(" from :: "+str(total_test)) # file.write("\nAccuracy :: "+str((float(true_number) / float(total_test)))) # file.write("\nPrecision :: "+str(all_score[0])) # file.write("\nF-Score :: "+str(all_score[2])) # file.write("\nNB True :: "+str(true_nb)) # file.write(" from :: "+str(total_test)) # file.write("\nAccuracy :: "+str((float(true_nb) / float(total_test)))) # file.write("\nPrecision :: "+str(nb_score[0])) # file.write("\nF-Score :: "+str(nb_score[2])) # file.write("\nLR True :: "+str(true_lr)) # file.write(" from :: "+str(total_test)) # file.write("\nAccuracy :: "+str((float(true_lr) / float(total_test)))) # file.write("\nPrecision :: "+str(lr_score[0])) # file.write("\nF-Score :: "+str(lr_score[2])) bow_mixed, words_num = processWords('mixed') start = 0 step = 840 end = start + step tt = TreeTagger(TAGLANG='en') wna = WNAffect(prefix + 'wordnet1.6/', prefix + 'wordnetAffect/') for i in range(0, 10): if i == 0: data_train = bow_mixed[step:, :] data_test = bow_mixed[:end, :] elif i == 9: data_train = bow_mixed[:start, :] data_test = bow_mixed[start:, :] else: temp = bow_mixed[:start, :] data_train = bow_mixed[end:, :] data_train = np.concatenate((data_train, temp), axis=0) data_test = bow_mixed[start:end, :] start += step end += step bnb = MultinomialNB(alpha=0.01).fit(data_train[:, 2:], data_train[:, 0]) lr = linear_model.LogisticRegression(solver='liblinear', n_jobs=3).fit( data_train[:, 2:], data_train[:, 0]) total_test = 0 true_number = 0 true_nb = 0 true_lr = 0 prediction_nb = [] prediction_lr = [] prediction = [] for item in data_test: score_table = np.zeros([2, 3], dtype=int) if str(item[1]) != '0': print item[1] total_test += 1 # ENSEMBLE result_nb = bnb.predict([item[2:]]) result_lr = lr.predict([item[2:]]) result_kb = kb.predict(item[1], tt, wna, cursor, 'mixed') score_table[1][result_nb] += 1 score_table[1][result_lr] += 1 if result_kb != 0: score_table[1][result_kb] += 1 predicted = np.unravel_index(np.argmax(score_table, axis=None), score_table.shape) final_prediction = predicted[1] if score_table[1][predicted[1]] == 1: final_prediction = result_lr if final_prediction == item[0]: true_number += 1 # SINGLE if result_nb == item[0]: true_nb += 1 if result_lr == item[0]: true_lr += 1 prediction_nb.append(int(result_nb)) prediction_lr.append(int(result_lr)) prediction.append(int(final_prediction)) prediction = np.array(prediction) prediction_nb = np.array(prediction_nb) prediction_lr = np.array(prediction_lr) if i == 9: nb_score = prf(data_test[:1169, 0], prediction_nb, average='binary') lr_score = prf(data_test[:1169, 0], prediction_lr, average='binary') all_score = prf(data_test[:1169, 0], prediction, average='binary') tn, fp, fn, tp = confusion_matrix(data_test[:1169, 0], prediction).ravel() else: nb_score = prf(data_test[:, 0], prediction_nb, average='binary') lr_score = prf(data_test[:, 0], prediction_lr, average='binary') all_score = prf(data_test[:, 0], prediction, average='binary') tn, fp, fn, tp = confusion_matrix(data_test[:, 0], prediction).ravel() with open('ensemble_confusion[mixed].txt', 'a') as file: file.write("\n\nTRUE NEG :: " + str(tn)) file.write("\nFALSE POS :: " + str(fp)) file.write("\nFALSE NEG :: " + str(fn)) file.write("\nTRUE POS :: " + str(tp)) # file.write("\n\n\nTrue :: "+str(true_number)) # file.write(" from :: "+str(total_test)) # file.write("\nAccuracy :: "+str((float(true_number) / float(total_test)))) # file.write("\nPrecision :: "+str(all_score[0])) # file.write("\nF-Score :: "+str(all_score[2])) # file.write("\nNB True :: "+str(true_nb)) # file.write(" from :: "+str(total_test)) # file.write("\nAccuracy :: "+str((float(true_nb) / float(total_test)))) # file.write("\nPrecision :: "+str(nb_score[0])) # file.write("\nF-Score :: "+str(nb_score[2])) # file.write("\nLR True :: "+str(true_lr)) # file.write(" from :: "+str(total_test)) # file.write("\nAccuracy :: "+str((float(true_lr) / float(total_test)))) # file.write("\nPrecision :: "+str(lr_score[0])) # file.write("\nF-Score :: "+str(lr_score[2])) cursor.close() conn.close() return
def scraping(): """Si l'utilisateur n'a pas demandé un scraping, recherche de documents du pays sélectionné dans la base de données; ces documents et leurs liens vers les photos seront renvoyés. Si l'utilisateur a demandé un scraping, ou s'il n'y a pas ou pas assez de documents du pays sélectionné dans la base de données, configuration et lancement du scrape sur Reddit, puis étiquetage des titres des soumissions résultats par TreeTagger, et analyse des étiquettes pour obtenir une liste de lieux potentiels. Ces lieux sont recherchés sur geonames. Les résultats de cette dernière recherche sont chargés dans deux dictionnaires, l'un pour l'affichage des photos sur le site et l'autre pour stocker les résultats dans la base de données sur mongoDB. NB: le scraping tente toujours d'obtenir de nouvelles photos (absentes de mongoDB). """ #Configuration Geoscape geokey = current_app.config['GEOKEY'] geoauth = current_app.config['GEOAUTH'] #Paramètres de la requête Javascript rgnversion = request.args.get('search_version') country = request.args.get('country') country_code = request.args.get('country_code') limit = int(request.args.get('nombre_image')) scrape_requested = True if request.args.get( 'scraping') == 'true' else False #Dico de résultats pour l'affichage sur le site search_res = geo.GeoQuery(geokey, geoauth, country, country_code, 'E') dic_results = { 'head': { 'total': 0, 'country': { 'name': country, 'lng': search_res.result.lng, 'lat': search_res.result.lat } }, 'results': [] } #Liste de chargement pour la base de données database_list = [] if scrape_requested: #On ne charge que les img_url load_arg = {'img_url': 1, '_id': 0} else: #On charge le document pour l'affichage load_arg = { 'scraped_title': 0, 'location_list': 0, 'feature_class': 0, 'testers': 0, '_id': 0 } existing_urls = [] check_db = mongo.Mongo.mongocheck('Resultats_RGN') #Initialisation de la collection des résultats si elle n'existe pas if not check_db: dbstart = mongo.MongoSave([{ 'key': 'Initialisation de la collection Resultats_RGN.' }]) dbstart.storeindb('Resultats_RGN', img_url='A', search_version='D') dbstart.nonunique_index('Resultats_RGN', country='A', search_version='D') #Les documents pris dans la base de données sont chargés dans le dictionnaire de résultats else: dbfinder = mongo.MongoLoad( { 'search_version': rgnversion, 'country': country }, load_arg) for doc in dbfinder.retrieve('Resultats_RGN', limit=limit): if not scrape_requested: dic_results['head']['total'] += 1 dic_results['results'].append(doc) existing_urls.append('-url:' + doc['img_url']) if scrape_requested or dic_results['head']['total'] < limit: #Configuration recherche reddit, profil chargé depuis praw.ini reddit = praw.Reddit('current_user') target_sub = reddit.subreddit('EarthPorn') query = country if country != 'United States' else 'USA' print( '\033[92m' + target_sub.display_name + '\033[0m' '\nRésultats de recherche pour les soumissions reddit avec:', query, '\n') #Exclure les documents déjà récupérés user_limit = limit if len(query) + len(existing_urls) + sum( len(url) for url in existing_urls) <= 512: query += (' ' + ' '.join(existing_urls)).rstrip() limit -= dic_results['head']['total'] else: #512 caractères max dans une requête Reddit limit = 1000 #Max permis par Reddit existing_urls = [url[5:] for url in existing_urls] #Config TreeTagger. Le dossier Treetagger doit être dans le dossier d'où le programme est exécuté if sys.platform.startswith('linux'): reddit_tagger = TreeTagger(TAGLANG='en', TAGDIR=join(getcwd(), 'Treetagger', 'TreeTagger_unix')) elif sys.platform.startswith('win'): reddit_tagger = TreeTagger(TAGLANG='en', TAGDIR=join(getcwd(), 'Treetagger', 'TreeTagger_windows')) else: sys.exit('Système d\'exploitation non compatible avec Geoscape.') #Résultats de la recherche dans le subreddit test_posts = target_sub.search(query, limit=limit) for post in test_posts: try: attempt = post.url except prawcore.exceptions.NotFound: continue #Problème avec la photo; éliminé if post.url in existing_urls: continue #Déjà stocké dans la base de données; éliminé if search('\W' + country + '\W', post.title): #Pays comme mot distinct #Saute aux plus une fois des caractères entre [] ou () au début du texte et s'arrête au premier [ ou ( res = search('^(?:[\[(].*[\])])?([^\[(]+)', post.title) if (res): print(res.group(1)) #Tagging: génère une liste de triplets: (word=..., pos=..., lemma=...) reddit_tags = make_tags(reddit_tagger.tag_text( res.group(1)), exclude_nottags=True) #Le nom du pays est exclu des lieux potentiels; rajouté seulement en dernier recours country_split = country.casefold().split(' ') size = len(country_split) indexes = [] if size > 1: name_tags = [t[0].casefold() for t in reddit_tags] for window in enumerate(windowed(name_tags, size)): if all(window[1][i] == country_split[i] for i in range(size)): indexes.extend([ i for i in range(window[0], window[0] + size) ]) for index, tag in enumerate(reddit_tags): if tag[1] == 'NP': #Tag nom propre sous Windows reddit_tags[index] = (tag[0], 'NP0', tag[2]) if tag[0].casefold() == country.casefold( ) or index in indexes: reddit_tags[index] = (tag[0], 'CTY', tag[2]) pprint(reddit_tags) #Recherche des lieux potentiels, avec stocké entre les lieux le nombre de mots non choisis location_list = location_finder(country, rgnversion, reddit_tags) print('Lieux trouvés:', end='') print(location_list, '\n') #Geonames date = gmtime(post.created_utc) dic_mongo = { 'link': 'https://www.reddit.com' + post.permalink, 'img_url': post.url, 'search_version': rgnversion, 'country': country, 'country_code': country_code, 'scraped_title': res.group(1).strip(), 'text': post.title, 'tag_list': reddit_tags, 'location_list': location_list, 'date': { 'year': date.tm_year, 'month': date.tm_mon, 'day': date.tm_mday, 'hour': date.tm_hour, 'min': date.tm_min, 'sec': date.tm_sec } } try: attempt = post.author.icon_img except prawcore.exceptions.NotFound: pass else: dic_mongo['author'] = { 'name': post.author.name, 'icon': post.author.icon_img, 'profile': 'https://www.reddit.com/user/' + post.author.name } """ R: recherche standard RF: recherche fuzzy E: recherche exacte EH: recherche exacte sur ensembles humains EN: recherche exacte sur ensembles naturels """ placefinder = geo.LocationList(country_code, location_list) geo_res = placefinder.geo_search(geokey, geoauth, 'EN EH', 'R') #Objet GeoQuery #En dernier recours, le pays lui-même s'il est dans le titre if geo_res.result is None and country in res.group(1): placefinder.reinit(country_code, [country]) geo_res = placefinder.geo_search(geokey, geoauth, 'E') if geo_res.result is not None: dic_results['head']['total'] += 1 print('Résultat GeoNames:', geo_res.result.address, end='') print('. Après', placefinder.counter, 'requêtes.') dic_mongo['name'] = geo_res.result.address #Nom dic_mongo['lng'] = geo_res.result.lng dic_mongo['lat'] = geo_res.result.lat dic_mongo[ 'feature_class'] = geo_res.result.feature_class dic_mongo['location'] = geo_res.location dic_results['results'].append(dic_mongo) dic_tostore = deepcopy(dic_mongo) database_list.append(dic_tostore) user_limit -= 1 if not user_limit: break print('\n###############') #Chargement dans la base de données des documents générés par le scrape documents = mongo.MongoSave(database_list) documents.storeindb('Resultats_RGN') return jsonify(dic_results)
# requirements : treetaggerwrapper from treetaggerwrapper import TreeTagger, make_tags if 'teampath' in locals() or 'teampath' in globals(): TAGGER = TreeTagger(TAGLANG='fr', TAGOPT="-token -lemma -sgml -no-unknown", TAGDIR="/home/teamlaw/TreeTagger/") else: TAGGER = TreeTagger(TAGLANG='fr', TAGOPT="-token -lemma -sgml -no-unknown") class JurQA: def __init__(self): self.class_ = str() self.subclass_ = str() self.question = Text() self.answer = Text() class Text: def __init__(self): self.text = list() self.lemma = list() self.pos = list() def init_text(self, text): tags = TAGGER.tag_text(text) for tag in tags: try: word, pos, lemma = tag.split("\t") except: