def preprocess(self, text=None, stem=False, fix_pdf=True): if text is None: text = self.text def fix_pdf2txt(texto): import re texto = re.sub(r'\n([^A-Z])', r' \1', texto) texto = re.sub(r'([^\.])\n', r'\1.\n', texto) return texto def tokenizer_fr(text): # Courtesy of http://www.fabienpoulard.info/post/2008/03/05/Tokenisation-en-mots-avec-NLTK return tok_fr.tokenize(text) # Fix newline problems with pdf to txt step if fix_pdf: text = fix_pdf2txt(text) text = text.lower() # Tokenization self._original_tokens = tokenizer_fr(text) self._tokens = self._original_tokens # self._tokens = [t for t in self._tokens if len(t) > 1] if stem: from nltk.stem.snowball import FrenchStemmer fr_stemmer = FrenchStemmer() self._tokens = [fr_stemmer.stem(t) for t in self._tokens] self._concordance_index = nltk.ConcordanceIndex(self._tokens, key=lambda s: s)
def is_french_adjr(word): # TODO change adjr tests stemmer = FrenchStemmer() # suffixes with gender and number flexions suffixes = [ u"ain", u"ains", u"aine", u"aines", u"aire", u"aires", u"al", u"aux", u"als", u"ale", u"ales", u"el", u"els", u"elle", u"elles", u"esque", u"esques", u"estre", u"estres", u"eux", u"euse", u"euses", u"é", u"és", u"ée", u"ées", u"ien", u"iens", u"ienne", u"iennes", u"ier", u"iers", u"ière", u"ières", u"if", u"ifs", u"ive", u"ives", u"il", u"ils", u"in", u"ins", u"ine", u"ines", u"ique", u"iques", u"ois", u"oise", u"oises" ] stem = stemmer.stem(word) stem_ending = "" if word.replace(u"é", "e").replace(u"è", "e").startswith(stem.replace(u"é", "e").replace(u"è", "e")): stem_ending = word.replace(u"é", "e").replace(u"è", "e").split(stem.replace(u"é", "e").replace(u"è", "e"), 1)[1] if stem in french_stemmed_adjr: return True for suffix in suffixes: if word[-len(suffix):] == suffix: return True # TODO change adjr tests #if stem_ending in french_adjr_stem_ending_counts: # return True return False
def convert_computing_input_to_dictionnary_input(datas): # Instantiation of the Tokenizer tokenizer = WordPunctTokenizer() # Instantiate Stemmer stemmer = FrenchStemmer() # Load french StopWords french_stopwords = set(stopwords.words('french')) matchingTable = [] tokenTable = [ ] # Each row of this table is [id, tokens] :'id' of the advert and 'tokens' the list of tokens in the advert i = 0 for row in datas: id = row[0] desc = row[2] # Get tokens for this row tokens = tokenizer.tokenize(str(desc[0])) # Filter tokens to remove punctuation regex = re.compile(r'\w+') tokens = filter(regex.search, tokens) # Filter tokens to remove stopwords and convert tokens to their stemm tokens = [ stemmer.stem(token) for token in tokens if token.lower() not in french_stopwords ] # Remove duplicate entries tokens = list(set(tokens)) # Sort tokens tokens.sort() # Construct the new row with only the id and the list of tokens row = [id, tokens] # Add the new Row to the global table tokenTable.append(row) i += 1 # Construct the vector for each advert rowCol = 0 rowCols = [] tabCols = [] rowInd = 0 initialRow = np.zeros(len(tokenDictionnary)) # # # Here we transform each row of tokens into row of 0|1 corresponding array, matching the tokenDictionnary # # tokenTable[1:] to skip the title row, because the original file has a title row for row in tokenTable[1:]: id = row[0].split(";")[0] advertVec = np.zeros(len(tokenDictionnary)) rowCols = [] for elm in row[1]: rowCol = tokenDictionnary.index( elm) if elm in tokenDictionnary else -1 advertVec[rowCol] = 1 if rowCol >= 0 else 0 composed_row = [id, advertVec] matchingTable.append(composed_row) rowInd += 1 return tokenTable, matchingTable
def __init__(self, label = "", role = "", ner = ""): self.label = label self.role = role self.ner = ner ##print(repr(label)) stemmer = FrenchStemmer() self.lemma = stemmer.stem(label)
def new_dico(file): stemmer = FrenchStemmer() input_ = "../dico/" + file output_ = "dic_with_roots/" + file fs=open(input_,'r') fd=open(output_,'w') k =0 lines = fs.readlines() for line in lines: txt = line.split(" ") if txt =='': break for w in txt: if(w.istitle()): k = 1 else: k = 0 w= w.decode("utf-8") w = ''.join(u for u in w if u in string.ascii_letters) w=enleve_accents(w) w=stemmer.stem(w)+" " w.encode("utf-8") if(k): w = w[0].upper() + w[1:] fd.write(w) else: fd.write(w) fs.close() fd.close()
def normalize_text(string): """Preprocess text string to return a normalized form of the text. """ if isinstance(string, float): return "" else: # lowering x, removing beginning and ending space s = string.strip().lower() # removing accents s = ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')) # remove punctuation s = re.sub("[" + punctuation + "]", " ", s) # remove uninformative, stop words and non alpha words words_to_remove = [ "les", "une", "des", "nos", "ils", "elle", "elles", "nan", "null" ] stop_words = list(stopwords.words("french")) remove_list = words_to_remove + stop_words s = " ".join([ word for word in s.split() if (word.isalpha() and word not in remove_list and len(word) > 2) ]) # Stemming words and remove duplicates stemmer = FrenchStemmer() stem_words = [stemmer.stem(w) for w in s.split()] s = " ".join(stem_words) return s
def get_lem(text): stemmer = FrenchStemmer() text_tokens = word_tokenize(text) text ="" for word in text_tokens : text += " "+stemmer.stem(word) return text
def stemmingFR(self): ps = FrenchStemmer() #Input Document Input = open(self.filename, "r") elagage = Input.read() #Output Document Output = open(self.filename + "-out3.txt", "a") #Stemming for w in elagage.split(): Output.write(ps.stem(w)) Output.write("\n") self.label.configure(text=self.filename) self.texte = Entry(self, width=20, font="Arial 14", fg="green", justify='center') self.texte.insert(END, "Succée de Stemming") self.texte.grid(padx=16, pady=16) self.texte = Entry(self, width=50, font="Arial 14", fg="blue", justify='center') self.texte.insert( END, "Vous trouvez votre fichier résulat sous le même répertoire") self.texte.grid(padx=16, pady=16)
def __init__(self): # load text_en_fr # if it's missing, we're generating it if not os.path.isfile("text_en_fr.csv"): self.df = self.generate_csv_from_en_fr_text("text_en_fr.csv") else: self.df = pd.read_csv("text_en_fr.csv") # load stopwords f = open("sorted_data/stopwords", "r") stopwords_en = f.read().split("\n") stopwords_en.pop(-1) self.stopwords = stopwords.words('english') + stopwords.words( 'french') + stopwords_en # load lemmatizer for en and fr self.lemmatizer_en = WordNetLemmatizer() self.stemmer_fr = FrenchStemmer() # preprocess text self.X, self.Y = self.preprocess(self.df["en"].values.tolist()[:2], self.df["fr"].values.tolist()[:2]) self.train_model_en_to_fr("model_translate.h5")
def process_text(text, stem=False): """ lowercase, removes stopwords, accents and lemmatizes the tokens if stem=True used with the df.apply() to create a new column on a dataframe """ text_clean = [] for sen in text: # sen = unidecode.unidecode(sen.replace("’", " ").replace(","," ").replace("."," ").replace(";"," ").lower()) sen = unidecode.unidecode( sen.replace("’", " ").replace(",", " ").replace( ";", " ").lower()) # keep the dots for the date_uniformizer sen = sen.replace("/ ", "/") #some dates are in DD/ MM/ yyyy format tokens = sen.split() if stem: from nltk.stem.snowball import FrenchStemmer stemmer = FrenchStemmer() tokens_no_stpwrd = [ stemmer.stem(tok) for tok in tokens if tok not in stop_words ] else: # tokens_no_stpwrd = [tok for tok in tokens if (tok not in stop_words) & (tok.isalnum())] tokens_no_stpwrd = [ tok for tok in tokens if (tok not in stop_words) ] no_letters = re.sub(' [a-z] ', " ", " ".join(tokens_no_stpwrd)) text_clean.append(no_letters) return text_clean
def preprocess(text): result = [] stopwords = get_stopswords() stemmer = FrenchStemmer() for token in simple_preprocess(text): if token not in stopwords and len(token) > 3: result.append(stemmer.stem(token)) return result
def __init__(self, stop_words=None, remove_non_words=False): self.st = FrenchStemmer() if stop_words == None: self.stopwords = set(stopwords.words('french')) else: self.stopwords = stop_words self.words = set(words.words()) self.remove_non_words = remove_non_words
def racinize_all_concept(concept): concept_tiers = [] stemmer = FrenchStemmer() for i in range(0, len(concept)): temp = concept[i][0].lower() temp2 = stemmer.stem(temp) concept_tiers.append((temp2, concept[i][1])) return concept_tiers
def lemmatize(token): """Lemmatize word using a french lemmatizer Arguments: token {string} -- token to lemmatize """ stemmer = FrenchStemmer() return stemmer.stem(token)
def stemArticle(self, doc): stemmer_fr = FrenchStemmer() stemmer_en = EnglishStemmer() stemmedArticle = [str(stemmer_fr.stem(w)) for w in doc] stemmedArticle = [str(stemmer_en.stem(w)) for w in stemmedArticle] return stemmedArticle
def stem_words(words): stemmed_words = [] stemmer = FrenchStemmer() for word in words: stemmed_word = stemmer.stem(word) stemmed_words.append(stemmed_word) stemmed_words.sort() return stemmed_words
def stem(sentence): # Instance Stemmer stemmer = FrenchStemmer() stem = '' for word in nltk.word_tokenize(sentence): stem += ' ' + stemmer.stem(word) #print(stem) # Pour débug return stem
def stemming_Function(filtered_words): stemmed_words = [] #declare an empty list to hold our stemmed words stemmer = FrenchStemmer() #create a stemmer object in the FrenchStemmer class for word in filtered_words: stemmed_word=stemmer.stem(word) #stem the word stemmed_words.append(stemmed_word) #add it to our stemmed word list freqdist = nltk.FreqDist(stemmed_words) return freqdist
def __init__(self, raw_data_path = 'data/df_stats.csv', output_filepath='data/cleaned_preprocessed_campaigns.csv', joi_output='data/joi.csv'): self.raw_data_path = raw_data_path self.output_filepath = output_filepath self.joi_output = joi_output self.legacy_columns = ['id', 'title', 'category', 'country', 'name', 'description', 'job_type', 'job_board_id', 'budgetmax', 'creation'] self.new_columns = ["job_board_name", 'amount_action_0', 'amount_action_1', 'amount_action_2', 'amount_action_3', 'amount_action_4', 'total_cost', 'true_cpc', 'taux_conversion', "taux_conversion_pondere", "volume_conversion", 'creation_an', 'creation_mois', 'creation_jour', 'weekday'] self.jobboard_name_for_id = { 31 : "AdformProgrammaticFR", 75 : "AdformProgrammaticGermany", 73 : "AdformProgrammaticNL", 74 : "AdformProgrammaticSwitzerland", 87 : "AdformProgrammaticUK", 76 : "AdformProgrammaticUS", 24 : "Adwords", 96 : "AdwordsFR", 102: "adwords-Switzerland", 101: "AdwordsUS", 4 : "Adzuna", 52 : "Adzuna US", 169: "APEC", 12 : "capital", 25 : "CV Library", 59 : "DoubleclickFR", 58 : "DoubleclickUK", 99 : "Facebook-Austria", 77 : "FacebookFR", 79 : "FacebookGermany", 81 : "Facebook-Netherlands", 80 : "Facebookswitzerland", 16 : "FaceBookUK", 78 : "FacebookUS", 98 : "Gigajob-Austria", 35 : "GigaJobFR", 68 : "Gigajob- Germany", 67 : "GigaJob - Netherlands", 69 : "Gigajob- switzerland", 34 : "GigaJobUK", 54 : "GigaJob US", 10 : "Github", 168: "GoogleJobDiscovery", 1 : "Indeed", 146: "Jobbird-Austria", 147: "Jobbird-Belgium", 156: "jobbird-Canada", 151: "Jobbird-France", 152: "Jobbird-Germany", 148: "Jobbird-India", 145: "Jobbird-Netherlands", 150: "Jobbird-Newzealand", 153: "Jobbird-Spain", 155: "jobbird-Switzerland", 149: "Jobbird-Turkey", 143: "Jobbird-UK", 144: "jobbird-US", 154: "jobboard-Switzerland", 14 : "Jobijoba", 9 : "Jobintree", 36 : "JobisJob", 57 : "JobisJob US", 97 : "Joblift-Austria", 131: "joblift-Belgium", 133: "joblift-Canada", 40 : "Joblift FR", 65 : "Joblift - Germany", 159: "joblift-Germany-d.jobmonitor.com", 158: "joblift-Germany-Muenchener", 136: "joblift-India", 61 : "Joblift-Netherlands", 135: "joblift-newzealand", 132: "joblift-Spain", 66 : "Joblift - Switzerland", 134: "joblift-Turkey", 37 : "JobLift UK", 50 : "Joblift US", 3 : "Jobrapido", 161: "jobrapidoGermany-Jobmonitor", 162: "jobrapidoGermany-muenchener", 60 : "jobrapidoProgrammaticTrendingJobs", 53 : "JobRapido US", 13 : "Jobtome", 157: "JobtomeGermany - de.jobmonitor.com", 88 : "JobtomeGermany - muenchener", 165: "Jobtomeprogrammatic", 39 : "Jobtome UK", 56 : "Jobtome US", 166: "kudos", 26 : "LApec", 7 : "Leboncoin", 19 : "Leboncoin Marque employeur", 100: "Linkedin-Austria", 82 : "LinkedinFR", 83 : "LinkedinGermany", 85 : "Linkedinnetherlands", 84 : "Linkedinswitzerland", 8 : "LinkedinUK", 86 : "LinkedinUS", 18 : "LinkUp", 47 : "LoadTestBoard", 167: "Match2one", 163: "Meteojob", 48 : "Monster", 43 : "MyJobHelper FR", 71 : "Myjobhelper-Germany", 70 : "MyJobHelper - Netherlands", 72 : "MyJobHelper - switzerland", 41 : "MyJobHelper UK", 55 : "MyJobHelper US", 28 : "Name", 42 : "Name", 33 : "Neuvoo", 90 : "NeuvooAustria-Jobleads", 137: "NeuvooBelgium-jobleads", 139: "NeuvooCanada-Jobleads", 93 : "NeuvooFrance-Jobleads", 89 : "NeuvooGermanyjobleads", 95 : "NeuvooGermany-Jobmonitor", 160: "NeuvooGermany-muenchener", 94 : "NeuvooHolland-Jobleads", 140: "NeuvooIndia-Jobleads", 142: "NeuvooNewzealand-jobleads", 138: "NeuvooSpain-jobleads", 91 : "Neuvooswitzerland-Jobleads", 141: "NeuvooTurkey-jobleads", 92 : "NeuvooUK-Jobleads", 51 : "NeuvooUSJobleads", 45 : "Nominal Technology", 2 : "Optioncarriere", 164: "ProgrammaticAppnexus", 113: "restorationmedia-UK", 112: "restorationmedia-US", 104: "ResultsGeneration-UK", 103: "Resultsgeneration -US", 27 : "Sites gratuits TP", 11 : "Stackoverflow", 5 : "Test", 29 : "[test] Job board 31671", 30 : "[test] Job board 73347", 115: "Trendingjobs-UK", 114: "Trendingjobs-US", 6 : "Trovit", 63 : "Trovit- Germany", 62 : "Trovit - Netherlands", 64 : "Trovit- Switzerland", 38 : "Trovit UK", 49 : "Trovit US", 17 : "Twitter", 15 : "Vivastreet", 120: "xpat-Austria", 125: "xpat-Belgium", 128: "xpat-Canada", 119: "xpat-France", 121: "xpat-Germany", 126: "xpat-India", 118: "xpat-Netherlands", 130: "xpat-newzealand", 127: "xpat-Spain", 122: "xpat-Switzerland", 129: "xpat-Turkey", 117: "xpat-UK", 116: "xpat-US", 109: "Yahoo-Austria", 108: "Yahoo-France", 107: "Yahoo-Germany", 110: "Yahoo-Netherlands", 111: "Yahoo-Switzerland", 106: "Yahoo-UK", 105: "Yahoo-US", 46 : "ZipRecruiter-France", 124: "ZipRecruiter-UK", 123: "ZipRecruiter-US"} self.stemmer = FrenchStemmer() self._set_stopwords()
def racinize_all_negationeur( concept): #all racinisation are used to work with variations nega_tiers = [] stemmer = FrenchStemmer() for i in range(0, len(negationeur)): temp = negationeur[i].lower() temp2 = stemmer.stem(temp) nega_tiers.append(temp2) return nega_tiers
def stem_words(words): #stemming words stemmed_words = [] #declare an empty list to hold our stemmed words stemmer = FrenchStemmer() #create a stemmer object in the FrenchStemmer class for word in words: stemmed_word=stemmer.stem(word) #stem the word stemmed_words.append(stemmed_word) #add it to our stemmed word list stemmed_words.sort() #sort the stemmed_words return stemmed_words
def stemWords(listWords): '''racinisation''' stemmedWords = list() stemmer = FrenchStemmer() for word in removeStopwords(listWords): stemmedWord = stemmer.stem(word) stemmedWords.append(stemmedWord) stemmedWords.sort() return stemmedWords
def __init__(self, ignoreWords : list = ['?', '!'], verbose : bool = False, forceSave : bool = False): self.roots = [] self.ruleList = [] self.corpus = [] self.ignoreWords = ignoreWords self.verbose = verbose self.forceSave = forceSave self.stemmer = FrenchStemmer() self.rules = None self.model = None
def stem_words(words): #stemming words stemmed_words = [] stemmer = FrenchStemmer() #creation d'un objet stemmer #dans la classe FrenchStemmer for word in words: stemmed_word=stemmer.stem(word) #stem the word stemmed_words.append(stemmed_word) return stemmed_words
def stem_words(words): '''stems the word list using the French Stemmer''' #stemming words stemmed_words = [] #declare an empty list to hold our stemmed words stemmer = FrenchStemmer() #create a stemmer object in the FrenchStemmer class for word in words: stemmed_word=stemmer.stem(word) #stem the word stemmed_words.append(stemmed_word) #add it to our stemmed word list stemmed_words.sort() #sort the stemmed_words return stemmed_words
def stem(msg, stopwords): stemmer = FrenchStemmer() lem = [] words = re.split("[ |,|.|;|!|?|\"|\'|-]", msg) for word in words: if word: if (word[0] != "#" and word not in stopwords): lem.append(stemmer.stem(word)) elif (word[0] == "#"): lem.append(word) return lem
def stem_words(self, words): """Stem words in list of tokenized words""" if self._lang == 'fr': stemmer = FrenchStemmer() else: stemmer = LancasterStemmer() stems = [] for word in words: stem = stemmer.stem(word) stems.append(stem) return stems
def racinize_all_qualifieurs(qualifieurs): iznogoud = [] stemmer = FrenchStemmer() for i in range(0, len(qualifieurs)): temp = qualifieurs[i].lower() if temp == 'isolées': temp2 = 'isole' else: temp2 = stemmer.stem(temp) iznogoud.append(temp2) return iznogoud
def nettoyage(self, document): """ function to clean the dataset + stemming """ document = self.tokenize(document) document = [word.lower() for word in document if len(word) > 2 and not word.isnumeric() and word not in self.stop_words] if self.stem: stem = FrenchStemmer() document = [stem.stem(word) for word in document] return document
def stem_wordsfr(words): '''stems the word list using the French Stemmer''' #stemming words stemmed_wordsfr = [] #declare an empty list to hold our stemmed words stemmerfr = FrenchStemmer( ) #create a stemmer object in the FrenchStemmer class for word in words: stemmed_wordfr = stemmerfr.stem(word) #stem the word stemmed_wordsfr.append( stemmed_wordfr) #add it to our stemmed word list #stemmed_wordsfr.sort() #sort the stemmed_words return stemmed_wordsfr
def __init__(self, f, keywords): s = f.read() self.keywords = keywords self.file = s self.sentences = sent_tokenize(s) self.parser = StanfordParser( "stanford-parser-full-2014-08-27/stanford-parser", "stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models") self.tagger = st.StanfordPOSTagger( "stanford-postagger-full-2014-08-27/models/french.tagger", "stanford-postagger-full-2014-08-27/stanford-postagger.jar") self.ner = st.StanfordNERTagger( "stanford-ner-2014-08-27/classifiers/english.all.3class.distsim.crf.ser.gz", "stanford-ner-2014-08-27/stanford-ner.jar") self.trees = [] for sent in self.sentences: try: self.trees.append(self.parser.raw_parse(sent)) except OSError: self.trees.append([]) self.words = self.word_tokenize_without_punc(s) self.stemmer = FrenchStemmer() self.stems = [self.stemmer.stem(w) for w in self.words] self.words_sentences = [ self.word_tokenize_without_punc(s) for s in self.sentences ] self.tags = self.tagger.tag(self.words) self.tags_sentences = [ self.tagger.tag([w for w in self.words_sentences[i]]) for i in range(len(self.sentences)) ] self.entities = self.ner.tag(self.words) self.entities_sentences = [ self.ner.tag([w for w in self.words_sentences[i]]) for i in range(len(self.sentences)) ] self.left_subject = defaultdict(lambda: 0) self.left_compl = defaultdict(lambda: 0) self.left_neg_subject = defaultdict(lambda: 0) self.left_neg_compl = defaultdict(lambda: 0) self.right_subject = defaultdict(lambda: 0) self.right_compl = defaultdict(lambda: 0) self.right_neg_subject = defaultdict(lambda: 0) self.right_neg_compl = defaultdict(lambda: 0) self.left_ref = 0 self.right_ref = 0 self.trees_leaves = [] for e in self.trees: res = [] extract_leaves(list(e)[0], res) self.trees_leaves.append(tuple_to_dict(res)) self.extract_keywords()
def vectorize_descreption(): fs = FrenchStemmer() df = return_features_model() df['Descposte'] = [fs.stem(k) for k in df['Descposte']] tfidf = TfidfVectorizer() tfidf.fit_transform(df['Descposte']) tfidf_col = pd.DataFrame(tfidf.fit_transform(df['Descposte']).todense(), columns=tfidf.get_feature_names()) df = df.reset_index() df_final = pd.merge(df.drop(columns=drop_columns, axis=1), tfidf_col, right_index=True, left_index=True) return df_final
def __init__(self, mode='build', DB_file=None, doc_files=None, trace=False): self.mode = mode self.DB_file = DB_file self.doc_list = [] doc_to_read=[] for root, dirs, files in os.walk(doc_files, topdown=False): for file_name in files: doc_to_read.append(os.path.join(root, file_name.encode('utf-8'))) for doc_file in doc_to_read : doc = Doc(doc_file) self.doc_list.append(doc) self.trace = trace self.requete= [] self.DB = Data_Base() self.stemmer=FrenchStemmer() if mode == 'build' : #construction de la base de donnee, puis dump sur DB_file print 'Built Data Base...' self.build_DB() #print self.DB elif mode == 'search' : #chargement de la base de donnee self.load_DB() print self.DB.word2Word_struct
def __init__(self, translation_dict, lemmatized=False): self.english_lemmatizer = WordNetLemmatizer() self.french_stemmer = FrenchStemmer() if not lemmatized: stemmed_dict = self._get_lemmatized_dict(translation_dict) self.stemmed_dict = stemmed_dict self.translation_dict = translation_dict
def lemmatize_or_stem(language, terms): if language != 'english' or (language == 'english' and ENGLISH_FREELING): #TEMPORARY: EXPERIMENTING WITH ENGLISH FREELING # Use FreeLing if language == 'spanish': analyzeProcess = subprocess.Popen(["analyze", "-f", "/usr/local/share/freeling/config/es.cfg"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) elif language == 'portugese': analyzeProcess = subprocess.Popen(["analyze", "-f", "/usr/local/share/freeling/config/pt.cfg"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) elif language == 'italian': analyzeProcess = subprocess.Popen(["analyze", "-f", "/usr/local/share/freeling/config/it.cfg"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) elif language == 'english': analyzeProcess = subprocess.Popen(["analyze", "-f", "/usr/local/share/freeling/config/en.cfg"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) terms = map(lambda term: term.encode('utf-8'), terms) analyzeProcess.stdin.write(' '.join(terms)) stdout, stderr = analyzeProcess.communicate() # Parse FreeLing output # Lemma is always second word of each line. terms = list() lines = stdout.split('\n') for line in lines: items = line.split(' ') if len(items) == 4: lemma = items[1] tag = items[2] """ATTN: TAGSET IS DIFFERENT IN SPANISH AND ENGLISH. However, NP, F, Z, and W all mean the same thing in both tagsets.""" # remove proper nouns, punctuation, numbers, and dates/times if not (tag[0:2]=='NP' or tag[0] == 'F' or tag[0] == 'Z' or tag[0] == 'W' or tag[0:3] == 'POS'): # if english, need to remove numbers include = True for num in NUMBERS: if num in lemma: include = False if include: terms.append(lemma) terms = map(lambda term: term.decode('utf-8'), terms) elif (language == 'english' and not ENGLISH_FREELING): lem = WordNetLemmatizer() terms = map(lambda term: lem.lemmatize(term), terms ) elif language == 'french': from nltk.stem.snowball import FrenchStemmer stemmer = FrenchStemmer() terms = map(lambda term: stemmer.stem(term), terms) terms = map(lambda term: term.decode('utf-8'), terms) return terms
def __init__(self, mode='build', DB_file=None, doc_files=None, trace=False): self.mode = mode self.DB_file = DB_file self.doc_list = [] self.trace = trace self.requete= [] self.DB = Data_Base() self.stemmer=FrenchStemmer() self.requeteFin=[] self.idDoc2tfIdf={} if mode == 'build' : #construction de la base de donnee, puis dump sur DB_file print 'Building Data Base...' self.build_DB(doc_files) print 'Building completed' elif mode == 'search' : #chargement de la base de donnee self.load_DB() self.word2nbOccDsDB={}
def __init__(self, corpus_file, word_file): self.user= False if word_file : self.user=True self.stemmer = FrenchStemmer() self.text_into_sentences = data.load("tokenizers/punkt/french.pickle") curr_path = os.path.dirname(os.path.abspath(__file__)) os.environ['STANFORD_PARSER'] = curr_path +"/stanford-parser-full-2015-04-20" os.environ['STANFORD_MODELS'] = curr_path + "/stanford-parser-full-2015-04-20" self.parser = stanford.StanfordParser(model_path= curr_path+ "/frenchFactored.ser.gz") self.stpwords = stopwords.words("french") self.read_corpus(corpus_file) self.known_words_list = [] self.ok_words_lis = [] if self.user: self.read_known_words(word_file) self.stemmed_known_words = map(self.stemmer.stem, [w for w in self.known_words_list if w not in self.stpwords]) self.ok_words_list = [w for w in self.ok_words_lis if w not in self.stpwords] self.sentences = [] self.word_count = 0 self.syll_count = 0 self.num_of_sentences = 0 self.word_per_sentence= 0.0 self.syllables_per_word = 0.0 self.num_of_words_with_more_than_six_chars = 0 self.sixratio = 0.0 self.num_of_words_with_more_than_three_syll = 0 self.threeratio= 0.0 self.depth_sum = 0 self.avgdepth = 0.0 self.known_words = 0 self.knownratio = 0.0 self.NP_sum = 0 self.NPratio =0.0 self.VP_sum = 0 self.VPratio =0.0 self.SBAR_sum = 0 self.SBARratio =0.0 self.score_with_words=0.0 self.score_general = 0.0 self.flesch_kincaid_score=0.0
class DirectTranslate: """Word-by-word direct translator. Usage: translator = DirectTranslate(translation_dict) for sentence in file: print translator.translate(sentence, delims=",' ", remove='') """ def __init__(self, translation_dict, lemmatized=False): self.english_lemmatizer = WordNetLemmatizer() self.french_stemmer = FrenchStemmer() if not lemmatized: stemmed_dict = self._get_lemmatized_dict(translation_dict) self.stemmed_dict = stemmed_dict self.translation_dict = translation_dict def _get_lemmatized_dict(self, dict): result = {} for french_word, english_translation_list in dict.iteritems(): french_stem = self.french_stemmer.stem(french_word) english_translations = [ self.english_lemmatizer.lemmatize(word) for word in english_translation_list ] # NOTE: This may or may not be the best stragetgy. If two distinct # French words in the initial dict have the same stem, # it appends the two lists of translations. # TODO: Reconsider. # TODO: Consider removing duplicates from this new list. But need to preserve order. if french_stem not in result: result[french_stem] = english_translations else: result[french_stem].extend(english_translations) return result def _get_preprocessed_sentence(self, french_sentence): """Apply any preprossing rules here. Args: french_sentence: string; the sentence in french Returns: The sentence with all preprocessing rules applied. """ return unicode(french_sentence) def _get_postprocessed_sentence(self, english_sentence): """Apply any postproccessing rules here. Args: english_sentence: string; an english sentence Returns: The sentence with all postprocessing rules applied. """ return english_sentence # TODO: Add code to keep commas. Translate them into a word. def translate(self, sentence, delims=",' ", remove=''): sentence = self._get_preprocessed_sentence(sentence) tokens = TranslateUtils.get_list_of_words(sentence, delims, remove) translated_list = [] for token in tokens: stemmed_token = self.french_stemmer.stem(token).lower() if stemmed_token in self.stemmed_dict: possible_translations = self.stemmed_dict[stemmed_token] if possible_translations: # Use first translation in the list translation = possible_translations[0] translated_list.append(translation) elif token in self.translation_dict: possible_translations = self.translation_dict[token] if possible_translations: # Use first translation in the list translation = possible_translations[0] translated_list.append(translation) translation = ' '.join(translated_list) translation = self._get_postprocessed_sentence(translation) return translation
#Stopwords from nltk.corpus import stopwords import nltk # chargement des stopwords français french_stopwords = set(stopwords.words('french')) print french_stopwords chars = ['.', '/', "'", '"', '?', '!', '#', '$', '%', '^', '&','*',\ '(', ')', ' - ', '_', '+' ,'=', '@', ':', '\\', ',',';', '~', '`', '<',\ '>', '|', '[', ']', '{', '}', '–', '“','»', '«', '°', '’', '--'\ '</div>','<div','class','class="tt14-prodpres-txt' ,'tt14-prodpres-res">','<b>','</b>'] from nltk.stem.snowball import FrenchStemmer stemmer = FrenchStemmer() stemmer.stem('voudrais') def cleaningDocumentList(documentList): #remove frech stopwords texts = [[word for word in document.lower().replace("'",'').split() if word not in french_stopwords] for document in documentList] #remove special catalogue characters texts = [[word for word in text if word not in chars] for text in texts] # remove french accent texts = [[unicodedata.normalize('NFD',unicode(word,'utf-8')).encode('ascii', 'ignore') for word in text] for text in texts] #remove general special characters texts = [[re.sub(r'[. ?!=+ & | , " :; ⋆ $ %()<> &\[\]/_]',r'',word) for word in text] for text in texts] #remove small words texts = [[re.sub(r'\b\w{1,3}\b', '', word) for word in text] for text in texts] #Lemmitizer texts = [[stemmer.stem(word) for word in text] for text in texts]
def read_doc(self,docfile): """ lit le document dans le fichier doc_file et rempli les dictionnaires de listes de chaque champs avec les token du document. Compte egalement le nombre de mot """ stemmer=FrenchStemmer() flux=open(docfile) line=flux.readline() position=0 title=True first=True while line != '': liste=line.split() if title==True and len(liste)>0: #remplir le dictionnaire du titre self.full_title = line title=False for each in liste: each=each.lower() if '\'' in each: strings=self.splitAccent(each) strings[0]+='\'' self.nb_word+=len(strings) for word in strings: word= stemmer.stem(word.decode('iso-8859-1') ) if word not in self.word2pos_list_title: self.word2pos_list_title[word]=[] self.word2pos_list_title[word].append(position) position+=1 else: self.nb_word+=1 each=stemmer.stem(each.decode('iso-8859-1')) if each not in self.word2pos_list_title: self.word2pos_list_title[each]=[] self.word2pos_list_title[each].append(position) position+=1 line=flux.readline() liste=line.split() if first==True and title==False and liste!=[]: #pour remplir le dictionnaire du premier paragraphe first=False for each in liste: each=each.lower() if '\'' in each: strings=self.splitAccent(each) strings[0]+='\'' self.nb_word+=len(strings) for word in strings: word= stemmer.stem(word.decode('iso-8859-1') ) if word not in self.word2pos_list_first: self.word2pos_list_first[word]=[] self.word2pos_list_first[word].append(position) position+=1 else: self.nb_word+=1 each=stemmer.stem(each.decode('iso-8859-1')) if each not in self.word2pos_list_first: self.word2pos_list_first[each]=[] self.word2pos_list_first[each].append(position) position+=1 line=flux.readline() liste=line.split() if first==False and title==False and liste!=[]: #pour remplir le dictionnaire du corps de texte for each in liste: each=each.lower() if '\'' in each: strings=self.splitAccent(each) strings[0]+='\'' self.nb_word+=len(strings) for word in strings: word= stemmer.stem(word.decode('iso-8859-1') ) if word not in self.word2pos_list_body: self.word2pos_list_body[word]=[] self.word2pos_list_body[word].append(position) position+=1 else: self.nb_word+=1 each=stemmer.stem(each.decode('iso-8859-1')) if each not in self.word2pos_list_body: self.word2pos_list_body[each]=[] self.word2pos_list_body[each].append(position) else: self.word2pos_list_body[each].append(position) position+=1 line=flux.readline()
# coding: utf-8 import string from nltk.stem.snowball import FrenchStemmer from preprocess import Enleve_Accents import urllib url = "http://www.pallier.org/ressources/dicofr/liste.de.mots.francais.frgut.txt" file_name = "dico.txt" print "downloading the french dictionary from http://www.pallier.org/" urllib.urlretrieve(url, "FrenchDictionary.txt") stemmer = FrenchStemmer() fs = open("FrenchDictionary.txt", 'r') fd = open("StemmedFrenchDictionary.txt", 'w') while 1: txt = fs.readline() if txt =='': break if txt[0] != '': txt = txt.lower() txt = ''.join(u for u in txt if u in string.ascii_letters) txt = Enleve_Accents(txt) txt = stemmer.stem(txt) + "\n" fd.write(txt) import os script = """ (cat StemmedFrenchDictionary.txt|uniq>monfichier.tmp) && mv -f monfichier.tmp FinalDictionary.txt
class Level: def __init__(self, corpus_file, word_file): self.user= False if word_file : self.user=True self.stemmer = FrenchStemmer() self.text_into_sentences = data.load("tokenizers/punkt/french.pickle") curr_path = os.path.dirname(os.path.abspath(__file__)) os.environ['STANFORD_PARSER'] = curr_path +"/stanford-parser-full-2015-04-20" os.environ['STANFORD_MODELS'] = curr_path + "/stanford-parser-full-2015-04-20" self.parser = stanford.StanfordParser(model_path= curr_path+ "/frenchFactored.ser.gz") self.stpwords = stopwords.words("french") self.read_corpus(corpus_file) self.known_words_list = [] self.ok_words_lis = [] if self.user: self.read_known_words(word_file) self.stemmed_known_words = map(self.stemmer.stem, [w for w in self.known_words_list if w not in self.stpwords]) self.ok_words_list = [w for w in self.ok_words_lis if w not in self.stpwords] self.sentences = [] self.word_count = 0 self.syll_count = 0 self.num_of_sentences = 0 self.word_per_sentence= 0.0 self.syllables_per_word = 0.0 self.num_of_words_with_more_than_six_chars = 0 self.sixratio = 0.0 self.num_of_words_with_more_than_three_syll = 0 self.threeratio= 0.0 self.depth_sum = 0 self.avgdepth = 0.0 self.known_words = 0 self.knownratio = 0.0 self.NP_sum = 0 self.NPratio =0.0 self.VP_sum = 0 self.VPratio =0.0 self.SBAR_sum = 0 self.SBARratio =0.0 self.score_with_words=0.0 self.score_general = 0.0 self.flesch_kincaid_score=0.0 def sentence_stats(self): self.sentences = self.text_into_sentences.tokenize(self.text) self.num_of_sentences = len(self.sentences) self.word_count, self.syll_count, self.num_of_words_with_more_than_six_chars , self.num_of_words_with_more_than_three_syll, self.depth_sum, self.NP_sum, self.VP_sum, self.SBAR_sum, self.known_words = [sum(x) for x in zip(*[self.count_words_in_a_sentence(s) for s in self.sentences ])] def calculate_stats(self): self.word_per_sentence= float(self.word_count)/self.num_of_sentences self.syllables_per_word = float(self.syll_count)/ self.word_count self.sixratio = float(self.num_of_words_with_more_than_six_chars)/ self.word_count self.threeratio = float(self.num_of_words_with_more_than_three_syll)/ self.word_count self.avgdepth = float(self.depth_sum)/ self.num_of_sentences self.knownratio = float(self.known_words)/ self.word_count self.NPratio = float(self.NP_sum)/ self.word_count self.VPratio = float(self.VP_sum)/ self.word_count self.SBARratio = float(self.SBAR_sum)/ self.word_count def print_stats(self): print "#of sentences:", self.num_of_sentences print "#of words, #of words per sentence:", self.word_count, " ", self.word_per_sentence print "#of syllables, #of syllables per word:", self.syll_count," ", self.syllables_per_word print "#of words with more than 6 characters, percentage to all words:",self.num_of_words_with_more_than_six_chars," ", self.sixratio print "#of words with more than 3 syllables, percentage to all words:", self.num_of_words_with_more_than_three_syll," ", self.threeratio print "average parse tree depth:", self.avgdepth print "average # of noun phrases:", self.NPratio print "average # of verb phrases:", self.VPratio print "average # of SBAR phrases:", self.SBARratio print "# of known words, percentage to all words:", self.known_words," ", self.knownratio print "flesch-kincaid score:", self.flesch_kincaid_score print "general score without vocabulary:", self.score_general if self.user: print "score with vocabulary:", self.score_with_words def calculate_score(self): self.flesch_kincaid_score = 206.835 - 1.015 * self.word_per_sentence - 84.6 * self.syllables_per_word self.score_general =((self.SBARratio+self.VPratio+self.NPratio+self.threeratio+self.sixratio)/5)*self.avgdepth/self.flesch_kincaid_score self.score_with_words =((self.SBARratio+self.VPratio+self.NPratio+self.threeratio+self.sixratio+(1-self.knownratio))/6)*self.avgdepth/self.flesch_kincaid_score def count_words_in_a_sentence(self, sentence): known_words=0 tokens = word_punckt_tokenizer.tokenize(sentence.lower()) words = self.normalize_list(tokens) word_count = len(words) syll_count = sum(Level.syllable_count(word) for word in words) num_of_words_with_more_than_six_chars = len(filter(lambda(x): len(x) >= 6, words)) num_of_words_with_more_than_three_syll = len(filter(lambda(x): Level.syllable_count(x) >=3, words)) parse_tree_depth, num_of_NP, num_of_VP, num_of_SBAR = self.tree_stats(sentence) if self.user: known_words = sum([1 for w in words if w in self.ok_words_list or self.stemmer.stem(w) in self.stemmed_known_words ]) print sentence print words return (word_count, syll_count, num_of_words_with_more_than_six_chars,num_of_words_with_more_than_three_syll, parse_tree_depth, num_of_NP, num_of_VP, num_of_SBAR, known_words) def tree_stats(self, sentence): depth=1 num_of_np =1 num_of_vp = 1 num_of_sbar = 0 try: l = list(self.parser.raw_parse(sentence))[0] num_of_np = sum( [1 for i in l.subtrees() if i.label() == 'NP']) num_of_vp = sum( [1 for i in l.subtrees() if i.label() == 'VN']) num_of_sbar = sum( [1 for i in l.subtrees() if i.label() == 'CS']) depth = l.height() except: pass return (depth, num_of_np, num_of_vp, num_of_sbar) def normalize_list(self, token_list): ss = [w for w in token_list if w not in self.stpwords and w not in string.punctuation] return ss @staticmethod def syllable_count(word): n = len(word) num_of_syll = 0 i=0 while i < n: if i < n-3 and is_a_vowel(word[i]) and is_a_vowel(word[i+1]) and is_a_vowel(word[i+2]) and is_a_vowel(word[i+3]): num_of_syll += 2 i += 4 elif i < n-2 and is_a_vowel(word[i]) and is_a_vowel(word[i+1]) and is_a_vowel(word[i+2]): num_of_syll += 1 i += 3 elif i < n-1 and is_a_vowel(word[i]) and is_a_vowel(word[i+1]): num_of_syll += 1 i += 2 elif i < n and is_a_vowel(word[i]): num_of_syll += 1 i += 1 else: i += 1 return num_of_syll def read_corpus(self, filename): encodings = ["utf-8", "latin-1", "windows-1250", "windows-1252", "latin-15", "utf-16", "ascii"] for e in encodings: try: fh = codecs.open(filename, "r", encoding=e) self.text = fh.read().strip() fh.close() except UnicodeDecodeError: pass else: break def read_known_words(self, filename): encodings = ["utf-8", "latin-1", "windows-1250", "windows-1252", "latin-15", "ascii", "utf-16"] for e in encodings: try: fh = codecs.open(filename, "r", encoding=e) for line in fh: word, d = line.strip().split() degree = int(d) if degree ==1 : self.known_words_list.append(word) else: self.ok_words_lis.append(word) fh.close() except UnicodeDecodeError: pass else: break
from sklearn.pipeline import Pipeline import logging logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info("running %s" % " ".join(sys.argv)) # Test string # string = 'Jadis, une nuit, je fus un papillon, voltigeant, content de son sort. Puis, je m’éveillai, étant Tchouang-tseu. Qui suis-je en réalité ? Un papillon qui rêve qu’il est Tchouang-tseu ou Tchouang qui s’imagine qu’il fut papillon ?' # Have fun with tokenizers tokenizer1 = nltk.data.load('tokenizers/punkt/french.pickle') tokenizer2 = TreebankWordTokenizer() french_stopwords = set(stopwords.words('french')) stemmer = FrenchStemmer() # See results tokens1 = tokenizer1.tokenize(string) tokens2 = tokenizer2.tokenize(string) tokens3 = [token.encode('utf-8') for token in tokens2 if token.lower() not in french_stopwords] tokens4 = [stemmer.stem(token.decode('utf-8')) for token in tokens3] # Build class to add stem to pipleine class StemmedCountVectorizer(CountVectorizer): def build_analyzer(self): analyzer = super(CountVectorizer, self).build_analyzer() return lambda doc:(stemmer.stem(w) for w in analyzer(doc))
class Search_engine: """ --Moteur de recherche-- DB_file = fichier contenant la base de donnee si mode = build la base de donnee construite sera dumpee sur DB_file si mode = search la base de donnee sera recupere depuis DB_file doc_files = liste de documents bruts a integrer a la base de donnee DB = base de donnee de la classe Data_base """ def __init__(self, mode='build', DB_file=None, doc_files=None, trace=False): self.mode = mode self.DB_file = DB_file self.doc_list = [] self.trace = trace self.requete= [] self.DB = Data_Base() self.stemmer=FrenchStemmer() self.requeteFin=[] self.idDoc2tfIdf={} if mode == 'build' : #construction de la base de donnee, puis dump sur DB_file print 'Building Data Base...' self.build_DB(doc_files) print 'Building completed' elif mode == 'search' : #chargement de la base de donnee self.load_DB() self.word2nbOccDsDB={} def build_DB(self, doc_files): """ rempli seld.DB avec les documents de self.doc_files """ compteur=0 doc_name=doc_files+'doc_'+str(compteur)+'.txt' while os.path.exists(doc_name): doc=Doc(doc_name) self.DB.add_doc(doc) compteur+=1 doc_name=doc_files+'doc_'+str(compteur)+'.txt' print "Number of documents in the Data Base: ", self.DB.nb_doc_total #print self.DB.id2nbword self.dump_DB() def load_DB(self): """ charge le contenu du fichier self.DB_file dans self.DB """ print 'Loadind Data Base...' stream = open(self.DB_file) self.DB = cPickle.load(stream) stream.close() print "Number of documents in the Data Base: ", self.DB.nb_doc_total print 'Loading completed' return def dump_DB(self): """ dump le contenu de self.DB dans le fichier self.DB_file """ print 'Dumping Data Base...' p=cPickle.Pickler(open(self.DB_file, 'wb')) p.fast=True p.dump(self.DB) print 'Dumping completed' #stream.close() #return def parse_requete(self, requete): """ parse la requete introduite par l'utilisateur et produit une liste de tokens """ req_list= re.findall( '\w+', requete) for word in req_list : word = self.stemmer.stem(word.decode('utf-8')) self.requete.append(word) self.requeteFin.append(word) def fuse_lst_rec(self,title_lst,title_head,first_lst,first_head,body_lst,body_head,acc): if acc == [] : acc.append(-1) m = max(title_head,first_head,body_head) title_head_aux = title_head first_head_aux = first_head body_head_aux = body_head if m == -1 : acc.reverse() a=acc.pop() return acc else: if m == title_head_aux : if title_lst != [] : title_head_aux = title_lst.pop() else : title_head_aux = -1 elif m == first_head_aux : if first_lst != [] : first_head_aux = first_lst.pop() else : first_head_aux = -1 elif m == body_head_aux : if body_lst != [] : body_head_aux = body_lst.pop() else : body_head_aux = -1 h = acc.pop() if h != m : acc.append(h) acc.append(m) else : acc.append(h) return self.fuse_lst_rec(title_lst,title_head_aux,first_lst,first_head_aux,body_lst,body_head_aux,acc) def merge_dif_rec(self,lst1,head1,lst2,head2,acc): if acc == [] : acc.append(-1) head1_aux = head1 head2_aux = head2 if head1_aux == head2_aux : acc.append(head1_aux) if lst1 == [] or lst2 == [] : acc.reverse() acc.pop() return acc else : head1_aux = lst1.pop() head2_aux = lst2.pop() elif head1_aux > head2_aux : if lst1 == [] : acc.reverse() acc.pop() return acc else : head1_aux = lst1.pop() else : if lst2 == [] : acc.reverse() acc.pop() return acc else : head2_aux = lst2.pop() return self.merge_dif_rec(lst1,head1_aux,lst2,head2_aux,acc) def search_bool_word(self,word): title_lst = [] title_head = -1 first_lst = [] first_head = -1 body_lst = [] body_head = -1 for doc_id in self.DB.word2Word_struct[word].title : #print "title" , str(doc_id.doc_id), str(self.DB.id2doc[doc_id.doc_id].doc_file) title_lst.append(doc_id.doc_id) for doc_id in self.DB.word2Word_struct[word].first : #print "first" , str(doc_id.doc_id), str(self.DB.id2doc[doc_id.doc_id].doc_file) first_lst.append(doc_id.doc_id) for doc_id in self.DB.word2Word_struct[word].body : #print "body" , str(doc_id.doc_id), str(self.DB.id2doc[doc_id.doc_id].doc_file) body_lst.append(doc_id.doc_id) if title_lst != [] : title_head = title_lst.pop() if first_lst != [] : first_head = first_lst.pop() if body_lst != [] : body_head = body_lst.pop() result=self.fuse_lst_rec(title_lst,title_head,first_lst,first_head,body_lst,body_head,[]) self.word2nbOccDsDB[word]=len(result) return result def search_bool_req(self): if self.requete == [] : return [] word0 = self.requete.pop() lst = self.search_bool_word(word0) for word in self.requete : if lst == [] : return [] lst_aux = self.search_bool_word(word) if lst_aux == [] : return [] head_lst = lst.pop() head_lst_aux = lst_aux.pop() lst = self.merge_dif_rec(lst,head_lst,lst_aux,head_lst_aux,[]) return lst def tf_idf(self, doc_id):#calcul le TF.IDF pour la requete pour chaque doc solution= 0 doc=self.DB.id2doc[doc_id] for word in self.requeteFin: word_in_title=0 word_in_first=0 word_in_body=0 total_noWords_in_doc = float(doc.nb_word) if word in doc.word2pos_list_title: word_in_title=len(doc.word2pos_list_title[word]) if word in doc.word2pos_list_first: word_in_first=len(doc.word2pos_list_first[word]) if word in doc.word2pos_list_body: word_in_body=len(doc.word2pos_list_body[word]) word_in_doc=float(word_in_body+word_in_first+word_in_title) no_docs=float(self.DB.nb_doc_total) no_docs_with_word=self.word2nbOccDsDB[word] solution +=float(word_in_doc/total_noWords_in_doc)*math.log1p(no_docs/no_docs_with_word) return solution def tf_idf_score(self, listDoc_id): for doc_id in listDoc_id: self.idDoc2tfIdf[doc_id]=self.tf_idf(doc_id) def search_rank_req(self, requete, nbResMax): self.requete=[] self.requeteFin=[] self.parse_requete(requete) docsTrouves=self.search_bool_req() self.tf_idf_score(docsTrouves) self.idDoc2tfIdf=OrderedDict(sorted(self.idDoc2tfIdf.items(), key=lambda t: t[1], reverse=True)) keys=self.idDoc2tfIdf.keys()[:nbResMax] if len(keys)<1: print 'Nothing found \n' i=1 for doc in keys: print str(i)+'. '+self.id2docTitle(doc)+'File: '+self.id2fileName(doc) i+=1 return keys def id2fileName(self, docId): return str(self.DB.id2doc[docId].doc_file) def id2docTitle(self,docId): return str(self.DB.id2doc[docId].full_title) def reset(self): self.requete=[] self.requeteFin=[] self.idDoc2tfIdf={}
def lemmatize(self, word): stemmer = FrenchStemmer() return stemmer.stem(word)
def stemm(word): stemmer = FrenchStemmer() return stemmer.stem(word)
class Search_engine: """ --Moteur de recherche-- DB_file = fichier contenant la base de donnee si mode = build la base de donnee construite sera dumpee sur DB_file si mode = search la base de donnee sera recupere depuis DB_file doc_files = liste de documents bruts a integrer a la base de donnee DB = base de donnee de la classe Data_base """ def __init__(self, mode='build', DB_file=None, doc_files=None, trace=False): self.mode = mode self.DB_file = DB_file self.doc_list = [] doc_to_read=[] for root, dirs, files in os.walk(doc_files, topdown=False): for file_name in files: doc_to_read.append(os.path.join(root, file_name.encode('utf-8'))) for doc_file in doc_to_read : doc = Doc(doc_file) self.doc_list.append(doc) self.trace = trace self.requete= [] self.DB = Data_Base() self.stemmer=FrenchStemmer() if mode == 'build' : #construction de la base de donnee, puis dump sur DB_file print 'Built Data Base...' self.build_DB() #print self.DB elif mode == 'search' : #chargement de la base de donnee self.load_DB() print self.DB.word2Word_struct def build_DB(self): """ rempli seld.DB avec les documents de self.doc_files """ #TODO for doc in self.doc_list: self.DB.add_doc(doc) print self.DB.nb_doc_total #print self.DB.id2nbword self.dump_DB() def load_DB(self): """ charge le contenu du fichier self.DB_file dans self.DB """ stream = open(self.DB_file) self.DB = pickle.load(stream) stream.close() return def dump_DB(self): """ dump le contenu de self.DB dans le fichier self.DB_file """ print 'Dump data base....' stream = open(self.DB_file, 'w') pickle.dump(self.DB, stream) stream.close() #return def parse_requete(self, requete): """ parse la requete introduite par l'utilisateur et produit une liste de tokens """ req_list= re.findall( '\w+', requete) for word in req_list : #print 'avant', word word = self.stemmer.stem(word.decode('utf-8')) self.requete.append(word) #print 'apres', word #print "requete (parse) :" #for word in self.requete : #print word #return def fuse_lst_rec(self,title_lst,title_head,first_lst,first_head,body_lst,body_head,acc): if acc == [] : acc.append(-1) m = max(title_head,first_head,body_head) title_head_aux = title_head first_head_aux = first_head body_head_aux = body_head if m == -1 : acc.reverse() acc.pop() return acc if m == title_head_aux : if title_lst != [] : title_head_aux = title_lst.pop() else : title_head_aux = -1 if m == first_head_aux : if first_lst != [] : first_head_aux = first_lst.pop() else : first_head_aux = -1 if m == body_head_aux : if body_lst != [] : body_head_aux = body_lst.pop() else : body_head_aux = -1 h = acc.pop() if h != m : acc.append(h) acc.append(m) else : acc.append(h) self.fuse_lst_rec(title_lst,title_head_aux,first_lst,first_head_aux,body_lst,body_head_aux,acc) def merge_dif_rec(self,lst1,head1,lst2,head2,acc): if acc == [] : acc.append(-1) head1_aux = head1 head2_aux = head2 if head1_aux == head2_aux : acc.append(head1_aux) if lst1 == [] or lst2 == [] : acc.reverse() acc.pop() return acc else : head1_aux = lst1.pop() head2_aux = lst2.pop() if head1_aux > head2_aux : if lst1 == [] : acc.reverse() acc.pop() return acc else : head1_aux = lst1.pop() else : if lst2 == [] : acc.reverse() acc.pop() return acc else : head2_aux = lst2.pop() self.merge_dif_rec(lst1,head1_aux,lst2,head2_aux,acc) def search_bool_word(self,word): title_lst = [] title_head = -1 first_lst = [] first_head = -1 body_lst = [] body_head = -1 print "searching ", word if word in self.DB.word2Word_struct: print "YES" print self.DB.word2Word_struct[word].body #word=self.stemmer.stem(word.decode('utf-8')) for doc_id in self.DB.word2Word_struct[word].title : print "title" , str(doc_id.doc_id) title_lst.append(doc_id.doc_id) for doc_id in self.DB.word2Word_struct[word].first : print "first" , str(doc_id.doc_id) first_lst.append(doc_id.doc_id) for doc_id in self.DB.word2Word_struct[word].body : print "body" , str(doc_id.doc_id) body_lst.append(doc_id.doc_id) if title_lst != [] : title_head = title_lst.pop() if first_lst != [] : first_head = first_lst.pop() if body_lst != [] : body_head = body_lst.pop() return self.fuse_lst_rec(title_lst,title_head,first_lst,first_head,body_lst,body_head,[]) def search_bool_req(self): #print "requete (search) :" #for word in self.requete : ###print word if self.requete == [] : return [] #TODO ajouter une fonction pour trier les mots par ordre croissant de doc word0 = self.requete.pop() #print "word (search) :",word0 lst = self.search_bool_word(word0) for word in self.requete : #print "word (search) :",word # word=self.stemmer.stem(word.decode('utf-8')) if lst == [] : return [] lst_aux = self.search_bool_word(word) if lst_aux == [] : return [] head_lst = lst.pop() head_lst_aux = lst_aux.pop() lst = self.merge_dif_rec(lst,head_lst,lst_aux,head_lst_aux,[]) #print lst return lst def search_rank_req(self): #TODO return []
def stem_words(self, words): stemmer = FrenchStemmer() stemmed_words = [] for word in words: stemmed_words.append(stemmer.stem(word)) return stemmed_words