def parse(self, file_path): file = open(file_path, "r+") #print (type(file.read())) soup = BeautifulSoup(file.read(), "html.parser") #kill all script and style elements for script in soup(["script", "style"]): script.extract() # rip it out text = soup.get_text(separator=" ") # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) #convert all str objects to unicode objects (usefull when search words which are not stop words) list_of_words = [] for word in text.split(): if isinstance(word, str): list_of_words.append(word.decode("utf-8").lower()) else: list_of_words.append(word.lower()) #get french stop words stop_words_french = get_stop_words('fr') stemmer = FrenchStemmer() #meilleure heuristique : au lien d'enlever les caractères 1 à 1, on se débarasse de ceux ayant une taille de 1 #Global_stop_words_List=["?",".","!",",","'","|","...",":","–","&","-","€"]+stop_words_french Global_stop_words_List = [ word for word in list_of_words if len(word) == 1 ] + stop_words_french #convert all str objects to unicode objects (usefull when search words which are not stop words) filter_stop_words_list = [] for word in Global_stop_words_List: if isinstance(word, str): filter_stop_words_list.append( stemmer.stem(word.decode("utf-8").lower())) else: filter_stop_words_list.append(stemmer.stem(word.lower())) #filter list using stop words and apply stemming operation filter_words = [] for word in list_of_words: word = word.lower() if not self.isUrl(word): list_clean_words = self.cleanWord(word) #list_clean_words=[stemmer.stem(cword) for cword in list_of_words] for clean_word in list_clean_words: if not (stemmer.stem(clean_word) in filter_stop_words_list): filter_words.append(stemmer.stem(clean_word)) return filter_words
def stem_words(words): '''stems the word list using the French Stemmer''' #stemming words stemmed_words = [] #declare an empty list to hold our stemmed words stemmer = FrenchStemmer( ) #create a stemmer object in the FrenchStemmer class for word in words: stemmed_word = stemmer.stem(stemmer.stem(word)) #stem the word stemmed_words.append( stemmed_word) #add it to our stemmed word list stemmed_words.sort() #sort the stemmed_words return stemmed_words
def tweet_cleaner(text, my_dict, stem=False): # fixes encoding problem (MICHELE) if sys.version_info[0] < 3: if type(text) != unicode and type(text) != float: try: text = unicode(text, 'utf-8') except UnicodeDecodeError: text = unicode(text, 'latin-1') if type(text) == float: text = str(text) if stem: stemmer = FrenchStemmer() tokens = [ stemmer.stem(t) for t in text.split() if len(stemmer.stem(t)) >= 1 ] text = " ".join(tokens) tok = WordPunctTokenizer() pat1 = r'@[A-Za-z0-9_]+' pat2 = r'https?://[^ ]+' combined_pat = r'|'.join((pat1, pat2)) www_pat = r'www.[^ ]+' rt_path = r'^rt' paths = [combined_pat, www_pat, rt_path] text = unidecode.unidecode(text) soup = BeautifulSoup(text, 'lxml') souped = soup.get_text() lower = souped.lower() for word in my_dict: souped = re.sub(word, my_dict[word], lower) try: bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?") except: bom_removed = souped for path in paths: bom_removed = re.sub(path, '', bom_removed.lower()) letters_only = re.sub("[^a-zA-Z]", " ", bom_removed) # During the letters_only process two lines above, it has created unnecessay white spaces, # I will tokenize and join together to remove unneccessary white spaces words = [x for x in tok.tokenize(letters_only) if len(x) >= 1] return (" ".join(words)).strip()
def __init__(self, label = "", role = "", ner = ""): self.label = label self.role = role self.ner = ner ##print(repr(label)) stemmer = FrenchStemmer() self.lemma = stemmer.stem(label)
def new_dico(file): stemmer = FrenchStemmer() input_ = "../dico/" + file output_ = "dic_with_roots/" + file fs=open(input_,'r') fd=open(output_,'w') k =0 lines = fs.readlines() for line in lines: txt = line.split(" ") if txt =='': break for w in txt: if(w.istitle()): k = 1 else: k = 0 w= w.decode("utf-8") w = ''.join(u for u in w if u in string.ascii_letters) w=enleve_accents(w) w=stemmer.stem(w)+" " w.encode("utf-8") if(k): w = w[0].upper() + w[1:] fd.write(w) else: fd.write(w) fs.close() fd.close()
def preprocess(self, text=None, stem=False, fix_pdf=True): if text is None: text = self.text def fix_pdf2txt(texto): import re texto = re.sub(r'\n([^A-Z])', r' \1', texto) texto = re.sub(r'([^\.])\n', r'\1.\n', texto) return texto def tokenizer_fr(text): # Courtesy of http://www.fabienpoulard.info/post/2008/03/05/Tokenisation-en-mots-avec-NLTK return tok_fr.tokenize(text) # Fix newline problems with pdf to txt step if fix_pdf: text = fix_pdf2txt(text) text = text.lower() # Tokenization self._original_tokens = tokenizer_fr(text) self._tokens = self._original_tokens # self._tokens = [t for t in self._tokens if len(t) > 1] if stem: from nltk.stem.snowball import FrenchStemmer fr_stemmer = FrenchStemmer() self._tokens = [fr_stemmer.stem(t) for t in self._tokens] self._concordance_index = nltk.ConcordanceIndex(self._tokens, key=lambda s: s)
def normalize_text(string): """Preprocess text string to return a normalized form of the text. """ if isinstance(string, float): return "" else: # lowering x, removing beginning and ending space s = string.strip().lower() # removing accents s = ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')) # remove punctuation s = re.sub("[" + punctuation + "]", " ", s) # remove uninformative, stop words and non alpha words words_to_remove = [ "les", "une", "des", "nos", "ils", "elle", "elles", "nan", "null" ] stop_words = list(stopwords.words("french")) remove_list = words_to_remove + stop_words s = " ".join([ word for word in s.split() if (word.isalpha() and word not in remove_list and len(word) > 2) ]) # Stemming words and remove duplicates stemmer = FrenchStemmer() stem_words = [stemmer.stem(w) for w in s.split()] s = " ".join(stem_words) return s
def get_lem(text): stemmer = FrenchStemmer() text_tokens = word_tokenize(text) text ="" for word in text_tokens : text += " "+stemmer.stem(word) return text
class HistExtractor: def __init__(self): self.stemmer = FrenchStemmer() self.analyzer = CountVectorizer().build_analyzer() self.bad_words = [ "src", 'html', 'ifram', 'allowtransparency', 'analytic', 'class', 'com', 'hidden', 'lien', 'lightwidget', 'overflow', 'row', 'script', 'scrolling', 'src', 'widget', "tous", "jour", "blog", 'width', 'wrapp', "les", "googl", "propos", "list" ] self.stopwords = nltk.corpus.stopwords.words('french') + self.bad_words def stemmed_words(doc): return (self.stemmer.stem(w) for w in self.analyzer(doc) if w not in self.stopwords) self.cv = CountVectorizer(analyzer=stemmed_words, stop_words=self.stopwords) # self.cv = CountVectorizer(stop_words=self.stopwords) def get_histogram_from_string(self, x): hist = self.cv.fit_transform([x]) dict_result = { k: int(v) for k, v in zip(self.cv.get_feature_names(), hist.toarray()[0]) if k not in self.bad_words } return dict_result
def stemmingFR(self): ps = FrenchStemmer() #Input Document Input = open(self.filename, "r") elagage = Input.read() #Output Document Output = open(self.filename + "-out3.txt", "a") #Stemming for w in elagage.split(): Output.write(ps.stem(w)) Output.write("\n") self.label.configure(text=self.filename) self.texte = Entry(self, width=20, font="Arial 14", fg="green", justify='center') self.texte.insert(END, "Succée de Stemming") self.texte.grid(padx=16, pady=16) self.texte = Entry(self, width=50, font="Arial 14", fg="blue", justify='center') self.texte.insert( END, "Vous trouvez votre fichier résulat sous le même répertoire") self.texte.grid(padx=16, pady=16)
def text_stemming(self): """ stem the text """ if self.language == "french": stemmer = FrenchStemmer() elif self.language == "english": stemmer = PorterStemmer() elif self.language == "italian": stemmer = SnowballStemmer(self.language) elif self.language == "german": stemmer = SnowballStemmer(self.language) elif self.language == "spanish": stemmer = SnowballStemmer(self.language) elif self.language == "dutch": stemmer = SnowballStemmer(self.language) elif self.language == "portuguese": stemmer = SnowballStemmer(self.language) elif self.language == "danish": stemmer = SnowballStemmer(self.language) elif self.language == "greek": stemmer = GreekStemmer() elif self.language == "arabic": stemmer = ISRIStemmer() else: print( "Language need to be french, english, german,spanish or italian" ) self.text = ' '.join( [stemmer.stem(word) for word in word_tokenize(self.text)])
class FrenchStemTokenizer(object): # A French Stemmer Tokenizer def __init__(self, stop_words=None, remove_non_words=False): self.st = FrenchStemmer() if stop_words == None: self.stopwords = set(stopwords.words('french')) else: self.stopwords = stop_words self.words = set(words.words()) self.remove_non_words = remove_non_words def __call__(self, doc): # tokenize words and punctuation word_list = wordpunct_tokenize(doc) # remove stopwords word_list = [word for word in word_list if word not in self.stopwords] # remove non words if (self.remove_non_words): word_list = [word for word in word_list if word in self.words] # remove 1-character words word_list = [word for word in word_list if len(word) > 1] # remove non alpha word_list = [word for word in word_list if word.isalpha()] return [self.st.stem(t) for t in word_list]
def process_text(text, stem=False): """ lowercase, removes stopwords, accents and lemmatizes the tokens if stem=True used with the df.apply() to create a new column on a dataframe """ text_clean = [] for sen in text: # sen = unidecode.unidecode(sen.replace("’", " ").replace(","," ").replace("."," ").replace(";"," ").lower()) sen = unidecode.unidecode( sen.replace("’", " ").replace(",", " ").replace( ";", " ").lower()) # keep the dots for the date_uniformizer sen = sen.replace("/ ", "/") #some dates are in DD/ MM/ yyyy format tokens = sen.split() if stem: from nltk.stem.snowball import FrenchStemmer stemmer = FrenchStemmer() tokens_no_stpwrd = [ stemmer.stem(tok) for tok in tokens if tok not in stop_words ] else: # tokens_no_stpwrd = [tok for tok in tokens if (tok not in stop_words) & (tok.isalnum())] tokens_no_stpwrd = [ tok for tok in tokens if (tok not in stop_words) ] no_letters = re.sub(' [a-z] ', " ", " ".join(tokens_no_stpwrd)) text_clean.append(no_letters) return text_clean
def convert_computing_input_to_dictionnary_input(datas): # Instantiation of the Tokenizer tokenizer = WordPunctTokenizer() # Instantiate Stemmer stemmer = FrenchStemmer() # Load french StopWords french_stopwords = set(stopwords.words('french')) matchingTable = [] tokenTable = [ ] # Each row of this table is [id, tokens] :'id' of the advert and 'tokens' the list of tokens in the advert i = 0 for row in datas: id = row[0] desc = row[2] # Get tokens for this row tokens = tokenizer.tokenize(str(desc[0])) # Filter tokens to remove punctuation regex = re.compile(r'\w+') tokens = filter(regex.search, tokens) # Filter tokens to remove stopwords and convert tokens to their stemm tokens = [ stemmer.stem(token) for token in tokens if token.lower() not in french_stopwords ] # Remove duplicate entries tokens = list(set(tokens)) # Sort tokens tokens.sort() # Construct the new row with only the id and the list of tokens row = [id, tokens] # Add the new Row to the global table tokenTable.append(row) i += 1 # Construct the vector for each advert rowCol = 0 rowCols = [] tabCols = [] rowInd = 0 initialRow = np.zeros(len(tokenDictionnary)) # # # Here we transform each row of tokens into row of 0|1 corresponding array, matching the tokenDictionnary # # tokenTable[1:] to skip the title row, because the original file has a title row for row in tokenTable[1:]: id = row[0].split(";")[0] advertVec = np.zeros(len(tokenDictionnary)) rowCols = [] for elm in row[1]: rowCol = tokenDictionnary.index( elm) if elm in tokenDictionnary else -1 advertVec[rowCol] = 1 if rowCol >= 0 else 0 composed_row = [id, advertVec] matchingTable.append(composed_row) rowInd += 1 return tokenTable, matchingTable
def is_french_adjr(word): # TODO change adjr tests stemmer = FrenchStemmer() # suffixes with gender and number flexions suffixes = [ u"ain", u"ains", u"aine", u"aines", u"aire", u"aires", u"al", u"aux", u"als", u"ale", u"ales", u"el", u"els", u"elle", u"elles", u"esque", u"esques", u"estre", u"estres", u"eux", u"euse", u"euses", u"é", u"és", u"ée", u"ées", u"ien", u"iens", u"ienne", u"iennes", u"ier", u"iers", u"ière", u"ières", u"if", u"ifs", u"ive", u"ives", u"il", u"ils", u"in", u"ins", u"ine", u"ines", u"ique", u"iques", u"ois", u"oise", u"oises" ] stem = stemmer.stem(word) stem_ending = "" if word.replace(u"é", "e").replace(u"è", "e").startswith(stem.replace(u"é", "e").replace(u"è", "e")): stem_ending = word.replace(u"é", "e").replace(u"è", "e").split(stem.replace(u"é", "e").replace(u"è", "e"), 1)[1] if stem in french_stemmed_adjr: return True for suffix in suffixes: if word[-len(suffix):] == suffix: return True # TODO change adjr tests #if stem_ending in french_adjr_stem_ending_counts: # return True return False
def getListStopwords(self): stemmer = FrenchStemmer() file = open(self.path, "r+") list_words = [] res = [] indice = 0 #Il n'y q'une seule ligne dans le fichier for line in file: list_words = line.split(',') for word in list_words: if isinstance(word, str): word = stemmer.stem(word.decode("utf-8").lower()) else: word = stemmer.stem(word.lower()) res.append(word) return res
def stem(sentence): # Instance Stemmer stemmer = FrenchStemmer() stem = '' for word in nltk.word_tokenize(sentence): stem += ' ' + stemmer.stem(word) #print(stem) # Pour débug return stem
def racinize_all_concept(concept): concept_tiers = [] stemmer = FrenchStemmer() for i in range(0, len(concept)): temp = concept[i][0].lower() temp2 = stemmer.stem(temp) concept_tiers.append((temp2, concept[i][1])) return concept_tiers
def stemArticle(self, doc): stemmer_fr = FrenchStemmer() stemmer_en = EnglishStemmer() stemmedArticle = [str(stemmer_fr.stem(w)) for w in doc] stemmedArticle = [str(stemmer_en.stem(w)) for w in stemmedArticle] return stemmedArticle
def lemmatize(token): """Lemmatize word using a french lemmatizer Arguments: token {string} -- token to lemmatize """ stemmer = FrenchStemmer() return stemmer.stem(token)
def preprocess(text): result = [] stopwords = get_stopswords() stemmer = FrenchStemmer() for token in simple_preprocess(text): if token not in stopwords and len(token) > 3: result.append(stemmer.stem(token)) return result
def stem_words(words): stemmed_words = [] stemmer = FrenchStemmer() for word in words: stemmed_word = stemmer.stem(word) stemmed_words.append(stemmed_word) stemmed_words.sort() return stemmed_words
def computeScores(title,desc,cat): if title == "": messagebox.showwarning("Empty attribute","Empty title. Title Score automatically set to 0.") titlescore = 0 else: titlescore = computetitleScore(title) print(titlescore) if desc == "": messagebox.showwarning("Empty attribute","Empty description. Description and Category Score automatically set to 0.") descscore = 0 links = 0 categoryScore = 0 else: urls = re.findall(regexURL,desc) mails = re.findall(regexMail,desc) words = nltk.word_tokenize(re.sub('\W+',' ',re.sub(regexMail,'',re.sub(regexURL,'', desc)))) words = [''.join([i for i in word if not i.isdigit()]) for word in words] stemmer = FrenchStemmer() newwords = [] for word in words: if word.lower() not in list(line.strip() for line in open('french')) and len(word)>1: if word.lower() in list(line.strip() for line in open('words')): newwords.append(unidecode.unidecode(stemmer.stem(word.lower()))) newwords.append(unidecode.unidecode(stemmer.stem(word.lower()))) links = len(urls)+len(mails) descscore = computedescScore(newwords) if cat == None: categoryScore = 0 else: categoryScore = computecatScore(newwords,cat) totScore = 7*descscore+categoryScore+titlescore if links > 0: totScore+=1 if totScore > 10: totScore=10 message = "The total score is "+str(round(totScore,2))+"\nThe description score is "+str(round(descscore,2))+"\nThe title score is "+str(titlescore)+"\nThe classification score is "+str(categoryScore)+"\n" if descscore < 0.4: message+="You should improve the description\n" if titlescore < 1: message+="You should improve the title\n" if categoryScore < 0.5: message+="The description should be more explcit on the category" messagebox.showinfo("Results",message)
def runSearch(self, list_keywords, termScoreMethod, documentScoreMethod): stemmer = FrenchStemmer() stop_words_french = get_stop_words('fr') stop_words_french = [ stemmer.stem(word.lower()) for word in stop_words_french ] list_keywords_split = [] for word, coef in list_keywords: wordList = word.split() for w in wordList: #on redivise le coef, par le nombre de mots trouvé en splitant. #Comme ça on re répartie le "coef" sur les nouveaux mots newCoef = float(coef) / float(len(wordList)) list_keywords_split.append((w, newCoef)) list_of_words_request = [] for word, coef in list_keywords_split: #print word if isinstance(word, str): word = word.decode("utf-8").lower() else: word = word.lower() #print "*************",stemmer.stem(word) #if (word in stop_words_french)==False: list_of_words_request.append((stemmer.stem(word), coef)) #for elt in stop_words_french: # print "******************",elt # by default, documents score are equal to zero scoreNameDoc = [(0, "D" + str(i + 1) + ".html") for i in range(138)] nb_doc_collection = 138 #scoring all documents, for idDoc in range(138): if idDoc + 1 != 127: score = self.computeDocumentScore(idDoc + 1, list_of_words_request, termScoreMethod, documentScoreMethod) scoreNameDoc[idDoc] = (score, scoreNameDoc[idDoc][1]) scoreNameDoc.sort(key=lambda tup: tup[0]) return scoreNameDoc[::-1]
def stemWords(listWords): '''racinisation''' stemmedWords = list() stemmer = FrenchStemmer() for word in removeStopwords(listWords): stemmedWord = stemmer.stem(word) stemmedWords.append(stemmedWord) stemmedWords.sort() return stemmedWords
class FrenchLemmaTokenizer(object): """ This is basically a function with a shared memory between calls for the wnl frenchstemmer """ def __init__(self): self.wnl = FrenchStemmer() def __call__(self, s): return [self.wnl.stem(t) for t in word_tokenize(s) if t.isalpha()]
def stemming_Function(filtered_words): stemmed_words = [] #declare an empty list to hold our stemmed words stemmer = FrenchStemmer() #create a stemmer object in the FrenchStemmer class for word in filtered_words: stemmed_word=stemmer.stem(word) #stem the word stemmed_words.append(stemmed_word) #add it to our stemmed word list freqdist = nltk.FreqDist(stemmed_words) return freqdist
def racinize_all_negationeur( concept): #all racinisation are used to work with variations nega_tiers = [] stemmer = FrenchStemmer() for i in range(0, len(negationeur)): temp = negationeur[i].lower() temp2 = stemmer.stem(temp) nega_tiers.append(temp2) return nega_tiers
class Preprocessing: def __init__(self, data_loader: DataLoader): self.tokenizer = nltk.RegexpTokenizer(r'\w+') self.stemmer = FrenchStemmer() self.get_stop_words() # TODO compose it inputs: InputData = data_loader.load() data = inputs.examples self.responses = inputs.responses data = self.to_lower_case_all(data) data = self.tokenize_all_examples(data) data = self.remove_stop_words_for_all(data) data = self.lemmatize_all(data) self.data = data def get_stop_words(self): nltk.download('french') self.stop_words = nltk.corpus.stopwords.words('french') dump_stop_words(self.stop_words) def to_lower_case_one(self, example: str): return example.lower() def to_lower_case_all(self, data): to_lower_case = lambda examples: [self.to_lower_case_one(ex) for ex in examples] return utils.apply_for_each_key(data, to_lower_case) def tokenize_one_example(self, example): return self.tokenizer.tokenize(example) def tokenize_all_examples(self, data): tokenize = lambda data_list: [self.tokenize_one_example(d) for d in data_list] return utils.apply_for_each_key(data, tokenize) def remove_stop_words(self, example): return [w for w in example if not w in self.stop_words] def remove_stop_words_for_all(self, data): remove_sw = lambda examples: [self.remove_stop_words(ex) for ex in examples] return utils.apply_for_each_key(data, remove_sw) def lemmatize(self, example): return [self.stemmer.stem(w) for w in example] def lemmatize_all(self, data): get_lems = lambda examples: [self.lemmatize(ex) for ex in examples] return utils.apply_for_each_key(data, get_lems) def process_sentence(self, sentence): # TODO compose data = self.to_lower_case_one(sentence) data = self.tokenize_one_example(data) data = self.remove_stop_words(data) data = self.lemmatize(data) return data
def stem_words(words): '''stems the word list using the French Stemmer''' #stemming words stemmed_words = [] #declare an empty list to hold our stemmed words stemmer = FrenchStemmer() #create a stemmer object in the FrenchStemmer class for word in words: stemmed_word=stemmer.stem(word) #stem the word stemmed_words.append(stemmed_word) #add it to our stemmed word list stemmed_words.sort() #sort the stemmed_words return stemmed_words
def stem_words(words): #stemming words stemmed_words = [] stemmer = FrenchStemmer() #creation d'un objet stemmer #dans la classe FrenchStemmer for word in words: stemmed_word=stemmer.stem(word) #stem the word stemmed_words.append(stemmed_word) return stemmed_words
def nettoyage(self, document): """ function to clean the dataset + stemming """ document = self.tokenize(document) document = [word.lower() for word in document if len(word) > 2 and not word.isnumeric() and word not in self.stop_words] if self.stem: stem = FrenchStemmer() document = [stem.stem(word) for word in document] return document
def lemmatize_or_stem(language, terms): if language != 'english' or (language == 'english' and ENGLISH_FREELING): #TEMPORARY: EXPERIMENTING WITH ENGLISH FREELING # Use FreeLing if language == 'spanish': analyzeProcess = subprocess.Popen(["analyze", "-f", "/usr/local/share/freeling/config/es.cfg"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) elif language == 'portugese': analyzeProcess = subprocess.Popen(["analyze", "-f", "/usr/local/share/freeling/config/pt.cfg"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) elif language == 'italian': analyzeProcess = subprocess.Popen(["analyze", "-f", "/usr/local/share/freeling/config/it.cfg"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) elif language == 'english': analyzeProcess = subprocess.Popen(["analyze", "-f", "/usr/local/share/freeling/config/en.cfg"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) terms = map(lambda term: term.encode('utf-8'), terms) analyzeProcess.stdin.write(' '.join(terms)) stdout, stderr = analyzeProcess.communicate() # Parse FreeLing output # Lemma is always second word of each line. terms = list() lines = stdout.split('\n') for line in lines: items = line.split(' ') if len(items) == 4: lemma = items[1] tag = items[2] """ATTN: TAGSET IS DIFFERENT IN SPANISH AND ENGLISH. However, NP, F, Z, and W all mean the same thing in both tagsets.""" # remove proper nouns, punctuation, numbers, and dates/times if not (tag[0:2]=='NP' or tag[0] == 'F' or tag[0] == 'Z' or tag[0] == 'W' or tag[0:3] == 'POS'): # if english, need to remove numbers include = True for num in NUMBERS: if num in lemma: include = False if include: terms.append(lemma) terms = map(lambda term: term.decode('utf-8'), terms) elif (language == 'english' and not ENGLISH_FREELING): lem = WordNetLemmatizer() terms = map(lambda term: lem.lemmatize(term), terms ) elif language == 'french': from nltk.stem.snowball import FrenchStemmer stemmer = FrenchStemmer() terms = map(lambda term: stemmer.stem(term), terms) terms = map(lambda term: term.decode('utf-8'), terms) return terms
from nltk.corpus import stopwords import nltk # chargement des stopwords français french_stopwords = set(stopwords.words('french')) print french_stopwords chars = ['.', '/', "'", '"', '?', '!', '#', '$', '%', '^', '&','*',\ '(', ')', ' - ', '_', '+' ,'=', '@', ':', '\\', ',',';', '~', '`', '<',\ '>', '|', '[', ']', '{', '}', '–', '“','»', '«', '°', '’', '--'\ '</div>','<div','class','class="tt14-prodpres-txt' ,'tt14-prodpres-res">','<b>','</b>'] from nltk.stem.snowball import FrenchStemmer stemmer = FrenchStemmer() stemmer.stem('voudrais') def cleaningDocumentList(documentList): #remove frech stopwords texts = [[word for word in document.lower().replace("'",'').split() if word not in french_stopwords] for document in documentList] #remove special catalogue characters texts = [[word for word in text if word not in chars] for text in texts] # remove french accent texts = [[unicodedata.normalize('NFD',unicode(word,'utf-8')).encode('ascii', 'ignore') for word in text] for text in texts] #remove general special characters texts = [[re.sub(r'[. ?!=+ & | , " :; ⋆ $ %()<> &\[\]/_]',r'',word) for word in text] for text in texts] #remove small words texts = [[re.sub(r'\b\w{1,3}\b', '', word) for word in text] for text in texts] #Lemmitizer texts = [[stemmer.stem(word) for word in text] for text in texts] #remove empty string
class DirectTranslate: """Word-by-word direct translator. Usage: translator = DirectTranslate(translation_dict) for sentence in file: print translator.translate(sentence, delims=",' ", remove='') """ def __init__(self, translation_dict, lemmatized=False): self.english_lemmatizer = WordNetLemmatizer() self.french_stemmer = FrenchStemmer() if not lemmatized: stemmed_dict = self._get_lemmatized_dict(translation_dict) self.stemmed_dict = stemmed_dict self.translation_dict = translation_dict def _get_lemmatized_dict(self, dict): result = {} for french_word, english_translation_list in dict.iteritems(): french_stem = self.french_stemmer.stem(french_word) english_translations = [ self.english_lemmatizer.lemmatize(word) for word in english_translation_list ] # NOTE: This may or may not be the best stragetgy. If two distinct # French words in the initial dict have the same stem, # it appends the two lists of translations. # TODO: Reconsider. # TODO: Consider removing duplicates from this new list. But need to preserve order. if french_stem not in result: result[french_stem] = english_translations else: result[french_stem].extend(english_translations) return result def _get_preprocessed_sentence(self, french_sentence): """Apply any preprossing rules here. Args: french_sentence: string; the sentence in french Returns: The sentence with all preprocessing rules applied. """ return unicode(french_sentence) def _get_postprocessed_sentence(self, english_sentence): """Apply any postproccessing rules here. Args: english_sentence: string; an english sentence Returns: The sentence with all postprocessing rules applied. """ return english_sentence # TODO: Add code to keep commas. Translate them into a word. def translate(self, sentence, delims=",' ", remove=''): sentence = self._get_preprocessed_sentence(sentence) tokens = TranslateUtils.get_list_of_words(sentence, delims, remove) translated_list = [] for token in tokens: stemmed_token = self.french_stemmer.stem(token).lower() if stemmed_token in self.stemmed_dict: possible_translations = self.stemmed_dict[stemmed_token] if possible_translations: # Use first translation in the list translation = possible_translations[0] translated_list.append(translation) elif token in self.translation_dict: possible_translations = self.translation_dict[token] if possible_translations: # Use first translation in the list translation = possible_translations[0] translated_list.append(translation) translation = ' '.join(translated_list) translation = self._get_postprocessed_sentence(translation) return translation
class Search_engine: """ --Moteur de recherche-- DB_file = fichier contenant la base de donnee si mode = build la base de donnee construite sera dumpee sur DB_file si mode = search la base de donnee sera recupere depuis DB_file doc_files = liste de documents bruts a integrer a la base de donnee DB = base de donnee de la classe Data_base """ def __init__(self, mode='build', DB_file=None, doc_files=None, trace=False): self.mode = mode self.DB_file = DB_file self.doc_list = [] doc_to_read=[] for root, dirs, files in os.walk(doc_files, topdown=False): for file_name in files: doc_to_read.append(os.path.join(root, file_name.encode('utf-8'))) for doc_file in doc_to_read : doc = Doc(doc_file) self.doc_list.append(doc) self.trace = trace self.requete= [] self.DB = Data_Base() self.stemmer=FrenchStemmer() if mode == 'build' : #construction de la base de donnee, puis dump sur DB_file print 'Built Data Base...' self.build_DB() #print self.DB elif mode == 'search' : #chargement de la base de donnee self.load_DB() print self.DB.word2Word_struct def build_DB(self): """ rempli seld.DB avec les documents de self.doc_files """ #TODO for doc in self.doc_list: self.DB.add_doc(doc) print self.DB.nb_doc_total #print self.DB.id2nbword self.dump_DB() def load_DB(self): """ charge le contenu du fichier self.DB_file dans self.DB """ stream = open(self.DB_file) self.DB = pickle.load(stream) stream.close() return def dump_DB(self): """ dump le contenu de self.DB dans le fichier self.DB_file """ print 'Dump data base....' stream = open(self.DB_file, 'w') pickle.dump(self.DB, stream) stream.close() #return def parse_requete(self, requete): """ parse la requete introduite par l'utilisateur et produit une liste de tokens """ req_list= re.findall( '\w+', requete) for word in req_list : #print 'avant', word word = self.stemmer.stem(word.decode('utf-8')) self.requete.append(word) #print 'apres', word #print "requete (parse) :" #for word in self.requete : #print word #return def fuse_lst_rec(self,title_lst,title_head,first_lst,first_head,body_lst,body_head,acc): if acc == [] : acc.append(-1) m = max(title_head,first_head,body_head) title_head_aux = title_head first_head_aux = first_head body_head_aux = body_head if m == -1 : acc.reverse() acc.pop() return acc if m == title_head_aux : if title_lst != [] : title_head_aux = title_lst.pop() else : title_head_aux = -1 if m == first_head_aux : if first_lst != [] : first_head_aux = first_lst.pop() else : first_head_aux = -1 if m == body_head_aux : if body_lst != [] : body_head_aux = body_lst.pop() else : body_head_aux = -1 h = acc.pop() if h != m : acc.append(h) acc.append(m) else : acc.append(h) self.fuse_lst_rec(title_lst,title_head_aux,first_lst,first_head_aux,body_lst,body_head_aux,acc) def merge_dif_rec(self,lst1,head1,lst2,head2,acc): if acc == [] : acc.append(-1) head1_aux = head1 head2_aux = head2 if head1_aux == head2_aux : acc.append(head1_aux) if lst1 == [] or lst2 == [] : acc.reverse() acc.pop() return acc else : head1_aux = lst1.pop() head2_aux = lst2.pop() if head1_aux > head2_aux : if lst1 == [] : acc.reverse() acc.pop() return acc else : head1_aux = lst1.pop() else : if lst2 == [] : acc.reverse() acc.pop() return acc else : head2_aux = lst2.pop() self.merge_dif_rec(lst1,head1_aux,lst2,head2_aux,acc) def search_bool_word(self,word): title_lst = [] title_head = -1 first_lst = [] first_head = -1 body_lst = [] body_head = -1 print "searching ", word if word in self.DB.word2Word_struct: print "YES" print self.DB.word2Word_struct[word].body #word=self.stemmer.stem(word.decode('utf-8')) for doc_id in self.DB.word2Word_struct[word].title : print "title" , str(doc_id.doc_id) title_lst.append(doc_id.doc_id) for doc_id in self.DB.word2Word_struct[word].first : print "first" , str(doc_id.doc_id) first_lst.append(doc_id.doc_id) for doc_id in self.DB.word2Word_struct[word].body : print "body" , str(doc_id.doc_id) body_lst.append(doc_id.doc_id) if title_lst != [] : title_head = title_lst.pop() if first_lst != [] : first_head = first_lst.pop() if body_lst != [] : body_head = body_lst.pop() return self.fuse_lst_rec(title_lst,title_head,first_lst,first_head,body_lst,body_head,[]) def search_bool_req(self): #print "requete (search) :" #for word in self.requete : ###print word if self.requete == [] : return [] #TODO ajouter une fonction pour trier les mots par ordre croissant de doc word0 = self.requete.pop() #print "word (search) :",word0 lst = self.search_bool_word(word0) for word in self.requete : #print "word (search) :",word # word=self.stemmer.stem(word.decode('utf-8')) if lst == [] : return [] lst_aux = self.search_bool_word(word) if lst_aux == [] : return [] head_lst = lst.pop() head_lst_aux = lst_aux.pop() lst = self.merge_dif_rec(lst,head_lst,lst_aux,head_lst_aux,[]) #print lst return lst def search_rank_req(self): #TODO return []
def stemm(word): stemmer = FrenchStemmer() return stemmer.stem(word)
def read_doc(self,docfile): """ lit le document dans le fichier doc_file et rempli les dictionnaires de listes de chaque champs avec les token du document. Compte egalement le nombre de mot """ stemmer=FrenchStemmer() flux=open(docfile) line=flux.readline() position=0 title=True first=True while line != '': liste=line.split() if title==True and len(liste)>0: #remplir le dictionnaire du titre self.full_title = line title=False for each in liste: each=each.lower() if '\'' in each: strings=self.splitAccent(each) strings[0]+='\'' self.nb_word+=len(strings) for word in strings: word= stemmer.stem(word.decode('iso-8859-1') ) if word not in self.word2pos_list_title: self.word2pos_list_title[word]=[] self.word2pos_list_title[word].append(position) position+=1 else: self.nb_word+=1 each=stemmer.stem(each.decode('iso-8859-1')) if each not in self.word2pos_list_title: self.word2pos_list_title[each]=[] self.word2pos_list_title[each].append(position) position+=1 line=flux.readline() liste=line.split() if first==True and title==False and liste!=[]: #pour remplir le dictionnaire du premier paragraphe first=False for each in liste: each=each.lower() if '\'' in each: strings=self.splitAccent(each) strings[0]+='\'' self.nb_word+=len(strings) for word in strings: word= stemmer.stem(word.decode('iso-8859-1') ) if word not in self.word2pos_list_first: self.word2pos_list_first[word]=[] self.word2pos_list_first[word].append(position) position+=1 else: self.nb_word+=1 each=stemmer.stem(each.decode('iso-8859-1')) if each not in self.word2pos_list_first: self.word2pos_list_first[each]=[] self.word2pos_list_first[each].append(position) position+=1 line=flux.readline() liste=line.split() if first==False and title==False and liste!=[]: #pour remplir le dictionnaire du corps de texte for each in liste: each=each.lower() if '\'' in each: strings=self.splitAccent(each) strings[0]+='\'' self.nb_word+=len(strings) for word in strings: word= stemmer.stem(word.decode('iso-8859-1') ) if word not in self.word2pos_list_body: self.word2pos_list_body[word]=[] self.word2pos_list_body[word].append(position) position+=1 else: self.nb_word+=1 each=stemmer.stem(each.decode('iso-8859-1')) if each not in self.word2pos_list_body: self.word2pos_list_body[each]=[] self.word2pos_list_body[each].append(position) else: self.word2pos_list_body[each].append(position) position+=1 line=flux.readline()
urllib.urlretrieve(url, "FrenchDictionary.txt") stemmer = FrenchStemmer() fs = open("FrenchDictionary.txt", 'r') fd = open("StemmedFrenchDictionary.txt", 'w') while 1: txt = fs.readline() if txt =='': break if txt[0] != '': txt = txt.lower() txt = ''.join(u for u in txt if u in string.ascii_letters) txt = Enleve_Accents(txt) txt = stemmer.stem(txt) + "\n" fd.write(txt) import os script = """ (cat StemmedFrenchDictionary.txt|uniq>monfichier.tmp) && mv -f monfichier.tmp FinalDictionary.txt """ os.system("bash -c '%s'" % script) fs.close() fd.close()
logging.info("running %s" % " ".join(sys.argv)) # Test string # string = 'Jadis, une nuit, je fus un papillon, voltigeant, content de son sort. Puis, je m’éveillai, étant Tchouang-tseu. Qui suis-je en réalité ? Un papillon qui rêve qu’il est Tchouang-tseu ou Tchouang qui s’imagine qu’il fut papillon ?' # Have fun with tokenizers tokenizer1 = nltk.data.load('tokenizers/punkt/french.pickle') tokenizer2 = TreebankWordTokenizer() french_stopwords = set(stopwords.words('french')) stemmer = FrenchStemmer() # See results tokens1 = tokenizer1.tokenize(string) tokens2 = tokenizer2.tokenize(string) tokens3 = [token.encode('utf-8') for token in tokens2 if token.lower() not in french_stopwords] tokens4 = [stemmer.stem(token.decode('utf-8')) for token in tokens3] # Build class to add stem to pipleine class StemmedCountVectorizer(CountVectorizer): def build_analyzer(self): analyzer = super(CountVectorizer, self).build_analyzer() return lambda doc:(stemmer.stem(w) for w in analyzer(doc)) analyzer = CountVectorizer().build_analyzer() stem_vectorizer = StemmedCountVectorizer(stemmer) def stemming(doc): return (stemmer.stem(w) for w in analyzer(doc))
class Search_engine: """ --Moteur de recherche-- DB_file = fichier contenant la base de donnee si mode = build la base de donnee construite sera dumpee sur DB_file si mode = search la base de donnee sera recupere depuis DB_file doc_files = liste de documents bruts a integrer a la base de donnee DB = base de donnee de la classe Data_base """ def __init__(self, mode='build', DB_file=None, doc_files=None, trace=False): self.mode = mode self.DB_file = DB_file self.doc_list = [] self.trace = trace self.requete= [] self.DB = Data_Base() self.stemmer=FrenchStemmer() self.requeteFin=[] self.idDoc2tfIdf={} if mode == 'build' : #construction de la base de donnee, puis dump sur DB_file print 'Building Data Base...' self.build_DB(doc_files) print 'Building completed' elif mode == 'search' : #chargement de la base de donnee self.load_DB() self.word2nbOccDsDB={} def build_DB(self, doc_files): """ rempli seld.DB avec les documents de self.doc_files """ compteur=0 doc_name=doc_files+'doc_'+str(compteur)+'.txt' while os.path.exists(doc_name): doc=Doc(doc_name) self.DB.add_doc(doc) compteur+=1 doc_name=doc_files+'doc_'+str(compteur)+'.txt' print "Number of documents in the Data Base: ", self.DB.nb_doc_total #print self.DB.id2nbword self.dump_DB() def load_DB(self): """ charge le contenu du fichier self.DB_file dans self.DB """ print 'Loadind Data Base...' stream = open(self.DB_file) self.DB = cPickle.load(stream) stream.close() print "Number of documents in the Data Base: ", self.DB.nb_doc_total print 'Loading completed' return def dump_DB(self): """ dump le contenu de self.DB dans le fichier self.DB_file """ print 'Dumping Data Base...' p=cPickle.Pickler(open(self.DB_file, 'wb')) p.fast=True p.dump(self.DB) print 'Dumping completed' #stream.close() #return def parse_requete(self, requete): """ parse la requete introduite par l'utilisateur et produit une liste de tokens """ req_list= re.findall( '\w+', requete) for word in req_list : word = self.stemmer.stem(word.decode('utf-8')) self.requete.append(word) self.requeteFin.append(word) def fuse_lst_rec(self,title_lst,title_head,first_lst,first_head,body_lst,body_head,acc): if acc == [] : acc.append(-1) m = max(title_head,first_head,body_head) title_head_aux = title_head first_head_aux = first_head body_head_aux = body_head if m == -1 : acc.reverse() a=acc.pop() return acc else: if m == title_head_aux : if title_lst != [] : title_head_aux = title_lst.pop() else : title_head_aux = -1 elif m == first_head_aux : if first_lst != [] : first_head_aux = first_lst.pop() else : first_head_aux = -1 elif m == body_head_aux : if body_lst != [] : body_head_aux = body_lst.pop() else : body_head_aux = -1 h = acc.pop() if h != m : acc.append(h) acc.append(m) else : acc.append(h) return self.fuse_lst_rec(title_lst,title_head_aux,first_lst,first_head_aux,body_lst,body_head_aux,acc) def merge_dif_rec(self,lst1,head1,lst2,head2,acc): if acc == [] : acc.append(-1) head1_aux = head1 head2_aux = head2 if head1_aux == head2_aux : acc.append(head1_aux) if lst1 == [] or lst2 == [] : acc.reverse() acc.pop() return acc else : head1_aux = lst1.pop() head2_aux = lst2.pop() elif head1_aux > head2_aux : if lst1 == [] : acc.reverse() acc.pop() return acc else : head1_aux = lst1.pop() else : if lst2 == [] : acc.reverse() acc.pop() return acc else : head2_aux = lst2.pop() return self.merge_dif_rec(lst1,head1_aux,lst2,head2_aux,acc) def search_bool_word(self,word): title_lst = [] title_head = -1 first_lst = [] first_head = -1 body_lst = [] body_head = -1 for doc_id in self.DB.word2Word_struct[word].title : #print "title" , str(doc_id.doc_id), str(self.DB.id2doc[doc_id.doc_id].doc_file) title_lst.append(doc_id.doc_id) for doc_id in self.DB.word2Word_struct[word].first : #print "first" , str(doc_id.doc_id), str(self.DB.id2doc[doc_id.doc_id].doc_file) first_lst.append(doc_id.doc_id) for doc_id in self.DB.word2Word_struct[word].body : #print "body" , str(doc_id.doc_id), str(self.DB.id2doc[doc_id.doc_id].doc_file) body_lst.append(doc_id.doc_id) if title_lst != [] : title_head = title_lst.pop() if first_lst != [] : first_head = first_lst.pop() if body_lst != [] : body_head = body_lst.pop() result=self.fuse_lst_rec(title_lst,title_head,first_lst,first_head,body_lst,body_head,[]) self.word2nbOccDsDB[word]=len(result) return result def search_bool_req(self): if self.requete == [] : return [] word0 = self.requete.pop() lst = self.search_bool_word(word0) for word in self.requete : if lst == [] : return [] lst_aux = self.search_bool_word(word) if lst_aux == [] : return [] head_lst = lst.pop() head_lst_aux = lst_aux.pop() lst = self.merge_dif_rec(lst,head_lst,lst_aux,head_lst_aux,[]) return lst def tf_idf(self, doc_id):#calcul le TF.IDF pour la requete pour chaque doc solution= 0 doc=self.DB.id2doc[doc_id] for word in self.requeteFin: word_in_title=0 word_in_first=0 word_in_body=0 total_noWords_in_doc = float(doc.nb_word) if word in doc.word2pos_list_title: word_in_title=len(doc.word2pos_list_title[word]) if word in doc.word2pos_list_first: word_in_first=len(doc.word2pos_list_first[word]) if word in doc.word2pos_list_body: word_in_body=len(doc.word2pos_list_body[word]) word_in_doc=float(word_in_body+word_in_first+word_in_title) no_docs=float(self.DB.nb_doc_total) no_docs_with_word=self.word2nbOccDsDB[word] solution +=float(word_in_doc/total_noWords_in_doc)*math.log1p(no_docs/no_docs_with_word) return solution def tf_idf_score(self, listDoc_id): for doc_id in listDoc_id: self.idDoc2tfIdf[doc_id]=self.tf_idf(doc_id) def search_rank_req(self, requete, nbResMax): self.requete=[] self.requeteFin=[] self.parse_requete(requete) docsTrouves=self.search_bool_req() self.tf_idf_score(docsTrouves) self.idDoc2tfIdf=OrderedDict(sorted(self.idDoc2tfIdf.items(), key=lambda t: t[1], reverse=True)) keys=self.idDoc2tfIdf.keys()[:nbResMax] if len(keys)<1: print 'Nothing found \n' i=1 for doc in keys: print str(i)+'. '+self.id2docTitle(doc)+'File: '+self.id2fileName(doc) i+=1 return keys def id2fileName(self, docId): return str(self.DB.id2doc[docId].doc_file) def id2docTitle(self,docId): return str(self.DB.id2doc[docId].full_title) def reset(self): self.requete=[] self.requeteFin=[] self.idDoc2tfIdf={}
def lemmatize(self, word): stemmer = FrenchStemmer() return stemmer.stem(word)
def stem_words(self, words): stemmer = FrenchStemmer() stemmed_words = [] for word in words: stemmed_words.append(stemmer.stem(word)) return stemmed_words
class Level: def __init__(self, corpus_file, word_file): self.user= False if word_file : self.user=True self.stemmer = FrenchStemmer() self.text_into_sentences = data.load("tokenizers/punkt/french.pickle") curr_path = os.path.dirname(os.path.abspath(__file__)) os.environ['STANFORD_PARSER'] = curr_path +"/stanford-parser-full-2015-04-20" os.environ['STANFORD_MODELS'] = curr_path + "/stanford-parser-full-2015-04-20" self.parser = stanford.StanfordParser(model_path= curr_path+ "/frenchFactored.ser.gz") self.stpwords = stopwords.words("french") self.read_corpus(corpus_file) self.known_words_list = [] self.ok_words_lis = [] if self.user: self.read_known_words(word_file) self.stemmed_known_words = map(self.stemmer.stem, [w for w in self.known_words_list if w not in self.stpwords]) self.ok_words_list = [w for w in self.ok_words_lis if w not in self.stpwords] self.sentences = [] self.word_count = 0 self.syll_count = 0 self.num_of_sentences = 0 self.word_per_sentence= 0.0 self.syllables_per_word = 0.0 self.num_of_words_with_more_than_six_chars = 0 self.sixratio = 0.0 self.num_of_words_with_more_than_three_syll = 0 self.threeratio= 0.0 self.depth_sum = 0 self.avgdepth = 0.0 self.known_words = 0 self.knownratio = 0.0 self.NP_sum = 0 self.NPratio =0.0 self.VP_sum = 0 self.VPratio =0.0 self.SBAR_sum = 0 self.SBARratio =0.0 self.score_with_words=0.0 self.score_general = 0.0 self.flesch_kincaid_score=0.0 def sentence_stats(self): self.sentences = self.text_into_sentences.tokenize(self.text) self.num_of_sentences = len(self.sentences) self.word_count, self.syll_count, self.num_of_words_with_more_than_six_chars , self.num_of_words_with_more_than_three_syll, self.depth_sum, self.NP_sum, self.VP_sum, self.SBAR_sum, self.known_words = [sum(x) for x in zip(*[self.count_words_in_a_sentence(s) for s in self.sentences ])] def calculate_stats(self): self.word_per_sentence= float(self.word_count)/self.num_of_sentences self.syllables_per_word = float(self.syll_count)/ self.word_count self.sixratio = float(self.num_of_words_with_more_than_six_chars)/ self.word_count self.threeratio = float(self.num_of_words_with_more_than_three_syll)/ self.word_count self.avgdepth = float(self.depth_sum)/ self.num_of_sentences self.knownratio = float(self.known_words)/ self.word_count self.NPratio = float(self.NP_sum)/ self.word_count self.VPratio = float(self.VP_sum)/ self.word_count self.SBARratio = float(self.SBAR_sum)/ self.word_count def print_stats(self): print "#of sentences:", self.num_of_sentences print "#of words, #of words per sentence:", self.word_count, " ", self.word_per_sentence print "#of syllables, #of syllables per word:", self.syll_count," ", self.syllables_per_word print "#of words with more than 6 characters, percentage to all words:",self.num_of_words_with_more_than_six_chars," ", self.sixratio print "#of words with more than 3 syllables, percentage to all words:", self.num_of_words_with_more_than_three_syll," ", self.threeratio print "average parse tree depth:", self.avgdepth print "average # of noun phrases:", self.NPratio print "average # of verb phrases:", self.VPratio print "average # of SBAR phrases:", self.SBARratio print "# of known words, percentage to all words:", self.known_words," ", self.knownratio print "flesch-kincaid score:", self.flesch_kincaid_score print "general score without vocabulary:", self.score_general if self.user: print "score with vocabulary:", self.score_with_words def calculate_score(self): self.flesch_kincaid_score = 206.835 - 1.015 * self.word_per_sentence - 84.6 * self.syllables_per_word self.score_general =((self.SBARratio+self.VPratio+self.NPratio+self.threeratio+self.sixratio)/5)*self.avgdepth/self.flesch_kincaid_score self.score_with_words =((self.SBARratio+self.VPratio+self.NPratio+self.threeratio+self.sixratio+(1-self.knownratio))/6)*self.avgdepth/self.flesch_kincaid_score def count_words_in_a_sentence(self, sentence): known_words=0 tokens = word_punckt_tokenizer.tokenize(sentence.lower()) words = self.normalize_list(tokens) word_count = len(words) syll_count = sum(Level.syllable_count(word) for word in words) num_of_words_with_more_than_six_chars = len(filter(lambda(x): len(x) >= 6, words)) num_of_words_with_more_than_three_syll = len(filter(lambda(x): Level.syllable_count(x) >=3, words)) parse_tree_depth, num_of_NP, num_of_VP, num_of_SBAR = self.tree_stats(sentence) if self.user: known_words = sum([1 for w in words if w in self.ok_words_list or self.stemmer.stem(w) in self.stemmed_known_words ]) print sentence print words return (word_count, syll_count, num_of_words_with_more_than_six_chars,num_of_words_with_more_than_three_syll, parse_tree_depth, num_of_NP, num_of_VP, num_of_SBAR, known_words) def tree_stats(self, sentence): depth=1 num_of_np =1 num_of_vp = 1 num_of_sbar = 0 try: l = list(self.parser.raw_parse(sentence))[0] num_of_np = sum( [1 for i in l.subtrees() if i.label() == 'NP']) num_of_vp = sum( [1 for i in l.subtrees() if i.label() == 'VN']) num_of_sbar = sum( [1 for i in l.subtrees() if i.label() == 'CS']) depth = l.height() except: pass return (depth, num_of_np, num_of_vp, num_of_sbar) def normalize_list(self, token_list): ss = [w for w in token_list if w not in self.stpwords and w not in string.punctuation] return ss @staticmethod def syllable_count(word): n = len(word) num_of_syll = 0 i=0 while i < n: if i < n-3 and is_a_vowel(word[i]) and is_a_vowel(word[i+1]) and is_a_vowel(word[i+2]) and is_a_vowel(word[i+3]): num_of_syll += 2 i += 4 elif i < n-2 and is_a_vowel(word[i]) and is_a_vowel(word[i+1]) and is_a_vowel(word[i+2]): num_of_syll += 1 i += 3 elif i < n-1 and is_a_vowel(word[i]) and is_a_vowel(word[i+1]): num_of_syll += 1 i += 2 elif i < n and is_a_vowel(word[i]): num_of_syll += 1 i += 1 else: i += 1 return num_of_syll def read_corpus(self, filename): encodings = ["utf-8", "latin-1", "windows-1250", "windows-1252", "latin-15", "utf-16", "ascii"] for e in encodings: try: fh = codecs.open(filename, "r", encoding=e) self.text = fh.read().strip() fh.close() except UnicodeDecodeError: pass else: break def read_known_words(self, filename): encodings = ["utf-8", "latin-1", "windows-1250", "windows-1252", "latin-15", "ascii", "utf-16"] for e in encodings: try: fh = codecs.open(filename, "r", encoding=e) for line in fh: word, d = line.strip().split() degree = int(d) if degree ==1 : self.known_words_list.append(word) else: self.ok_words_lis.append(word) fh.close() except UnicodeDecodeError: pass else: break