Пример #1
0
    def parse(self, file_path):
        file = open(file_path, "r+")
        #print (type(file.read()))
        soup = BeautifulSoup(file.read(), "html.parser")

        #kill all script and style elements
        for script in soup(["script", "style"]):
            script.extract()  # rip it out

        text = soup.get_text(separator=" ")

        # break into lines and remove leading and trailing space on each
        lines = (line.strip() for line in text.splitlines())
        # break multi-headlines into a line each
        chunks = (phrase.strip() for line in lines
                  for phrase in line.split(" "))
        # drop blank lines
        text = '\n'.join(chunk for chunk in chunks if chunk)

        #convert all str objects to unicode objects (usefull when search words which are not stop words)
        list_of_words = []
        for word in text.split():
            if isinstance(word, str):
                list_of_words.append(word.decode("utf-8").lower())
            else:
                list_of_words.append(word.lower())

        #get french stop words
        stop_words_french = get_stop_words('fr')
        stemmer = FrenchStemmer()

        #meilleure heuristique : au lien d'enlever les caractères 1 à 1, on se débarasse de ceux ayant une taille de 1
        #Global_stop_words_List=["?",".","!",",","'","|","...",":","–","&","-","€"]+stop_words_french
        Global_stop_words_List = [
            word for word in list_of_words if len(word) == 1
        ] + stop_words_french

        #convert all str objects to unicode objects (usefull when search words which are not stop words)
        filter_stop_words_list = []
        for word in Global_stop_words_List:
            if isinstance(word, str):
                filter_stop_words_list.append(
                    stemmer.stem(word.decode("utf-8").lower()))
            else:
                filter_stop_words_list.append(stemmer.stem(word.lower()))

        #filter list using stop words and apply stemming operation
        filter_words = []
        for word in list_of_words:
            word = word.lower()
            if not self.isUrl(word):
                list_clean_words = self.cleanWord(word)
                #list_clean_words=[stemmer.stem(cword) for cword in list_of_words]
                for clean_word in list_clean_words:
                    if not (stemmer.stem(clean_word)
                            in filter_stop_words_list):
                        filter_words.append(stemmer.stem(clean_word))

        return filter_words
Пример #2
0
 def stem_words(words):
     '''stems the word list using the French Stemmer'''
     #stemming words
     stemmed_words = []  #declare an empty list to hold our stemmed words
     stemmer = FrenchStemmer(
     )  #create a stemmer object in the FrenchStemmer class
     for word in words:
         stemmed_word = stemmer.stem(stemmer.stem(word))  #stem the word
         stemmed_words.append(
             stemmed_word)  #add it to our stemmed word list
     stemmed_words.sort()  #sort the stemmed_words
     return stemmed_words
Пример #3
0
def tweet_cleaner(text, my_dict, stem=False):

    # fixes encoding problem (MICHELE)
    if sys.version_info[0] < 3:
        if type(text) != unicode and type(text) != float:
            try:
                text = unicode(text, 'utf-8')
            except UnicodeDecodeError:
                text = unicode(text, 'latin-1')
    if type(text) == float:
        text = str(text)

    if stem:
        stemmer = FrenchStemmer()
        tokens = [
            stemmer.stem(t) for t in text.split() if len(stemmer.stem(t)) >= 1
        ]
        text = " ".join(tokens)

    tok = WordPunctTokenizer()

    pat1 = r'@[A-Za-z0-9_]+'
    pat2 = r'https?://[^ ]+'
    combined_pat = r'|'.join((pat1, pat2))
    www_pat = r'www.[^ ]+'
    rt_path = r'^rt'
    paths = [combined_pat, www_pat, rt_path]

    text = unidecode.unidecode(text)
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    lower = souped.lower()

    for word in my_dict:
        souped = re.sub(word, my_dict[word], lower)

    try:
        bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        bom_removed = souped

    for path in paths:
        bom_removed = re.sub(path, '', bom_removed.lower())

    letters_only = re.sub("[^a-zA-Z]", " ", bom_removed)

    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = [x for x in tok.tokenize(letters_only) if len(x) >= 1]
    return (" ".join(words)).strip()
Пример #4
0
 def __init__(self, label = "", role = "", ner = ""):
     self.label = label
     self.role = role
     self.ner = ner
     ##print(repr(label))
     stemmer = FrenchStemmer()
     self.lemma = stemmer.stem(label)
Пример #5
0
def new_dico(file):
	stemmer = FrenchStemmer()
	input_ = "../dico/" + file
	output_ = "dic_with_roots/" + file
	fs=open(input_,'r')
	fd=open(output_,'w')

	k =0
	lines = fs.readlines()
	for line in lines:
		txt = line.split(" ")
		if txt =='':
			break
		for w in txt:
			if(w.istitle()):	
				k = 1
			else:
				k = 0	
			w= w.decode("utf-8")
			w = ''.join(u for u in w if u in string.ascii_letters)
			w=enleve_accents(w)
			w=stemmer.stem(w)+" "
			w.encode("utf-8")
			if(k):
				w = w[0].upper() + w[1:]
				fd.write(w)
			else:
				fd.write(w)

	fs.close()
	fd.close()
Пример #6
0
    def preprocess(self, text=None, stem=False, fix_pdf=True):

        if text is None:
            text = self.text

        def fix_pdf2txt(texto):
            import re
            texto = re.sub(r'\n([^A-Z])', r' \1', texto)
            texto = re.sub(r'([^\.])\n', r'\1.\n', texto)
            return texto

        def tokenizer_fr(text):
            # Courtesy of http://www.fabienpoulard.info/post/2008/03/05/Tokenisation-en-mots-avec-NLTK

            return tok_fr.tokenize(text)

        # Fix newline problems with pdf to txt step
        if fix_pdf:
            text = fix_pdf2txt(text)

        text = text.lower()

        # Tokenization
        self._original_tokens = tokenizer_fr(text)
        self._tokens = self._original_tokens

        #         self._tokens = [t for t in self._tokens if len(t) > 1]

        if stem:
            from nltk.stem.snowball import FrenchStemmer
            fr_stemmer = FrenchStemmer()
            self._tokens = [fr_stemmer.stem(t) for t in self._tokens]

        self._concordance_index = nltk.ConcordanceIndex(self._tokens, key=lambda s: s)
Пример #7
0
def normalize_text(string):
    """Preprocess text string to return a normalized form of the text.
    """
    if isinstance(string, float):
        return ""
    else:
        # lowering x, removing beginning and ending space
        s = string.strip().lower()

    # removing accents
    s = ''.join((c for c in unicodedata.normalize('NFD', s)
                 if unicodedata.category(c) != 'Mn'))

    # remove punctuation
    s = re.sub("[" + punctuation + "]", " ", s)

    # remove uninformative, stop words and non alpha words
    words_to_remove = [
        "les", "une", "des", "nos", "ils", "elle", "elles", "nan", "null"
    ]
    stop_words = list(stopwords.words("french"))
    remove_list = words_to_remove + stop_words
    s = " ".join([
        word for word in s.split()
        if (word.isalpha() and word not in remove_list and len(word) > 2)
    ])

    # Stemming words and remove duplicates
    stemmer = FrenchStemmer()
    stem_words = [stemmer.stem(w) for w in s.split()]
    s = " ".join(stem_words)

    return s
Пример #8
0
def get_lem(text):
    stemmer = FrenchStemmer()
    text_tokens = word_tokenize(text)
    text =""
    for word in text_tokens :
        text += " "+stemmer.stem(word)
    return text
Пример #9
0
class HistExtractor:
    def __init__(self):
        self.stemmer = FrenchStemmer()
        self.analyzer = CountVectorizer().build_analyzer()

        self.bad_words = [
            "src", 'html', 'ifram', 'allowtransparency', 'analytic', 'class',
            'com', 'hidden', 'lien', 'lightwidget', 'overflow', 'row',
            'script', 'scrolling', 'src', 'widget', "tous", "jour", "blog",
            'width', 'wrapp', "les", "googl", "propos", "list"
        ]
        self.stopwords = nltk.corpus.stopwords.words('french') + self.bad_words

        def stemmed_words(doc):
            return (self.stemmer.stem(w) for w in self.analyzer(doc)
                    if w not in self.stopwords)

        self.cv = CountVectorizer(analyzer=stemmed_words,
                                  stop_words=self.stopwords)
        # self.cv = CountVectorizer(stop_words=self.stopwords)

    def get_histogram_from_string(self, x):
        hist = self.cv.fit_transform([x])
        dict_result = {
            k: int(v)
            for k, v in zip(self.cv.get_feature_names(),
                            hist.toarray()[0]) if k not in self.bad_words
        }
        return dict_result
Пример #10
0
    def stemmingFR(self):

        ps = FrenchStemmer()
        #Input Document
        Input = open(self.filename, "r")
        elagage = Input.read()

        #Output Document
        Output = open(self.filename + "-out3.txt", "a")

        #Stemming
        for w in elagage.split():
            Output.write(ps.stem(w))
            Output.write("\n")

        self.label.configure(text=self.filename)
        self.texte = Entry(self,
                           width=20,
                           font="Arial 14",
                           fg="green",
                           justify='center')
        self.texte.insert(END, "Succée de Stemming")
        self.texte.grid(padx=16, pady=16)

        self.texte = Entry(self,
                           width=50,
                           font="Arial 14",
                           fg="blue",
                           justify='center')
        self.texte.insert(
            END, "Vous trouvez votre fichier résulat sous le même répertoire")
        self.texte.grid(padx=16, pady=16)
Пример #11
0
    def text_stemming(self):
        """
        stem the text
        """
        if self.language == "french":
            stemmer = FrenchStemmer()
        elif self.language == "english":
            stemmer = PorterStemmer()
        elif self.language == "italian":
            stemmer = SnowballStemmer(self.language)
        elif self.language == "german":
            stemmer = SnowballStemmer(self.language)
        elif self.language == "spanish":
            stemmer = SnowballStemmer(self.language)
        elif self.language == "dutch":
            stemmer = SnowballStemmer(self.language)
        elif self.language == "portuguese":
            stemmer = SnowballStemmer(self.language)
        elif self.language == "danish":
            stemmer = SnowballStemmer(self.language)
        elif self.language == "greek":
            stemmer = GreekStemmer()
        elif self.language == "arabic":
            stemmer = ISRIStemmer()
        else:
            print(
                "Language need to be french, english, german,spanish or italian"
            )

        self.text = ' '.join(
            [stemmer.stem(word) for word in word_tokenize(self.text)])
Пример #12
0
class FrenchStemTokenizer(object):
    # A French Stemmer Tokenizer
    def __init__(self, stop_words=None, remove_non_words=False):
        self.st = FrenchStemmer()
        if stop_words == None:
            self.stopwords = set(stopwords.words('french'))
        else:
            self.stopwords = stop_words
        self.words = set(words.words())
        self.remove_non_words = remove_non_words

    def __call__(self, doc):
        # tokenize words and punctuation
        word_list = wordpunct_tokenize(doc)
        # remove stopwords
        word_list = [word for word in word_list if word not in self.stopwords]

        # remove non words
        if (self.remove_non_words):
            word_list = [word for word in word_list if word in self.words]
        # remove 1-character words
        word_list = [word for word in word_list if len(word) > 1]
        # remove non alpha
        word_list = [word for word in word_list if word.isalpha()]
        return [self.st.stem(t) for t in word_list]
def process_text(text, stem=False):
    """ lowercase, removes stopwords, accents and lemmatizes the tokens if stem=True
    used with the df.apply() to create a new column on a dataframe
    """

    text_clean = []
    for sen in text:
        #         sen = unidecode.unidecode(sen.replace("’", " ").replace(","," ").replace("."," ").replace(";"," ").lower())
        sen = unidecode.unidecode(
            sen.replace("’", " ").replace(",", " ").replace(
                ";", " ").lower())  # keep the dots for the date_uniformizer
        sen = sen.replace("/ ", "/")  #some dates are in DD/ MM/ yyyy format
        tokens = sen.split()
        if stem:
            from nltk.stem.snowball import FrenchStemmer
            stemmer = FrenchStemmer()
            tokens_no_stpwrd = [
                stemmer.stem(tok) for tok in tokens if tok not in stop_words
            ]
        else:
            #             tokens_no_stpwrd = [tok for tok in tokens if (tok not in stop_words) & (tok.isalnum())]
            tokens_no_stpwrd = [
                tok for tok in tokens if (tok not in stop_words)
            ]

        no_letters = re.sub(' [a-z] ', " ", " ".join(tokens_no_stpwrd))

        text_clean.append(no_letters)

    return text_clean
Пример #14
0
def convert_computing_input_to_dictionnary_input(datas):
    # Instantiation of the Tokenizer
    tokenizer = WordPunctTokenizer()
    # Instantiate Stemmer
    stemmer = FrenchStemmer()
    # Load french StopWords
    french_stopwords = set(stopwords.words('french'))

    matchingTable = []
    tokenTable = [
    ]  # Each row of this table is [id, tokens] :'id' of the advert and 'tokens' the list of tokens in the advert
    i = 0
    for row in datas:
        id = row[0]
        desc = row[2]
        # Get tokens for this row
        tokens = tokenizer.tokenize(str(desc[0]))
        # Filter tokens to remove punctuation
        regex = re.compile(r'\w+')
        tokens = filter(regex.search, tokens)
        # Filter tokens to remove stopwords and convert tokens to their stemm
        tokens = [
            stemmer.stem(token) for token in tokens
            if token.lower() not in french_stopwords
        ]
        # Remove duplicate entries
        tokens = list(set(tokens))
        # Sort tokens
        tokens.sort()
        # Construct the new row with only the id and the list of tokens
        row = [id, tokens]

        # Add the new Row to the global table
        tokenTable.append(row)
        i += 1

    # Construct the vector for each advert
    rowCol = 0
    rowCols = []
    tabCols = []
    rowInd = 0
    initialRow = np.zeros(len(tokenDictionnary))

    #
    # # Here we transform each row of tokens into row of 0|1 corresponding array, matching the tokenDictionnary
    #
    # tokenTable[1:] to skip the title row, because the original file has a title row
    for row in tokenTable[1:]:
        id = row[0].split(";")[0]
        advertVec = np.zeros(len(tokenDictionnary))
        rowCols = []
        for elm in row[1]:
            rowCol = tokenDictionnary.index(
                elm) if elm in tokenDictionnary else -1
            advertVec[rowCol] = 1 if rowCol >= 0 else 0

        composed_row = [id, advertVec]
        matchingTable.append(composed_row)
        rowInd += 1
    return tokenTable, matchingTable
Пример #15
0
    def preprocess(self, text=None, stem=False, fix_pdf=True):

        if text is None:
            text = self.text

        def fix_pdf2txt(texto):
            import re
            texto = re.sub(r'\n([^A-Z])', r' \1', texto)
            texto = re.sub(r'([^\.])\n', r'\1.\n', texto)
            return texto

        def tokenizer_fr(text):
            # Courtesy of http://www.fabienpoulard.info/post/2008/03/05/Tokenisation-en-mots-avec-NLTK

            return tok_fr.tokenize(text)

        # Fix newline problems with pdf to txt step
        if fix_pdf:
            text = fix_pdf2txt(text)

        text = text.lower()

        # Tokenization
        self._original_tokens = tokenizer_fr(text)
        self._tokens = self._original_tokens

        #         self._tokens = [t for t in self._tokens if len(t) > 1]

        if stem:
            from nltk.stem.snowball import FrenchStemmer
            fr_stemmer = FrenchStemmer()
            self._tokens = [fr_stemmer.stem(t) for t in self._tokens]

        self._concordance_index = nltk.ConcordanceIndex(self._tokens,
                                                        key=lambda s: s)
Пример #16
0
def is_french_adjr(word): # TODO change adjr tests
  stemmer = FrenchStemmer()
  # suffixes with gender and number flexions
  suffixes = [
    u"ain", u"ains", u"aine", u"aines",
    u"aire", u"aires",
    u"al", u"aux", u"als", u"ale", u"ales",
    u"el", u"els", u"elle", u"elles",
    u"esque", u"esques",
    u"estre", u"estres",
    u"eux", u"euse", u"euses",
    u"é", u"és", u"ée", u"ées",
    u"ien", u"iens", u"ienne", u"iennes",
    u"ier", u"iers", u"ière", u"ières",
    u"if", u"ifs", u"ive", u"ives",
    u"il", u"ils",
    u"in", u"ins", u"ine", u"ines",
    u"ique", u"iques",
    u"ois", u"oise", u"oises"
  ]
  stem = stemmer.stem(word)
  stem_ending = ""
  if word.replace(u"é", "e").replace(u"è", "e").startswith(stem.replace(u"é", "e").replace(u"è", "e")):
    stem_ending = word.replace(u"é", "e").replace(u"è", "e").split(stem.replace(u"é", "e").replace(u"è", "e"), 1)[1]

  if stem in french_stemmed_adjr:
    return True
  for suffix in suffixes:
    if word[-len(suffix):] == suffix:
      return True
  # TODO change adjr tests
  #if stem_ending in french_adjr_stem_ending_counts:
  #  return True
  return False
Пример #17
0
    def getListStopwords(self):
        stemmer = FrenchStemmer()
        file = open(self.path, "r+")
        list_words = []
        res = []
        indice = 0
        #Il n'y q'une seule ligne dans le fichier
        for line in file:
            list_words = line.split(',')

        for word in list_words:
            if isinstance(word, str):
                word = stemmer.stem(word.decode("utf-8").lower())
            else:
                word = stemmer.stem(word.lower())
            res.append(word)
        return res
Пример #18
0
def stem(sentence):
    # Instance Stemmer
    stemmer = FrenchStemmer()
    stem = ''
    for word in nltk.word_tokenize(sentence):
        stem += ' ' + stemmer.stem(word)
    #print(stem) # Pour débug
    return stem
Пример #19
0
def racinize_all_concept(concept):
    concept_tiers = []
    stemmer = FrenchStemmer()
    for i in range(0, len(concept)):
        temp = concept[i][0].lower()
        temp2 = stemmer.stem(temp)
        concept_tiers.append((temp2, concept[i][1]))
    return concept_tiers
Пример #20
0
 def stemArticle(self, doc):
     stemmer_fr = FrenchStemmer()
     stemmer_en = EnglishStemmer()
     
     stemmedArticle = [str(stemmer_fr.stem(w)) for w in doc]
     stemmedArticle = [str(stemmer_en.stem(w)) for w in stemmedArticle]   
     
     return stemmedArticle
Пример #21
0
def lemmatize(token):
    """Lemmatize word using a french lemmatizer
    
    Arguments:
        token {string} -- token to lemmatize
    """
    stemmer = FrenchStemmer()
    return stemmer.stem(token)
Пример #22
0
def preprocess(text):
    result = []
    stopwords = get_stopswords()
    stemmer = FrenchStemmer()
    for token in simple_preprocess(text):
        if token not in stopwords and len(token) > 3:
            result.append(stemmer.stem(token))
    return result
Пример #23
0
def stem_words(words):
    stemmed_words = []
    stemmer = FrenchStemmer()
    for word in words:
        stemmed_word = stemmer.stem(word)
        stemmed_words.append(stemmed_word)
    stemmed_words.sort()
    return stemmed_words
Пример #24
0
def computeScores(title,desc,cat):
    if title == "":
        messagebox.showwarning("Empty attribute","Empty title. Title Score automatically set to 0.")
        titlescore = 0
    else:
        titlescore = computetitleScore(title)
        print(titlescore)

    if desc == "":
        messagebox.showwarning("Empty attribute","Empty description. Description and Category Score automatically set to 0.")
        descscore = 0
        links = 0
        categoryScore = 0
    else:
        urls = re.findall(regexURL,desc)
        mails = re.findall(regexMail,desc)
        words = nltk.word_tokenize(re.sub('\W+',' ',re.sub(regexMail,'',re.sub(regexURL,'', desc))))
        words = [''.join([i for i in word if not i.isdigit()]) for word in words]
        stemmer = FrenchStemmer()
        newwords = []
        for word in words:
            if word.lower() not in list(line.strip() for line in open('french')) and len(word)>1:
                if word.lower() in list(line.strip() for line in open('words')):
                    newwords.append(unidecode.unidecode(stemmer.stem(word.lower())))
                newwords.append(unidecode.unidecode(stemmer.stem(word.lower())))
        links = len(urls)+len(mails)
        descscore = computedescScore(newwords)
        if cat == None:
            categoryScore = 0
        else:
            categoryScore = computecatScore(newwords,cat)
    totScore = 7*descscore+categoryScore+titlescore
    if links > 0:
        totScore+=1
    if totScore > 10:
        totScore=10

    message = "The total score is "+str(round(totScore,2))+"\nThe description score is "+str(round(descscore,2))+"\nThe title score is "+str(titlescore)+"\nThe classification score is "+str(categoryScore)+"\n"
    if descscore < 0.4:
        message+="You should improve the description\n"
    if titlescore < 1:
        message+="You should improve the title\n"
    if categoryScore < 0.5:
        message+="The description should be more explcit on the category"
    messagebox.showinfo("Results",message)
Пример #25
0
    def runSearch(self, list_keywords, termScoreMethod, documentScoreMethod):
        stemmer = FrenchStemmer()
        stop_words_french = get_stop_words('fr')
        stop_words_french = [
            stemmer.stem(word.lower()) for word in stop_words_french
        ]

        list_keywords_split = []

        for word, coef in list_keywords:
            wordList = word.split()
            for w in wordList:
                #on redivise le coef, par le nombre de mots trouvé en splitant.
                #Comme ça on re répartie le "coef" sur les nouveaux mots
                newCoef = float(coef) / float(len(wordList))
                list_keywords_split.append((w, newCoef))

        list_of_words_request = []
        for word, coef in list_keywords_split:
            #print word
            if isinstance(word, str):
                word = word.decode("utf-8").lower()
            else:
                word = word.lower()
            #print "*************",stemmer.stem(word)
            #if (word in stop_words_french)==False:
            list_of_words_request.append((stemmer.stem(word), coef))

        #for elt in stop_words_french:
        #	print "******************",elt

        # by default, documents score are equal to zero
        scoreNameDoc = [(0, "D" + str(i + 1) + ".html") for i in range(138)]
        nb_doc_collection = 138

        #scoring all documents,
        for idDoc in range(138):
            if idDoc + 1 != 127:
                score = self.computeDocumentScore(idDoc + 1,
                                                  list_of_words_request,
                                                  termScoreMethod,
                                                  documentScoreMethod)
                scoreNameDoc[idDoc] = (score, scoreNameDoc[idDoc][1])
        scoreNameDoc.sort(key=lambda tup: tup[0])
        return scoreNameDoc[::-1]
Пример #26
0
def stemWords(listWords):
    '''racinisation'''
    stemmedWords = list()
    stemmer = FrenchStemmer()
    for word in removeStopwords(listWords):
        stemmedWord = stemmer.stem(word)
        stemmedWords.append(stemmedWord)
    stemmedWords.sort()
    return stemmedWords
class FrenchLemmaTokenizer(object):
    """
    This is basically a function with a shared memory between calls for the wnl frenchstemmer
    """
    def __init__(self):
        self.wnl = FrenchStemmer()

    def __call__(self, s):
        return [self.wnl.stem(t) for t in word_tokenize(s) if t.isalpha()]
Пример #28
0
def stemming_Function(filtered_words):
        stemmed_words = [] #declare an empty list to hold our stemmed words
        stemmer = FrenchStemmer() #create a stemmer object in the FrenchStemmer class
        for word in filtered_words:
         stemmed_word=stemmer.stem(word) #stem the word
         stemmed_words.append(stemmed_word) #add it to our stemmed word list
        
        freqdist = nltk.FreqDist(stemmed_words)
        return freqdist
Пример #29
0
def racinize_all_negationeur(
        concept):  #all racinisation are used to work with variations
    nega_tiers = []
    stemmer = FrenchStemmer()
    for i in range(0, len(negationeur)):
        temp = negationeur[i].lower()
        temp2 = stemmer.stem(temp)
        nega_tiers.append(temp2)
    return nega_tiers
Пример #30
0
class Preprocessing:
    def __init__(self, data_loader: DataLoader):
        self.tokenizer = nltk.RegexpTokenizer(r'\w+')
        self.stemmer = FrenchStemmer()
        self.get_stop_words()
        
        # TODO compose it
        inputs: InputData = data_loader.load()
        data = inputs.examples
        self.responses = inputs.responses
        data = self.to_lower_case_all(data)
        data = self.tokenize_all_examples(data)
        data = self.remove_stop_words_for_all(data)
        data = self.lemmatize_all(data)
        self.data = data
        
    def get_stop_words(self):
        nltk.download('french')
        self.stop_words = nltk.corpus.stopwords.words('french')
        dump_stop_words(self.stop_words)

    def to_lower_case_one(self, example: str):
        return example.lower()
        
    def to_lower_case_all(self, data):
        to_lower_case = lambda examples: [self.to_lower_case_one(ex) for ex in examples]
        return utils.apply_for_each_key(data, to_lower_case)
    
    def tokenize_one_example(self, example):
        return self.tokenizer.tokenize(example)
    
    def tokenize_all_examples(self, data):
        tokenize = lambda data_list: [self.tokenize_one_example(d) for d in data_list]
        return utils.apply_for_each_key(data, tokenize)
    
    def remove_stop_words(self, example):
        return [w for w in example if not w in self.stop_words]
    
    def remove_stop_words_for_all(self, data):
        remove_sw = lambda examples: [self.remove_stop_words(ex) for ex in examples]
        return utils.apply_for_each_key(data, remove_sw)
    
    def lemmatize(self, example):
        return [self.stemmer.stem(w) for w in example]
    
    def lemmatize_all(self, data):
        get_lems = lambda examples: [self.lemmatize(ex) for ex in examples]
        return utils.apply_for_each_key(data, get_lems)
    
    def process_sentence(self, sentence):
        # TODO compose
        data = self.to_lower_case_one(sentence)
        data = self.tokenize_one_example(data)
        data = self.remove_stop_words(data)
        data = self.lemmatize(data)
        return data
Пример #31
0
def stem_words(words):
    '''stems the word list using the French Stemmer'''
    #stemming words
    stemmed_words = [] #declare an empty list to hold our stemmed words
    stemmer = FrenchStemmer() #create a stemmer object in the FrenchStemmer class
    for word in words:
        stemmed_word=stemmer.stem(word) #stem the word
        stemmed_words.append(stemmed_word) #add it to our stemmed word list
    stemmed_words.sort() #sort the stemmed_words
    return stemmed_words
Пример #32
0
def stem_words(words):
    
    #stemming words
    stemmed_words = [] 
    stemmer = FrenchStemmer() #creation d'un  objet stemmer  
                              #dans la classe FrenchStemmer 
    for word in words:
        stemmed_word=stemmer.stem(word) #stem the word
        stemmed_words.append(stemmed_word) 
    return stemmed_words
Пример #33
0
    def nettoyage(self, document):
        """
        function to clean the dataset + stemming
        """
        document = self.tokenize(document)
        document = [word.lower() for word in document if len(word) > 2 and not word.isnumeric() and word not in self.stop_words]

        if self.stem:
            stem = FrenchStemmer()
            document = [stem.stem(word) for word in document]
        return document
def lemmatize_or_stem(language, terms):
	if language != 'english' or (language == 'english' and ENGLISH_FREELING): #TEMPORARY: EXPERIMENTING WITH ENGLISH FREELING
		# Use FreeLing
		if language == 'spanish':
			analyzeProcess = subprocess.Popen(["analyze", "-f", "/usr/local/share/freeling/config/es.cfg"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
                elif language == 'portugese':
			analyzeProcess = subprocess.Popen(["analyze", "-f", "/usr/local/share/freeling/config/pt.cfg"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)                    
                elif language == 'italian':
			analyzeProcess = subprocess.Popen(["analyze", "-f", "/usr/local/share/freeling/config/it.cfg"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)                    
		elif language == 'english':
			analyzeProcess = subprocess.Popen(["analyze", "-f", "/usr/local/share/freeling/config/en.cfg"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
		terms = map(lambda term: term.encode('utf-8'), terms)
		analyzeProcess.stdin.write(' '.join(terms))
		stdout, stderr = analyzeProcess.communicate()
		# Parse FreeLing output
		# Lemma is always second word of each line.
		terms = list()
		lines = stdout.split('\n')
		for line in lines:
			items = line.split(' ')
			if len(items) == 4:
				lemma = items[1]
				tag = items[2]
				"""ATTN: TAGSET IS DIFFERENT IN SPANISH AND ENGLISH. However, NP, F, Z, and W
				all mean the same thing in both tagsets."""
				# remove proper nouns, punctuation, numbers, and dates/times
				if not (tag[0:2]=='NP' or tag[0] == 'F' or tag[0] == 'Z' or tag[0] == 'W' or tag[0:3] == 'POS'):
					# if english, need to remove numbers
					include = True
					for num in NUMBERS:
						if num in lemma:
							include = False
					if include:
						terms.append(lemma)
		terms = map(lambda term: term.decode('utf-8'), terms)

	elif (language == 'english' and not ENGLISH_FREELING):
	 	lem = WordNetLemmatizer()
	 	terms = map(lambda term: lem.lemmatize(term), terms )
	elif language == 'french':
		from nltk.stem.snowball import FrenchStemmer
		stemmer = FrenchStemmer()
		terms = map(lambda term: stemmer.stem(term), terms)
	
		terms = map(lambda term: term.decode('utf-8'), terms)
	return terms
Пример #35
0
from nltk.corpus import stopwords
import nltk
# chargement des stopwords français
french_stopwords = set(stopwords.words('french'))
print french_stopwords

chars = ['.', '/', "'", '"', '?', '!', '#', '$', '%', '^', '&','*',\
 '(', ')', ' - ', '_', '+' ,'=', '@', ':', '\\', ',',';', '~', '`', '<',\
 '>', '|', '[', ']', '{', '}', '–', '“','»', '«', '°', '’', '--'\
 '</div>','<div','class','class="tt14-prodpres-txt' ,'tt14-prodpres-res">','<b>','</b>']



from nltk.stem.snowball import FrenchStemmer
stemmer = FrenchStemmer()
stemmer.stem('voudrais')

def cleaningDocumentList(documentList):
    #remove frech stopwords
    texts = [[word for word  in document.lower().replace("'",'').split() if word not in french_stopwords] for document in documentList]
    #remove special catalogue characters
    texts = [[word for word  in text if word not in chars] for text in texts]
    # remove french accent
    texts = [[unicodedata.normalize('NFD',unicode(word,'utf-8')).encode('ascii', 'ignore') for word  in text] for text in texts]
    #remove general special characters 
    texts = [[re.sub(r'[. ?!=+ & | , " :; ⋆ $ %()<> &\[\]/_]',r'',word) for word in text] for text in texts]
    #remove small words
    texts = [[re.sub(r'\b\w{1,3}\b', '', word) for word in text] for text in texts]
    #Lemmitizer
    texts = [[stemmer.stem(word) for word in text] for text in texts]
    #remove empty string
Пример #36
0
class DirectTranslate:
  """Word-by-word direct translator.
  
  Usage:
  translator = DirectTranslate(translation_dict)
  for sentence in file:
    print translator.translate(sentence, delims=",' ", remove='')
  """
  def __init__(self, translation_dict, lemmatized=False):
    self.english_lemmatizer = WordNetLemmatizer()
    self.french_stemmer = FrenchStemmer()
    if not lemmatized:
      stemmed_dict = self._get_lemmatized_dict(translation_dict)
    self.stemmed_dict = stemmed_dict
    self.translation_dict = translation_dict
    
  def _get_lemmatized_dict(self, dict):
    result = {}
    for french_word, english_translation_list in dict.iteritems():
      french_stem = self.french_stemmer.stem(french_word)
      english_translations = [
        self.english_lemmatizer.lemmatize(word) for word in english_translation_list
      ]
      # NOTE: This may or may not be the best stragetgy.  If two distinct
      # French words in the initial dict have the same stem,
      # it appends the two lists of translations.
      # TODO: Reconsider.
      # TODO: Consider removing duplicates from this new list.  But need to preserve order.
      if french_stem not in result:
        result[french_stem] = english_translations
      else:
        result[french_stem].extend(english_translations)
    return result

  def _get_preprocessed_sentence(self, french_sentence):
    """Apply any preprossing rules here.
    Args:
      french_sentence: string; the sentence in french
    
    Returns:
      The sentence with all preprocessing rules applied.
    """
    return unicode(french_sentence)

  def _get_postprocessed_sentence(self, english_sentence):
    """Apply any postproccessing rules here.
    Args: 
      english_sentence: string; an english sentence
    
    Returns:
      The sentence with all postprocessing rules applied.
    """
    return english_sentence

  # TODO: Add code to keep commas.  Translate them into a word.
  def translate(self, sentence, delims=",' ", remove=''):
    sentence = self._get_preprocessed_sentence(sentence)
    tokens = TranslateUtils.get_list_of_words(sentence, delims, remove)
    translated_list = []
    for token in tokens:
      stemmed_token = self.french_stemmer.stem(token).lower()
      if stemmed_token in self.stemmed_dict:
        possible_translations = self.stemmed_dict[stemmed_token]
        if possible_translations:
          # Use first translation in the list
          translation = possible_translations[0]
          translated_list.append(translation)
      elif token in self.translation_dict:
        possible_translations = self.translation_dict[token]
        if possible_translations:
          # Use first translation in the list
          translation = possible_translations[0]
          translated_list.append(translation)
    translation = ' '.join(translated_list)
    translation = self._get_postprocessed_sentence(translation)
    return translation
Пример #37
0
class Search_engine:
	"""
		--Moteur de recherche--

		DB_file = fichier contenant la base de donnee
			si mode = build
				la base de donnee construite sera dumpee sur DB_file
			si mode = search
				la base de donnee sera recupere depuis DB_file

		doc_files = liste de documents bruts a integrer a la base de donnee

		DB = base de donnee de la classe Data_base
	"""

	def __init__(self, mode='build', DB_file=None, doc_files=None, trace=False):
		self.mode = mode
		self.DB_file = DB_file
		self.doc_list = []
		doc_to_read=[]
		for root, dirs, files in os.walk(doc_files, topdown=False):
			for file_name in files: 
				doc_to_read.append(os.path.join(root, file_name.encode('utf-8')))
		for doc_file in doc_to_read :
			doc = Doc(doc_file)
			self.doc_list.append(doc)
		self.trace = trace
		self.requete= []
		self.DB = Data_Base()
		self.stemmer=FrenchStemmer()

		if mode == 'build' :
			#construction de la base de donnee, puis dump sur DB_file
			print 'Built Data Base...'
			self.build_DB()
			#print self.DB
		elif mode == 'search' :
			#chargement de la base de donnee
			self.load_DB()
		print self.DB.word2Word_struct
		
	def build_DB(self):
		"""
			rempli seld.DB avec les documents de self.doc_files
		"""
		#TODO
		for doc in self.doc_list:
				self.DB.add_doc(doc)
		print self.DB.nb_doc_total
		#print self.DB.id2nbword
		self.dump_DB()

	def load_DB(self):
		"""
			charge le contenu du fichier self.DB_file dans self.DB
		"""
		stream = open(self.DB_file)
		self.DB = pickle.load(stream)
		stream.close()
		return

	def dump_DB(self):
		"""
			dump le contenu de self.DB dans le fichier self.DB_file
		"""
		print 'Dump data base....'
		stream = open(self.DB_file, 'w')
		pickle.dump(self.DB, stream)
		stream.close()
		#return 
	
	def parse_requete(self, requete):
		"""
				parse la requete introduite par l'utilisateur et produit une liste de tokens
			"""
		req_list= re.findall( '\w+', requete)
		for word in req_list :
			#print 'avant', word
			word = self.stemmer.stem(word.decode('utf-8'))
			self.requete.append(word)
			#print 'apres', word
		#print "requete (parse) :"
		#for word in self.requete :
			#print word
		#return 
		
	def fuse_lst_rec(self,title_lst,title_head,first_lst,first_head,body_lst,body_head,acc):
		if acc == [] :
			acc.append(-1)
		m = max(title_head,first_head,body_head)
		title_head_aux = title_head
		first_head_aux = first_head
		body_head_aux = body_head		
		if m == -1 :
			acc.reverse()
			acc.pop()
			return acc
		if m == title_head_aux :
			if title_lst != [] :
				title_head_aux = title_lst.pop()
			else :
				title_head_aux = -1
		if m == first_head_aux :
			if first_lst != [] :
				first_head_aux = first_lst.pop()
			else :
				first_head_aux = -1
		if m == body_head_aux :
			if body_lst != [] :
				body_head_aux = body_lst.pop()
			else :
				body_head_aux = -1
		h = acc.pop()
		if h != m :
			acc.append(h)
			acc.append(m)
		else :
			acc.append(h)
		self.fuse_lst_rec(title_lst,title_head_aux,first_lst,first_head_aux,body_lst,body_head_aux,acc)
		
	def merge_dif_rec(self,lst1,head1,lst2,head2,acc):
		if acc == [] :
			acc.append(-1)
		head1_aux = head1
		head2_aux = head2
		if head1_aux == head2_aux :
			acc.append(head1_aux)
			if lst1 == [] or lst2 == [] :
				acc.reverse()
				acc.pop()
				return acc				
			else :
				head1_aux = lst1.pop()
				head2_aux = lst2.pop()
		if head1_aux > head2_aux :
			if lst1 == [] :
				acc.reverse()
				acc.pop()
				return acc
			else :
				head1_aux = lst1.pop()
		else :
			if lst2 == [] :
				acc.reverse()
				acc.pop()
				return acc
			else :
				head2_aux = lst2.pop()
		self.merge_dif_rec(lst1,head1_aux,lst2,head2_aux,acc)
					
	def search_bool_word(self,word):
		title_lst = []
		title_head = -1
		first_lst = []
		first_head = -1
		body_lst = []
		body_head = -1
		print "searching ", word
		if word in self.DB.word2Word_struct:
		  print "YES"
		  print self.DB.word2Word_struct[word].body
		#word=self.stemmer.stem(word.decode('utf-8'))
		for doc_id in self.DB.word2Word_struct[word].title :
			print "title" , str(doc_id.doc_id)
			title_lst.append(doc_id.doc_id)
		for doc_id in self.DB.word2Word_struct[word].first :
			print "first" , str(doc_id.doc_id)
			first_lst.append(doc_id.doc_id)
		for doc_id in self.DB.word2Word_struct[word].body :
			print "body" , str(doc_id.doc_id)
			body_lst.append(doc_id.doc_id)
		if title_lst != [] :
			title_head = title_lst.pop()
		if first_lst != [] :
			first_head = first_lst.pop()
		if body_lst != [] :
			body_head = body_lst.pop()
		return self.fuse_lst_rec(title_lst,title_head,first_lst,first_head,body_lst,body_head,[])
		
	def search_bool_req(self):
		#print "requete (search) :"
		#for word in self.requete :
			###print word
		if self.requete == [] :
			return []
		#TODO ajouter une fonction pour trier les mots par ordre croissant de doc
		word0 = self.requete.pop()
		#print "word (search) :",word0
		lst = self.search_bool_word(word0)
		for word in self.requete :
			#print "word (search) :",word
			# word=self.stemmer.stem(word.decode('utf-8'))
			if lst == [] :
				return []
			lst_aux = self.search_bool_word(word)
			if lst_aux == [] :
				return []
			head_lst = lst.pop()
			head_lst_aux = lst_aux.pop()
			lst = self.merge_dif_rec(lst,head_lst,lst_aux,head_lst_aux,[])
		#print lst
		return lst
		
	def search_rank_req(self):
		#TODO
		return []
Пример #38
0
def stemm(word):
	stemmer = FrenchStemmer()
	return stemmer.stem(word)
Пример #39
0
	def read_doc(self,docfile):
		"""
			lit le document dans le fichier doc_file et rempli les dictionnaires de listes de chaque champs avec les token du document. Compte egalement le nombre de mot
		"""
		stemmer=FrenchStemmer()
		flux=open(docfile)
		line=flux.readline()
		position=0
		title=True
		first=True
		while line != '':
		  liste=line.split()
		  if title==True and len(liste)>0: #remplir le dictionnaire du titre
		    self.full_title = line
		    title=False
		    for each in liste:
		      each=each.lower()
		      if '\'' in each:
			strings=self.splitAccent(each)
			strings[0]+='\''
			self.nb_word+=len(strings)
			for word in strings:
			  word= stemmer.stem(word.decode('iso-8859-1') )
			  if word not in self.word2pos_list_title:
			   self.word2pos_list_title[word]=[]
			  self.word2pos_list_title[word].append(position)
			  position+=1
		      else:
			self.nb_word+=1
			each=stemmer.stem(each.decode('iso-8859-1'))
			if each not in self.word2pos_list_title:
			   self.word2pos_list_title[each]=[]
			self.word2pos_list_title[each].append(position)
			position+=1
		    line=flux.readline()
		    liste=line.split()
		  if first==True and title==False and liste!=[]: #pour remplir le dictionnaire du premier paragraphe
		      first=False
		      for each in liste:
			each=each.lower()
			if '\'' in each:
			  strings=self.splitAccent(each)
			  strings[0]+='\''
			  self.nb_word+=len(strings)
			  for word in strings:
			    word= stemmer.stem(word.decode('iso-8859-1') )
			    if word not in self.word2pos_list_first:
			      self.word2pos_list_first[word]=[]
			    self.word2pos_list_first[word].append(position)
			    position+=1
			else:
			  self.nb_word+=1
			  each=stemmer.stem(each.decode('iso-8859-1'))
			  if each not in self.word2pos_list_first:
			    self.word2pos_list_first[each]=[]
			  self.word2pos_list_first[each].append(position)
			  position+=1
		      line=flux.readline()
		      liste=line.split()
		  if first==False and title==False and liste!=[]: #pour remplir le dictionnaire du corps de texte
		    for each in liste:
		      each=each.lower()
		      if '\'' in each:
			strings=self.splitAccent(each)
			strings[0]+='\''
			self.nb_word+=len(strings)
			for word in strings:
			  word= stemmer.stem(word.decode('iso-8859-1') )
			  if word not in self.word2pos_list_body:
			    self.word2pos_list_body[word]=[]
			  self.word2pos_list_body[word].append(position)
			  position+=1
		      else:
			self.nb_word+=1
			each=stemmer.stem(each.decode('iso-8859-1'))
			if each not in self.word2pos_list_body:
			  self.word2pos_list_body[each]=[]
			  self.word2pos_list_body[each].append(position)
			else:
			    self.word2pos_list_body[each].append(position)
			position+=1
		  line=flux.readline()
Пример #40
0
urllib.urlretrieve(url, "FrenchDictionary.txt")
 
 
stemmer = FrenchStemmer()

fs = open("FrenchDictionary.txt", 'r')
fd = open("StemmedFrenchDictionary.txt", 'w')
while 1:
	txt = fs.readline()
	if txt =='':
		break
	if txt[0] != '':
	
		txt = txt.lower()
		txt = ''.join(u for u in txt if u in string.ascii_letters)
		txt = Enleve_Accents(txt)
		txt = stemmer.stem(txt) + "\n"
		fd.write(txt)
import os
script = """
(cat StemmedFrenchDictionary.txt|uniq>monfichier.tmp) &&  mv -f monfichier.tmp FinalDictionary.txt
"""
os.system("bash -c '%s'" % script)



fs.close()
fd.close()


Пример #41
0
logging.info("running %s" % " ".join(sys.argv))

# Test string #
string = 'Jadis, une nuit, je fus un papillon, voltigeant, content de son sort. Puis, je m’éveillai, étant Tchouang-tseu. Qui suis-je en réalité ? Un papillon qui rêve qu’il est Tchouang-tseu ou Tchouang qui s’imagine qu’il fut papillon ?'

# Have fun with tokenizers
tokenizer1 = nltk.data.load('tokenizers/punkt/french.pickle')
tokenizer2 = TreebankWordTokenizer()
french_stopwords = set(stopwords.words('french'))
stemmer = FrenchStemmer()

# See results
tokens1 = tokenizer1.tokenize(string)
tokens2 = tokenizer2.tokenize(string)
tokens3 = [token.encode('utf-8') for token in tokens2 if token.lower() not in french_stopwords]
tokens4 = [stemmer.stem(token.decode('utf-8')) for token in tokens3]


# Build class to add stem to pipleine

class StemmedCountVectorizer(CountVectorizer):

	def build_analyzer(self):
		analyzer = super(CountVectorizer, self).build_analyzer()
		return lambda doc:(stemmer.stem(w) for w in analyzer(doc))

analyzer = CountVectorizer().build_analyzer()
stem_vectorizer = StemmedCountVectorizer(stemmer)

def stemming(doc):
	return (stemmer.stem(w) for w in analyzer(doc))
Пример #42
0
class Search_engine:
	"""
		--Moteur de recherche--

		DB_file = fichier contenant la base de donnee
			si mode = build
				la base de donnee construite sera dumpee sur DB_file
			si mode = search
				la base de donnee sera recupere depuis DB_file

		doc_files = liste de documents bruts a integrer a la base de donnee

		DB = base de donnee de la classe Data_base
	"""

	def __init__(self, mode='build', DB_file=None, doc_files=None, trace=False):
		self.mode = mode
		self.DB_file = DB_file
		self.doc_list = []
		self.trace = trace
		self.requete= []
		self.DB = Data_Base()
		self.stemmer=FrenchStemmer()
		self.requeteFin=[]
		self.idDoc2tfIdf={}

		if mode == 'build' :
			#construction de la base de donnee, puis dump sur DB_file
			print 'Building Data Base...'
			self.build_DB(doc_files)
			print 'Building completed'
		elif mode == 'search' :
			#chargement de la base de donnee
			self.load_DB()
		self.word2nbOccDsDB={}
		
	def build_DB(self, doc_files):
		"""
			rempli seld.DB avec les documents de self.doc_files
		"""
		compteur=0
		doc_name=doc_files+'doc_'+str(compteur)+'.txt'
		while os.path.exists(doc_name):
		  doc=Doc(doc_name)
		  self.DB.add_doc(doc)
		  compteur+=1
		  doc_name=doc_files+'doc_'+str(compteur)+'.txt'
		print "Number of documents in the Data Base: ", self.DB.nb_doc_total
		#print self.DB.id2nbword
		self.dump_DB()

	def load_DB(self):
		"""
			charge le contenu du fichier self.DB_file dans self.DB
		"""
		print 'Loadind Data Base...'
		stream = open(self.DB_file)
		self.DB = cPickle.load(stream)
		stream.close()
		print "Number of documents in the Data Base: ", self.DB.nb_doc_total
		print 'Loading completed'
		return

	def dump_DB(self):
		"""
			dump le contenu de self.DB dans le fichier self.DB_file
		"""
		print 'Dumping Data Base...'
		p=cPickle.Pickler(open(self.DB_file, 'wb'))
		p.fast=True
		p.dump(self.DB)
		print 'Dumping completed'
		#stream.close()
		#return 
	
	def parse_requete(self, requete):
		"""
				parse la requete introduite par l'utilisateur et produit une liste de tokens
			"""
		req_list= re.findall( '\w+', requete)
		for word in req_list :
			word = self.stemmer.stem(word.decode('utf-8'))
			self.requete.append(word)
			self.requeteFin.append(word)
		
	def fuse_lst_rec(self,title_lst,title_head,first_lst,first_head,body_lst,body_head,acc):
		if acc == [] :
			acc.append(-1)
		m = max(title_head,first_head,body_head)
		title_head_aux = title_head
		first_head_aux = first_head
		body_head_aux = body_head		
		if m == -1 :
			acc.reverse()
			a=acc.pop()
			
			return acc
		else:
		  if m == title_head_aux :
			  if title_lst != [] :
				  title_head_aux = title_lst.pop()
			  else :
				  title_head_aux = -1
		  elif m == first_head_aux :
			  if first_lst != [] :
				  first_head_aux = first_lst.pop()
			  else :
				  first_head_aux = -1
		  elif m == body_head_aux :
			  if body_lst != [] :
				  body_head_aux = body_lst.pop()
			  else :
				  body_head_aux = -1
		  h = acc.pop()
		  if h != m :
			  acc.append(h)
			  acc.append(m)
		  else :
			  acc.append(h)
		  return self.fuse_lst_rec(title_lst,title_head_aux,first_lst,first_head_aux,body_lst,body_head_aux,acc)
		
	def merge_dif_rec(self,lst1,head1,lst2,head2,acc):
		if acc == [] :
			acc.append(-1)
		head1_aux = head1
		head2_aux = head2
		if head1_aux == head2_aux :
			acc.append(head1_aux)
			if lst1 == [] or lst2 == [] :
				acc.reverse()
				acc.pop()
				return acc				
			else :
				head1_aux = lst1.pop()
				head2_aux = lst2.pop()
		elif head1_aux > head2_aux :
			if lst1 == [] :
				acc.reverse()
				acc.pop()
				return acc
			else :
				head1_aux = lst1.pop()
		else :
			if lst2 == [] :
				acc.reverse()
				acc.pop()
				return acc
			else :
				head2_aux = lst2.pop()
		return self.merge_dif_rec(lst1,head1_aux,lst2,head2_aux,acc)
					
	def search_bool_word(self,word):
		title_lst = []
		title_head = -1
		first_lst = []
		first_head = -1
		body_lst = []
		body_head = -1
		for doc_id in self.DB.word2Word_struct[word].title :
			#print "title" , str(doc_id.doc_id), str(self.DB.id2doc[doc_id.doc_id].doc_file)
			title_lst.append(doc_id.doc_id)
		for doc_id in self.DB.word2Word_struct[word].first :
			#print "first" , str(doc_id.doc_id), str(self.DB.id2doc[doc_id.doc_id].doc_file)
			first_lst.append(doc_id.doc_id)
		for doc_id in self.DB.word2Word_struct[word].body :
			#print "body" , str(doc_id.doc_id), str(self.DB.id2doc[doc_id.doc_id].doc_file)
			body_lst.append(doc_id.doc_id)
		if title_lst != [] :
			title_head = title_lst.pop()
		if first_lst != [] :
			first_head = first_lst.pop()
		if body_lst != [] :
			body_head = body_lst.pop()
		result=self.fuse_lst_rec(title_lst,title_head,first_lst,first_head,body_lst,body_head,[])
		self.word2nbOccDsDB[word]=len(result)
		return result
		
	def search_bool_req(self):
		if self.requete == [] :
			return []
		word0 = self.requete.pop()
		lst = self.search_bool_word(word0)
		for word in self.requete :
			if lst == [] :
				return []
			lst_aux = self.search_bool_word(word)
		
			if lst_aux == [] :
				return []
			head_lst = lst.pop()
			head_lst_aux = lst_aux.pop()
			lst = self.merge_dif_rec(lst,head_lst,lst_aux,head_lst_aux,[])
		return lst

	def tf_idf(self, doc_id):#calcul le TF.IDF pour la requete pour chaque doc
		solution= 0
		doc=self.DB.id2doc[doc_id]
		for word in self.requeteFin:
			word_in_title=0
			word_in_first=0
			word_in_body=0
			total_noWords_in_doc = float(doc.nb_word)
			if word in doc.word2pos_list_title:
			  word_in_title=len(doc.word2pos_list_title[word])
			if word in doc.word2pos_list_first:
			  word_in_first=len(doc.word2pos_list_first[word])
			if word in doc.word2pos_list_body:
			  word_in_body=len(doc.word2pos_list_body[word])
			word_in_doc=float(word_in_body+word_in_first+word_in_title)
			no_docs=float(self.DB.nb_doc_total)
			no_docs_with_word=self.word2nbOccDsDB[word]
			solution +=float(word_in_doc/total_noWords_in_doc)*math.log1p(no_docs/no_docs_with_word)
		return solution
	      
	def tf_idf_score(self, listDoc_id):
	  for doc_id in listDoc_id:
	    self.idDoc2tfIdf[doc_id]=self.tf_idf(doc_id)
	
	def search_rank_req(self, requete, nbResMax):
		self.requete=[]
		self.requeteFin=[]
		self.parse_requete(requete)
		docsTrouves=self.search_bool_req()
		self.tf_idf_score(docsTrouves)
		self.idDoc2tfIdf=OrderedDict(sorted(self.idDoc2tfIdf.items(), key=lambda t: t[1], reverse=True))
		
		keys=self.idDoc2tfIdf.keys()[:nbResMax]
		if len(keys)<1:
		  print 'Nothing found \n'
		i=1
		for doc in keys:
		  print str(i)+'. '+self.id2docTitle(doc)+'File: '+self.id2fileName(doc)
		  i+=1
		return keys

	def id2fileName(self, docId):
	  return str(self.DB.id2doc[docId].doc_file)
	
	def id2docTitle(self,docId):
	  return str(self.DB.id2doc[docId].full_title)
	
	def reset(self):
	  self.requete=[]
	  self.requeteFin=[]
	  self.idDoc2tfIdf={}
Пример #43
0
	def lemmatize(self, word):
		stemmer = FrenchStemmer()
		return stemmer.stem(word)
Пример #44
0
 def stem_words(self, words):
     stemmer = FrenchStemmer()
     stemmed_words = []        
     for word in words:
         stemmed_words.append(stemmer.stem(word))
     return stemmed_words
Пример #45
0
class Level:
  def __init__(self, corpus_file, word_file):
    self.user= False
    if  word_file : self.user=True
    self.stemmer = FrenchStemmer()
    self.text_into_sentences = data.load("tokenizers/punkt/french.pickle")
    curr_path = os.path.dirname(os.path.abspath(__file__))
    os.environ['STANFORD_PARSER'] = curr_path +"/stanford-parser-full-2015-04-20"
    os.environ['STANFORD_MODELS'] = curr_path + "/stanford-parser-full-2015-04-20"
    self.parser = stanford.StanfordParser(model_path= curr_path+ "/frenchFactored.ser.gz")
    self.stpwords = stopwords.words("french")
    self.read_corpus(corpus_file)
    self.known_words_list = []
    self.ok_words_lis = []
    if self.user: self.read_known_words(word_file)
    self.stemmed_known_words = map(self.stemmer.stem,  [w for w in self.known_words_list if w not in  self.stpwords])
    self.ok_words_list = [w for w in self.ok_words_lis if w not in self.stpwords]

    self.sentences = []
    self.word_count = 0
    self.syll_count = 0
    self.num_of_sentences = 0
    self.word_per_sentence= 0.0
    self.syllables_per_word = 0.0
    self.num_of_words_with_more_than_six_chars = 0
    self.sixratio = 0.0
    self.num_of_words_with_more_than_three_syll = 0
    self.threeratio= 0.0
    self.depth_sum = 0
    self.avgdepth = 0.0
    self.known_words = 0
    self.knownratio = 0.0
    self.NP_sum = 0
    self.NPratio =0.0
    self.VP_sum = 0
    self.VPratio =0.0
    self.SBAR_sum = 0
    self.SBARratio =0.0

    self.score_with_words=0.0
    self.score_general = 0.0
    self.flesch_kincaid_score=0.0

  def sentence_stats(self):
    self.sentences = self.text_into_sentences.tokenize(self.text)
    self.num_of_sentences = len(self.sentences)
    self.word_count, self.syll_count, self.num_of_words_with_more_than_six_chars , self.num_of_words_with_more_than_three_syll, self.depth_sum, self.NP_sum, self.VP_sum, self.SBAR_sum, self.known_words = [sum(x) for x in zip(*[self.count_words_in_a_sentence(s) for s in self.sentences ])]

  def calculate_stats(self): 
    self.word_per_sentence=  float(self.word_count)/self.num_of_sentences
    self.syllables_per_word = float(self.syll_count)/ self.word_count 
    self.sixratio = float(self.num_of_words_with_more_than_six_chars)/ self.word_count 
    self.threeratio = float(self.num_of_words_with_more_than_three_syll)/ self.word_count 
    self.avgdepth = float(self.depth_sum)/ self.num_of_sentences 
    self.knownratio = float(self.known_words)/ self.word_count 
    self.NPratio = float(self.NP_sum)/ self.word_count 
    self.VPratio = float(self.VP_sum)/ self.word_count 
    self.SBARratio = float(self.SBAR_sum)/ self.word_count 

  def  print_stats(self):
    print "#of sentences:", self.num_of_sentences
    print "#of words, #of words per sentence:", self.word_count, "   ", self.word_per_sentence
    print "#of syllables, #of syllables per word:", self.syll_count,"   ", self.syllables_per_word
    print "#of words with more than 6 characters, percentage to all words:",self.num_of_words_with_more_than_six_chars,"  ", self.sixratio 
    print "#of words with more than 3 syllables, percentage to all words:", self.num_of_words_with_more_than_three_syll,"   ", self.threeratio
    print "average parse tree depth:", self.avgdepth
    print "average # of noun phrases:", self.NPratio
    print "average # of verb phrases:", self.VPratio
    print "average # of SBAR phrases:", self.SBARratio
    print "# of known words, percentage to all words:", self.known_words,"   ", self.knownratio
    print "flesch-kincaid score:", self.flesch_kincaid_score
    print "general score without vocabulary:", self.score_general
    if self.user:
      print "score with vocabulary:", self.score_with_words

  def calculate_score(self):
    self.flesch_kincaid_score = 206.835 - 1.015 * self.word_per_sentence - 84.6 * self.syllables_per_word
    self.score_general =((self.SBARratio+self.VPratio+self.NPratio+self.threeratio+self.sixratio)/5)*self.avgdepth/self.flesch_kincaid_score   
    self.score_with_words =((self.SBARratio+self.VPratio+self.NPratio+self.threeratio+self.sixratio+(1-self.knownratio))/6)*self.avgdepth/self.flesch_kincaid_score    
    

  def count_words_in_a_sentence(self, sentence):
    known_words=0
    tokens = word_punckt_tokenizer.tokenize(sentence.lower()) 
    words = self.normalize_list(tokens)
    word_count = len(words)
    syll_count = sum(Level.syllable_count(word) for word in words)
    num_of_words_with_more_than_six_chars = len(filter(lambda(x): len(x) >= 6, words)) 
    num_of_words_with_more_than_three_syll = len(filter(lambda(x): Level.syllable_count(x) >=3, words)) 
    parse_tree_depth, num_of_NP, num_of_VP, num_of_SBAR = self.tree_stats(sentence)
    if self.user:
      known_words = sum([1 for w in words if w in self.ok_words_list or self.stemmer.stem(w) in self.stemmed_known_words ]) 
    print sentence
    print words
    return (word_count, syll_count,  num_of_words_with_more_than_six_chars,num_of_words_with_more_than_three_syll, parse_tree_depth, num_of_NP, num_of_VP, num_of_SBAR, known_words)
  
  def tree_stats(self, sentence):
    depth=1
    num_of_np =1
    num_of_vp = 1
    num_of_sbar = 0 
    try:
      l = list(self.parser.raw_parse(sentence))[0]
      num_of_np = sum( [1 for i in l.subtrees() if i.label() == 'NP'])
      num_of_vp = sum( [1 for i in l.subtrees() if i.label() == 'VN'])
      num_of_sbar = sum( [1 for i in l.subtrees() if i.label() == 'CS'])
      depth = l.height()
    except:
      pass
    return (depth, num_of_np, num_of_vp, num_of_sbar)


  def normalize_list(self, token_list):
    ss  = [w for w in token_list if w not in self.stpwords and w not in string.punctuation]
    return  ss

  @staticmethod
  def syllable_count(word):
    n = len(word)
    num_of_syll = 0
    i=0
    while i < n:
      if i < n-3 and is_a_vowel(word[i]) and is_a_vowel(word[i+1]) and is_a_vowel(word[i+2]) and is_a_vowel(word[i+3]):
        num_of_syll += 2
        i += 4
      elif i < n-2 and is_a_vowel(word[i]) and is_a_vowel(word[i+1]) and is_a_vowel(word[i+2]):
        num_of_syll += 1
        i += 3
      elif i < n-1 and is_a_vowel(word[i]) and is_a_vowel(word[i+1]):
        num_of_syll += 1
        i += 2
      elif i < n and is_a_vowel(word[i]): 
        num_of_syll += 1
        i += 1
      else:
        i += 1
    return num_of_syll

  def read_corpus(self, filename):
    encodings = ["utf-8", "latin-1", "windows-1250", "windows-1252", "latin-15", "utf-16", "ascii"]

    for e in encodings:
      try:
        fh = codecs.open(filename, "r", encoding=e)
        self.text = fh.read().strip()
        fh.close()
      except UnicodeDecodeError:
        pass
      else:
        break
    

    
  def read_known_words(self, filename):
    encodings = ["utf-8", "latin-1", "windows-1250", "windows-1252", "latin-15", "ascii", "utf-16"]

    for e in encodings:
      try:
        fh = codecs.open(filename, "r", encoding=e)
        for line in fh:
          word, d = line.strip().split()
          degree = int(d)
          if degree ==1 :  self.known_words_list.append(word)
          else:   self.ok_words_lis.append(word)
        fh.close()
      except UnicodeDecodeError:
        pass
      else:
        break