예제 #1
0
    def preprocess(self, text=None, stem=False, fix_pdf=True):

        if text is None:
            text = self.text

        def fix_pdf2txt(texto):
            import re
            texto = re.sub(r'\n([^A-Z])', r' \1', texto)
            texto = re.sub(r'([^\.])\n', r'\1.\n', texto)
            return texto

        def tokenizer_fr(text):
            # Courtesy of http://www.fabienpoulard.info/post/2008/03/05/Tokenisation-en-mots-avec-NLTK

            return tok_fr.tokenize(text)

        # Fix newline problems with pdf to txt step
        if fix_pdf:
            text = fix_pdf2txt(text)

        text = text.lower()

        # Tokenization
        self._original_tokens = tokenizer_fr(text)
        self._tokens = self._original_tokens

        #         self._tokens = [t for t in self._tokens if len(t) > 1]

        if stem:
            from nltk.stem.snowball import FrenchStemmer
            fr_stemmer = FrenchStemmer()
            self._tokens = [fr_stemmer.stem(t) for t in self._tokens]

        self._concordance_index = nltk.ConcordanceIndex(self._tokens,
                                                        key=lambda s: s)
예제 #2
0
def is_french_adjr(word): # TODO change adjr tests
  stemmer = FrenchStemmer()
  # suffixes with gender and number flexions
  suffixes = [
    u"ain", u"ains", u"aine", u"aines",
    u"aire", u"aires",
    u"al", u"aux", u"als", u"ale", u"ales",
    u"el", u"els", u"elle", u"elles",
    u"esque", u"esques",
    u"estre", u"estres",
    u"eux", u"euse", u"euses",
    u"é", u"és", u"ée", u"ées",
    u"ien", u"iens", u"ienne", u"iennes",
    u"ier", u"iers", u"ière", u"ières",
    u"if", u"ifs", u"ive", u"ives",
    u"il", u"ils",
    u"in", u"ins", u"ine", u"ines",
    u"ique", u"iques",
    u"ois", u"oise", u"oises"
  ]
  stem = stemmer.stem(word)
  stem_ending = ""
  if word.replace(u"é", "e").replace(u"è", "e").startswith(stem.replace(u"é", "e").replace(u"è", "e")):
    stem_ending = word.replace(u"é", "e").replace(u"è", "e").split(stem.replace(u"é", "e").replace(u"è", "e"), 1)[1]

  if stem in french_stemmed_adjr:
    return True
  for suffix in suffixes:
    if word[-len(suffix):] == suffix:
      return True
  # TODO change adjr tests
  #if stem_ending in french_adjr_stem_ending_counts:
  #  return True
  return False
예제 #3
0
def convert_computing_input_to_dictionnary_input(datas):
    # Instantiation of the Tokenizer
    tokenizer = WordPunctTokenizer()
    # Instantiate Stemmer
    stemmer = FrenchStemmer()
    # Load french StopWords
    french_stopwords = set(stopwords.words('french'))

    matchingTable = []
    tokenTable = [
    ]  # Each row of this table is [id, tokens] :'id' of the advert and 'tokens' the list of tokens in the advert
    i = 0
    for row in datas:
        id = row[0]
        desc = row[2]
        # Get tokens for this row
        tokens = tokenizer.tokenize(str(desc[0]))
        # Filter tokens to remove punctuation
        regex = re.compile(r'\w+')
        tokens = filter(regex.search, tokens)
        # Filter tokens to remove stopwords and convert tokens to their stemm
        tokens = [
            stemmer.stem(token) for token in tokens
            if token.lower() not in french_stopwords
        ]
        # Remove duplicate entries
        tokens = list(set(tokens))
        # Sort tokens
        tokens.sort()
        # Construct the new row with only the id and the list of tokens
        row = [id, tokens]

        # Add the new Row to the global table
        tokenTable.append(row)
        i += 1

    # Construct the vector for each advert
    rowCol = 0
    rowCols = []
    tabCols = []
    rowInd = 0
    initialRow = np.zeros(len(tokenDictionnary))

    #
    # # Here we transform each row of tokens into row of 0|1 corresponding array, matching the tokenDictionnary
    #
    # tokenTable[1:] to skip the title row, because the original file has a title row
    for row in tokenTable[1:]:
        id = row[0].split(";")[0]
        advertVec = np.zeros(len(tokenDictionnary))
        rowCols = []
        for elm in row[1]:
            rowCol = tokenDictionnary.index(
                elm) if elm in tokenDictionnary else -1
            advertVec[rowCol] = 1 if rowCol >= 0 else 0

        composed_row = [id, advertVec]
        matchingTable.append(composed_row)
        rowInd += 1
    return tokenTable, matchingTable
예제 #4
0
파일: Mot.py 프로젝트: Palo-IT/petitext
 def __init__(self, label = "", role = "", ner = ""):
     self.label = label
     self.role = role
     self.ner = ner
     ##print(repr(label))
     stemmer = FrenchStemmer()
     self.lemma = stemmer.stem(label)
예제 #5
0
def new_dico(file):
	stemmer = FrenchStemmer()
	input_ = "../dico/" + file
	output_ = "dic_with_roots/" + file
	fs=open(input_,'r')
	fd=open(output_,'w')

	k =0
	lines = fs.readlines()
	for line in lines:
		txt = line.split(" ")
		if txt =='':
			break
		for w in txt:
			if(w.istitle()):	
				k = 1
			else:
				k = 0	
			w= w.decode("utf-8")
			w = ''.join(u for u in w if u in string.ascii_letters)
			w=enleve_accents(w)
			w=stemmer.stem(w)+" "
			w.encode("utf-8")
			if(k):
				w = w[0].upper() + w[1:]
				fd.write(w)
			else:
				fd.write(w)

	fs.close()
	fd.close()
예제 #6
0
    def preprocess(self, text=None, stem=False, fix_pdf=True):

        if text is None:
            text = self.text

        def fix_pdf2txt(texto):
            import re
            texto = re.sub(r'\n([^A-Z])', r' \1', texto)
            texto = re.sub(r'([^\.])\n', r'\1.\n', texto)
            return texto

        def tokenizer_fr(text):
            # Courtesy of http://www.fabienpoulard.info/post/2008/03/05/Tokenisation-en-mots-avec-NLTK

            return tok_fr.tokenize(text)

        # Fix newline problems with pdf to txt step
        if fix_pdf:
            text = fix_pdf2txt(text)

        text = text.lower()

        # Tokenization
        self._original_tokens = tokenizer_fr(text)
        self._tokens = self._original_tokens

        #         self._tokens = [t for t in self._tokens if len(t) > 1]

        if stem:
            from nltk.stem.snowball import FrenchStemmer
            fr_stemmer = FrenchStemmer()
            self._tokens = [fr_stemmer.stem(t) for t in self._tokens]

        self._concordance_index = nltk.ConcordanceIndex(self._tokens, key=lambda s: s)
예제 #7
0
def normalize_text(string):
    """Preprocess text string to return a normalized form of the text.
    """
    if isinstance(string, float):
        return ""
    else:
        # lowering x, removing beginning and ending space
        s = string.strip().lower()

    # removing accents
    s = ''.join((c for c in unicodedata.normalize('NFD', s)
                 if unicodedata.category(c) != 'Mn'))

    # remove punctuation
    s = re.sub("[" + punctuation + "]", " ", s)

    # remove uninformative, stop words and non alpha words
    words_to_remove = [
        "les", "une", "des", "nos", "ils", "elle", "elles", "nan", "null"
    ]
    stop_words = list(stopwords.words("french"))
    remove_list = words_to_remove + stop_words
    s = " ".join([
        word for word in s.split()
        if (word.isalpha() and word not in remove_list and len(word) > 2)
    ])

    # Stemming words and remove duplicates
    stemmer = FrenchStemmer()
    stem_words = [stemmer.stem(w) for w in s.split()]
    s = " ".join(stem_words)

    return s
예제 #8
0
def get_lem(text):
    stemmer = FrenchStemmer()
    text_tokens = word_tokenize(text)
    text =""
    for word in text_tokens :
        text += " "+stemmer.stem(word)
    return text
예제 #9
0
    def stemmingFR(self):

        ps = FrenchStemmer()
        #Input Document
        Input = open(self.filename, "r")
        elagage = Input.read()

        #Output Document
        Output = open(self.filename + "-out3.txt", "a")

        #Stemming
        for w in elagage.split():
            Output.write(ps.stem(w))
            Output.write("\n")

        self.label.configure(text=self.filename)
        self.texte = Entry(self,
                           width=20,
                           font="Arial 14",
                           fg="green",
                           justify='center')
        self.texte.insert(END, "Succée de Stemming")
        self.texte.grid(padx=16, pady=16)

        self.texte = Entry(self,
                           width=50,
                           font="Arial 14",
                           fg="blue",
                           justify='center')
        self.texte.insert(
            END, "Vous trouvez votre fichier résulat sous le même répertoire")
        self.texte.grid(padx=16, pady=16)
    def __init__(self):
        # load text_en_fr
        # if it's missing, we're generating it
        if not os.path.isfile("text_en_fr.csv"):
            self.df = self.generate_csv_from_en_fr_text("text_en_fr.csv")
        else:
            self.df = pd.read_csv("text_en_fr.csv")

        # load stopwords
        f = open("sorted_data/stopwords", "r")
        stopwords_en = f.read().split("\n")
        stopwords_en.pop(-1)

        self.stopwords = stopwords.words('english') + stopwords.words(
            'french') + stopwords_en

        # load lemmatizer for en and fr
        self.lemmatizer_en = WordNetLemmatizer()
        self.stemmer_fr = FrenchStemmer()

        # preprocess text
        self.X, self.Y = self.preprocess(self.df["en"].values.tolist()[:2],
                                         self.df["fr"].values.tolist()[:2])

        self.train_model_en_to_fr("model_translate.h5")
def process_text(text, stem=False):
    """ lowercase, removes stopwords, accents and lemmatizes the tokens if stem=True
    used with the df.apply() to create a new column on a dataframe
    """

    text_clean = []
    for sen in text:
        #         sen = unidecode.unidecode(sen.replace("’", " ").replace(","," ").replace("."," ").replace(";"," ").lower())
        sen = unidecode.unidecode(
            sen.replace("’", " ").replace(",", " ").replace(
                ";", " ").lower())  # keep the dots for the date_uniformizer
        sen = sen.replace("/ ", "/")  #some dates are in DD/ MM/ yyyy format
        tokens = sen.split()
        if stem:
            from nltk.stem.snowball import FrenchStemmer
            stemmer = FrenchStemmer()
            tokens_no_stpwrd = [
                stemmer.stem(tok) for tok in tokens if tok not in stop_words
            ]
        else:
            #             tokens_no_stpwrd = [tok for tok in tokens if (tok not in stop_words) & (tok.isalnum())]
            tokens_no_stpwrd = [
                tok for tok in tokens if (tok not in stop_words)
            ]

        no_letters = re.sub(' [a-z] ', " ", " ".join(tokens_no_stpwrd))

        text_clean.append(no_letters)

    return text_clean
예제 #12
0
def preprocess(text):
    result = []
    stopwords = get_stopswords()
    stemmer = FrenchStemmer()
    for token in simple_preprocess(text):
        if token not in stopwords and len(token) > 3:
            result.append(stemmer.stem(token))
    return result
예제 #13
0
 def __init__(self, stop_words=None, remove_non_words=False):
     self.st = FrenchStemmer()
     if stop_words == None:
         self.stopwords = set(stopwords.words('french'))
     else:
         self.stopwords = stop_words
     self.words = set(words.words())
     self.remove_non_words = remove_non_words
예제 #14
0
def racinize_all_concept(concept):
    concept_tiers = []
    stemmer = FrenchStemmer()
    for i in range(0, len(concept)):
        temp = concept[i][0].lower()
        temp2 = stemmer.stem(temp)
        concept_tiers.append((temp2, concept[i][1]))
    return concept_tiers
예제 #15
0
def lemmatize(token):
    """Lemmatize word using a french lemmatizer
    
    Arguments:
        token {string} -- token to lemmatize
    """
    stemmer = FrenchStemmer()
    return stemmer.stem(token)
예제 #16
0
 def stemArticle(self, doc):
     stemmer_fr = FrenchStemmer()
     stemmer_en = EnglishStemmer()
     
     stemmedArticle = [str(stemmer_fr.stem(w)) for w in doc]
     stemmedArticle = [str(stemmer_en.stem(w)) for w in stemmedArticle]   
     
     return stemmedArticle
예제 #17
0
def stem_words(words):
    stemmed_words = []
    stemmer = FrenchStemmer()
    for word in words:
        stemmed_word = stemmer.stem(word)
        stemmed_words.append(stemmed_word)
    stemmed_words.sort()
    return stemmed_words
예제 #18
0
def stem(sentence):
    # Instance Stemmer
    stemmer = FrenchStemmer()
    stem = ''
    for word in nltk.word_tokenize(sentence):
        stem += ' ' + stemmer.stem(word)
    #print(stem) # Pour débug
    return stem
예제 #19
0
def stemming_Function(filtered_words):
        stemmed_words = [] #declare an empty list to hold our stemmed words
        stemmer = FrenchStemmer() #create a stemmer object in the FrenchStemmer class
        for word in filtered_words:
         stemmed_word=stemmer.stem(word) #stem the word
         stemmed_words.append(stemmed_word) #add it to our stemmed word list
        
        freqdist = nltk.FreqDist(stemmed_words)
        return freqdist
예제 #20
0
 def __init__(self, raw_data_path = 'data/df_stats.csv', output_filepath='data/cleaned_preprocessed_campaigns.csv', joi_output='data/joi.csv'):
     self.raw_data_path = raw_data_path
     self.output_filepath = output_filepath
     self.joi_output = joi_output
     self.legacy_columns = ['id', 'title', 'category', 'country', 'name', 'description', 'job_type', 'job_board_id', 'budgetmax', 'creation']
     self.new_columns = ["job_board_name", 'amount_action_0', 'amount_action_1', 'amount_action_2', 'amount_action_3', 'amount_action_4', 'total_cost', 'true_cpc', 'taux_conversion', "taux_conversion_pondere", "volume_conversion", 'creation_an', 'creation_mois', 'creation_jour', 'weekday']
     self.jobboard_name_for_id = { 31 : "AdformProgrammaticFR", 75 : "AdformProgrammaticGermany", 73 : "AdformProgrammaticNL", 74 : "AdformProgrammaticSwitzerland", 87 : "AdformProgrammaticUK", 76 : "AdformProgrammaticUS", 24 : "Adwords", 96 : "AdwordsFR", 102: "adwords-Switzerland", 101: "AdwordsUS", 4  : "Adzuna", 52 : "Adzuna US", 169: "APEC", 12 : "capital", 25 : "CV Library", 59 : "DoubleclickFR", 58 : "DoubleclickUK", 99 : "Facebook-Austria", 77 : "FacebookFR", 79 : "FacebookGermany", 81 : "Facebook-Netherlands", 80 : "Facebookswitzerland", 16 : "FaceBookUK", 78 : "FacebookUS", 98 : "Gigajob-Austria", 35 : "GigaJobFR", 68 : "Gigajob- Germany", 67 : "GigaJob - Netherlands", 69 : "Gigajob- switzerland", 34 : "GigaJobUK", 54 : "GigaJob US", 10 : "Github", 168: "GoogleJobDiscovery", 1  : "Indeed", 146: "Jobbird-Austria", 147: "Jobbird-Belgium", 156: "jobbird-Canada", 151: "Jobbird-France", 152: "Jobbird-Germany", 148: "Jobbird-India", 145: "Jobbird-Netherlands", 150: "Jobbird-Newzealand", 153: "Jobbird-Spain", 155: "jobbird-Switzerland", 149: "Jobbird-Turkey", 143: "Jobbird-UK", 144: "jobbird-US", 154: "jobboard-Switzerland", 14 : "Jobijoba", 9  : "Jobintree", 36 : "JobisJob", 57 : "JobisJob US", 97 : "Joblift-Austria", 131: "joblift-Belgium", 133: "joblift-Canada", 40 : "Joblift FR", 65 : "Joblift - Germany", 159: "joblift-Germany-d.jobmonitor.com", 158: "joblift-Germany-Muenchener", 136: "joblift-India", 61 : "Joblift-Netherlands", 135: "joblift-newzealand", 132: "joblift-Spain", 66 : "Joblift - Switzerland", 134: "joblift-Turkey", 37 : "JobLift UK", 50 : "Joblift US", 3  : "Jobrapido", 161: "jobrapidoGermany-Jobmonitor", 162: "jobrapidoGermany-muenchener", 60 : "jobrapidoProgrammaticTrendingJobs", 53 : "JobRapido US", 13 : "Jobtome", 157: "JobtomeGermany - de.jobmonitor.com", 88 : "JobtomeGermany - muenchener", 165: "Jobtomeprogrammatic", 39 : "Jobtome UK", 56 : "Jobtome US", 166: "kudos", 26 : "LApec", 7  : "Leboncoin", 19 : "Leboncoin Marque employeur", 100: "Linkedin-Austria", 82 : "LinkedinFR", 83 : "LinkedinGermany", 85 : "Linkedinnetherlands", 84 : "Linkedinswitzerland", 8  : "LinkedinUK", 86 : "LinkedinUS", 18 : "LinkUp", 47 : "LoadTestBoard", 167: "Match2one", 163: "Meteojob", 48 : "Monster", 43 : "MyJobHelper FR", 71 : "Myjobhelper-Germany", 70 : "MyJobHelper - Netherlands", 72 : "MyJobHelper - switzerland", 41 : "MyJobHelper UK", 55 : "MyJobHelper US", 28 : "Name", 42 : "Name", 33 : "Neuvoo", 90 : "NeuvooAustria-Jobleads", 137: "NeuvooBelgium-jobleads", 139: "NeuvooCanada-Jobleads", 93 : "NeuvooFrance-Jobleads", 89 : "NeuvooGermanyjobleads", 95 : "NeuvooGermany-Jobmonitor", 160: "NeuvooGermany-muenchener", 94 : "NeuvooHolland-Jobleads", 140: "NeuvooIndia-Jobleads", 142: "NeuvooNewzealand-jobleads", 138: "NeuvooSpain-jobleads", 91 : "Neuvooswitzerland-Jobleads", 141: "NeuvooTurkey-jobleads", 92 : "NeuvooUK-Jobleads", 51 : "NeuvooUSJobleads", 45 : "Nominal Technology", 2  : "Optioncarriere", 164: "ProgrammaticAppnexus", 113: "restorationmedia-UK", 112: "restorationmedia-US", 104: "ResultsGeneration-UK", 103: "Resultsgeneration -US", 27 : "Sites gratuits TP", 11 : "Stackoverflow", 5  : "Test", 29 : "[test] Job board 31671", 30 : "[test] Job board 73347", 115: "Trendingjobs-UK", 114: "Trendingjobs-US", 6  : "Trovit", 63 : "Trovit- Germany", 62 : "Trovit - Netherlands", 64 : "Trovit- Switzerland", 38 : "Trovit UK", 49 : "Trovit US", 17 : "Twitter", 15 : "Vivastreet", 120: "xpat-Austria", 125: "xpat-Belgium", 128: "xpat-Canada", 119: "xpat-France", 121: "xpat-Germany", 126: "xpat-India", 118: "xpat-Netherlands", 130: "xpat-newzealand", 127: "xpat-Spain", 122: "xpat-Switzerland", 129: "xpat-Turkey", 117: "xpat-UK", 116: "xpat-US", 109: "Yahoo-Austria", 108: "Yahoo-France", 107: "Yahoo-Germany", 110: "Yahoo-Netherlands", 111: "Yahoo-Switzerland", 106: "Yahoo-UK", 105: "Yahoo-US", 46 : "ZipRecruiter-France", 124: "ZipRecruiter-UK", 123: "ZipRecruiter-US"}
     self.stemmer = FrenchStemmer()
     self._set_stopwords()
예제 #21
0
def racinize_all_negationeur(
        concept):  #all racinisation are used to work with variations
    nega_tiers = []
    stemmer = FrenchStemmer()
    for i in range(0, len(negationeur)):
        temp = negationeur[i].lower()
        temp2 = stemmer.stem(temp)
        nega_tiers.append(temp2)
    return nega_tiers
예제 #22
0
def stem_words(words):
    #stemming words
    stemmed_words = [] #declare an empty list to hold our stemmed words
    stemmer = FrenchStemmer() #create a stemmer object in the FrenchStemmer class
    for word in words:
        stemmed_word=stemmer.stem(word) #stem the word
        stemmed_words.append(stemmed_word) #add it to our stemmed word list
    stemmed_words.sort() #sort the stemmed_words
    return stemmed_words
예제 #23
0
def stemWords(listWords):
    '''racinisation'''
    stemmedWords = list()
    stemmer = FrenchStemmer()
    for word in removeStopwords(listWords):
        stemmedWord = stemmer.stem(word)
        stemmedWords.append(stemmedWord)
    stemmedWords.sort()
    return stemmedWords
예제 #24
0
 def __init__(self, ignoreWords : list = ['?', '!'], verbose : bool = False, forceSave : bool = False):
     self.roots = []
     self.ruleList = []
     self.corpus = []
     self.ignoreWords = ignoreWords
     self.verbose = verbose
     self.forceSave = forceSave
     self.stemmer = FrenchStemmer()
     self.rules = None
     self.model = None
예제 #25
0
def stem_words(words):
    
    #stemming words
    stemmed_words = [] 
    stemmer = FrenchStemmer() #creation d'un  objet stemmer  
                              #dans la classe FrenchStemmer 
    for word in words:
        stemmed_word=stemmer.stem(word) #stem the word
        stemmed_words.append(stemmed_word) 
    return stemmed_words
예제 #26
0
def stem_words(words):
    '''stems the word list using the French Stemmer'''
    #stemming words
    stemmed_words = [] #declare an empty list to hold our stemmed words
    stemmer = FrenchStemmer() #create a stemmer object in the FrenchStemmer class
    for word in words:
        stemmed_word=stemmer.stem(word) #stem the word
        stemmed_words.append(stemmed_word) #add it to our stemmed word list
    stemmed_words.sort() #sort the stemmed_words
    return stemmed_words
예제 #27
0
def stem(msg, stopwords):
    stemmer = FrenchStemmer()
    lem = []
    words = re.split("[ |,|.|;|!|?|\"|\'|-]", msg)
    for word in words:
        if word:
            if (word[0] != "#" and word not in stopwords):
                lem.append(stemmer.stem(word))
            elif (word[0] == "#"):
                lem.append(word)
    return lem
예제 #28
0
 def stem_words(self, words):
     """Stem words in list of tokenized words"""
     if self._lang == 'fr':
         stemmer = FrenchStemmer()
     else:
         stemmer = LancasterStemmer()
     stems = []
     for word in words:
         stem = stemmer.stem(word)
         stems.append(stem)
     return stems
예제 #29
0
def racinize_all_qualifieurs(qualifieurs):
    iznogoud = []
    stemmer = FrenchStemmer()
    for i in range(0, len(qualifieurs)):
        temp = qualifieurs[i].lower()
        if temp == 'isolées':
            temp2 = 'isole'
        else:
            temp2 = stemmer.stem(temp)
        iznogoud.append(temp2)
    return iznogoud
예제 #30
0
    def nettoyage(self, document):
        """
        function to clean the dataset + stemming
        """
        document = self.tokenize(document)
        document = [word.lower() for word in document if len(word) > 2 and not word.isnumeric() and word not in self.stop_words]

        if self.stem:
            stem = FrenchStemmer()
            document = [stem.stem(word) for word in document]
        return document
예제 #31
0
def stem_wordsfr(words):
    '''stems the word list using the French Stemmer'''
    #stemming words
    stemmed_wordsfr = []  #declare an empty list to hold our stemmed words
    stemmerfr = FrenchStemmer(
    )  #create a stemmer object in the FrenchStemmer class
    for word in words:
        stemmed_wordfr = stemmerfr.stem(word)  #stem the word
        stemmed_wordsfr.append(
            stemmed_wordfr)  #add it to our stemmed word list
    #stemmed_wordsfr.sort() #sort the stemmed_words
    return stemmed_wordsfr
예제 #32
0
    def __init__(self, f, keywords):
        s = f.read()
        self.keywords = keywords
        self.file = s
        self.sentences = sent_tokenize(s)
        self.parser = StanfordParser(
            "stanford-parser-full-2014-08-27/stanford-parser",
            "stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models")
        self.tagger = st.StanfordPOSTagger(
            "stanford-postagger-full-2014-08-27/models/french.tagger",
            "stanford-postagger-full-2014-08-27/stanford-postagger.jar")
        self.ner = st.StanfordNERTagger(
            "stanford-ner-2014-08-27/classifiers/english.all.3class.distsim.crf.ser.gz",
            "stanford-ner-2014-08-27/stanford-ner.jar")

        self.trees = []
        for sent in self.sentences:
            try:
                self.trees.append(self.parser.raw_parse(sent))
            except OSError:
                self.trees.append([])
        self.words = self.word_tokenize_without_punc(s)
        self.stemmer = FrenchStemmer()
        self.stems = [self.stemmer.stem(w) for w in self.words]
        self.words_sentences = [
            self.word_tokenize_without_punc(s) for s in self.sentences
        ]
        self.tags = self.tagger.tag(self.words)
        self.tags_sentences = [
            self.tagger.tag([w for w in self.words_sentences[i]])
            for i in range(len(self.sentences))
        ]
        self.entities = self.ner.tag(self.words)
        self.entities_sentences = [
            self.ner.tag([w for w in self.words_sentences[i]])
            for i in range(len(self.sentences))
        ]
        self.left_subject = defaultdict(lambda: 0)
        self.left_compl = defaultdict(lambda: 0)
        self.left_neg_subject = defaultdict(lambda: 0)
        self.left_neg_compl = defaultdict(lambda: 0)
        self.right_subject = defaultdict(lambda: 0)
        self.right_compl = defaultdict(lambda: 0)
        self.right_neg_subject = defaultdict(lambda: 0)
        self.right_neg_compl = defaultdict(lambda: 0)
        self.left_ref = 0
        self.right_ref = 0
        self.trees_leaves = []
        for e in self.trees:
            res = []
            extract_leaves(list(e)[0], res)
            self.trees_leaves.append(tuple_to_dict(res))
        self.extract_keywords()
def vectorize_descreption():
    fs = FrenchStemmer()
    df = return_features_model()
    df['Descposte'] = [fs.stem(k) for k in df['Descposte']]
    tfidf = TfidfVectorizer()
    tfidf.fit_transform(df['Descposte'])
    tfidf_col = pd.DataFrame(tfidf.fit_transform(df['Descposte']).todense(),
                             columns=tfidf.get_feature_names())
    df = df.reset_index()
    df_final = pd.merge(df.drop(columns=drop_columns, axis=1),
                        tfidf_col,
                        right_index=True,
                        left_index=True)
    return df_final
예제 #34
0
	def __init__(self, mode='build', DB_file=None, doc_files=None, trace=False):
		self.mode = mode
		self.DB_file = DB_file
		self.doc_list = []
		doc_to_read=[]
		for root, dirs, files in os.walk(doc_files, topdown=False):
			for file_name in files: 
				doc_to_read.append(os.path.join(root, file_name.encode('utf-8')))
		for doc_file in doc_to_read :
			doc = Doc(doc_file)
			self.doc_list.append(doc)
		self.trace = trace
		self.requete= []
		self.DB = Data_Base()
		self.stemmer=FrenchStemmer()

		if mode == 'build' :
			#construction de la base de donnee, puis dump sur DB_file
			print 'Built Data Base...'
			self.build_DB()
			#print self.DB
		elif mode == 'search' :
			#chargement de la base de donnee
			self.load_DB()
		print self.DB.word2Word_struct
예제 #35
0
 def __init__(self, translation_dict, lemmatized=False):
   self.english_lemmatizer = WordNetLemmatizer()
   self.french_stemmer = FrenchStemmer()
   if not lemmatized:
     stemmed_dict = self._get_lemmatized_dict(translation_dict)
   self.stemmed_dict = stemmed_dict
   self.translation_dict = translation_dict
def lemmatize_or_stem(language, terms):
	if language != 'english' or (language == 'english' and ENGLISH_FREELING): #TEMPORARY: EXPERIMENTING WITH ENGLISH FREELING
		# Use FreeLing
		if language == 'spanish':
			analyzeProcess = subprocess.Popen(["analyze", "-f", "/usr/local/share/freeling/config/es.cfg"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
                elif language == 'portugese':
			analyzeProcess = subprocess.Popen(["analyze", "-f", "/usr/local/share/freeling/config/pt.cfg"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)                    
                elif language == 'italian':
			analyzeProcess = subprocess.Popen(["analyze", "-f", "/usr/local/share/freeling/config/it.cfg"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)                    
		elif language == 'english':
			analyzeProcess = subprocess.Popen(["analyze", "-f", "/usr/local/share/freeling/config/en.cfg"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
		terms = map(lambda term: term.encode('utf-8'), terms)
		analyzeProcess.stdin.write(' '.join(terms))
		stdout, stderr = analyzeProcess.communicate()
		# Parse FreeLing output
		# Lemma is always second word of each line.
		terms = list()
		lines = stdout.split('\n')
		for line in lines:
			items = line.split(' ')
			if len(items) == 4:
				lemma = items[1]
				tag = items[2]
				"""ATTN: TAGSET IS DIFFERENT IN SPANISH AND ENGLISH. However, NP, F, Z, and W
				all mean the same thing in both tagsets."""
				# remove proper nouns, punctuation, numbers, and dates/times
				if not (tag[0:2]=='NP' or tag[0] == 'F' or tag[0] == 'Z' or tag[0] == 'W' or tag[0:3] == 'POS'):
					# if english, need to remove numbers
					include = True
					for num in NUMBERS:
						if num in lemma:
							include = False
					if include:
						terms.append(lemma)
		terms = map(lambda term: term.decode('utf-8'), terms)

	elif (language == 'english' and not ENGLISH_FREELING):
	 	lem = WordNetLemmatizer()
	 	terms = map(lambda term: lem.lemmatize(term), terms )
	elif language == 'french':
		from nltk.stem.snowball import FrenchStemmer
		stemmer = FrenchStemmer()
		terms = map(lambda term: stemmer.stem(term), terms)
	
		terms = map(lambda term: term.decode('utf-8'), terms)
	return terms
예제 #37
0
	def __init__(self, mode='build', DB_file=None, doc_files=None, trace=False):
		self.mode = mode
		self.DB_file = DB_file
		self.doc_list = []
		self.trace = trace
		self.requete= []
		self.DB = Data_Base()
		self.stemmer=FrenchStemmer()
		self.requeteFin=[]
		self.idDoc2tfIdf={}

		if mode == 'build' :
			#construction de la base de donnee, puis dump sur DB_file
			print 'Building Data Base...'
			self.build_DB(doc_files)
			print 'Building completed'
		elif mode == 'search' :
			#chargement de la base de donnee
			self.load_DB()
		self.word2nbOccDsDB={}
예제 #38
0
  def __init__(self, corpus_file, word_file):
    self.user= False
    if  word_file : self.user=True
    self.stemmer = FrenchStemmer()
    self.text_into_sentences = data.load("tokenizers/punkt/french.pickle")
    curr_path = os.path.dirname(os.path.abspath(__file__))
    os.environ['STANFORD_PARSER'] = curr_path +"/stanford-parser-full-2015-04-20"
    os.environ['STANFORD_MODELS'] = curr_path + "/stanford-parser-full-2015-04-20"
    self.parser = stanford.StanfordParser(model_path= curr_path+ "/frenchFactored.ser.gz")
    self.stpwords = stopwords.words("french")
    self.read_corpus(corpus_file)
    self.known_words_list = []
    self.ok_words_lis = []
    if self.user: self.read_known_words(word_file)
    self.stemmed_known_words = map(self.stemmer.stem,  [w for w in self.known_words_list if w not in  self.stpwords])
    self.ok_words_list = [w for w in self.ok_words_lis if w not in self.stpwords]

    self.sentences = []
    self.word_count = 0
    self.syll_count = 0
    self.num_of_sentences = 0
    self.word_per_sentence= 0.0
    self.syllables_per_word = 0.0
    self.num_of_words_with_more_than_six_chars = 0
    self.sixratio = 0.0
    self.num_of_words_with_more_than_three_syll = 0
    self.threeratio= 0.0
    self.depth_sum = 0
    self.avgdepth = 0.0
    self.known_words = 0
    self.knownratio = 0.0
    self.NP_sum = 0
    self.NPratio =0.0
    self.VP_sum = 0
    self.VPratio =0.0
    self.SBAR_sum = 0
    self.SBARratio =0.0

    self.score_with_words=0.0
    self.score_general = 0.0
    self.flesch_kincaid_score=0.0
예제 #39
0
class DirectTranslate:
  """Word-by-word direct translator.
  
  Usage:
  translator = DirectTranslate(translation_dict)
  for sentence in file:
    print translator.translate(sentence, delims=",' ", remove='')
  """
  def __init__(self, translation_dict, lemmatized=False):
    self.english_lemmatizer = WordNetLemmatizer()
    self.french_stemmer = FrenchStemmer()
    if not lemmatized:
      stemmed_dict = self._get_lemmatized_dict(translation_dict)
    self.stemmed_dict = stemmed_dict
    self.translation_dict = translation_dict
    
  def _get_lemmatized_dict(self, dict):
    result = {}
    for french_word, english_translation_list in dict.iteritems():
      french_stem = self.french_stemmer.stem(french_word)
      english_translations = [
        self.english_lemmatizer.lemmatize(word) for word in english_translation_list
      ]
      # NOTE: This may or may not be the best stragetgy.  If two distinct
      # French words in the initial dict have the same stem,
      # it appends the two lists of translations.
      # TODO: Reconsider.
      # TODO: Consider removing duplicates from this new list.  But need to preserve order.
      if french_stem not in result:
        result[french_stem] = english_translations
      else:
        result[french_stem].extend(english_translations)
    return result

  def _get_preprocessed_sentence(self, french_sentence):
    """Apply any preprossing rules here.
    Args:
      french_sentence: string; the sentence in french
    
    Returns:
      The sentence with all preprocessing rules applied.
    """
    return unicode(french_sentence)

  def _get_postprocessed_sentence(self, english_sentence):
    """Apply any postproccessing rules here.
    Args: 
      english_sentence: string; an english sentence
    
    Returns:
      The sentence with all postprocessing rules applied.
    """
    return english_sentence

  # TODO: Add code to keep commas.  Translate them into a word.
  def translate(self, sentence, delims=",' ", remove=''):
    sentence = self._get_preprocessed_sentence(sentence)
    tokens = TranslateUtils.get_list_of_words(sentence, delims, remove)
    translated_list = []
    for token in tokens:
      stemmed_token = self.french_stemmer.stem(token).lower()
      if stemmed_token in self.stemmed_dict:
        possible_translations = self.stemmed_dict[stemmed_token]
        if possible_translations:
          # Use first translation in the list
          translation = possible_translations[0]
          translated_list.append(translation)
      elif token in self.translation_dict:
        possible_translations = self.translation_dict[token]
        if possible_translations:
          # Use first translation in the list
          translation = possible_translations[0]
          translated_list.append(translation)
    translation = ' '.join(translated_list)
    translation = self._get_postprocessed_sentence(translation)
    return translation
예제 #40
0
#Stopwords
from nltk.corpus import stopwords
import nltk
# chargement des stopwords français
french_stopwords = set(stopwords.words('french'))
print french_stopwords

chars = ['.', '/', "'", '"', '?', '!', '#', '$', '%', '^', '&','*',\
 '(', ')', ' - ', '_', '+' ,'=', '@', ':', '\\', ',',';', '~', '`', '<',\
 '>', '|', '[', ']', '{', '}', '–', '“','»', '«', '°', '’', '--'\
 '</div>','<div','class','class="tt14-prodpres-txt' ,'tt14-prodpres-res">','<b>','</b>']



from nltk.stem.snowball import FrenchStemmer
stemmer = FrenchStemmer()
stemmer.stem('voudrais')

def cleaningDocumentList(documentList):
    #remove frech stopwords
    texts = [[word for word  in document.lower().replace("'",'').split() if word not in french_stopwords] for document in documentList]
    #remove special catalogue characters
    texts = [[word for word  in text if word not in chars] for text in texts]
    # remove french accent
    texts = [[unicodedata.normalize('NFD',unicode(word,'utf-8')).encode('ascii', 'ignore') for word  in text] for text in texts]
    #remove general special characters 
    texts = [[re.sub(r'[. ?!=+ & | , " :; ⋆ $ %()<> &\[\]/_]',r'',word) for word in text] for text in texts]
    #remove small words
    texts = [[re.sub(r'\b\w{1,3}\b', '', word) for word in text] for text in texts]
    #Lemmitizer
    texts = [[stemmer.stem(word) for word in text] for text in texts]
예제 #41
0
	def read_doc(self,docfile):
		"""
			lit le document dans le fichier doc_file et rempli les dictionnaires de listes de chaque champs avec les token du document. Compte egalement le nombre de mot
		"""
		stemmer=FrenchStemmer()
		flux=open(docfile)
		line=flux.readline()
		position=0
		title=True
		first=True
		while line != '':
		  liste=line.split()
		  if title==True and len(liste)>0: #remplir le dictionnaire du titre
		    self.full_title = line
		    title=False
		    for each in liste:
		      each=each.lower()
		      if '\'' in each:
			strings=self.splitAccent(each)
			strings[0]+='\''
			self.nb_word+=len(strings)
			for word in strings:
			  word= stemmer.stem(word.decode('iso-8859-1') )
			  if word not in self.word2pos_list_title:
			   self.word2pos_list_title[word]=[]
			  self.word2pos_list_title[word].append(position)
			  position+=1
		      else:
			self.nb_word+=1
			each=stemmer.stem(each.decode('iso-8859-1'))
			if each not in self.word2pos_list_title:
			   self.word2pos_list_title[each]=[]
			self.word2pos_list_title[each].append(position)
			position+=1
		    line=flux.readline()
		    liste=line.split()
		  if first==True and title==False and liste!=[]: #pour remplir le dictionnaire du premier paragraphe
		      first=False
		      for each in liste:
			each=each.lower()
			if '\'' in each:
			  strings=self.splitAccent(each)
			  strings[0]+='\''
			  self.nb_word+=len(strings)
			  for word in strings:
			    word= stemmer.stem(word.decode('iso-8859-1') )
			    if word not in self.word2pos_list_first:
			      self.word2pos_list_first[word]=[]
			    self.word2pos_list_first[word].append(position)
			    position+=1
			else:
			  self.nb_word+=1
			  each=stemmer.stem(each.decode('iso-8859-1'))
			  if each not in self.word2pos_list_first:
			    self.word2pos_list_first[each]=[]
			  self.word2pos_list_first[each].append(position)
			  position+=1
		      line=flux.readline()
		      liste=line.split()
		  if first==False and title==False and liste!=[]: #pour remplir le dictionnaire du corps de texte
		    for each in liste:
		      each=each.lower()
		      if '\'' in each:
			strings=self.splitAccent(each)
			strings[0]+='\''
			self.nb_word+=len(strings)
			for word in strings:
			  word= stemmer.stem(word.decode('iso-8859-1') )
			  if word not in self.word2pos_list_body:
			    self.word2pos_list_body[word]=[]
			  self.word2pos_list_body[word].append(position)
			  position+=1
		      else:
			self.nb_word+=1
			each=stemmer.stem(each.decode('iso-8859-1'))
			if each not in self.word2pos_list_body:
			  self.word2pos_list_body[each]=[]
			  self.word2pos_list_body[each].append(position)
			else:
			    self.word2pos_list_body[each].append(position)
			position+=1
		  line=flux.readline()
예제 #42
0
# coding: utf-8
import string
from nltk.stem.snowball import FrenchStemmer
from preprocess import Enleve_Accents
import urllib


url = "http://www.pallier.org/ressources/dicofr/liste.de.mots.francais.frgut.txt"
file_name = "dico.txt"
print "downloading the french dictionary from http://www.pallier.org/"
urllib.urlretrieve(url, "FrenchDictionary.txt")
 
 
stemmer = FrenchStemmer()

fs = open("FrenchDictionary.txt", 'r')
fd = open("StemmedFrenchDictionary.txt", 'w')
while 1:
	txt = fs.readline()
	if txt =='':
		break
	if txt[0] != '':
	
		txt = txt.lower()
		txt = ''.join(u for u in txt if u in string.ascii_letters)
		txt = Enleve_Accents(txt)
		txt = stemmer.stem(txt) + "\n"
		fd.write(txt)
import os
script = """
(cat StemmedFrenchDictionary.txt|uniq>monfichier.tmp) &&  mv -f monfichier.tmp FinalDictionary.txt
예제 #43
0
class Level:
  def __init__(self, corpus_file, word_file):
    self.user= False
    if  word_file : self.user=True
    self.stemmer = FrenchStemmer()
    self.text_into_sentences = data.load("tokenizers/punkt/french.pickle")
    curr_path = os.path.dirname(os.path.abspath(__file__))
    os.environ['STANFORD_PARSER'] = curr_path +"/stanford-parser-full-2015-04-20"
    os.environ['STANFORD_MODELS'] = curr_path + "/stanford-parser-full-2015-04-20"
    self.parser = stanford.StanfordParser(model_path= curr_path+ "/frenchFactored.ser.gz")
    self.stpwords = stopwords.words("french")
    self.read_corpus(corpus_file)
    self.known_words_list = []
    self.ok_words_lis = []
    if self.user: self.read_known_words(word_file)
    self.stemmed_known_words = map(self.stemmer.stem,  [w for w in self.known_words_list if w not in  self.stpwords])
    self.ok_words_list = [w for w in self.ok_words_lis if w not in self.stpwords]

    self.sentences = []
    self.word_count = 0
    self.syll_count = 0
    self.num_of_sentences = 0
    self.word_per_sentence= 0.0
    self.syllables_per_word = 0.0
    self.num_of_words_with_more_than_six_chars = 0
    self.sixratio = 0.0
    self.num_of_words_with_more_than_three_syll = 0
    self.threeratio= 0.0
    self.depth_sum = 0
    self.avgdepth = 0.0
    self.known_words = 0
    self.knownratio = 0.0
    self.NP_sum = 0
    self.NPratio =0.0
    self.VP_sum = 0
    self.VPratio =0.0
    self.SBAR_sum = 0
    self.SBARratio =0.0

    self.score_with_words=0.0
    self.score_general = 0.0
    self.flesch_kincaid_score=0.0

  def sentence_stats(self):
    self.sentences = self.text_into_sentences.tokenize(self.text)
    self.num_of_sentences = len(self.sentences)
    self.word_count, self.syll_count, self.num_of_words_with_more_than_six_chars , self.num_of_words_with_more_than_three_syll, self.depth_sum, self.NP_sum, self.VP_sum, self.SBAR_sum, self.known_words = [sum(x) for x in zip(*[self.count_words_in_a_sentence(s) for s in self.sentences ])]

  def calculate_stats(self): 
    self.word_per_sentence=  float(self.word_count)/self.num_of_sentences
    self.syllables_per_word = float(self.syll_count)/ self.word_count 
    self.sixratio = float(self.num_of_words_with_more_than_six_chars)/ self.word_count 
    self.threeratio = float(self.num_of_words_with_more_than_three_syll)/ self.word_count 
    self.avgdepth = float(self.depth_sum)/ self.num_of_sentences 
    self.knownratio = float(self.known_words)/ self.word_count 
    self.NPratio = float(self.NP_sum)/ self.word_count 
    self.VPratio = float(self.VP_sum)/ self.word_count 
    self.SBARratio = float(self.SBAR_sum)/ self.word_count 

  def  print_stats(self):
    print "#of sentences:", self.num_of_sentences
    print "#of words, #of words per sentence:", self.word_count, "   ", self.word_per_sentence
    print "#of syllables, #of syllables per word:", self.syll_count,"   ", self.syllables_per_word
    print "#of words with more than 6 characters, percentage to all words:",self.num_of_words_with_more_than_six_chars,"  ", self.sixratio 
    print "#of words with more than 3 syllables, percentage to all words:", self.num_of_words_with_more_than_three_syll,"   ", self.threeratio
    print "average parse tree depth:", self.avgdepth
    print "average # of noun phrases:", self.NPratio
    print "average # of verb phrases:", self.VPratio
    print "average # of SBAR phrases:", self.SBARratio
    print "# of known words, percentage to all words:", self.known_words,"   ", self.knownratio
    print "flesch-kincaid score:", self.flesch_kincaid_score
    print "general score without vocabulary:", self.score_general
    if self.user:
      print "score with vocabulary:", self.score_with_words

  def calculate_score(self):
    self.flesch_kincaid_score = 206.835 - 1.015 * self.word_per_sentence - 84.6 * self.syllables_per_word
    self.score_general =((self.SBARratio+self.VPratio+self.NPratio+self.threeratio+self.sixratio)/5)*self.avgdepth/self.flesch_kincaid_score   
    self.score_with_words =((self.SBARratio+self.VPratio+self.NPratio+self.threeratio+self.sixratio+(1-self.knownratio))/6)*self.avgdepth/self.flesch_kincaid_score    
    

  def count_words_in_a_sentence(self, sentence):
    known_words=0
    tokens = word_punckt_tokenizer.tokenize(sentence.lower()) 
    words = self.normalize_list(tokens)
    word_count = len(words)
    syll_count = sum(Level.syllable_count(word) for word in words)
    num_of_words_with_more_than_six_chars = len(filter(lambda(x): len(x) >= 6, words)) 
    num_of_words_with_more_than_three_syll = len(filter(lambda(x): Level.syllable_count(x) >=3, words)) 
    parse_tree_depth, num_of_NP, num_of_VP, num_of_SBAR = self.tree_stats(sentence)
    if self.user:
      known_words = sum([1 for w in words if w in self.ok_words_list or self.stemmer.stem(w) in self.stemmed_known_words ]) 
    print sentence
    print words
    return (word_count, syll_count,  num_of_words_with_more_than_six_chars,num_of_words_with_more_than_three_syll, parse_tree_depth, num_of_NP, num_of_VP, num_of_SBAR, known_words)
  
  def tree_stats(self, sentence):
    depth=1
    num_of_np =1
    num_of_vp = 1
    num_of_sbar = 0 
    try:
      l = list(self.parser.raw_parse(sentence))[0]
      num_of_np = sum( [1 for i in l.subtrees() if i.label() == 'NP'])
      num_of_vp = sum( [1 for i in l.subtrees() if i.label() == 'VN'])
      num_of_sbar = sum( [1 for i in l.subtrees() if i.label() == 'CS'])
      depth = l.height()
    except:
      pass
    return (depth, num_of_np, num_of_vp, num_of_sbar)


  def normalize_list(self, token_list):
    ss  = [w for w in token_list if w not in self.stpwords and w not in string.punctuation]
    return  ss

  @staticmethod
  def syllable_count(word):
    n = len(word)
    num_of_syll = 0
    i=0
    while i < n:
      if i < n-3 and is_a_vowel(word[i]) and is_a_vowel(word[i+1]) and is_a_vowel(word[i+2]) and is_a_vowel(word[i+3]):
        num_of_syll += 2
        i += 4
      elif i < n-2 and is_a_vowel(word[i]) and is_a_vowel(word[i+1]) and is_a_vowel(word[i+2]):
        num_of_syll += 1
        i += 3
      elif i < n-1 and is_a_vowel(word[i]) and is_a_vowel(word[i+1]):
        num_of_syll += 1
        i += 2
      elif i < n and is_a_vowel(word[i]): 
        num_of_syll += 1
        i += 1
      else:
        i += 1
    return num_of_syll

  def read_corpus(self, filename):
    encodings = ["utf-8", "latin-1", "windows-1250", "windows-1252", "latin-15", "utf-16", "ascii"]

    for e in encodings:
      try:
        fh = codecs.open(filename, "r", encoding=e)
        self.text = fh.read().strip()
        fh.close()
      except UnicodeDecodeError:
        pass
      else:
        break
    

    
  def read_known_words(self, filename):
    encodings = ["utf-8", "latin-1", "windows-1250", "windows-1252", "latin-15", "ascii", "utf-16"]

    for e in encodings:
      try:
        fh = codecs.open(filename, "r", encoding=e)
        for line in fh:
          word, d = line.strip().split()
          degree = int(d)
          if degree ==1 :  self.known_words_list.append(word)
          else:   self.ok_words_lis.append(word)
        fh.close()
      except UnicodeDecodeError:
        pass
      else:
        break
예제 #44
0
from sklearn.pipeline import Pipeline

import logging


logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.info("running %s" % " ".join(sys.argv))

# Test string #
string = 'Jadis, une nuit, je fus un papillon, voltigeant, content de son sort. Puis, je m’éveillai, étant Tchouang-tseu. Qui suis-je en réalité ? Un papillon qui rêve qu’il est Tchouang-tseu ou Tchouang qui s’imagine qu’il fut papillon ?'

# Have fun with tokenizers
tokenizer1 = nltk.data.load('tokenizers/punkt/french.pickle')
tokenizer2 = TreebankWordTokenizer()
french_stopwords = set(stopwords.words('french'))
stemmer = FrenchStemmer()

# See results
tokens1 = tokenizer1.tokenize(string)
tokens2 = tokenizer2.tokenize(string)
tokens3 = [token.encode('utf-8') for token in tokens2 if token.lower() not in french_stopwords]
tokens4 = [stemmer.stem(token.decode('utf-8')) for token in tokens3]


# Build class to add stem to pipleine

class StemmedCountVectorizer(CountVectorizer):

	def build_analyzer(self):
		analyzer = super(CountVectorizer, self).build_analyzer()
		return lambda doc:(stemmer.stem(w) for w in analyzer(doc))
예제 #45
0
class Search_engine:
	"""
		--Moteur de recherche--

		DB_file = fichier contenant la base de donnee
			si mode = build
				la base de donnee construite sera dumpee sur DB_file
			si mode = search
				la base de donnee sera recupere depuis DB_file

		doc_files = liste de documents bruts a integrer a la base de donnee

		DB = base de donnee de la classe Data_base
	"""

	def __init__(self, mode='build', DB_file=None, doc_files=None, trace=False):
		self.mode = mode
		self.DB_file = DB_file
		self.doc_list = []
		self.trace = trace
		self.requete= []
		self.DB = Data_Base()
		self.stemmer=FrenchStemmer()
		self.requeteFin=[]
		self.idDoc2tfIdf={}

		if mode == 'build' :
			#construction de la base de donnee, puis dump sur DB_file
			print 'Building Data Base...'
			self.build_DB(doc_files)
			print 'Building completed'
		elif mode == 'search' :
			#chargement de la base de donnee
			self.load_DB()
		self.word2nbOccDsDB={}
		
	def build_DB(self, doc_files):
		"""
			rempli seld.DB avec les documents de self.doc_files
		"""
		compteur=0
		doc_name=doc_files+'doc_'+str(compteur)+'.txt'
		while os.path.exists(doc_name):
		  doc=Doc(doc_name)
		  self.DB.add_doc(doc)
		  compteur+=1
		  doc_name=doc_files+'doc_'+str(compteur)+'.txt'
		print "Number of documents in the Data Base: ", self.DB.nb_doc_total
		#print self.DB.id2nbword
		self.dump_DB()

	def load_DB(self):
		"""
			charge le contenu du fichier self.DB_file dans self.DB
		"""
		print 'Loadind Data Base...'
		stream = open(self.DB_file)
		self.DB = cPickle.load(stream)
		stream.close()
		print "Number of documents in the Data Base: ", self.DB.nb_doc_total
		print 'Loading completed'
		return

	def dump_DB(self):
		"""
			dump le contenu de self.DB dans le fichier self.DB_file
		"""
		print 'Dumping Data Base...'
		p=cPickle.Pickler(open(self.DB_file, 'wb'))
		p.fast=True
		p.dump(self.DB)
		print 'Dumping completed'
		#stream.close()
		#return 
	
	def parse_requete(self, requete):
		"""
				parse la requete introduite par l'utilisateur et produit une liste de tokens
			"""
		req_list= re.findall( '\w+', requete)
		for word in req_list :
			word = self.stemmer.stem(word.decode('utf-8'))
			self.requete.append(word)
			self.requeteFin.append(word)
		
	def fuse_lst_rec(self,title_lst,title_head,first_lst,first_head,body_lst,body_head,acc):
		if acc == [] :
			acc.append(-1)
		m = max(title_head,first_head,body_head)
		title_head_aux = title_head
		first_head_aux = first_head
		body_head_aux = body_head		
		if m == -1 :
			acc.reverse()
			a=acc.pop()
			
			return acc
		else:
		  if m == title_head_aux :
			  if title_lst != [] :
				  title_head_aux = title_lst.pop()
			  else :
				  title_head_aux = -1
		  elif m == first_head_aux :
			  if first_lst != [] :
				  first_head_aux = first_lst.pop()
			  else :
				  first_head_aux = -1
		  elif m == body_head_aux :
			  if body_lst != [] :
				  body_head_aux = body_lst.pop()
			  else :
				  body_head_aux = -1
		  h = acc.pop()
		  if h != m :
			  acc.append(h)
			  acc.append(m)
		  else :
			  acc.append(h)
		  return self.fuse_lst_rec(title_lst,title_head_aux,first_lst,first_head_aux,body_lst,body_head_aux,acc)
		
	def merge_dif_rec(self,lst1,head1,lst2,head2,acc):
		if acc == [] :
			acc.append(-1)
		head1_aux = head1
		head2_aux = head2
		if head1_aux == head2_aux :
			acc.append(head1_aux)
			if lst1 == [] or lst2 == [] :
				acc.reverse()
				acc.pop()
				return acc				
			else :
				head1_aux = lst1.pop()
				head2_aux = lst2.pop()
		elif head1_aux > head2_aux :
			if lst1 == [] :
				acc.reverse()
				acc.pop()
				return acc
			else :
				head1_aux = lst1.pop()
		else :
			if lst2 == [] :
				acc.reverse()
				acc.pop()
				return acc
			else :
				head2_aux = lst2.pop()
		return self.merge_dif_rec(lst1,head1_aux,lst2,head2_aux,acc)
					
	def search_bool_word(self,word):
		title_lst = []
		title_head = -1
		first_lst = []
		first_head = -1
		body_lst = []
		body_head = -1
		for doc_id in self.DB.word2Word_struct[word].title :
			#print "title" , str(doc_id.doc_id), str(self.DB.id2doc[doc_id.doc_id].doc_file)
			title_lst.append(doc_id.doc_id)
		for doc_id in self.DB.word2Word_struct[word].first :
			#print "first" , str(doc_id.doc_id), str(self.DB.id2doc[doc_id.doc_id].doc_file)
			first_lst.append(doc_id.doc_id)
		for doc_id in self.DB.word2Word_struct[word].body :
			#print "body" , str(doc_id.doc_id), str(self.DB.id2doc[doc_id.doc_id].doc_file)
			body_lst.append(doc_id.doc_id)
		if title_lst != [] :
			title_head = title_lst.pop()
		if first_lst != [] :
			first_head = first_lst.pop()
		if body_lst != [] :
			body_head = body_lst.pop()
		result=self.fuse_lst_rec(title_lst,title_head,first_lst,first_head,body_lst,body_head,[])
		self.word2nbOccDsDB[word]=len(result)
		return result
		
	def search_bool_req(self):
		if self.requete == [] :
			return []
		word0 = self.requete.pop()
		lst = self.search_bool_word(word0)
		for word in self.requete :
			if lst == [] :
				return []
			lst_aux = self.search_bool_word(word)
		
			if lst_aux == [] :
				return []
			head_lst = lst.pop()
			head_lst_aux = lst_aux.pop()
			lst = self.merge_dif_rec(lst,head_lst,lst_aux,head_lst_aux,[])
		return lst

	def tf_idf(self, doc_id):#calcul le TF.IDF pour la requete pour chaque doc
		solution= 0
		doc=self.DB.id2doc[doc_id]
		for word in self.requeteFin:
			word_in_title=0
			word_in_first=0
			word_in_body=0
			total_noWords_in_doc = float(doc.nb_word)
			if word in doc.word2pos_list_title:
			  word_in_title=len(doc.word2pos_list_title[word])
			if word in doc.word2pos_list_first:
			  word_in_first=len(doc.word2pos_list_first[word])
			if word in doc.word2pos_list_body:
			  word_in_body=len(doc.word2pos_list_body[word])
			word_in_doc=float(word_in_body+word_in_first+word_in_title)
			no_docs=float(self.DB.nb_doc_total)
			no_docs_with_word=self.word2nbOccDsDB[word]
			solution +=float(word_in_doc/total_noWords_in_doc)*math.log1p(no_docs/no_docs_with_word)
		return solution
	      
	def tf_idf_score(self, listDoc_id):
	  for doc_id in listDoc_id:
	    self.idDoc2tfIdf[doc_id]=self.tf_idf(doc_id)
	
	def search_rank_req(self, requete, nbResMax):
		self.requete=[]
		self.requeteFin=[]
		self.parse_requete(requete)
		docsTrouves=self.search_bool_req()
		self.tf_idf_score(docsTrouves)
		self.idDoc2tfIdf=OrderedDict(sorted(self.idDoc2tfIdf.items(), key=lambda t: t[1], reverse=True))
		
		keys=self.idDoc2tfIdf.keys()[:nbResMax]
		if len(keys)<1:
		  print 'Nothing found \n'
		i=1
		for doc in keys:
		  print str(i)+'. '+self.id2docTitle(doc)+'File: '+self.id2fileName(doc)
		  i+=1
		return keys

	def id2fileName(self, docId):
	  return str(self.DB.id2doc[docId].doc_file)
	
	def id2docTitle(self,docId):
	  return str(self.DB.id2doc[docId].full_title)
	
	def reset(self):
	  self.requete=[]
	  self.requeteFin=[]
	  self.idDoc2tfIdf={}
예제 #46
0
	def lemmatize(self, word):
		stemmer = FrenchStemmer()
		return stemmer.stem(word)
예제 #47
0
def stemm(word):
	stemmer = FrenchStemmer()
	return stemmer.stem(word)
예제 #48
0
class Search_engine:
	"""
		--Moteur de recherche--

		DB_file = fichier contenant la base de donnee
			si mode = build
				la base de donnee construite sera dumpee sur DB_file
			si mode = search
				la base de donnee sera recupere depuis DB_file

		doc_files = liste de documents bruts a integrer a la base de donnee

		DB = base de donnee de la classe Data_base
	"""

	def __init__(self, mode='build', DB_file=None, doc_files=None, trace=False):
		self.mode = mode
		self.DB_file = DB_file
		self.doc_list = []
		doc_to_read=[]
		for root, dirs, files in os.walk(doc_files, topdown=False):
			for file_name in files: 
				doc_to_read.append(os.path.join(root, file_name.encode('utf-8')))
		for doc_file in doc_to_read :
			doc = Doc(doc_file)
			self.doc_list.append(doc)
		self.trace = trace
		self.requete= []
		self.DB = Data_Base()
		self.stemmer=FrenchStemmer()

		if mode == 'build' :
			#construction de la base de donnee, puis dump sur DB_file
			print 'Built Data Base...'
			self.build_DB()
			#print self.DB
		elif mode == 'search' :
			#chargement de la base de donnee
			self.load_DB()
		print self.DB.word2Word_struct
		
	def build_DB(self):
		"""
			rempli seld.DB avec les documents de self.doc_files
		"""
		#TODO
		for doc in self.doc_list:
				self.DB.add_doc(doc)
		print self.DB.nb_doc_total
		#print self.DB.id2nbword
		self.dump_DB()

	def load_DB(self):
		"""
			charge le contenu du fichier self.DB_file dans self.DB
		"""
		stream = open(self.DB_file)
		self.DB = pickle.load(stream)
		stream.close()
		return

	def dump_DB(self):
		"""
			dump le contenu de self.DB dans le fichier self.DB_file
		"""
		print 'Dump data base....'
		stream = open(self.DB_file, 'w')
		pickle.dump(self.DB, stream)
		stream.close()
		#return 
	
	def parse_requete(self, requete):
		"""
				parse la requete introduite par l'utilisateur et produit une liste de tokens
			"""
		req_list= re.findall( '\w+', requete)
		for word in req_list :
			#print 'avant', word
			word = self.stemmer.stem(word.decode('utf-8'))
			self.requete.append(word)
			#print 'apres', word
		#print "requete (parse) :"
		#for word in self.requete :
			#print word
		#return 
		
	def fuse_lst_rec(self,title_lst,title_head,first_lst,first_head,body_lst,body_head,acc):
		if acc == [] :
			acc.append(-1)
		m = max(title_head,first_head,body_head)
		title_head_aux = title_head
		first_head_aux = first_head
		body_head_aux = body_head		
		if m == -1 :
			acc.reverse()
			acc.pop()
			return acc
		if m == title_head_aux :
			if title_lst != [] :
				title_head_aux = title_lst.pop()
			else :
				title_head_aux = -1
		if m == first_head_aux :
			if first_lst != [] :
				first_head_aux = first_lst.pop()
			else :
				first_head_aux = -1
		if m == body_head_aux :
			if body_lst != [] :
				body_head_aux = body_lst.pop()
			else :
				body_head_aux = -1
		h = acc.pop()
		if h != m :
			acc.append(h)
			acc.append(m)
		else :
			acc.append(h)
		self.fuse_lst_rec(title_lst,title_head_aux,first_lst,first_head_aux,body_lst,body_head_aux,acc)
		
	def merge_dif_rec(self,lst1,head1,lst2,head2,acc):
		if acc == [] :
			acc.append(-1)
		head1_aux = head1
		head2_aux = head2
		if head1_aux == head2_aux :
			acc.append(head1_aux)
			if lst1 == [] or lst2 == [] :
				acc.reverse()
				acc.pop()
				return acc				
			else :
				head1_aux = lst1.pop()
				head2_aux = lst2.pop()
		if head1_aux > head2_aux :
			if lst1 == [] :
				acc.reverse()
				acc.pop()
				return acc
			else :
				head1_aux = lst1.pop()
		else :
			if lst2 == [] :
				acc.reverse()
				acc.pop()
				return acc
			else :
				head2_aux = lst2.pop()
		self.merge_dif_rec(lst1,head1_aux,lst2,head2_aux,acc)
					
	def search_bool_word(self,word):
		title_lst = []
		title_head = -1
		first_lst = []
		first_head = -1
		body_lst = []
		body_head = -1
		print "searching ", word
		if word in self.DB.word2Word_struct:
		  print "YES"
		  print self.DB.word2Word_struct[word].body
		#word=self.stemmer.stem(word.decode('utf-8'))
		for doc_id in self.DB.word2Word_struct[word].title :
			print "title" , str(doc_id.doc_id)
			title_lst.append(doc_id.doc_id)
		for doc_id in self.DB.word2Word_struct[word].first :
			print "first" , str(doc_id.doc_id)
			first_lst.append(doc_id.doc_id)
		for doc_id in self.DB.word2Word_struct[word].body :
			print "body" , str(doc_id.doc_id)
			body_lst.append(doc_id.doc_id)
		if title_lst != [] :
			title_head = title_lst.pop()
		if first_lst != [] :
			first_head = first_lst.pop()
		if body_lst != [] :
			body_head = body_lst.pop()
		return self.fuse_lst_rec(title_lst,title_head,first_lst,first_head,body_lst,body_head,[])
		
	def search_bool_req(self):
		#print "requete (search) :"
		#for word in self.requete :
			###print word
		if self.requete == [] :
			return []
		#TODO ajouter une fonction pour trier les mots par ordre croissant de doc
		word0 = self.requete.pop()
		#print "word (search) :",word0
		lst = self.search_bool_word(word0)
		for word in self.requete :
			#print "word (search) :",word
			# word=self.stemmer.stem(word.decode('utf-8'))
			if lst == [] :
				return []
			lst_aux = self.search_bool_word(word)
			if lst_aux == [] :
				return []
			head_lst = lst.pop()
			head_lst_aux = lst_aux.pop()
			lst = self.merge_dif_rec(lst,head_lst,lst_aux,head_lst_aux,[])
		#print lst
		return lst
		
	def search_rank_req(self):
		#TODO
		return []
예제 #49
0
 def stem_words(self, words):
     stemmer = FrenchStemmer()
     stemmed_words = []        
     for word in words:
         stemmed_words.append(stemmer.stem(word))
     return stemmed_words