예제 #1
0
    def preprocessing(self):
        self.df = pd.read_csv('static/models/resampled_comments_1.csv')
        self.comments = self.df[['comment', 'rating', 'sentiment']]
        self.comments['comment'] = self.comments['comment'].map(
            lambda x: x.lower())

        toknizer = RegexpTokenizer(r'''\w'|\w+|[^\w\s]''')
        token = self.comments.apply(
            lambda row: toknizer.tokenize(row['comment']), axis=1)

        stop_words = set(stopwords.words('french'))
        stop_token = token.apply(
            lambda x: [item for item in x if item not in stop_words])

        stemmer = SnowballStemmer(language='french')
        stemm = stop_token.apply(lambda x: [stemmer.stem(y) for y in x])

        lemmatizer = FrenchLefffLemmatizer()
        lemm = stemm.apply(lambda x: [lemmatizer.lemmatize(y) for y in x])

        for i in range(len(lemm)):
            lemm[i] = ' '.join(lemm[i])

        self.comments['lemmatiser_com'] = lemm
        data = self.comments[['comment', 'lemmatiser_com', 'sentiment']]

        self.df = pd.DataFrame(data)
        return self.df
예제 #2
0
def lemma(liste):
    lemmatizer = FrenchLefffLemmatizer()
    listepleine = []
    for u in liste:
        v = lemmatizer.lemmatize(u)
        listepleine.append(v)
    return listepleine
예제 #3
0
class FrenchLemmatizer(object):
    name = 'lefff'

    SPACY_WORDNET_DIC = {'ADJ': 'a', 'ADV': 'r', 'NOUN': 'n', 'VERB': 'v'}

    def __init__(self):
        # register your new attribute token._.lefff_lemma
        Token.set_extension('lefff_lemma', default=None)

        self.lemmatizer = FrenchLefffLemmatizer()

    def __call__(self, doc):
        for token in doc:
            wn_pos = self.SPACY_WORDNET_DIC[
                token.
                pos_] if token.pos_ in self.SPACY_WORDNET_DIC else token.pos_
            lemma = self.lemmatizer.lemmatize(token.text, wn_pos)

            # TODO: return only ONE lemma
            if not isinstance(lemma, list):
                token._.lefff_lemma = lemma
            elif len(lemma) == 1:
                token._.lefff_lemma = lemma[0][0]
            else:
                # not lemmatized word or None or empty?
                token._.lefff_lemma = ''
        return doc
예제 #4
0
 def lemmatize_verbs(self, words):
     """Lemmatize verbs in list of tokenized words"""
     if self._lang == 'fr':
         lemmatizer = FrenchLefffLemmatizer()
     else:
         lemmatizer = WordNetLemmatizer()
     lemmas = []
     for word in words:
         lemma = lemmatizer.lemmatize(word, pos='v')
         lemmas.append(lemma)
     return lemmas
예제 #5
0
def simpleFilter(sentence):
    filtered_sent = []
    
    lemmatizer = None
    if targetLang == "french":
        lemmatizer = FrenchLefffLemmatizer()
    else:
        lemmatizer = WordNetLemmatizer()
    
    
    stop_words = set(stopwords.words(targetLang))
    words = word_tokenize(sentence)

    for w in words:
        if w not in stop_words:
            filtered_sent.append(lemmatizer.lemmatize(w))

    return filtered_sent
예제 #6
0
        def process_input(self,
                          input_description,
                          stem=False,
                          undesired_words=[]):
            '''
            Description :
                
                Apply the processing steps to an input text

            Parameters : 
                
                - input_description (String) : the text to process.
                - stem (Boolean) : specify if the model should use stemmed descriptions or not.
                - undesired_words (List[String]) : list of all words that we want to filter in the processing step
            '''

            tokenizer_ = RegexpTokenizer(r'[^\d\W]+')
            stemmer_ = FrenchStemmer()
            lemmatizer_ = FrenchLefffLemmatizer()
            stop_words_ = stopwords.words('french')

            tokens = list(
                map(lambda x: x.lower(),
                    tokenizer_.tokenize(input_description)))
            tokens = list(
                filter(
                    lambda x: x not in stop_words_ and len(x) >= 3 and x not in
                    undesired_words, tokens))
            tokens = [
                unidecode(lemmatizer_.lemmatize(token)) for token in tokens
            ]

            if stem:
                return ' '.join(list(map(lambda x: stemmer_.stem(x), tokens)))

            else:
                return ' '.join(tokens)
예제 #7
0
class ClusteringArticles(object):
    def __init__(self, articles):
        self.articles = articles
        self.stemmer = FrenchStemmer()
        self.lemmetizer = FrenchLefffLemmatizer()
        self.liste_french = [
            "demain",
            "iii",
            "ii",
            "reuters",
            "lundi",
            "mardi",
            "mercredi",
            "jeudi",
            "vendredi",
            "samedi",
            "dimanche",
            #                            "janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre",
            "fin",
            "afp",
            "déjà",
            "ok",
            "ca",
            "cas",
            "a",
            "abord",
            "absolument",
            "afin",
            "ah",
            "ai",
            "aie",
            "aient",
            "aies",
            "ailleurs",
            "ainsi",
            "ait",
            "allaient",
            "allo",
            "allons",
            "allô",
            "alors",
            "anterieur",
            "anterieure",
            "anterieures",
            "apres",
            "après",
            "as",
            "assez",
            "attendu",
            "au",
            "aucun",
            "aucune",
            "aucuns",
            "aujourd",
            "aujourd'hui",
            "aupres",
            "auquel",
            "aura",
            "aurai",
            "auraient",
            "aurais",
            "aurait",
            "auras",
            "aurez",
            "auriez",
            "aurions",
            "aurons",
            "auront",
            "aussi",
            "autre",
            "autrefois",
            "autrement",
            "autres",
            "autrui",
            "aux",
            "auxquelles",
            "auxquels",
            "avaient",
            "avais",
            "avait",
            "avant",
            "avec",
            "avez",
            "aviez",
            "avions",
            "avoir",
            "avons",
            "ayant",
            "ayez",
            "ayons",
            "b",
            "bah",
            "bas",
            "basee",
            "bat",
            "beau",
            "beaucoup",
            "bien",
            "bigre",
            "bon",
            "boum",
            "bravo",
            "brrr",
            "c",
            "car",
            "ce",
            "ceci",
            "cela",
            "celle",
            "celle-ci",
            "celle-là",
            "celles",
            "celles-ci",
            "celles-là",
            "celui",
            "celui-ci",
            "celui-là",
            "celà",
            "cent",
            "cependant",
            "certain",
            "certaine",
            "certaines",
            "certains",
            "certes",
            "ces",
            "cet",
            "cette",
            "ceux",
            "ceux-ci",
            "ceux-là",
            "chacun",
            "chacune",
            "chaque",
            "cher",
            "chers",
            "chez",
            "chiche",
            "chut",
            "chère",
            "chères",
            "ci",
            "cinq",
            "cinquantaine",
            "cinquante",
            "cinquantième",
            "cinquième",
            "clac",
            "clic",
            "combien",
            "comme",
            "comment",
            "comparable",
            "comparables",
            "compris",
            "concernant",
            "contre",
            "couic",
            "crac",
            "d",
            "da",
            "dans",
            "de",
            "debout",
            "dedans",
            "dehors",
            "deja",
            "delà",
            "depuis",
            "dernier",
            "derniere",
            "derriere",
            "derrière",
            "des",
            "desormais",
            "desquelles",
            "desquels",
            "dessous",
            "dessus",
            "deux",
            "deuxième",
            "deuxièmement",
            "devant",
            "devers",
            "devra",
            "devrait",
            "different",
            "differentes",
            "differents",
            "différent",
            "différente",
            "différentes",
            "différents",
            "dire",
            "directe",
            "directement",
            "dit",
            "dite",
            "dits",
            "divers",
            "diverse",
            "diverses",
            "dix",
            "dix-huit",
            "dix-neuf",
            "dix-sept",
            "dixième",
            "doit",
            "doivent",
            "donc",
            "dont",
            "dos",
            "douze",
            "douzième",
            "dring",
            "droite",
            "du",
            "duquel",
            "durant",
            "dès",
            "début",
            "désormais",
            "e",
            "effet",
            "egale",
            "egalement",
            "egales",
            "eh",
            "elle",
            "elle-même",
            "elles",
            "elles-mêmes",
            "en",
            "encore",
            "enfin",
            "entre",
            "envers",
            "environ",
            "es",
            "essai",
            "est",
            "et",
            "etant",
            "etc",
            "etre",
            "eu",
            "eue",
            "eues",
            "euh",
            "eurent",
            "eus",
            "eusse",
            "eussent",
            "eusses",
            "eussiez",
            "eussions",
            "eut",
            "eux",
            "eux-mêmes",
            "exactement",
            "excepté",
            "extenso",
            "exterieur",
            "eûmes",
            "eût",
            "eûtes",
            "f",
            "fais",
            "faisaient",
            "faisant",
            "fait",
            "faites",
            "façon",
            "feront",
            "fi",
            "flac",
            "floc",
            "fois",
            "font",
            "force",
            "furent",
            "fus",
            "fusse",
            "fussent",
            "fusses",
            "fussiez",
            "fussions",
            "fut",
            "fûmes",
            "fût",
            "fûtes",
            "g",
            "gens",
            "h",
            "ha",
            "haut",
            "hein",
            "hem",
            "hep",
            "hi",
            "ho",
            "holà",
            "hop",
            "hormis",
            "hors",
            "hou",
            "houp",
            "hue",
            "hui",
            "huit",
            "huitième",
            "hum",
            "hurrah",
            "hé",
            "hélas",
            "i",
            "ici",
            "il",
            "ils",
            "importe",
            "j",
            "je",
            "jusqu",
            "jusque",
            "juste",
            "k",
            "l",
            "la",
            "laisser",
            "laquelle",
            "las",
            "le",
            "lequel",
            "les",
            "lesquelles",
            "lesquels",
            "leur",
            "leurs",
            "longtemps",
            "lors",
            "lorsque",
            "lui",
            "lui-meme",
            "lui-même",
            "là",
            "lès",
            "m",
            "ma",
            "maint",
            "maintenant",
            "mais",
            "malgre",
            "malgré",
            "maximale",
            "me",
            "meme",
            "memes",
            "merci",
            "mes",
            "mien",
            "mienne",
            "miennes",
            "miens",
            "mille",
            "mince",
            "mine",
            "minimale",
            "moi",
            "moi-meme",
            "moi-même",
            "moindres",
            "moins",
            "mon",
            "mot",
            "moyennant",
            "multiple",
            "multiples",
            "même",
            "mêmes",
            "n",
            "na",
            "naturel",
            "naturelle",
            "naturelles",
            "ne",
            "neanmoins",
            "necessaire",
            "necessairement",
            "neuf",
            "neuvième",
            "ni",
            "nombreuses",
            "nombreux",
            "nommés",
            "non",
            "nos",
            "notamment",
            "notre",
            "nous",
            "nous-mêmes",
            "nouveau",
            "nouveaux",
            "nul",
            "néanmoins",
            "nôtre",
            "nôtres",
            "o",
            "oh",
            "ohé",
            "ollé",
            "olé",
            "on",
            "ont",
            "onze",
            "onzième",
            "ore",
            "ou",
            "ouf",
            "ouias",
            "oust",
            "ouste",
            "outre",
            "ouvert",
            "ouverte",
            "ouverts",
            "o|",
            "où",
            "p",
            "paf",
            "pan",
            "par",
            "parce",
            "parfois",
            "parle",
            "parlent",
            "parler",
            "parmi",
            "parole",
            "parseme",
            "partant",
            "particulier",
            "particulière",
            "particulièrement",
            "pas",
            "passé",
            "pendant",
            "pense",
            "permet",
            "personne",
            "personnes",
            "peu",
            "peut",
            "peuvent",
            "peux",
            "pff",
            "pfft",
            "pfut",
            "pif",
            "pire",
            "pièce",
            "plein",
            "plouf",
            "plupart",
            "plus",
            "plusieurs",
            "plutôt",
            "possessif",
            "possessifs",
            "possible",
            "possibles",
            "pouah",
            "pour",
            "pourquoi",
            "pourrais",
            "pourrait",
            "pouvait",
            "prealable",
            "precisement",
            "premier",
            "première",
            "premièrement",
            "pres",
            "probable",
            "probante",
            "procedant",
            "proche",
            "près",
            "psitt",
            "pu",
            "puis",
            "puisque",
            "pur",
            "pure",
            "q",
            "qu",
            "quand",
            "quant",
            "quant-à-soi",
            "quanta",
            "quarante",
            "quatorze",
            "quatre",
            "quatre-vingt",
            "quatrième",
            "quatrièmement",
            "que",
            "quel",
            "quelconque",
            "quelle",
            "quelles",
            "quelqu'un",
            "quelque",
            "quelques",
            "quels",
            "qui",
            "quiconque",
            "quinze",
            "quoi",
            "quoique",
            "r",
            "rare",
            "rarement",
            "rares",
            "relative",
            "relativement",
            "remarquable",
            "rend",
            "rendre",
            "restant",
            "reste",
            "restent",
            "restrictif",
            "retour",
            "revoici",
            "revoilà",
            "rien",
            "s",
            "sa",
            "sacrebleu",
            "sait",
            "sans",
            "sapristi",
            "sauf",
            "se",
            "sein",
            "seize",
            "selon",
            "semblable",
            "semblaient",
            "semble",
            "semblent",
            "sent",
            "sept",
            "septième",
            "sera",
            "serai",
            "seraient",
            "serais",
            "serait",
            "seras",
            "serez",
            "seriez",
            "serions",
            "serons",
            "seront",
            "ses",
            "seul",
            "seule",
            "seulement",
            "si",
            "sien",
            "sienne",
            "siennes",
            "siens",
            "sinon",
            "six",
            "sixième",
            "soi",
            "soi-même",
            "soient",
            "sois",
            "soit",
            "soixante",
            "sommes",
            "son",
            "sont",
            "sous",
            "souvent",
            "soyez",
            "soyons",
            "specifique",
            "specifiques",
            "speculatif",
            "stop",
            "strictement",
            "subtiles",
            "suffisant",
            "suffisante",
            "suffit",
            "suis",
            "suit",
            "suivant",
            "suivante",
            "suivantes",
            "suivants",
            "suivre",
            "sujet",
            "superpose",
            "sur",
            "surtout",
            "t",
            "ta",
            "tac",
            "tandis",
            "tant",
            "tardive",
            "te",
            "tel",
            "telle",
            "tellement",
            "telles",
            "tels",
            "tenant",
            "tend",
            "tenir",
            "tente",
            "tes",
            "tic",
            "tien",
            "tienne",
            "tiennes",
            "tiens",
            "toc",
            "toi",
            "toi-même",
            "ton",
            "touchant",
            "toujours",
            "tous",
            "tout",
            "toute",
            "toutefois",
            "toutes",
            "treize",
            "trente",
            "tres",
            "trois",
            "troisième",
            "troisièmement",
            "trop",
            "très",
            "tsoin",
            "tsouin",
            "tu",
            "té",
            "u",
            "un",
            "une",
            "unes",
            "uniformement",
            "unique",
            "uniques",
            "uns",
            "v",
            "va",
            "vais",
            "valeur",
            "vas",
            "vers",
            "via",
            "vif",
            "vifs",
            "vingt",
            "vivat",
            "vive",
            "vives",
            "vlan",
            "voici",
            "voie",
            "voient",
            "voilà",
            "vont",
            "vos",
            "votre",
            "vous",
            "vous-mêmes",
            "vu",
            "vé",
            "vôtre",
            "vôtres",
            "w",
            "x",
            "y",
            "z",
            "zut",
            "à",
            "â",
            "ça",
            "ès",
            "étaient",
            "étais",
            "était",
            "étant",
            "état",
            "étiez",
            "étions",
            "été",
            "étée",
            "étées",
            "étés",
            "êtes",
            "être",
            "ô"
        ]

    def main_article_clustering(self):
        self.clean_articles()
        cluster_words = self.clustering_Tf_Itf()
        #        self.match_general_cluster(cluster_words)
        return self.articles

    def clean_articles(self):
        def clean_articles2(x):
            liste_para = x[0].split("\r\n")
            end = re.sub("'\([^)]*\)", "",
                         str(liste_para[-1]).replace(str(x[1]), "")).strip()
            article = "\r\n".join([x for x in liste_para[:-1] if x != ''] +
                                  [end])
            return article

        self.articles = self.articles.loc[~pd.isnull(self.articles["article"])]
        self.articles = self.articles.loc[~pd.isnull(self.
                                                     articles["restricted"])]
        self.articles = self.articles.loc[
            self.articles["article"].apply(lambda x: len(x)) > 750]
        keep_index = self.articles["article"].apply(
            lambda x: False if "L'essentiel de l'actu " in x else True)
        self.articles = self.articles.loc[keep_index]
        self.articles["article"] = self.articles[["article", "auteur"]].apply(
            lambda x: clean_articles2(x), axis=1)

    def tokenize(self, text):
        text = text.replace("'", " ")
        text = re.sub(r'\S*@\S*\s?', '', text.strip(),
                      flags=re.MULTILINE)  # remove email
        text = re.sub(r'\@\S*\s?', '', text.strip(),
                      flags=re.MULTILINE)  # remove tweeter names
        text = re.sub(r'http\S+', '', text,
                      flags=re.MULTILINE)  # remove web addresses
        text = re.sub(
            r'\(Crédits :.+\)\r\n', ' ', text,
            flags=re.MULTILINE)  # remove credits from start of article
        text = re.sub(
            r'\r\n.+\r\nVoir les réactions', '', text,
            flags=re.MULTILINE)  # remove credits from start of article
        text = text.replace("/ REUTERS", "").replace("/ AFP", "")
        s = text.split("(Reuters) - ", 1)
        if len(s) > 1:
            text = s[1]
        s = text.split(" - ", 1)
        if len(s[0]) < 35:
            text = s[1]
        text = re.sub(r'\r\npar .+\r\n', '', text, flags=re.MULTILINE)
        text = re.sub(r'\r\n\(.+\)', ' ', text, flags=re.MULTILINE)
        text = re.sub(r'\r\nÀ LIRE AUSSI\r\n.+\r\n',
                      ' ',
                      text,
                      flags=re.MULTILINE)
        text = re.sub(r'\r\nLIRE AUSSI\r\n.+\r\n',
                      ' ',
                      text,
                      flags=re.MULTILINE)
        text = re.sub(r'\r\nLIRE AUSSI >>.+\r\n',
                      ' ',
                      text,
                      flags=re.MULTILINE)
        text = re.sub(r'\r\n» LIRE AUSSI -.+\r\n',
                      ' ',
                      text,
                      flags=re.MULTILINE)
        text = re.sub(r'\r\nLE FIGARO. -.+ - ', ' ', text, flags=re.MULTILINE)
        text = re.sub(r'www.\S+', '', text,
                      flags=re.MULTILINE)  # remove web addresses
        text = re.sub(r'\r\n» Vous pouvez également suivre.+.',
                      '',
                      text,
                      flags=re.MULTILINE)
        text = re.sub(r'\r\nLIRE NOTRE DOSSIER COMPLET\r\n.+\r\n',
                      '',
                      text,
                      flags=re.MULTILINE)

        text = text.replace("\r\nLIRE AUSSI :\r\n»", "")
        text = text.replace("(Reuters)",
                            "").replace("Article réservé aux abonnés", " ")
        text = text.translate({ord(ch): None for ch in '0123456789'})
        text = text.translate({
            ord(ch): " "
            for ch in '-•“’!"#$%&()*+,./:;<=>?@[\\]^_`{|}~«»–…‘'
        })  # lower + suppress numbers
        text = re.sub(r' +', ' ', text,
                      flags=re.MULTILINE)  # remove autor name
        text = re.sub(r' \b[a-zA-Z]\b ', ' ', text,
                      flags=re.MULTILINE)  ### mono letters word
        tokens = nltk.word_tokenize(text.lower(), language='french')

        tokens = " ".join([
            self.lemmetizer.lemmatize(self.lemmetizer.lemmatize(word, "v"),
                                      "n") for word in tokens
        ])
        return tokens

    def clustering_Tf_Itf(self, thresh=0.6, nwords=100):
        '''
         Home made clustering method:
             - get nwords most important words per document (after tf idf)
             - Group articles having at least thresh of common weights (% of importance in common between articles)
             - If one group then cluster = -1
        '''

        articles = self.articles.copy()

        # =============================================================================
        #         #### 1) cluster articles close in words
        # =============================================================================
        clusters, tfs = self.intersect_cluster(articles,
                                               nwords=70,
                                               thresh=0.55)
        index_cluster = []
        for key, value in clusters.items():
            for index_art in value:
                index_cluster.append([index_art, key])

        index_cluster = pd.DataFrame(index_cluster).sort_values(0)
        articles["article_cluster"] = index_cluster[1].tolist()
        cluster_unique = articles["article_cluster"].value_counts()[
            articles["article_cluster"].value_counts() == 1].index

        idx = articles["article_cluster"].isin(cluster_unique)
        m = metrics.silhouette_score(tfs.toarray()[~idx],
                                     articles.loc[~idx]["article_cluster"],
                                     metric='cosine')
        print("first step clustering {0}, {1}".format(m, len(cluster_unique)))

        # =============================================================================
        #         #### 2) cluster clusters
        # =============================================================================
        articles["article"] = articles["titre"] + " " + articles[
            "titre"] + " " + articles["titre"] + " " + articles["article"]
        article_cluster = {}
        for cluster in articles["article_cluster"].value_counts().sort_index(
        ).index:
            sub_articles = articles.loc[articles["article_cluster"] == cluster,
                                        "article"].tolist()
            a = ""
            for art in sub_articles:
                a += " " + art
            article_cluster[cluster] = a
        article_cluster = pd.DataFrame.from_dict(article_cluster,
                                                 orient="index").sort_index()
        article_cluster.columns = ["article"]

        clusters2, tfs2 = self.intersect_cluster(article_cluster, 70, 0.37)
        articles["cluster"] = 0
        for key, value in clusters2.items():
            value_cluster = article_cluster.iloc[value].index.tolist()
            articles.loc[articles["article_cluster"].isin(value_cluster),
                         "cluster"] = articles.loc[
                             articles["article_cluster"].isin(value_cluster),
                             "article_cluster"].iloc[0]

        # =============================================================================
        #         #### 3) get main words from cluster to merge with past clusters
        # =============================================================================
        article_cluster = {}
        liste_cluster = []
        for cluster in articles["cluster"].value_counts().sort_index().index:
            sub_articles = articles.loc[articles["cluster"] == cluster,
                                        "article"].tolist()
            a = ""
            for art in sub_articles:
                a += " " + art
            liste_cluster.append(cluster)
            article_cluster[cluster] = a
        article_cluster = pd.DataFrame.from_dict(article_cluster,
                                                 orient="index").sort_index()
        article_cluster.columns = ["article"]

        liste_words, article_words, tfs2 = self.weight_words(article_cluster,
                                                             nwords=70)
        cluster_words = {}
        for i, words in enumerate(article_words):
            cluster_words[liste_cluster[i]] = words

        with open(
                os.environ["DIR_PATH"] +
                "/data/continuous_run/clusters/dayly_cluster/{0}.json".format(
                    datetime.now().strftime("%Y-%m-%d")), "w") as f:
            json.dump(cluster_words, f, ensure_ascii=False, indent=2)

        # =============================================================================
        #         ### 4) finish : lonely cluster into -1
        # =============================================================================
        cluster_unique = articles["cluster"].value_counts()[
            articles["cluster"].value_counts() == 1].index
        articles["cluster"] = np.where(
            articles["cluster"].isin(cluster_unique), -1, articles["cluster"])

        idx = articles["cluster"] != -1
        m = metrics.silhouette_score(tfs.toarray()[idx],
                                     articles.loc[idx]["cluster"],
                                     metric='cosine')
        minus = articles["cluster"].value_counts().iloc[0]
        print("clustering {0} , {1}".format(m, minus))

        self.articles["cluster"] = articles["cluster"].tolist()
        self.articles["granular_cluster"] = articles["article_cluster"].tolist(
        )

        return cluster_words

    def match_general_cluster(self, cluster_words, thresh=0.37):

        if not os.path.isfile(
                os.environ["DIR_PATH"] +
                "/data/continuous_run/clusters/general_cluster_words.json"):
            with open(
                    os.environ["DIR_PATH"] +
                    "/data/continuous_run/general_cluster_words.json",
                    "w") as f:
                json.dump(cluster_words, f, ensure_ascii=False, indent=2)
            return 0

        with open(
                os.environ["DIR_PATH"] +
                "/data/continuous_run/clusters/general_cluster_words.json",
                "r") as read_file:
            general_cluster_words = json.load(read_file)

        max_cluster = max([int(x) for x in general_cluster_words.keys()])

        rematch_cluster = {}
        additionnal_dico = {}
        for new_cluster, new_words in tqdm.tqdm(cluster_words.items()):
            max_score = 0
            for cluster, words in general_cluster_words.items():
                intersect_words = list(
                    set(new_words.keys()).intersection(set(words.keys())))
                score = sum([new_words[x] + words[x]
                             for x in intersect_words]) * 2 / (
                                 sum(new_words.values()) + sum(words.values()))
                if score >= thresh:
                    max_score = score
                    rematch_cluster[new_cluster] = cluster

            ### if not in treshold, create new cluster
            if max_score < thresh:
                additionnal_dico[str(max_cluster + 1)] = new_words
                rematch_cluster[new_cluster] = str(max_cluster + 1)
                max_cluster += 1

        general_cluster_words.update(additionnal_dico)
        with open(
                os.environ["DIR_PATH"] +
                "/data/continuous_run/clusters/general_cluster_words.json",
                "w") as f:
            json.dump(general_cluster_words, f, ensure_ascii=False, indent=2)

        for key, value in rematch_cluster.items():
            self.articles.loc[self.articles["cluster"] == key,
                              "cluster"] = value

    def intersect_cluster(self, articles, nwords, thresh):

        liste_words, article_words, tfs = self.weight_words(articles, nwords)

        clusters = {}
        index_articles = list(range(len(article_words)))
        cluster = 0
        while len(index_articles) > 1:
            j = index_articles[0]
            clusters[str(cluster)] = []
            for k in index_articles:
                intersect_words = list(
                    set(article_words[k].keys()).intersection(
                        set(article_words[j].keys())))
                score = sum([
                    article_words[j][x] + article_words[k][x]
                    for x in intersect_words
                ]) * 2 / (sum(article_words[j].values()) +
                          sum(article_words[k].values()))
                if score >= thresh:
                    clusters[str(cluster)].append(k)
                    index_articles.remove(k)
            cluster += 1

        if len(index_articles) == 1:
            clusters[str(cluster)] = [index_articles[0]]

        return clusters, tfs

    def sort_coo(self, coo_matrix):
        tuples = zip(coo_matrix.col, coo_matrix.data)
        return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

    def extract_topn_from_vector(self, feature_names, sorted_items, topn=10):
        """get the feature names and tf-idf score of top n items"""

        #use only topn items from vector
        sorted_items = sorted_items[:topn]

        score_vals = []
        feature_vals = []

        # word index and corresponding tf-idf score
        for idx, score in sorted_items:

            #keep track of feature name and its corresponding score
            score_vals.append(round(score, 3))
            feature_vals.append(feature_names[idx])

        #create a tuples of feature,score
        #results = zip(feature_vals,score_vals)
        results = {}
        for idx in range(len(feature_vals)):
            results[feature_vals[idx]] = score_vals[idx]

        return results

    def weight_words(self, articles, nwords=40):

        tfidf = TfidfVectorizer(stop_words=self.liste_french,
                                preprocessor=self.tokenize,
                                min_df=2,
                                max_df=0.35,
                                ngram_range=(1, 2))
        tfs = tfidf.fit_transform(articles["article"].tolist())

        liste_words = []
        article_words = []
        for i, art in enumerate(articles["article"].tolist()):
            sorted_items = self.sort_coo(tfs[i].tocoo())
            article_words.append(
                self.extract_topn_from_vector(tfidf.get_feature_names(),
                                              sorted_items, nwords))
            liste_words += article_words[i].keys()

        return liste_words, article_words, tfs

    def db_scan_on_top_k(self, articles):

        liste_words, article_words, tfs = self.weight_words(articles, 100)
        liste_words = list(set(liste_words))

        index_col_keep = []
        for word in liste_words:
            index_col_keep.append(liste_words.index(word))

        tfs = tfs[:, index_col_keep]
        clusterer = hdbscan.HDBSCAN(min_cluster_size=2)
        cluster_labels = clusterer.fit_predict(tfs.toarray())
        articles["article_cluster"] = cluster_labels

        idx = articles["article_cluster"] != -1
        m = metrics.silhouette_score(tfs.toarray()[idx],
                                     articles.loc[idx]["article_cluster"],
                                     metric='cosine')
        print("KPIS: silhouette {0}, shape {1}, -1 {2}".format(
            m, articles["article_cluster"].value_counts().shape[0],
            articles["article_cluster"].value_counts().iloc[0]))

        return articles, tfs

    def clustering_Tf_ItfV1(self):
        tfidf = TfidfVectorizer(preprocessor=self.tokenize,
                                stop_words=self.liste_french,
                                ngram_range=(1, 2),
                                max_features=2000,
                                max_df=0.25,
                                min_df=3)

        tfs = tfidf.fit_transform(self.articles["article"].tolist())
        # HDBSCAN / -1 = no cluster
        clusterer = hdbscan.HDBSCAN(min_cluster_size=2)
        cluster_labels = clusterer.fit_predict(tfs.toarray())
        self.articles["cluster"] = cluster_labels

        ### get most used words per cluster detected
        self.articles["main_words"] = ""
        for i in self.articles["cluster"].value_counts().iloc[1:].index:
            try:
                keeping_words, d = self.generate_text(tfidf, tfs, i,
                                                      self.articles)
                self.articles.loc[self.articles["cluster"] == i,
                                  "main_words"] = keeping_words
            except Exception:
                self.articles.loc[self.articles["cluster"] == i,
                                  "main_words"] = ""
                pass

        ### silhouette
        idx = self.articles["cluster"] != -1
        m = metrics.silhouette_score(tfs.toarray()[idx],
                                     self.articles.loc[idx]["cluster"],
                                     metric='cosine')
        print("KPIS: silhouette {0}, shape {1}, -1 {2}".format(
            m, self.articles["cluster"].value_counts().shape[0],
            self.articles["cluster"].value_counts().iloc[0]))
예제 #8
0
    def build(self, folder_path, num_cluster):
        start_time = time.clock()
        if not isinstance(self.language_regonizer, FunctionType):
            print("ERROR: language_regonizer never setted or type error")
            return 0
        if not isinstance(self.sentence_tokenizer, FunctionType):
            print("ERROR: sentence_tokenizer never setted")
            return 0
        if not isinstance(self.theme_cluster, FunctionType):
            print("ERROR: theme_cluster never setted")
            return 0
        if not isinstance(self.indexer, FunctionType):
            print("ERROR: indexer never setted")
            return 0
        if not isinstance(self.phrase_extractor, FunctionType):
            print("ERROR: phrase_extractor never setted")
            return 0
        if not isinstance(self.quantizator, FunctionType):
            print("ERROR: quantizator never setted")
            return 0
        if not isinstance(self.phrase_quantizator, FunctionType):
            print("ERROR: phrase_quantizator never setted")
            return 0

        sentence_index = -1
        sentences_store = []
        inversed_index = {}
        inversed_index_fr = {}
        punctuations = set(string.punctuation)
        porter_stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()
        fr_lemmatizer = FrenchLefffLemmatizer()
        translator = Translator()

        for file_name in os.listdir(folder_path):
            if not file_name.startswith('.'):   #avoid hidden file
                doc = read_txt(folder_path + '/' + file_name)
                print("[INFO] reading " + file_name + " to build model")
                # spilit doc to paragraphs, suppose single language in one paragraph
                paragraphs = doc.split('\n')
                for parag in paragraphs:
                    language = self.language_regonizer(parag)
                    # sentence tokenize
                    lst_sentence = self.sentence_tokenizer(parag, language)
                    # save original sentence to memory
                    sentences_store.append(lst_sentence)

                    for sentence in lst_sentence:
                        sentence_index += 1
                        # case fold
                        sentence = sentence.lower()
                        # extract phrases
                        phrases = self.phrase_extractor(sentence)
                        # remove these phrases from sentence
                        for p in phrases:
                            sentence = sentence.replace(p, '')

                        # remove all punctuation
                        removed_punc = ''.join(s for s in sentence if s not in punctuations)
                        # remove all digits
                        removed_digit = re.sub(r'\d+', '', removed_punc)
                        # tokenize
                        word_tokens = nltk.word_tokenize(removed_digit)
                        # filter stop words
                        removed_stopwords = [t for t in word_tokens if t not in stopwords.words(language)]
                        # stemming or lemmatization
                        for t in removed_stopwords:
                            if language == 'english':
                                # token = porter_stemmer.stem(t)
                                pos_token = pos_word(t)
                                if pos_token is not 0:
                                    token = lemmatizer.lemmatize(t, pos=pos_token)
                                    self.indexer(token, sentence_index, inversed_index)
                            elif language == 'french':
                                token = fr_lemmatizer.lemmatize(t)
                            # index for words
                                self.indexer(token, sentence_index, inversed_index_fr)
                        # index for phrase
                        for t in phrases:
                            if language == 'english':
                                self.indexer(t, sentence_index, inversed_index)
                            elif language == 'french':
                                self.indexer(t, sentence_index, inversed_index_fr)


        # persist the index
        save_pickle(inversed_index, 'out/index.pickle')
        save_txt(str_of(inversed_index), 'out/index.txt')

        save_pickle(inversed_index_fr, 'out/index_fr.pickle')
        save_txt(str_of(inversed_index_fr), 'out/index_fr.txt')
        index = 0
        for i in range(0,len(sentences_store)):
            input_sentence={"_id":str(index)}
            index = index + 1
            input_sentence["content"] = sentences_store[i]
            sentences_store[i] = input_sentence
            insert("corpus",sentences_store[i])

        # get all tokens
        lst_tokens = list(inversed_index.keys())
        lst_tokens_fr = list(inversed_index_fr.keys())
        # a hashmap to quick match between word and vec
        self.word_vec_map = {}
        # list of vector(list)
        matrix = []
        for token in lst_tokens:
            if ' '  not in token:   # token is a word
                if IF_DEBUG:
                    print("[INFO] " + token + "is a word")

                temp_vector = self.quantizator(token)
                if len(temp_vector) > 0:
                    matrix.append(temp_vector)
                    self.word_vec_map[token] = temp_vector

            else:   # token is a phrase
                if IF_DEBUG:
                    print("[INFO] " + token + "is a phrase")

                temp_vector = self.phrase_quantizator(token, self.quantizator)
                if temp_vector is not 0:
                    matrix.append(temp_vector)
                    self.word_vec_map[token] = temp_vector

        for token in lst_tokens_fr:
            if ' ' not in token:
                after_trans = translator.translate(token, dest='en')
                # print(after_trans.text)
                ####tanslate to engish after_tran
                temp_vector = self.quantizator(after_trans.text)
                if len(temp_vector) > 0:
                    matrix.append(temp_vector)
                    self.word_vec_map[token] = temp_vector
            else:
                ####translate to english
                after_trans = translator.translate(token, dest='en')
                temp_vector = self.phrase_quantizator(after_trans.text, self.quantizator)
                if temp_vector is not 0:
                    matrix.append(temp_vector)
                    self.word_vec_map[token] = temp_vector

        # theme cluster
        self.theme_clustered = self.theme_cluster(num_cluster, matrix, list(self.word_vec_map.keys()), self.word_vec_map)
        elapsed = (time.clock() - start_time)
        print("========== END ========")
        print(self.theme_clustered['clusters'])
        print("\n")

        print(self.theme_clustered['centers'])
        print('\n')

        print(self.theme_clustered['representative'])
        print('\n')
        print("RUNNING TIME:" + str(elapsed) + " sec" )