Пример #1
0
    def generate(self, edgeCount, tfidf = False, window_size = 0, degree = False, closeness = False, groups= False):
        parser = XMLDataframeParser()
        text = parser.getText("./data/smokingRecords.xml")
        parser.addFeatureFromText(text, "HISTORY OF PRESENT ILLNESS :", "", True, True, "illness")
        df = parser.getDataframe()
        df_xml = parser.removeEmptyEntries(df, "illness")
        normalizer = Normalizer()
        if tfidf:
            if window_size == 0:
                vectorizer = TfidfVectorizer(tokenizer = lambda text: normalizer.normalize(text, True, False), ngram_range = (2, 2))
                mostFreq2Grams = self.get_first_n_words(vectorizer, df_xml.illness, edgeCount)
            else:
                vectorizer = TfidfVectorizer(analyzer = lambda text: self.custom_analyser(text, 2, int(window_size)))
                mostFreq2Grams = self.get_first_n_words(vectorizer, normalizer.normalizeArray(df_xml.illness, True, False), edgeCount)
        else:
            if window_size == 0:
                vectorizer = CountVectorizer(tokenizer = lambda text: normalizer.normalize(text, True, False), ngram_range = (2, 2))
                mostFreq2Grams = self.get_first_n_words(vectorizer, df_xml.illness, edgeCount)
            else:
                vectorizer = CountVectorizer(analyzer = lambda text: self.custom_analyser(text, 2, int(window_size)))
                mostFreq2Grams = self.get_first_n_words(vectorizer, normalizer.normalizeArray(df_xml.illness, True, False), edgeCount)
        df_graph = self.create_dataframe(mostFreq2Grams)
        GF = nx.from_pandas_edgelist(df_graph, 'Node1', 'Node2', ["Weight"])
        

        if degree:
            # calculate degree centrality
            degree_centrality = nx.degree_centrality(GF)
            nx.set_node_attributes(GF, degree_centrality, "degree_centrality")
            
        if closeness:
            # calculate closeness centrality    
            closeness_centrality = nx.closeness_centrality(GF) 
            nx.set_node_attributes(GF, closeness_centrality, "closeness_centrality")

        if groups:
            # calculate partitions
            partition = community.best_partition(GF)
            nx.set_node_attributes(GF, partition, "group")

        payload = json_graph.node_link_data(GF)
        return payload