def generate(self, edgeCount, tfidf = False, window_size = 0, degree = False, closeness = False, groups= False): parser = XMLDataframeParser() text = parser.getText("./data/smokingRecords.xml") parser.addFeatureFromText(text, "HISTORY OF PRESENT ILLNESS :", "", True, True, "illness") df = parser.getDataframe() df_xml = parser.removeEmptyEntries(df, "illness") normalizer = Normalizer() if tfidf: if window_size == 0: vectorizer = TfidfVectorizer(tokenizer = lambda text: normalizer.normalize(text, True, False), ngram_range = (2, 2)) mostFreq2Grams = self.get_first_n_words(vectorizer, df_xml.illness, edgeCount) else: vectorizer = TfidfVectorizer(analyzer = lambda text: self.custom_analyser(text, 2, int(window_size))) mostFreq2Grams = self.get_first_n_words(vectorizer, normalizer.normalizeArray(df_xml.illness, True, False), edgeCount) else: if window_size == 0: vectorizer = CountVectorizer(tokenizer = lambda text: normalizer.normalize(text, True, False), ngram_range = (2, 2)) mostFreq2Grams = self.get_first_n_words(vectorizer, df_xml.illness, edgeCount) else: vectorizer = CountVectorizer(analyzer = lambda text: self.custom_analyser(text, 2, int(window_size))) mostFreq2Grams = self.get_first_n_words(vectorizer, normalizer.normalizeArray(df_xml.illness, True, False), edgeCount) df_graph = self.create_dataframe(mostFreq2Grams) GF = nx.from_pandas_edgelist(df_graph, 'Node1', 'Node2', ["Weight"]) if degree: # calculate degree centrality degree_centrality = nx.degree_centrality(GF) nx.set_node_attributes(GF, degree_centrality, "degree_centrality") if closeness: # calculate closeness centrality closeness_centrality = nx.closeness_centrality(GF) nx.set_node_attributes(GF, closeness_centrality, "closeness_centrality") if groups: # calculate partitions partition = community.best_partition(GF) nx.set_node_attributes(GF, partition, "group") payload = json_graph.node_link_data(GF) return payload