def phase2(self, basic_events): print('Phase 2...') # create the event graph (directed) and the redundancy graph (undirected) self.event_graph = nx.DiGraph(name='Event graph') self.redundancy_graph = nx.Graph(name='Redundancy graph') refined_events = [] for basic_event in basic_events: main_word = basic_event[2] candidate_words = self.corpus.cooccurring_words( basic_event, self.rel_words_per_event) main_word_freq = self.corpus.global_freq[ self.corpus.vocabulary[main_word], :].toarray() main_word_freq = main_word_freq[0, :] related_words = [] # identify candidate words based on co-occurrence if candidate_words is not None: for candidate_word in candidate_words: candidate_word_freq = self.corpus.global_freq[ self.corpus.vocabulary[candidate_word], :].toarray() candidate_word_freq = candidate_word_freq[0, :] # compute correlation and filter according to theta weight = (st.erdem_correlation( main_word_freq, candidate_word_freq) + 1) / 2 if weight >= self.theta: related_words.append((candidate_word, weight)) # if len(related_words) > 1: # I also removed a tab. The following lines were inside the "if candidate_words is not None:" else: print("no related words") refined_event = (basic_event[0], basic_event[1], main_word, related_words, basic_event[3]) if self.update_graphs( refined_event ): # check if this event is distinct from those already stored in the event graph refined_events.append(refined_event) else: print("Different main word but same related words") # merge redundant events and save the result self.events = self.merge_redundant_events(refined_events) return self.events
def phase2(self, basic_events): print('Phase 2...') # sort the events detected during phase 1 according to their magnitude of impact basic_events.sort(key=lambda tup: tup[0], reverse=True) # create the event graph (directed) and the redundancy graph (undirected) self.event_graph = nx.DiGraph(name='Event graph') self.redundancy_graph = nx.Graph(name='Redundancy graph') i = 0 unique_events = 0 refined_events = [] # phase 2 goes on until the top k (distinct) events have been identified while unique_events < self.k and i < len(basic_events): basic_event = basic_events[i] main_word = basic_event[2] candidate_words = self.corpus.cooccurring_words( basic_event, self.p) main_word_freq = self.corpus.global_freq[ self.corpus.vocabulary[main_word], :].toarray() main_word_freq = main_word_freq[0, :] related_words = [] # identify candidate words based on co-occurrence if candidate_words is not None: for candidate_word in candidate_words: candidate_word_freq = self.corpus.global_freq[ self.corpus.vocabulary[candidate_word], :].toarray() candidate_word_freq = candidate_word_freq[0, :] # compute correlation and filter according to theta weight = (st.erdem_correlation( main_word_freq, candidate_word_freq) + 1) / 2 if weight >= self.theta: related_words.append((candidate_word, weight)) if len(related_words) > 1: refined_event = (basic_event[0], basic_event[1], main_word, related_words, basic_event[3]) # check if this event is distinct from those already stored in the event graph if self.update_graphs(refined_event): refined_events.append(refined_event) unique_events += 1 i += 1 # merge redundant events and save the result self.events = self.merge_redundant_events(refined_events)
def phase2(self, basic_events): print('Phase 2...') # sort the events detected during phase 1 according to their magnitude of impact basic_events.sort(key=lambda tup: tup[0], reverse=True) # create the event graph (directed) and the redundancy graph (undirected) self.event_graph = nx.DiGraph(name='Event graph') self.redundancy_graph = nx.Graph(name='Redundancy graph') i = 0 unique_events = 0 refined_events = [] # phase 2 goes on until the top k (distinct) events have been identified while unique_events < self.k and i < len(basic_events): basic_event = basic_events[i] main_word = basic_event[2] candidate_words = self.corpus.cooccurring_words(basic_event, self.p) main_word_freq = self.corpus.global_freq[self.corpus.vocabulary[main_word], :].toarray() main_word_freq = main_word_freq[0, :] related_words = [] # identify candidate words based on co-occurrence if candidate_words is not None: for candidate_word in candidate_words: candidate_word_freq = self.corpus.global_freq[self.corpus.vocabulary[candidate_word], :].toarray() candidate_word_freq = candidate_word_freq[0, :] # compute correlation and filter according to theta weight = (st.erdem_correlation(main_word_freq, candidate_word_freq) + 1) / 2 if weight >= self.theta: related_words.append((candidate_word, weight)) if len(related_words) > 1: refined_event = (basic_event[0], basic_event[1], main_word, related_words, basic_event[3]) # check if this event is distinct from those already stored in the event graph if self.update_graphs(refined_event): refined_events.append(refined_event) unique_events += 1 i += 1 # merge redundant events and save the result self.events = self.merge_redundant_events(refined_events)