def execute(self, data_path, numSeriatedTerms=None): assert data_path is not None if numSeriatedTerms is None: numSeriatedTerms = ComputeSeriation.DEFAULT_NUM_SERIATED_TERMS self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info('Computing term seriation...') self.logger.info(' data_path = %s', data_path) self.logger.info(' number_of_seriated_terms = %d', numSeriatedTerms) self.logger.info('Connecting to data...') self.saliency = SaliencyAPI(data_path) self.similarity = SimilarityAPI(data_path) self.seriation = SeriationAPI(data_path) self.logger.info('Reading data from disk...') self.saliency.read() self.similarity.read() self.logger.info('Reshaping saliency data...') self.reshape() self.logger.info('Computing seriation...') self.compute(numSeriatedTerms) self.logger.info('Writing data to disk...') self.seriation.write() self.logger.info( '--------------------------------------------------------------------------------' )
def execute( self, data_path, numSeriatedTerms = None ): assert data_path is not None if numSeriatedTerms is None: numSeriatedTerms = ComputeSeriation.DEFAULT_NUM_SERIATED_TERMS self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info( 'Computing term seriation...' ) self.logger.info( ' data_path = %s', data_path ) self.logger.info( ' number_of_seriated_terms = %d', numSeriatedTerms ) self.logger.info( 'Connecting to data...' ) self.saliency = SaliencyAPI( data_path ) self.similarity = SimilarityAPI( data_path ) self.seriation = SeriationAPI( data_path ) self.logger.info( 'Reading data from disk...' ) self.saliency.read() self.similarity.read() self.logger.info( 'Reshaping saliency data...' ) self.reshape() self.logger.info( 'Computing seriation...' ) self.compute( numSeriatedTerms ) self.logger.info( 'Writing data to disk...' ) self.seriation.write() self.logger.info( '--------------------------------------------------------------------------------' )
def execute( self, data_path, sliding_window_size = None ): assert data_path is not None if sliding_window_size is None: sliding_window_size = ComputeSimilarity.DEFAULT_SLIDING_WINDOW_SIZE self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info( 'Computing term similarity...' ) self.logger.info( ' data_path = %s', data_path ) self.logger.info( ' sliding_window_size = %d', sliding_window_size ) self.logger.info( 'Connecting to data...' ) self.tokens = TokensAPI( data_path ) self.similarity = SimilarityAPI( data_path ) self.logger.info( 'Reading data from disk...' ) self.tokens.read() self.logger.info( 'Computing document co-occurrence...' ) self.computeDocumentCooccurrence() self.logger.info( 'Computing sliding-window co-occurrence...' ) self.computeSlidingWindowCooccurrence( sliding_window_size ) self.logger.info( 'Counting total number of tokens, unigrams, and bigrams in the corpus...' ) self.computeTokenCounts() self.logger.info( 'Computing document co-occurrence likelihood...' ) self.similarity.document_g2 = self.getG2Stats( self.document_count, self.similarity.document_occurrence, self.similarity.document_cooccurrence ) self.logger.info( 'Computing sliding-window co-occurrence likelihood...' ) self.similarity.window_g2 = self.getG2Stats( self.window_count, self.similarity.window_occurrence, self.similarity.window_cooccurrence ) self.logger.info( 'Computing collocation likelihood...' ) self.similarity.collocation_g2 = self.getG2Stats( self.token_count, self.similarity.unigram_counts, self.similarity.bigram_counts ) self.combineSimilarityMatrices() self.logger.info( 'Writing data to disk...' ) self.similarity.write() self.logger.info( '--------------------------------------------------------------------------------' )
class ComputeSimilarity( object ): """ Similarity measures. Compute term similarity based on co-occurrence and collocation likelihoods. """ DEFAULT_SLIDING_WINDOW_SIZE = 10 MAX_FREQ = 100.0 def __init__( self, logging_level ): self.logger = logging.getLogger( 'ComputeSimilarity' ) self.logger.setLevel( logging_level ) handler = logging.StreamHandler( sys.stderr ) handler.setLevel( logging_level ) self.logger.addHandler( handler ) def execute( self, data_path, sliding_window_size = None ): assert data_path is not None if sliding_window_size is None: sliding_window_size = ComputeSimilarity.DEFAULT_SLIDING_WINDOW_SIZE self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info( 'Computing term similarity...' ) self.logger.info( ' data_path = %s', data_path ) self.logger.info( ' sliding_window_size = %d', sliding_window_size ) self.logger.info( 'Connecting to data...' ) self.tokens = TokensAPI( data_path ) self.similarity = SimilarityAPI( data_path ) self.logger.info( 'Reading data from disk...' ) self.tokens.read() self.logger.info( 'Computing document co-occurrence...' ) self.computeDocumentCooccurrence() self.logger.info( 'Computing sliding-window co-occurrence...' ) self.computeSlidingWindowCooccurrence( sliding_window_size ) self.logger.info( 'Counting total number of tokens, unigrams, and bigrams in the corpus...' ) self.computeTokenCounts() self.logger.info( 'Computing document co-occurrence likelihood...' ) self.similarity.document_g2 = self.getG2Stats( self.document_count, self.similarity.document_occurrence, self.similarity.document_cooccurrence ) self.logger.info( 'Computing sliding-window co-occurrence likelihood...' ) self.similarity.window_g2 = self.getG2Stats( self.window_count, self.similarity.window_occurrence, self.similarity.window_cooccurrence ) self.logger.info( 'Computing collocation likelihood...' ) self.similarity.collocation_g2 = self.getG2Stats( self.token_count, self.similarity.unigram_counts, self.similarity.bigram_counts ) self.combineSimilarityMatrices() self.logger.info( 'Writing data to disk...' ) self.similarity.write() self.logger.info( '--------------------------------------------------------------------------------' ) def incrementCount( self, occurrence, key ): if key not in occurrence: occurrence[ key ] = 1 else: occurrence[ key ] += 1 def computeDocumentCooccurrence( self ): document_count = 0 occurrence = {} cooccurrence = {} for docID, docTokens in self.tokens.data.iteritems(): self.logger.debug( ' %s (%d tokens)', docID, len(docTokens) ) tokenSet = frozenset(docTokens) document_count += 1 for token in tokenSet: self.incrementCount( occurrence, token ) for aToken in tokenSet: for bToken in tokenSet: if aToken < bToken: self.incrementCount( cooccurrence, (aToken, bToken) ) self.document_count = document_count self.similarity.document_occurrence = occurrence self.similarity.document_cooccurrence = cooccurrence def computeSlidingWindowCooccurrence( self, sliding_window_size ): window_count = 0 occurrence = {} cooccurrence = {} for docID, docTokens in self.tokens.data.iteritems(): allWindowTokens = self.getSlidingWindowTokens( docTokens, sliding_window_size ) self.logger.debug( ' %s (%d tokens, %d windows)', docID, len(docTokens), len(allWindowTokens) ) for windowTokens in allWindowTokens: tokenSet = frozenset(windowTokens) window_count += 1 for token in tokenSet: self.incrementCount( occurrence, token ) for aToken in tokenSet: for bToken in tokenSet: if aToken < bToken: self.incrementCount( cooccurrence, (aToken, bToken) ) self.window_count = window_count self.similarity.window_occurrence = occurrence self.similarity.window_cooccurrence = cooccurrence def getSlidingWindowTokens( self, tokens, sliding_window_size ): allWindows = [] aIndex = 0 - sliding_window_size bIndex = len(tokens) + sliding_window_size for index in range( aIndex, bIndex ): a = max( 0 , index - sliding_window_size ) b = min( len(tokens) , index + sliding_window_size ) allWindows.append( tokens[a:b] ) return allWindows def computeTokenCounts( self ): token_count = sum( len(docTokens) for docTokens in self.tokens.data.itervalues() ) unigram_counts = {} for docTokens in self.tokens.data.itervalues(): for token in docTokens: self.incrementCount( unigram_counts, token ) bigram_counts = {} for docTokens in self.tokens.data.itervalues(): prevToken = None for currToken in docTokens: if prevToken is not None: self.incrementCount( bigram_counts, (prevToken, currToken) ) prevToken = currToken self.token_count = token_count self.similarity.unigram_counts = unigram_counts self.similarity.bigram_counts = bigram_counts def getBinomial( self, B_given_A, any_given_A, B_given_notA, any_given_notA ): assert B_given_A >= 0 assert B_given_notA >= 0 assert any_given_A >= B_given_A assert any_given_notA >= B_given_notA a = float( B_given_A ) b = float( B_given_notA ) c = float( any_given_A ) d = float( any_given_notA ) E1 = c * ( a + b ) / ( c + d ) E2 = d * ( a + b ) / ( c + d ) g2a = 0 g2b = 0 if a > 0: g2a = a * math.log( a / E1 ) if b > 0: g2b = b * math.log( b / E2 ) return 2 * ( g2a + g2b ) def getG2( self, freq_all, freq_ab, freq_a, freq_b ): assert freq_all >= freq_a assert freq_all >= freq_b assert freq_a >= freq_ab assert freq_b >= freq_ab assert freq_all >= 0 assert freq_ab >= 0 assert freq_a >= 0 assert freq_b >= 0 B_given_A = freq_ab B_given_notA = freq_b - freq_ab any_given_A = freq_a any_given_notA = freq_all - freq_a return self.getBinomial( B_given_A, any_given_A, B_given_notA, any_given_notA ) def getG2Stats( self, max_count, occurrence, cooccurrence ): g2_stats = {} freq_all = max_count for ( firstToken, secondToken ) in cooccurrence: freq_a = occurrence[ firstToken ] freq_b = occurrence[ secondToken ] freq_ab = cooccurrence[ (firstToken, secondToken) ] scale = ComputeSimilarity.MAX_FREQ / freq_all rescaled_freq_all = freq_all * scale rescaled_freq_a = freq_a * scale rescaled_freq_b = freq_b * scale rescaled_freq_ab = freq_ab * scale if rescaled_freq_a > 1.0 and rescaled_freq_b > 1.0: g2_stats[ (firstToken, secondToken) ] = self.getG2( freq_all, freq_ab, freq_a, freq_b ) return g2_stats def combineSimilarityMatrices( self ): self.logger.info( 'Combining similarity matrices...' ) self.similarity.combined_g2 = {} keys_queued = [] for key in self.similarity.document_g2: ( firstToken, secondToken ) = key otherKey = ( secondToken, firstToken ) keys_queued.append( key ) keys_queued.append( otherKey ) for key in self.similarity.window_g2: ( firstToken, secondToken ) = key otherKey = ( secondToken, firstToken ) keys_queued.append( key ) keys_queued.append( otherKey ) for key in self.similarity.collocation_g2: keys_queued.append( key ) keys_processed = {} for key in keys_queued: keys_processed[ key ] = False for key in keys_queued: if not keys_processed[ key ]: keys_processed[ key ] = True ( firstToken, secondToken ) = key if firstToken < secondToken: orderedKey = key else: orderedKey = ( secondToken, firstToken ) score = 0.0 if orderedKey in self.similarity.document_g2: score += self.similarity.document_g2[ orderedKey ] if orderedKey in self.similarity.window_g2: score += self.similarity.window_g2[ orderedKey ] if key in self.similarity.collocation_g2: score += self.similarity.collocation_g2[ key ] if score > 0.0: self.similarity.combined_g2[ key ] = score
class ComputeSeriation(object): """Seriation algorithm. Re-order words to improve promote the legibility of multi-word phrases and reveal the clustering of related terms. As output, the algorithm produces a list of seriated terms and its 'ranking' (i.e., the iteration in which a term was seriated). """ DEFAULT_NUM_SERIATED_TERMS = 100 def __init__(self, logging_level): self.logger = logging.getLogger('ComputeSeriation') self.logger.setLevel(logging_level) handler = logging.StreamHandler(sys.stderr) handler.setLevel(logging_level) self.logger.addHandler(handler) def execute(self, data_path, numSeriatedTerms=None): assert data_path is not None if numSeriatedTerms is None: numSeriatedTerms = ComputeSeriation.DEFAULT_NUM_SERIATED_TERMS self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info('Computing term seriation...') self.logger.info(' data_path = %s', data_path) self.logger.info(' number_of_seriated_terms = %d', numSeriatedTerms) self.logger.info('Connecting to data...') self.saliency = SaliencyAPI(data_path) self.similarity = SimilarityAPI(data_path) self.seriation = SeriationAPI(data_path) self.logger.info('Reading data from disk...') self.saliency.read() self.similarity.read() self.logger.info('Reshaping saliency data...') self.reshape() self.logger.info('Computing seriation...') self.compute(numSeriatedTerms) self.logger.info('Writing data to disk...') self.seriation.write() self.logger.info( '--------------------------------------------------------------------------------' ) def reshape(self): self.candidateSize = 100 self.orderedTermList = [] self.termSaliency = {} self.termFreqs = {} self.termDistinct = {} self.termRank = {} self.termVisibility = {} for element in self.saliency.term_info: term = element['term'] self.orderedTermList.append(term) self.termSaliency[term] = element['saliency'] self.termFreqs[term] = element['frequency'] self.termDistinct[term] = element['distinctiveness'] self.termRank[term] = element['rank'] self.termVisibility[term] = element['visibility'] def compute(self, numSeriatedTerms): # Elicit from user (1) the number of terms to output and (2) a list of terms that should be included in the output... # set in init (i.e. read from config file) # Seriate! start_time = time.time() candidateTerms = self.orderedTermList self.seriation.term_ordering = [] self.seriation.term_iter_index = [] self.buffers = [0, 0] preBest = [] postBest = [] for iteration in range(numSeriatedTerms) and candidateTerms: print "Iteration no. ", iteration addedTerm = 0 if len(self.seriation.term_iter_index) > 0: addedTerm = self.seriation.term_iter_index[-1] if iteration == 1: (preBest, postBest) = self.initBestEnergies(addedTerm, candidateTerms) (preBest, postBest, self.bestEnergies) = self.getBestEnergies(preBest, postBest, addedTerm) (candidateTerms, self.seriation.term_ordering, self.seriation.term_iter_index, self.buffers) = self.iterate_eff( candidateTerms, self.seriation.term_ordering, self.seriation.term_iter_index, self.buffers, self.bestEnergies, iteration) print "---------------" seriation_time = time.time() - start_time # Output consists of (1) a list of ordered terms, and (2) the iteration index in which a term was ordered #print "term_ordering: ", self.seriation.term_ordering #print "term_iter_index: ", self.seriation.term_iter_index # Feel free to pick a less confusing variable name #print "similarity matrix generation time: ", compute_sim_time #print "seriation time: ", seriation_time self.logger.debug("seriation time: " + str(seriation_time)) #-------------------------------------------------------------------------------# # Helper Functions def initBestEnergies(self, firstTerm, candidateTerms): preBest = [] postBest = [] for candidate in candidateTerms: pre_score = 0 post_score = 0 # preBest if (candidate, firstTerm) in self.similarity.combined_g2: pre_score = self.similarity.combined_g2[(candidate, firstTerm)] # postBest if (firstTerm, candidate) in self.similarity.combined_g2: post_score = self.similarity.combined_g2[(firstTerm, candidate)] preBest.append((candidate, pre_score)) postBest.append((candidate, post_score)) return (preBest, postBest) def getBestEnergies(self, preBest, postBest, addedTerm): if addedTerm == 0: return (preBest, postBest, []) term_order = [x[0] for x in preBest] # compare candidate terms' bests against newly added term remove_index = -1 for existingIndex in range(len(preBest)): term = term_order[existingIndex] if term == addedTerm: remove_index = existingIndex # check pre energies if (term, addedTerm) in self.similarity.combined_g2: if self.similarity.combined_g2[( term, addedTerm)] > preBest[existingIndex][1]: preBest[existingIndex] = ( term, self.similarity.combined_g2[(term, addedTerm)]) # check post energies if (addedTerm, term) in self.similarity.combined_g2: if self.similarity.combined_g2[( addedTerm, term)] > postBest[existingIndex][1]: postBest[existingIndex] = ( term, self.similarity.combined_g2[(addedTerm, term)]) # remove the added term's preBest and postBest scores if remove_index != -1: del preBest[remove_index] del postBest[remove_index] #create and sort the bestEnergies list energyMax = [ sum(pair) for pair in zip([x[1] for x in preBest], [y[1] for y in postBest]) ] bestEnergies = zip([x[0] for x in preBest], energyMax) return (preBest, postBest, sorted(bestEnergies, key=itemgetter(1), reverse=True)) def iterate_eff(self, candidateTerms, term_ordering, term_iter_index, buffers, bestEnergies, iteration_no): maxEnergyChange = -9999999999999999 maxTerm = "" maxPosition = 0 if len(bestEnergies) != 0: bestEnergy_terms = [x[0] for x in bestEnergies] else: bestEnergy_terms = candidateTerms breakout_counter = 0 for candidate_index in range(len(bestEnergy_terms)): breakout_counter += 1 candidate = bestEnergy_terms[candidate_index] for position in range(len(term_ordering) + 1): current_buffer = buffers[position] candidateRank = self.termRank[candidate] if candidateRank <= (len(term_ordering) + self.candidateSize): current_energy_change = self.getEnergyChange( candidate, position, term_ordering, current_buffer, iteration_no) if current_energy_change > maxEnergyChange: maxEnergyChange = current_energy_change maxTerm = candidate maxPosition = position # check for early termination if candidate_index < len(bestEnergy_terms) - 1 and len( bestEnergies) != 0: if maxEnergyChange >= ( 2 * (bestEnergies[candidate_index][1] + current_buffer)): print "#-------- breaking out early ---------#" print "candidates checked: ", breakout_counter break print "change in energy: ", maxEnergyChange print "maxTerm: ", maxTerm print "maxPosition: ", maxPosition candidateTerms.remove(maxTerm) # update buffers buf_score = 0 if len(term_ordering) == 0: buffers = buffers elif maxPosition >= len(term_ordering): if (term_ordering[-1], maxTerm) in self.similarity.combined_g2: buf_score = self.similarity.combined_g2[(term_ordering[-1], maxTerm)] buffers.insert(len(buffers) - 1, buf_score) elif maxPosition == 0: if (maxTerm, term_ordering[0]) in self.similarity.combined_g2: buf_score = self.similarity.combined_g2[(maxTerm, term_ordering[0])] buffers.insert(1, buf_score) else: if (term_ordering[maxPosition - 1], maxTerm) in self.similarity.combined_g2: buf_score = self.similarity.combined_g2[( term_ordering[maxPosition - 1], maxTerm)] buffers[maxPosition] = buf_score buf_score = 0 if (maxTerm, term_ordering[maxPosition]) in self.similarity.combined_g2: buf_score = self.similarity.combined_g2[( maxTerm, term_ordering[maxPosition])] buffers.insert(maxPosition + 1, buf_score) # update term ordering and ranking if maxPosition >= len(term_ordering): term_ordering.append(maxTerm) else: term_ordering.insert(maxPosition, maxTerm) term_iter_index.append(maxTerm) return (candidateTerms, term_ordering, term_iter_index, buffers) def getEnergyChange(self, candidateTerm, position, term_list, currentBuffer, iteration_no): prevBond = 0.0 postBond = 0.0 # first iteration only if iteration_no == 0: current_freq = 0.0 current_saliency = 0.0 if candidateTerm in self.termFreqs: current_freq = self.termFreqs[candidateTerm] if candidateTerm in self.termSaliency: current_saliency = self.termSaliency[candidateTerm] return 0.001 * current_freq * current_saliency # get previous term if position > 0: prev_term = term_list[position - 1] if (prev_term, candidateTerm) in self.similarity.combined_g2: prevBond = self.similarity.combined_g2[(prev_term, candidateTerm)] # get next term if position < len(term_list): next_term = term_list[position] if (next_term, candidateTerm) in self.similarity.combined_g2: postBond = self.similarity.combined_g2[(candidateTerm, next_term)] return 2 * (prevBond + postBond - currentBuffer)
class ComputeSeriation( object ): """Seriation algorithm. Re-order words to improve promote the legibility of multi-word phrases and reveal the clustering of related terms. As output, the algorithm produces a list of seriated terms and its 'ranking' (i.e., the iteration in which a term was seriated). """ DEFAULT_NUM_SERIATED_TERMS = 100 def __init__( self, logging_level ): self.logger = logging.getLogger( 'ComputeSeriation' ) self.logger.setLevel( logging_level ) handler = logging.StreamHandler( sys.stderr ) handler.setLevel( logging_level ) self.logger.addHandler( handler ) def execute( self, data_path, numSeriatedTerms = None ): assert data_path is not None if numSeriatedTerms is None: numSeriatedTerms = ComputeSeriation.DEFAULT_NUM_SERIATED_TERMS self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info( 'Computing term seriation...' ) self.logger.info( ' data_path = %s', data_path ) self.logger.info( ' number_of_seriated_terms = %d', numSeriatedTerms ) self.logger.info( 'Connecting to data...' ) self.saliency = SaliencyAPI( data_path ) self.similarity = SimilarityAPI( data_path ) self.seriation = SeriationAPI( data_path ) self.logger.info( 'Reading data from disk...' ) self.saliency.read() self.similarity.read() self.logger.info( 'Reshaping saliency data...' ) self.reshape() self.logger.info( 'Computing seriation...' ) self.compute( numSeriatedTerms ) self.logger.info( 'Writing data to disk...' ) self.seriation.write() self.logger.info( '--------------------------------------------------------------------------------' ) def reshape( self ): self.candidateSize = 100 self.orderedTermList = [] self.termSaliency = {} self.termFreqs = {} self.termDistinct = {} self.termRank = {} self.termVisibility = {} for element in self.saliency.term_info: term = element['term'] self.orderedTermList.append( term ) self.termSaliency[term] = element['saliency'] self.termFreqs[term] = element['frequency'] self.termDistinct[term] = element['distinctiveness'] self.termRank[term] = element['rank'] self.termVisibility[term] = element['visibility'] def compute( self, numSeriatedTerms ): # Elicit from user (1) the number of terms to output and (2) a list of terms that should be included in the output... # set in init (i.e. read from config file) # Seriate! start_time = time.time() candidateTerms = self.orderedTermList self.seriation.term_ordering = [] self.seriation.term_iter_index = [] self.buffers = [0,0] preBest = [] postBest = [] for iteration in range(numSeriatedTerms): print "Iteration no. ", iteration addedTerm = 0 if len(self.seriation.term_iter_index) > 0: addedTerm = self.seriation.term_iter_index[-1] if iteration == 1: (preBest, postBest) = self.initBestEnergies(addedTerm, candidateTerms) (preBest, postBest, self.bestEnergies) = self.getBestEnergies(preBest, postBest, addedTerm) (candidateTerms, self.seriation.term_ordering, self.seriation.term_iter_index, self.buffers) = self.iterate_eff(candidateTerms, self.seriation.term_ordering, self.seriation.term_iter_index, self.buffers, self.bestEnergies, iteration) print "---------------" seriation_time = time.time() - start_time # Output consists of (1) a list of ordered terms, and (2) the iteration index in which a term was ordered #print "term_ordering: ", self.seriation.term_ordering #print "term_iter_index: ", self.seriation.term_iter_index # Feel free to pick a less confusing variable name #print "similarity matrix generation time: ", compute_sim_time #print "seriation time: ", seriation_time self.logger.debug("seriation time: " + str(seriation_time)) #-------------------------------------------------------------------------------# # Helper Functions def initBestEnergies(self, firstTerm, candidateTerms): preBest = [] postBest = [] for candidate in candidateTerms: pre_score = 0 post_score = 0 # preBest if (candidate, firstTerm) in self.similarity.combined_g2: pre_score = self.similarity.combined_g2[(candidate, firstTerm)] # postBest if (firstTerm, candidate) in self.similarity.combined_g2: post_score = self.similarity.combined_g2[(firstTerm, candidate)] preBest.append((candidate, pre_score)) postBest.append((candidate, post_score)) return (preBest, postBest) def getBestEnergies(self, preBest, postBest, addedTerm): if addedTerm == 0: return (preBest, postBest, []) term_order = [x[0] for x in preBest] # compare candidate terms' bests against newly added term remove_index = -1 for existingIndex in range(len(preBest)): term = term_order[existingIndex] if term == addedTerm: remove_index = existingIndex # check pre energies if (term, addedTerm) in self.similarity.combined_g2: if self.similarity.combined_g2[(term, addedTerm)] > preBest[existingIndex][1]: preBest[existingIndex] = (term, self.similarity.combined_g2[(term, addedTerm)]) # check post energies if (addedTerm, term) in self.similarity.combined_g2: if self.similarity.combined_g2[(addedTerm, term)] > postBest[existingIndex][1]: postBest[existingIndex] = (term, self.similarity.combined_g2[(addedTerm, term)]) # remove the added term's preBest and postBest scores if remove_index != -1: del preBest[remove_index] del postBest[remove_index] #create and sort the bestEnergies list energyMax = [sum(pair) for pair in zip([x[1] for x in preBest], [y[1] for y in postBest])] bestEnergies = zip([x[0] for x in preBest], energyMax) return (preBest, postBest, sorted(bestEnergies, key=itemgetter(1), reverse=True)) def iterate_eff( self, candidateTerms, term_ordering, term_iter_index, buffers, bestEnergies, iteration_no ): maxEnergyChange = 0.0; maxTerm = ""; maxPosition = 0; if len(bestEnergies) != 0: bestEnergy_terms = [x[0] for x in bestEnergies] else: bestEnergy_terms = candidateTerms breakout_counter = 0 for candidate_index in range(len(bestEnergy_terms)): breakout_counter += 1 candidate = bestEnergy_terms[candidate_index] for position in range(len(term_ordering)+1): current_buffer = buffers[position] candidateRank = self.termRank[candidate] if candidateRank <= (len(term_ordering) + self.candidateSize): current_energy_change = self.getEnergyChange(candidate, position, term_ordering, current_buffer, iteration_no) if current_energy_change > maxEnergyChange: maxEnergyChange = current_energy_change maxTerm = candidate maxPosition = position # check for early termination if candidate_index < len(bestEnergy_terms)-1 and len(bestEnergies) != 0: if maxEnergyChange >= (2*(bestEnergies[candidate_index][1] + current_buffer)): print "#-------- breaking out early ---------#" print "candidates checked: ", breakout_counter break; print "change in energy: ", maxEnergyChange print "maxTerm: ", maxTerm print "maxPosition: ", maxPosition candidateTerms.remove(maxTerm) # update buffers buf_score = 0 if len(term_ordering) == 0: buffers = buffers elif maxPosition >= len(term_ordering): if (term_ordering[-1], maxTerm) in self.similarity.combined_g2: buf_score = self.similarity.combined_g2[(term_ordering[-1], maxTerm)] buffers.insert(len(buffers)-1, buf_score) elif maxPosition == 0: if (maxTerm, term_ordering[0]) in self.similarity.combined_g2: buf_score = self.similarity.combined_g2[(maxTerm, term_ordering[0])] buffers.insert(1, buf_score) else: if (term_ordering[maxPosition-1], maxTerm) in self.similarity.combined_g2: buf_score = self.similarity.combined_g2[(term_ordering[maxPosition-1], maxTerm)] buffers[maxPosition] = buf_score buf_score = 0 if (maxTerm, term_ordering[maxPosition]) in self.similarity.combined_g2: buf_score = self.similarity.combined_g2[(maxTerm, term_ordering[maxPosition])] buffers.insert(maxPosition+1, buf_score) # update term ordering and ranking if maxPosition >= len(term_ordering): term_ordering.append(maxTerm) else: term_ordering.insert(maxPosition, maxTerm) term_iter_index.append(maxTerm) return (candidateTerms, term_ordering, term_iter_index, buffers) def getEnergyChange(self, candidateTerm, position, term_list, currentBuffer, iteration_no): prevBond = 0.0 postBond = 0.0 # first iteration only if iteration_no == 0: current_freq = 0.0 current_saliency = 0.0 if candidateTerm in self.termFreqs: current_freq = self.termFreqs[candidateTerm] if candidateTerm in self.termSaliency: current_saliency = self.termSaliency[candidateTerm] return 0.001 * current_freq * current_saliency # get previous term if position > 0: prev_term = term_list[position-1] if (prev_term, candidateTerm) in self.similarity.combined_g2: prevBond = self.similarity.combined_g2[(prev_term, candidateTerm)] # get next term if position < len(term_list): next_term = term_list[position] if (next_term, candidateTerm) in self.similarity.combined_g2: postBond = self.similarity.combined_g2[(candidateTerm, next_term)] return 2*(prevBond + postBond - currentBuffer)