def execute(self, data_path, numSeriatedTerms=None): assert data_path is not None if numSeriatedTerms is None: numSeriatedTerms = ComputeSeriation.DEFAULT_NUM_SERIATED_TERMS self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info('Computing term seriation...') self.logger.info(' data_path = %s', data_path) self.logger.info(' number_of_seriated_terms = %d', numSeriatedTerms) self.logger.info('Connecting to data...') self.saliency = SaliencyAPI(data_path) self.similarity = SimilarityAPI(data_path) self.seriation = SeriationAPI(data_path) self.logger.info('Reading data from disk...') self.saliency.read() self.similarity.read() self.logger.info('Reshaping saliency data...') self.reshape() self.logger.info('Computing seriation...') self.compute(numSeriatedTerms) self.logger.info('Writing data to disk...') self.seriation.write() self.logger.info( '--------------------------------------------------------------------------------' )
def execute( self, data_path, sliding_window_size = None ): assert data_path is not None if sliding_window_size is None: sliding_window_size = ComputeSimilarity.DEFAULT_SLIDING_WINDOW_SIZE self.logger.info( '--------------------------------------------------------------------------------' ) self.logger.info( 'Computing term similarity...' ) self.logger.info( ' data_path = %s', data_path ) self.logger.info( ' sliding_window_size = %d', sliding_window_size ) self.logger.info( 'Connecting to data...' ) self.tokens = TokensAPI( data_path ) self.similarity = SimilarityAPI( data_path ) self.logger.info( 'Reading data from disk...' ) self.tokens.read() self.logger.info( 'Computing document co-occurrence...' ) self.computeDocumentCooccurrence() self.logger.info( 'Computing sliding-window co-occurrence...' ) self.computeSlidingWindowCooccurrence( sliding_window_size ) self.logger.info( 'Counting total number of tokens, unigrams, and bigrams in the corpus...' ) self.computeTokenCounts() self.logger.info( 'Computing document co-occurrence likelihood...' ) self.similarity.document_g2 = self.getG2Stats( self.document_count, self.similarity.document_occurrence, self.similarity.document_cooccurrence ) self.logger.info( 'Computing sliding-window co-occurrence likelihood...' ) self.similarity.window_g2 = self.getG2Stats( self.window_count, self.similarity.window_occurrence, self.similarity.window_cooccurrence ) self.logger.info( 'Computing collocation likelihood...' ) self.similarity.collocation_g2 = self.getG2Stats( self.token_count, self.similarity.unigram_counts, self.similarity.bigram_counts ) self.combineSimilarityMatrices() self.logger.info( 'Writing data to disk...' ) self.similarity.write() self.logger.info( '--------------------------------------------------------------------------------' )