示例#1
0
 def run(self):
     # Initialize the algorithm
     print 'initialize the algorithm ...'
     online_vb = Online_VB.OnlineVB(
         self.settings['num_docs'], self.settings['num_terms'],
         self.settings['num_topics'], self.settings['alpha'],
         self.settings['eta'], self.settings['tau0'],
         self.settings['kappa'], self.settings['conv_infer'],
         self.settings['iter_infer'])
     # Start
     print 'start!!!'
     i = 0
     while i < self.settings['iter_train']:
         i += 1
         print '\n***iter_train:%d***\n' % (i)
         datafp = open(self.train_file, 'r')
         j = 0
         while True:
             j += 1
             (wordids, wordcts) = utilities.read_minibatch_list_frequencies(
                 datafp, self.settings['batch_size'])
             # Stop condition
             if len(wordids) == 0:
                 break
             #
             print '---num_minibatch:%d---' % (j)
             (time_e, time_m,
              theta) = online_vb.static_online(self.settings['batch_size'],
                                               wordids, wordcts)
             # Compute sparsity
             sparsity = utilities.compute_sparsity(theta, theta.shape[0],
                                                   theta.shape[1], 't')
             # Compute perplexities
             LD2 = utilities.compute_perplexities_vb(
                 online_vb._lambda, self.settings['alpha'],
                 self.settings['eta'], self.settings['iter_infer'],
                 self.test_data)
             # Search top words of each topics
             list_tops = utilities.list_top(online_vb._lambda, self.tops)
             # Write files
             utilities.write_file(i, j, online_vb._lambda, time_e, time_m,
                                  theta, sparsity, LD2, list_tops,
                                  self.tops, self.model_folder)
         datafp.close()
     # Write settings
     print 'write setting ...'
     file_name = '%s/setting.txt' % (self.model_folder)
     utilities.write_setting(self.settings, file_name)
     # Write final model to file
     print 'write final model ...'
     file_name = '%s/beta_final.dat' % (self.model_folder)
     utilities.write_topics(online_vb._lambda, file_name)
     # Finish
     print 'done!!!'
示例#2
0
 def run(self):
     # Initialize the algorithm
     print'initialize the algorithm ...'
     new2_online_ope = New2Online_OPE.New2OnlineOPE(self.settings['num_docs'], self.settings['num_terms'], 
                                       self.settings['num_topics'], self.settings['alpha'],
                                       self.settings['eta'], self.settings['tau0'], 
                                       self.settings['kappa'], self.settings['iter_infer'])
     # Start
     print'start!!!'
     i = 0
     while i < self.settings['iter_train']:
         i += 1
         print'\n***iter_train:%d***\n'%(i)
         datafp = open(self.train_file, 'r')
         j = 0
         while True:
             j += 1
             (wordids, wordcts) = utilities.read_minibatch_list_frequencies(datafp, self.settings['batch_size'])
             # Stop condition
             if len(wordids) == 0:
                 break
             # 
             print'---num_minibatch:%d---'%(j)
             (time_e, time_m, theta) = new2_online_ope.static_online(wordids, wordcts)
             # Compute sparsity
             sparsity = utilities.compute_sparsity(theta, theta.shape[0], theta.shape[1], 't')
             # Compute perplexities
             LD2 = utilities.compute_perplexities_vb(new2_online_ope._lambda, self.settings['alpha'], self.settings['eta'], 
                                                     self.settings['iter_infer'], self.test_data)
             # Search top words of each topics
             list_tops = utilities.list_top(new2_online_ope._lambda, self.tops)
             # Write files
             utilities.write_file(i, j, new2_online_ope._lambda, time_e, time_m, theta, sparsity, LD2, list_tops, self.tops, 
                                  self.model_folder)
         datafp.close()
     # Write settings
     print'write setting ...'
     file_name = '%s/setting.txt'%(self.model_folder)
     utilities.write_setting(self.settings, file_name)
     # Write final model to file
     print'write final model ...'
     file_name = '%s/beta_final.dat'%(self.model_folder)
     utilities.write_topics(new2_online_ope._lambda, file_name)
     # Finish
     print'done!!!'
示例#3
0
def main():
    # Check input
    if len(sys.argv) != 5:
        print"usage: python run_Online_OPE.py [train file] [setting file] [model folder] [test data folder]"
        exit()
    # Get environment variables
    train_file = sys.argv[1]
    setting_file = sys.argv[2]
    model_folder = sys.argv[3]
    test_data_folder = sys.argv[4]
    tops = 10#int(sys.argv[5])    
    # Create model folder if it doesn't exist
    if os.path.exists(model_folder):
        shutil.rmtree(model_folder)
    os.makedirs(model_folder)
    # Read settings
    print'reading setting ...'
    ddict = utilities.read_setting(setting_file)
    print'write setting ...'
    file_name = '%s/setting.txt'%(model_folder)
    utilities.write_setting(ddict, file_name)
    # Read data for computing perplexities
    print'read data for computing perplexities ...'
    (wordids_1, wordcts_1, wordids_2, wordcts_2) = \
    utilities.read_data_for_perpl(test_data_folder)
    # Initialize the algorithm
    print'initialize the algorithm ...'
    online_ope = Online_OPE.OnlineOPE(ddict['num_docs'], ddict['num_terms'], ddict['num_topics'], ddict['alpha'],
                                     ddict['eta'], ddict['tau0'], ddict['kappa'], ddict['iter_infer'])
    # Start
    print'start!!!'
    i = 0
    while i < ddict['iter_train']:
        i += 1
        print'\n***iter_train:%d***\n'%(i)
        datafp = open(train_file, 'r')
        j = 0
        while True:
            j += 1
            (wordids, wordcts) = utilities.read_minibatch_list_frequencies(datafp, ddict['batch_size'])
            # Stop condition
            if len(wordids) == 0:
                break
            # 
            print'---num_minibatch:%d---'%(j)
            (time_e, time_m, theta) = online_ope.static_online(ddict['batch_size'], wordids, wordcts)
            # Compute sparsity
            sparsity = utilities.compute_sparsity(theta, theta.shape[0], theta.shape[1], 't')
            # Compute perplexities
            LD2 = utilities.compute_perplexities_vb(online_ope._lambda, ddict['alpha'], ddict['eta'], ddict['iter_infer'],\
                                                                wordids_1, wordcts_1, wordids_2, wordcts_2)
            # Search top words of each topics
            list_tops = utilities.list_top(online_ope._lambda, tops)
            # Write files
            utilities.write_file(i, j, online_ope._lambda, time_e, time_m, theta, sparsity, LD2, list_tops, tops, model_folder)
        datafp.close()
    # Write final model to file
    file_name = '%s/beta_final.dat'%(model_folder)
    utilities.write_topics(online_ope._lambda, file_name)
    # Finish
    print'done!!!'        
示例#4
0
def main():
    # Check input
    if len(sys.argv) != 5:
        print "usage: python run_Streaming_OPE.py [train file] [setting file] [model folder] [test data folder]"
        exit()
    # Get environment variables
    train_file = sys.argv[1]
    setting_file = sys.argv[2]
    model_folder = sys.argv[3]
    test_data_folder = sys.argv[4]
    tops = 10  #int(sys.argv[5])
    # Create model folder if it doesn't exist
    if os.path.exists(model_folder):
        shutil.rmtree(model_folder)
    os.makedirs(model_folder)
    # Read settings
    print 'reading setting ...'
    ddict = utilities.read_setting(setting_file)
    print 'write setting ...'
    file_name = '%s/setting.txt' % (model_folder)
    utilities.write_setting(ddict, file_name)
    # Read data for computing perplexities
    print 'read data for computing perplexities ...'
    (wordids_1, wordcts_1, wordids_2, wordcts_2) = \
    utilities.read_data_for_perpl(test_data_folder)
    # Initialize the algorithm
    print 'initialize the algorithm ...'
    streaming_ope = Streaming_OPE.StreamingOPE(ddict['num_terms'],
                                               ddict['num_topics'],
                                               ddict['alpha'], ddict['eta'],
                                               ddict['iter_infer'])
    # Start
    print 'start!!!'
    i = 0
    while i < ddict['iter_train']:
        i += 1
        print '\n***iter_train:%d***\n' % (i)
        datafp = open(train_file, 'r')
        j = 0
        while True:
            j += 1
            (wordids, wordcts) = utilities.read_minibatch_list_frequencies(
                datafp, ddict['batch_size'])
            # Stop condition
            if len(wordids) == 0:
                break
            #
            print '---num_minibatch:%d---' % (j)
            (time_e, time_m,
             theta) = streaming_ope.static_online(ddict['batch_size'], wordids,
                                                  wordcts)
            # Compute sparsity
            sparsity = utilities.compute_sparsity(theta, theta.shape[0],
                                                  theta.shape[1], 't')
            # Compute perplexities
            LD2 = utilities.compute_perplexities_vb(streaming_ope._lambda, ddict['alpha'], ddict['eta'], ddict['iter_infer'],\
                                                                wordids_1, wordcts_1, wordids_2, wordcts_2)
            # Search top words of each topics
            list_tops = utilities.list_top(streaming_ope._lambda, tops)
            # Write files
            utilities.write_file(i, j, streaming_ope._lambda, time_e, time_m,
                                 theta, sparsity, LD2, list_tops, tops,
                                 model_folder)
        datafp.close()
    # Write final model to file
    file_name = '%s/beta_final.dat' % (model_folder)
    utilities.write_topics(streaming_ope._lambda, file_name)
    # Finish
    print 'done!!!'