Exemplo n.º 1
0
def main():
    # Check input
    if len(sys.argv) != 5:
        print"usage: python run_ML_FW.py [train file] [setting file] [model folder] [test data folder]"
        exit()
    # Get environment variables
    train_file = sys.argv[1]
    setting_file = sys.argv[2]
    model_folder = sys.argv[3]
    test_data_folder = sys.argv[4]
    tops = 10#int(sys.argv[5])    
    # Create model folder if it doesn't exist
    if os.path.exists(model_folder):
        shutil.rmtree(model_folder)
    os.makedirs(model_folder)
    # Read settings
    print'reading setting ...'
    ddict = utilities.read_setting(setting_file)
    print'write setting ...'
    file_name = '%s/setting.txt'%(model_folder)
    utilities.write_setting(ddict, file_name)
    # Read data for computing perplexities
    print'read data for computing perplexities ...'
    (wordids_1, wordcts_1, wordids_2, wordcts_2) = \
    utilities.read_data_for_perpl(test_data_folder)
    # Initialize the algorithm
    print'initialize the algorithm ...'
    ml_fw = ML_FW.MLFW(ddict['num_terms'], ddict['num_topics'], ddict['tau0'], ddict['kappa'], ddict['iter_infer'])
    # Start
    print'start!!!'
    i = 0
    while i < ddict['iter_train']:
        i += 1
        print'\n***iter_train:%d***\n'%(i)
        datafp = open(train_file, 'r')
        j = 0
        while True:
            j += 1
            (wordids, wordcts) = utilities.read_minibatch_list_frequencies(datafp, ddict['batch_size'])
            # Stop condition
            if len(wordids) == 0:
                break
            # 
            print'---num_minibatch:%d---'%(j)
            (time_e, time_m, theta) = ml_fw.static_online(ddict['batch_size'], wordids, wordcts)
            # Compute sparsity
            sparsity = utilities.compute_sparsity(theta, theta.shape[0], theta.shape[1], 't')
            # Compute perplexities
            LD2 = utilities.compute_perplexities_fw(ml_fw.beta, ddict['iter_infer'], \
                               wordids_1, wordcts_1, wordids_2, wordcts_2)
            # Search top words of each topics
            list_tops = utilities.list_top(ml_fw.beta, tops)
            # Write files
            utilities.write_file(i, j, ml_fw.beta, time_e, time_m, theta, sparsity, LD2, list_tops, tops, model_folder)
        datafp.close()
    # Write final model to file
    file_name = '%s/beta_final.dat'%(model_folder)
    utilities.write_topics(ml_fw.beta, file_name)
    # Finish
    print'done!!!'        
Exemplo n.º 2
0
Arquivo: run.py Projeto: tuvvkstn/OPE
def main():
    # Check input
    if len(sys.argv) != 6:
        print "usage: python run.py [method name] [train file] [setting file] [model folder] [test data folder]"
        exit()
    # Get environment variables
    method_name = sys.argv[1]
    train_file = sys.argv[2]
    setting_file = sys.argv[3]
    model_folder = sys.argv[4]
    test_data_folder = sys.argv[5]
    tops = 10  # int(sys.argv[5])
    # Create model folder if it doesn't exist
    if os.path.exists(model_folder):
        shutil.rmtree(model_folder)
    os.makedirs(model_folder)
    # Read settings
    print "reading setting ..."
    settings = utilities.read_setting(setting_file)
    # Read data for computing perplexities
    print "read data for computing perplexities ..."
    test_data = utilities.read_data_for_perpl(test_data_folder)
    # Check method and run algorithm
    methods = ["new1ml-ope", "new2ml-ope", "new1online-ope", "new2online-ope", "new1streaming-ope", "new2streaming-ope"]
    method_low = method_name.lower()
    if method_low == "new1ml-ope":
        run_new1_mlope = run_New1ML_OPE.runNew1MLOPE(train_file, settings, model_folder, test_data, tops)
        run_new1_mlope.run()
    elif method_low == "new2ml-ope":
        run_new2_mlope = run_New2ML_OPE.runNew2MLOPE(train_file, settings, model_folder, test_data, tops)
        run_new2_mlope.run()
    elif method_low == "new1online-ope":
        run_new1_onlineope = run_New1Online_OPE.runNew1OnlineOPE(train_file, settings, model_folder, test_data, tops)
        run_new1_onlineope.run()
    elif method_low == "new2online-ope":
        run_new2onlineope = run_New2Online_OPE.runNew2OnlineOPE(train_file, settings, model_folder, test_data, tops)
        run_new2onlineope.run()

    elif method_low == "new1streaming-ope":
        run_new1_streamingope = run_New1Streaming_OPE.runNew1StreamingOPE(
            train_file, settings, model_folder, test_data, tops
        )
        run_new1_streamingope.run()
    elif method_low == "new2streaming-ope":
        run_new2streamingope = run_New2Streaming_OPE.runNew2StreamingOPE(
            train_file, settings, model_folder, test_data, tops
        )
        run_new2streamingope.run()
    else:
        print "\ninput wrong method name: %s\n" % (method_name)
        print "list of methods:"
        for method in methods:
            print "\t\t%s" % (method)
        exit()
Exemplo n.º 3
0
def main():
    # Check input
    if len(sys.argv) != 6:
        print"usage: python run.py [method name] [train file] [setting file] [model folder] [test data folder]"
        exit()
    # Get environment variables
    method_name = sys.argv[1]
    train_file = sys.argv[2]
    setting_file = sys.argv[3]
    model_folder = sys.argv[4]
    test_data_folder = sys.argv[5]
    tops = 10#int(sys.argv[5])
    # Create model folder if it doesn't exist
    if os.path.exists(model_folder):
        shutil.rmtree(model_folder)
    os.makedirs(model_folder)
    # Read settings
    print'reading setting ...'
    settings = utilities.read_setting(setting_file)
    # Read data for computing perplexities
    print'read data for computing perplexities ...'
    test_data = utilities.read_data_for_perpl(test_data_folder)
    # Check method and run algorithm
    methods = ['new1ml-ope','new2ml-ope', 'new1online-ope','new2online-ope', 'new1streaming-ope', 'new2streaming-ope']
    method_low = method_name.lower()    
    if method_low == 'new1ml-ope':        
        run_new1_mlope = run_New1ML_OPE.runNew1MLOPE(train_file, settings, model_folder, test_data, tops)
        run_new1_mlope.run()
    elif method_low == 'new2ml-ope':
        run_new2_mlope = run_New2ML_OPE.runNew2MLOPE(train_file, settings, model_folder, test_data, tops)
        run_new2_mlope.run()
    elif method_low == 'new1online-ope':
        run_new1_onlineope = run_New1Online_OPE.runNew1OnlineOPE(train_file, settings, model_folder, test_data, tops)
        run_new1_onlineope.run()
    elif method_low == 'new2online-ope':
        run_new2onlineope = run_New2Online_OPE.runNew2OnlineOPE(train_file, settings, model_folder, test_data, tops)
        run_new2onlineope.run()
        
    elif method_low == 'new1streaming-ope':
        run_new1_streamingope = run_New1Streaming_OPE.runNew1StreamingOPE(train_file, settings, model_folder, test_data, tops)
        run_new1_streamingope.run()
    elif method_low == 'new2streaming-ope':
        run_new2streamingope = run_New2Streaming_OPE.runNew2StreamingOPE(train_file, settings, model_folder, test_data, tops)
        run_new2streamingope.run()
    else:
        print '\ninput wrong method name: %s\n'%(method_name)
        print 'list of methods:'
        for method in methods:
            print '\t\t%s'%(method)
        exit()
Exemplo n.º 4
0
import sys
import Streaming
import utilities as util
import os
# python2 run_Streaming.py [file train] [file setting] [folder model] [folder data test] [prior]
# check input
if len(sys.argv) != 6:
    print "Usage: python2 run_Streaming.py [file train] [file setting] [folder model] [folder data test] [prior]"
    exit()
filetrain = sys.argv[1]
filesetting = sys.argv[2]
folder = sys.argv[3]
filetest = sys.argv[4]
fileprior = sys.argv[5]
setting = util.read_setting(filesetting)
n_tests = 1  #
folder = "%s-TPS-%s-%s-%s" % (folder, setting['sigma'], setting['batch_size'],
                              setting['n_topics'])
print folder
if not os.path.exists(folder):
    os.makedirs(folder)
else:
    print "Folder existed"
    exit()

ft = open(filetrain, 'r')
util.write_setting(folder, setting)

strm = Streaming.Streaming(fileprior, setting['alpha'], setting['n_topics'],
                           setting['n_terms'], setting['n_infer'],
                           setting['learning_rate'], setting['sigma'])
Exemplo n.º 5
0
def main():
    # Check input
    if len(sys.argv) != 5:
        print "usage: python run_Online_FW.py [train file] [setting file] [model folder] [test data folder]"
        exit()
    # Get environment variables
    train_file = sys.argv[1]
    setting_file = sys.argv[2]
    model_folder = sys.argv[3]
    test_data_folder = sys.argv[4]
    tops = 10  # int(sys.argv[5])
    # Create model folder if it doesn't exist
    if os.path.exists(model_folder):
        shutil.rmtree(model_folder)
    os.makedirs(model_folder)
    # Read settings
    print "reading setting ..."
    ddict = utilities.read_setting(setting_file)
    print "write setting ..."
    file_name = "%s/setting.txt" % (model_folder)
    utilities.write_setting(ddict, file_name)
    # Read data for computing perplexities
    print "read data for computing perplexities ..."
    (wordids_1, wordcts_1, wordids_2, wordcts_2) = utilities.read_data_for_perpl(test_data_folder)
    # Initialize the algorithm
    print "initialize the algorithm ..."
    online_fw = Online_FW.OnlineFW(
        ddict["num_docs"],
        ddict["num_terms"],
        ddict["num_topics"],
        ddict["eta"],
        ddict["tau0"],
        ddict["kappa"],
        ddict["iter_infer"],
    )
    # Start
    print "start!!!"
    i = 0
    while i < ddict["iter_train"]:
        i += 1
        print "\n***iter_train:%d***\n" % (i)
        datafp = open(train_file, "r")
        j = 0
        while True:
            j += 1
            (wordids, wordcts) = utilities.read_minibatch_list_frequencies(datafp, ddict["batch_size"])
            # Stop condition
            if len(wordids) == 0:
                break
            #
            print "---num_minibatch:%d---" % (j)
            (time_e, time_m, theta) = online_fw.static_online(ddict["batch_size"], wordids, wordcts)
            # Compute sparsity
            sparsity = utilities.compute_sparsity(theta, theta.shape[0], theta.shape[1], "t")
            # Compute perplexities
            LD2 = utilities.compute_perplexities_fw(
                online_fw._lambda, ddict["iter_infer"], wordids_1, wordcts_1, wordids_2, wordcts_2
            )
            # Search top words of each topics
            list_tops = utilities.list_top(online_fw._lambda, tops)
            # Write files
            utilities.write_file(
                i, j, online_fw._lambda, time_e, time_m, theta, sparsity, LD2, list_tops, tops, model_folder
            )
        datafp.close()
    # Write final model to file
    file_name = "%s/lambda_final.dat" % (model_folder)
    utilities.write_topics(online_fw._lambda, file_name)
    # Finish
    print "done!!!"
Exemplo n.º 6
0
def main():
    # Check input
    if len(sys.argv) != 5:
        print(
            "usage: python run_ML_OPE.py [train file] [setting file] [model folder] [test data folder]"
        )
        exit()
    # Get environment variables
    train_file = sys.argv[1]
    setting_file = sys.argv[2]
    model_folder = sys.argv[3]
    test_data_folder = sys.argv[4]
    # Create model folder if it doesn't exist
    if os.path.exists(model_folder):
        shutil.rmtree(model_folder)
    os.makedirs(model_folder)
    # Read settings
    print('reading setting ...')
    ddict = utilities.read_setting(setting_file)
    print('write setting ...')
    file_name = '%s/setting.txt' % (model_folder)
    utilities.write_setting(ddict, file_name)
    # Read data for computing perplexities
    print('read data for computing perplexities ...')
    (wordids_1, wordcts_1, wordids_2, wordcts_2) = \
        utilities.read_data_for_perpl(test_data_folder)
    # ============================================= TILL HERE OKAY [0] =============================================
    # Initialize the algorithm
    print('initialize the algorithm ...')
    ml_ope = ML_OPE.MLOPE(ddict['num_terms'], ddict['num_topics'],
                          ddict['alpha'], ddict['tau0'], ddict['kappa'],
                          ddict['iter_infer'])

    # Start
    print('start!!!')
    i = 0
    list_tops = []
    while i < ddict['iter_train']:
        i += 1
        print('\n***iter_train:%d***\n' % (i))
        datafp = open(train_file, 'r')
        j = 0
        while True:
            j += 1
            (wordids, wordcts) = utilities.read_minibatch_list_frequencies(
                datafp, ddict['batch_size'])
            # Stop condition
            if len(wordids) == 0:
                break
            #
            print('---num_minibatch:%d---' % (j))
            (time_e, time_m,
             theta) = ml_ope.static_online(ddict['batch_size'], wordids,
                                           wordcts)
            # ========================= TILL HERE OKAY [1] ======================================
            # Compute sparsity
            sparsity = utilities.compute_sparsity(theta, theta.shape[0],
                                                  theta.shape[1], 't')
            # print(sparsity)        # for Testing Sparsity of 1st theta
            # print(theta[0,:])      # for Testing Sparsity of 1st theta
            # Compute perplexities

            # LD2 = utilities.compute_perplexities_vb(ml_ope.beta, ddict['alpha'], ddict['eta'], ddict['iter_infer'], \
            #                                        wordids_1, wordcts_1, wordids_2, wordcts_2)
            LD2 = None

            # Saving previous list_tops for diff_list_tops() below
            prev_list_tops = list_tops

            # Search top words of each topics
            list_tops = utilities.list_top(ml_ope.beta, ddict['tops'])

            # TODO: add [last 25% avg diff count] to new file to compare later with other settings
            # Calculate and print difference between old and current list_tops
            utilities.diff_list_tops(list_tops, prev_list_tops, i)

            # Write files
            utilities.write_file(i, j, ml_ope.beta, time_e, time_m, theta,
                                 sparsity, LD2, list_tops, model_folder)
        datafp.close()
    # Write final model to file
    file_name = '%s/beta_final.dat' % (model_folder)
    utilities.write_topics(ml_ope.beta, file_name)
    # Finish
    print('done!!!')
Exemplo n.º 7
0
def main():
    # Check input
    if len(sys.argv) != 5:
        print "usage: python run_ML_FW.py [train file] [setting file] [model folder] [test data folder]"
        exit()
    # Get environment variables
    train_file = sys.argv[1]
    setting_file = sys.argv[2]
    model_folder = sys.argv[3]
    test_data_folder = sys.argv[4]
    tops = 10  #int(sys.argv[5])
    # Create model folder if it doesn't exist
    if os.path.exists(model_folder):
        shutil.rmtree(model_folder)
    os.makedirs(model_folder)
    # Read settings
    print 'reading setting ...'
    ddict = utilities.read_setting(setting_file)
    print 'write setting ...'
    file_name = '%s/setting.txt' % (model_folder)
    utilities.write_setting(ddict, file_name)
    # Read data for computing perplexities
    print 'read data for computing perplexities ...'
    (wordids_1, wordcts_1, wordids_2, wordcts_2) = \
    utilities.read_data_for_perpl(test_data_folder)
    # Initialize the algorithm
    print 'initialize the algorithm ...'
    ml_fw = ML_FW.MLFW(ddict['num_terms'], ddict['num_topics'], ddict['tau0'],
                       ddict['kappa'], ddict['iter_infer'])
    # Start
    print 'start!!!'
    i = 0
    while i < ddict['iter_train']:
        i += 1
        print '\n***iter_train:%d***\n' % (i)
        datafp = open(train_file, 'r')
        j = 0
        while True:
            j += 1
            (wordids, wordcts) = utilities.read_minibatch_list_frequencies(
                datafp, ddict['batch_size'])
            # Stop condition
            if len(wordids) == 0:
                break
            #
            print '---num_minibatch:%d---' % (j)
            (time_e, time_m,
             theta) = ml_fw.static_online(ddict['batch_size'], wordids,
                                          wordcts)
            # Compute sparsity
            sparsity = utilities.compute_sparsity(theta, theta.shape[0],
                                                  theta.shape[1], 't')
            # Compute perplexities
            LD2 = utilities.compute_perplexities_fw(ml_fw.beta, ddict['iter_infer'], \
                               wordids_1, wordcts_1, wordids_2, wordcts_2)
            # Search top words of each topics
            list_tops = utilities.list_top(ml_fw.beta, tops)
            # Write files
            utilities.write_file(i, j, ml_fw.beta, time_e, time_m, theta,
                                 sparsity, LD2, list_tops, tops, model_folder)
        datafp.close()
    # Write final model to file
    file_name = '%s/beta_final.dat' % (model_folder)
    utilities.write_topics(ml_fw.beta, file_name)
    # Finish
    print 'done!!!'