Exemplo n.º 1
0
def run_hdp():
  options = parse_args()
  random.seed(options.random_seed)
  # Read the training data.
  c_train_filename = options.data_path

  test_data_path = options.test_data_path
  c_test = read_data(test_data_path)
  c_test_word_count = sum([doc.total for doc in c_test.docs])

  if options.test_data_path_in_folds is not None:
    test_data_path_in_folds = options.test_data_path_in_folds
    test_data_in_folds_filenames = glob(test_data_path_in_folds)
    test_data_in_folds_filenames.sort()
    num_folds = len(test_data_in_folds_filenames)/2
    test_data_train_filenames = []
    test_data_test_filenames = []

    for i in range(num_folds):
      test_data_train_filenames.append(test_data_in_folds_filenames[2*i+1])
      test_data_test_filenames.append(test_data_in_folds_filenames[2*i])

    c_test_train_folds = [read_data(filename) for filename in test_data_train_filenames]
    c_test_test_folds = [read_data(filename) for filename in test_data_test_filenames]

  result_directory = "%s/corpus-%s" % (options.directory, options.corpus_name)
  print "creating directory %s" % result_directory
  if not os.path.isdir(result_directory):
    os.makedirs(result_directory)

  options_file = file("%s/options.dat" % result_directory, "w")
  for opt, value in options.__dict__.items():
    options_file.write(str(opt) + " " + str(value) + "\n")
  options_file.close()

  print "creating hdp instance."

  bhdp_hp = hdp.hdp_hyperparameter(options.alpha, options.alpha, options.gamma, options.gamma, False)

  bhdp = hdp.hdp(options.T, options.K, options.D, options.W, options.eta, bhdp_hp)
  #bhdp.seed_init(c_train)

  print "setting up counters and log files."
  iter = 0 
  total_time = 0.0
  total_doc_count = 0

  likelihood = 0.0
  old_likelihood = 0.0
  converge = 1.0

  log_file = file("%s/log.dat" % result_directory, "w") 
  log_file.write("iteration time doc.count likelihood\n")

  test_log_file = file("%s/test-log.dat" % result_directory, "w") 
  test_log_file.write("iteration time doc.count score word.count score.split word.count.split\n")

  while (options.max_iter == -1 or iter < options.max_iter) and total_time < options.max_time:
    t0 = time.clock()
    # Run one step iteration.
    likelihood = bhdp.em_on_large_data(c_train_filename, options.var_converge, fresh=(iter==0))
    if iter > 0:
        converge = (likelihood - old_likelihood)/abs(old_likelihood)
    old_likelihood = likelihood
    print "iter = %d, likelihood = %f, converge = %f" % (iter, likelihood, converge)
    if converge < 0:
        print "warning, likelihood is decreasing!"

    total_time += time.clock() - t0

    iter += 1  # increase the iter counter
    total_doc_count += options.D

    log_file.write("%d %d %d %.5f\n" % (iter, total_time, total_doc_count, likelihood))
    log_file.flush()

    bhdp.save_topics('%s/doc_count-%d.topics' %  (result_directory, total_doc_count))
    cPickle.dump(bhdp, file('%s/doc_count-%d.model' % (result_directory, total_doc_count), 'w'), -1)

    print "\tworking on predictions."
    (lda_alpha, lda_beta) = bhdp.hdp_to_lda()

    # prediction on the fixed test in folds
    print "\tworking on fixed test data."
    test_score = 0.0
    test_score_split = 0.0
    c_test_word_count_split = 0
    for doc in c_test.docs:
      (likelihood, gamma) = hdp.lda_e_step(doc, lda_alpha, lda_beta)
      test_score += likelihood
      (likelihood, count, gamma) = hdp.lda_e_step_split(doc, lda_alpha, lda_beta)
      test_score_split += likelihood
      c_test_word_count_split += count

    test_log_file.write("%d %d %d %.5f %d %.5f %d\n" % (iter, total_time,
                        total_doc_count, test_score, c_test_word_count, 
                        test_score_split, c_test_word_count_split))
    test_log_file.flush()

    # prediction on the test set in the folds
   # print "\tworking on test data in folds."
   # test_folds_log_file = file("%s/doc_count-%d.test.folds" % (result_directory, total_doc_count), "w")
   # test_folds_log_file.write("fold doc.id word count score\n")
   # for i in range(num_folds):
   #   train_data = c_test_train_folds[i]
   #   test_data = c_test_test_folds[i]
   #   for (doc_id, train_doc, test_doc) in izip(range(train_data.num_docs), train_data.docs, test_data.docs):
   #     if test_doc.total > 0:
   #       (likelihood, gamma) = hdp.lda_e_step(train_doc, lda_alpha, lda_beta)
   #       theta = gamma/np.sum(gamma)
   #       lda_betad = lda_beta[:, test_doc.words]
   #       log_predicts = np.log(np.dot(theta, lda_betad))
   #       log_info = "\n".join(["%d %d %d %d %.5f" % (i, doc_id, word, word_count, f) for (word, word_count, f) in izip(test_doc.words, test_doc.counts, log_predicts)])
   #       test_folds_log_file.write(log_info + "\n") 

   # test_folds_log_file.close()

  log_file.close()

  print "Saving the final model and topics."
  bhdp.save_topics('%s/final.topics' %  result_directory)
  cPickle.dump(bhdp, file('%s/final.model' % result_directory, 'w'), -1)

  (lda_alpha, lda_beta) = bhdp.hdp_to_lda()

  # prediction on the fixed test in folds
  print "\tworking on fixed test data."
  test_score = 0.0
  test_score_split = 0.0
  c_test_word_count_split = 0
  for doc in c_test.docs:
    (likelihood, gamma) = hdp.lda_e_step(doc, lda_alpha, lda_beta)
    test_score += likelihood
    (likelihood, count, gamma) = hdp.lda_e_step_split(doc, lda_alpha, lda_beta)
    test_score_split += likelihood
    c_test_word_count_split += count

  test_log_file.write("%d %d %d %.5f %d %.5f %d\n" % (iter, total_time,
                      total_doc_count, test_score, c_test_word_count, 
                      test_score_split, c_test_word_count_split))
  test_log_file.flush()

  test_log_file.close()
Exemplo n.º 2
0
def run_hdp():
  options = parse_args()
  random.seed(options.random_seed)

  # Read the training data.
  c_train_filename = options.data_path
  c_train  = read_data(c_train_filename)
  trlabels = utils.read_1D_array('trdataSLDALabels.txt');
  trsz     = utils.read_1D_array('trdatasz.txt');
  trsz = trsz[0];

  # Read the test data.
  test_data_path = options.test_data_path
  c_test = read_data(test_data_path)
  c_test_word_count = sum([doc.total for doc in c_test.docs])
  testlabels = utils.read_1D_array('trdataSLDALabels.txt');
  testsz     = utils.read_1D_array('trdatasz.txt');
  testsz = testsz[0];

  f = file("localsticks_a.txt", "w") 
  f.close()
  f = file("localsticks_b.txt", "w") 
  f.close()

  result_directory = "%s/corpus-%s" % (options.directory, options.corpus_name)
  print "creating directory %s" % result_directory
  if not os.path.isdir(result_directory):
    os.makedirs(result_directory)

  options_file = file("%s/options.dat" % result_directory, "w")
  for opt, value in options.__dict__.items():
    options_file.write(str(opt) + " " + str(value) + "\n")
  options_file.close()

  print "creating hdp instance for training."
  bhdp_hp = hdp.hdp_hyperparameter(options.alpha, options.alpha, options.gamma, options.gamma, True)
  bhdp = hdp.hdp(options.T, options.K, options.D, options.W, options.eta, trsz, bhdp_hp)

  print "setting up counters and log files."
  iter = 0 
  total_time = 0.0
  total_doc_count = 0

  likelihood = 0.0
  old_likelihood = 0.0
  converge = 1.0

  while options.max_iter == -1 or iter < options.max_iter:
    t0 = time.clock()
    # Run one step iteration.
    print "EM iteration starts!" 
    likelihood, trmodel = bhdp.em(c_train, trlabels, options.var_converge, trsz, fresh=(iter==0))
    if iter > 0:
        converge = (likelihood - old_likelihood)/abs(old_likelihood)
    old_likelihood = likelihood
    print "iter = %d, likelihood = %f, converge = %f" % (iter, likelihood, converge)
    if converge < 0:
        print "warning, likelihood is decreasing!"
    total_time += time.clock() - t0
    iter += 1  # increase the iter counter
    total_doc_count += options.D

  raw_input()
  # prediction on the fixed test data
  print "\tworking on fixed test data."
  print "creating hdp instance for test."
  bhdp_hp_test = hdp.hdp_hyperparameter(options.alpha, options.alpha, options.gamma, options.gamma, False)
  bhdp_test = hdp.hdp(options.T, options.K, options.D, options.W, options.eta, testsz, bhdp_hp_test)
  # prediction on test data 
  Elogbeta = dirichlet_expectation(bhdp.m_beta) # the topics
  Elogsticks_1st = expect_log_sticks(bhdp.m_var_sticks) # global sticks
  m_var_zeta = np.zeros((testsz, options.K))
  docnum = 0
  for doc in c_test.docs:
     temp, m_var_zeta = bhdp_test.doc_inference(doc, docnum, Elogbeta, Elogsticks_1st, options.var_converge, m_var_zeta)
     likelihood += temp
     docnum +=1  

  x = utils.converttoliblinear(m_var_zeta)
  y = testlabels
  p_label, p_acc, p_val = predict(y, x, trmodel)
  ACC, MSE, SCC = evaluations(y, p_label)
  print "test accuracy: %f" %ACC
Exemplo n.º 3
0
def run_hdp():
  options = parse_args()
  random.seed(options.random_seed)
  # Read the training data.
  c_train_filename = options.data_path

  test_data_path = options.test_data_path
  c_test = read_data(test_data_path)
  c_test_word_count = sum([doc.total for doc in c_test.docs])

  if options.test_data_path_in_folds is not None:
    test_data_path_in_folds = options.test_data_path_in_folds
    test_data_in_folds_filenames = glob(test_data_path_in_folds)
    test_data_in_folds_filenames.sort()
    num_folds = len(test_data_in_folds_filenames)/2
    test_data_train_filenames = []
    test_data_test_filenames = []

    for i in range(num_folds):
      test_data_train_filenames.append(test_data_in_folds_filenames[2*i+1])
      test_data_test_filenames.append(test_data_in_folds_filenames[2*i])

    c_test_train_folds = [read_data(filename) for filename in test_data_train_filenames]
    c_test_test_folds = [read_data(filename) for filename in test_data_test_filenames]

  result_directory = "%s/corpus-%s" % (options.directory, options.corpus_name)
  print "creating directory %s" % result_directory
  if not os.path.isdir(result_directory):
    os.makedirs(result_directory)

  options_file = file("%s/options.dat" % result_directory, "w")
  for opt, value in options.__dict__.items():
    options_file.write(str(opt) + " " + str(value) + "\n")
  options_file.close()

  print "creating hdp instance."

  bhdp_hp = hdp.hdp_hyperparameter(options.alpha, options.alpha, options.gamma, options.gamma, False)

  bhdp = hdp.hdp(options.T, options.K, options.D, options.W, options.eta, bhdp_hp)
  #bhdp.seed_init(c_train)

  print "setting up counters and log files."
  iter = 0 
  total_time = 0.0
  total_doc_count = 0

  likelihood = 0.0
  old_likelihood = 0.0
  converge = 1.0

  log_file = file("%s/log.dat" % result_directory, "w") 
  log_file.write("iteration time doc.count likelihood\n")

  test_log_file = file("%s/test-log.dat" % result_directory, "w") 
  test_log_file.write("iteration time doc.count score word.count score.split word.count.split\n")

  while (options.max_iter == -1 or iter < options.max_iter) and total_time < options.max_time:
    t0 = time.clock()
    # Run one step iteration.
    likelihood = bhdp.em_on_large_data(c_train_filename, options.var_converge, fresh=(iter==0))
    if iter > 0:
        converge = (likelihood - old_likelihood)/abs(old_likelihood)
    old_likelihood = likelihood
    print "iter = %d, likelihood = %f, converge = %f" % (iter, likelihood, converge)
    if converge < 0:
        print "warning, likelihood is decreasing!"

    total_time += time.clock() - t0

    iter += 1  # increase the iter counter
    total_doc_count += options.D

    log_file.write("%d %d %d %.5f\n" % (iter, total_time, total_doc_count, likelihood))
    log_file.flush()

    bhdp.save_topics('%s/doc_count-%d.topics' %  (result_directory, total_doc_count))
    cPickle.dump(bhdp, file('%s/doc_count-%d.model' % (result_directory, total_doc_count), 'w'), -1)

    print "\tworking on predictions."
    (lda_alpha, lda_beta) = bhdp.hdp_to_lda()

    # prediction on the fixed test in folds
    print "\tworking on fixed test data."
    test_score = 0.0
    test_score_split = 0.0
    c_test_word_count_split = 0
    for doc in c_test.docs:
      (likelihood, gamma) = hdp.lda_e_step(doc, lda_alpha, lda_beta)
      test_score += likelihood
      (likelihood, count, gamma) = hdp.lda_e_step_split(doc, lda_alpha, lda_beta)
      test_score_split += likelihood
      c_test_word_count_split += count

    test_log_file.write("%d %d %d %.5f %d %.5f %d\n" % (iter, total_time,
                        total_doc_count, test_score, c_test_word_count, 
                        test_score_split, c_test_word_count_split))
    test_log_file.flush()

    # prediction on the test set in the folds
   # print "\tworking on test data in folds."
   # test_folds_log_file = file("%s/doc_count-%d.test.folds" % (result_directory, total_doc_count), "w")
   # test_folds_log_file.write("fold doc.id word count score\n")
   # for i in range(num_folds):
   #   train_data = c_test_train_folds[i]
   #   test_data = c_test_test_folds[i]
   #   for (doc_id, train_doc, test_doc) in izip(range(train_data.num_docs), train_data.docs, test_data.docs):
   #     if test_doc.total > 0:
   #       (likelihood, gamma) = hdp.lda_e_step(train_doc, lda_alpha, lda_beta)
   #       theta = gamma/np.sum(gamma)
   #       lda_betad = lda_beta[:, test_doc.words]
   #       log_predicts = np.log(np.dot(theta, lda_betad))
   #       log_info = "\n".join(["%d %d %d %d %.5f" % (i, doc_id, word, word_count, f) for (word, word_count, f) in izip(test_doc.words, test_doc.counts, log_predicts)])
   #       test_folds_log_file.write(log_info + "\n") 

   # test_folds_log_file.close()

  log_file.close()

  print "Saving the final model and topics."
  bhdp.save_topics('%s/final.topics' %  result_directory)
  cPickle.dump(bhdp, file('%s/final.model' % result_directory, 'w'), -1)

  (lda_alpha, lda_beta) = bhdp.hdp_to_lda()

  # prediction on the fixed test in folds
  print "\tworking on fixed test data."
  test_score = 0.0
  test_score_split = 0.0
  c_test_word_count_split = 0
  for doc in c_test.docs:
    (likelihood, gamma) = hdp.lda_e_step(doc, lda_alpha, lda_beta)
    test_score += likelihood
    (likelihood, count, gamma) = hdp.lda_e_step_split(doc, lda_alpha, lda_beta)
    test_score_split += likelihood
    c_test_word_count_split += count

  test_log_file.write("%d %d %d %.5f %d %.5f %d\n" % (iter, total_time,
                      total_doc_count, test_score, c_test_word_count, 
                      test_score_split, c_test_word_count_split))
  test_log_file.flush()

  test_log_file.close()
Exemplo n.º 4
0
def run_hdp():
    options = parse_args()
    random.seed(options.random_seed)

    # Read the training data.
    c_train_filename = options.data_path
    c_train = read_data(c_train_filename)
    trlabels = utils.read_1D_array('trdataSLDALabels.txt')
    trsz = utils.read_1D_array('trdatasz.txt')
    trsz = trsz[0]

    # Read the test data.
    test_data_path = options.test_data_path
    c_test = read_data(test_data_path)
    c_test_word_count = sum([doc.total for doc in c_test.docs])
    testlabels = utils.read_1D_array('trdataSLDALabels.txt')
    testsz = utils.read_1D_array('trdatasz.txt')
    testsz = testsz[0]

    f = file("localsticks_a.txt", "w")
    f.close()
    f = file("localsticks_b.txt", "w")
    f.close()

    result_directory = "%s/corpus-%s" % (options.directory,
                                         options.corpus_name)
    print "creating directory %s" % result_directory
    if not os.path.isdir(result_directory):
        os.makedirs(result_directory)

    options_file = file("%s/options.dat" % result_directory, "w")
    for opt, value in options.__dict__.items():
        options_file.write(str(opt) + " " + str(value) + "\n")
    options_file.close()

    print "creating hdp instance for training."
    bhdp_hp = hdp.hdp_hyperparameter(options.alpha, options.alpha,
                                     options.gamma, options.gamma, True)
    bhdp = hdp.hdp(options.T, options.K, options.D, options.W, options.eta,
                   trsz, bhdp_hp)

    print "setting up counters and log files."
    iter = 0
    total_time = 0.0
    total_doc_count = 0

    likelihood = 0.0
    old_likelihood = 0.0
    converge = 1.0

    while options.max_iter == -1 or iter < options.max_iter:
        t0 = time.clock()
        # Run one step iteration.
        print "EM iteration starts!"
        likelihood, trmodel = bhdp.em(c_train,
                                      trlabels,
                                      options.var_converge,
                                      trsz,
                                      fresh=(iter == 0))
        if iter > 0:
            converge = (likelihood - old_likelihood) / abs(old_likelihood)
        old_likelihood = likelihood
        print "iter = %d, likelihood = %f, converge = %f" % (iter, likelihood,
                                                             converge)
        if converge < 0:
            print "warning, likelihood is decreasing!"
        total_time += time.clock() - t0
        iter += 1  # increase the iter counter
        total_doc_count += options.D

    raw_input()
    # prediction on the fixed test data
    print "\tworking on fixed test data."
    print "creating hdp instance for test."
    bhdp_hp_test = hdp.hdp_hyperparameter(options.alpha, options.alpha,
                                          options.gamma, options.gamma, False)
    bhdp_test = hdp.hdp(options.T, options.K, options.D, options.W,
                        options.eta, testsz, bhdp_hp_test)
    # prediction on test data
    Elogbeta = dirichlet_expectation(bhdp.m_beta)  # the topics
    Elogsticks_1st = expect_log_sticks(bhdp.m_var_sticks)  # global sticks
    m_var_zeta = np.zeros((testsz, options.K))
    docnum = 0
    for doc in c_test.docs:
        temp, m_var_zeta = bhdp_test.doc_inference(doc, docnum, Elogbeta,
                                                   Elogsticks_1st,
                                                   options.var_converge,
                                                   m_var_zeta)
        likelihood += temp
        docnum += 1

    x = utils.converttoliblinear(m_var_zeta)
    y = testlabels
    p_label, p_acc, p_val = predict(y, x, trmodel)
    ACC, MSE, SCC = evaluations(y, p_label)
    print "test accuracy: %f" % ACC