示例#1
0
def main():
    course_dir = '/usr/local/data/cs465/'

    if len(sys.argv) < 5 or (sys.argv[1] == 'TRAIN' and len(sys.argv) != 5):
        print("""
Prints the log-probability of each file under a smoothed n-gram model.

Usage:   {} TRAIN smoother lexicon trainpath
         {} TEST smoother lexicon trainpath files...
Example: {} TRAIN add0.01 {}hw-lm/lexicons/words-10.txt switchboard-small
         {} TEST add0.01 {}hw-lm/lexicons/words-10.txt switchboard-small {}hw-lm/speech/sample*

Possible values for smoother: uniform, add1, backoff_add1, backoff_wb, loglinear1
  (the \"1\" in add1/backoff_add1 can be replaced with any real lambda >= 0
   the \"1\" in loglinear1 can be replaced with any C >= 0 )
lexicon is the location of the word vector file, which is only used in the loglinear model
trainpath is the location of the training corpus
  (the search path for this includes "{}")
""".format(sys.argv[0], sys.argv[0], sys.argv[0], course_dir, sys.argv[0],
           course_dir, course_dir, Probs.DEFAULT_TRAINING_DIR))
        sys.exit(1)

    mode = sys.argv[1]
    argv = sys.argv[2:]

    smoother = argv.pop(0)
    lexicon = argv.pop(0)
    train_file = argv.pop(0)

    if mode == 'TRAIN':
        lm = Probs.LanguageModel()
        lm.set_smoother(smoother)
        lm.read_vectors(lexicon)
        lm.train(train_file)
        lm.save(get_model_filename(smoother, lexicon, train_file))
    elif mode == 'TEST':
        if not argv:
            print("warning: no input files specified")
        lm = Probs.LanguageModel.load(
            get_model_filename(smoother, lexicon, train_file))
        # We use natural log for our internal computations and that's
        # the kind of log-probability that fileLogProb returns.
        # But we'd like to print a value in bits: so we convert
        # log base e to log base 2 at print time, by dividing by log(2).

        total_cross_entropy = 0.
        for testfile in argv:
            ce = lm.filelogprob(testfile) / math.log(2)
            #print("{:g}\t{}".format(ce, testfile))
            total_cross_entropy -= ce
        print(total_cross_entropy)
        print(sum([lm.num_tokens(testfile) for testfile in argv]))
        #print('Overall cross-entropy:\t{0:.5f}'.format(total_cross_entropy/sum([lm.num_tokens(testfile) for testfile in argv])))
    else:
        sys.exit(-1)
示例#2
0
def main():
    course_dir = '/usr/local/data/cs465/'
    argv = sys.argv[1:]

    if len(argv) < 2:
        print """
Prints the log-probability of each file under a smoothed n-gram model.

Usage:   %s smoother lexicon trainpath files...
Example: %s add0.01 %shw-lm/lexicons/words-10.txt switchboard-small %shw-lm/speech/sample*

Possible values for smoother: uniform, add1, backoff_add1, backoff_wb, loglinear1
  (the \"1\" in add1/backoff_add1 can be replaced with any real lambda >= 0
   the \"1\" in loglinear1 can be replaced with any C >= 0 )
lexicon is the location of the word vector file, which is only used in the loglinear model
trainpath is the location of the training corpus
  (the search path for this includes "%s")
""" % (sys.argv[0], sys.argv[0], course_dir, course_dir,
        Probs.DEFAULT_TRAINING_DIR)
        sys.exit(1)

    smoother = argv.pop(0)
    lexicon = argv.pop(0)
    train_file = argv.pop(0)

    if not argv:
        print "warning: no input files specified"

    lm = Probs.LanguageModel()
    lm.set_smoother(smoother)
    lm.read_vectors(lexicon)
    lm.train(train_file)

    # We use natural log for our internal computations and that's
    # the kind of log-probability that fileLogProb returns.
    # But we'd like to print a value in bits: so we convert
    # log base e to log base 2 at print time, by dividing by log(2).

    for testfile in argv:
        print "%g\t%s" % (lm.filelogprob(testfile) / math.log(2), testfile)
示例#3
0
def main():
  course_dir = '/usr/local/data/cs465/'

  if len(sys.argv) < 6 or (sys.argv[1] == 'TRAIN' and len(sys.argv) != 6):

#     print("""
# Prints the log-probability of each file under a smoothed n-gram model.
#
# Usage:   {} TRAIN smoother lexicon trainpath
#          {} TEST smoother lexicon trainpath files...
# Example: {} TRAIN add0.01 {}hw-lm/lexicons/words-10.txt switchboard-small
#          {} TEST add0.01 {}hw-lm/lexicons/words-10.txt switchboard-small {}hw-lm/speech/sample*
#
# Possible values for smoother: uniform, add1, backoff_add1, backoff_wb, loglinear1
#   (the \"1\" in add1/backoff_add1 can be replaced with any real lambda >= 0
#    the \"1\" in loglinear1 can be replaced with any C >= 0 )
# lexicon is the location of the word vector file, which is only used in the loglinear model
# trainpath is the location of the training corpus
#   (the search path for this includes "{}")
# """.format(sys.argv[0], sys.argv[0], sys.argv[0], course_dir, sys.argv[0], course_dir, course_dir, Probs.DEFAULT_TRAINING_DIR))

  mode = sys.argv[1]
  argv = sys.argv[2:]

  smoother = argv.pop(0)
  lexicon = argv.pop(0)
  train_file1 = argv.pop(0)
  train_file2 = argv.pop(0)
  epochs = 10
  if mode == 'TRAIN':

    #Train Model1
    lm1 = Probs.LanguageModel()
    #Comment following line when you want cross entropy reading
    lm1.set_vocab_size(train_file1, train_file2)
    lm1.set_smoother(smoother)
    lm1.read_vectors(lexicon)
    lm1.train(train_file1,epochs)
    lm1.save(get_model_filename(smoother, lexicon, train_file1))

    #Train Model2
    lm2 = Probs.LanguageModel()
    #Comment following line when you want cross entropy reading
    lm2.set_vocab_size(train_file1, train_file2)
    lm2.set_smoother(smoother)
    lm2.read_vectors(lexicon)
    lm2.train(train_file2, epochs)
    lm2.save(get_model_filename(smoother, lexicon, train_file2))
  elif mode == 'TEST':
    if not argv:
      print("warning: no input files specified")
    
    priorprob_corpus1 = float(argv.pop(0))

    #Load parameters of the trained models
    lm1 = Probs.LanguageModel.load(get_model_filename(smoother, lexicon, train_file1))
    lm2 = Probs.LanguageModel.load(get_model_filename(smoother, lexicon, train_file2))

    # We use natural log for our internal computations and that's
    # the kind of log-probability that fileLogProb returns.
    # But we'd like to print a value in bits: so we convert
    # log base e to log base 2 at print time, by dividing by log(2).

    #Class counters to keep track of number of predictions in each class
    class1_counter = 0
    class2_counter = 0
    #Counter of wrong predictions for evaluation
    wrong_predictions = 0
    total_cross_entropy1 = 0.
    total_cross_entropy2 = 0.
    total_cross_entropy = 0.
    files_length_accuracy = defaultdict(list)
    #Loop for predicting each dev/test file
    for testfile in argv:
      ce1 =  lm1.filelogprob(testfile) / math.log(2)
#      print("#{:g}\t{}".format(ce1, testfile))
      #Number of tokens in the test file used for averaging probability
      token_count = lm1.num_tokens(testfile)
      #Compute posterior probability for class 1
      map1 = ((math.log(priorprob_corpus1) + lm1.filelogprob(testfile)) / math.log(2) ) / token_count


      #Compute posterior probability for class 2
      map2 = ((math.log(1 - priorprob_corpus1) +  lm2.filelogprob(testfile)) / math.log(2)) / token_count
      ce2 =  lm2.filelogprob(testfile) / math.log(2)
#      print("#{:g}\t{}".format(ce2, testfile))

      total_cross_entropy1 -= ce1
      total_cross_entropy2 -= ce2

      #Compare probabilities for prediction
      if map1 > map2:
          print(train_file1,"\t",testfile)
          class1_counter += 1
          prediction, filelength = evaluate(testfile, 'english')
          wrong_predictions += prediction
      else:
          print(train_file2, "\t", testfile)
          class2_counter += 1
          prediction, filelength = evaluate(testfile, 'spanish')
          wrong_predictions += prediction
    
      #files_length_accuracy[filelength].append(1-prediction)

    #Print Outputs for Class 1
    print(class1_counter,"files were more probably",train_file1,"({percent:.2f}%)".format(percent = 100*class1_counter/
                                                                                         (class1_counter + class2_counter)))
    #Print Outputs for Class 2
    print(class2_counter, "files were more probably", train_file2, "({percent:.2f}%)".format(percent = 100 * class2_counter/
                                                                                                (class1_counter + class2_counter)))
    print("#",wrong_predictions,"Error Rate: ", " ({percent:.2f}%)".format(percent = 100 * wrong_predictions/(class1_counter + class2_counter)))
    
    #filename = 'P3_{}_{}_{}_{}_data.txt'.format(smoother, basename(lexicon), basename(train_file1), basename(train_file2))
    #f = open(filename, "w")
    #for key, val in  files_length_accuracy.items():
    #    print("#File of length ", key," were ", 100*sum(val)/len(val), "% accurate.")
    #    f.write(str(key)+" "+str(100*sum(val)/len(val))+"\n")
    #f.close()


    # for p1,p2 in zip(ce1_list, ce2_list):
    #     if p1> p2:

    total_cross_entropy2 -= ce2
    
    total_cross_entropy = total_cross_entropy1 + total_cross_entropy2
#    print('#Overall cross-entropy:\t{0:.5f}'.format(total_cross_entropy1/sum([lm1.num_tokens(testfile) for testfile in argv])))
#    print('#Overall cross-entropy:\t{0:.5f}'.format(total_cross_entropy2/sum([lm2.num_tokens(testfile) for testfile in argv])))
    print('#Overall cross-entropy:\t{0:.5f}'.format(0.5*total_cross_entropy/sum([lm1.num_tokens(testfile) for testfile in argv])))

  else:
    sys.exit(-1)

if __name__ ==  "__main__":
  main()
示例#4
0
def main():
    course_dir = '/usr/local/data/cs465/'

    if len(sys.argv) < 6 or (sys.argv[1] == 'TRAIN' and len(sys.argv) != 6):
        print("""
Prints the log-probability of each file under a smoothed n-gram model.

Usage:   {} TRAIN smoother lexicon trainpath
         {} TEST smoother lexicon trainpath files...
Example: {} TRAIN add0.01 {}hw-lm/lexicons/words-10.txt switchboard-small
         {} TEST add0.01 {}hw-lm/lexicons/words-10.txt switchboard-small {}hw-lm/speech/sample*

Possible values for smoother: uniform, add1, backoff_add1, backoff_wb, loglinear1
  (the \"1\" in add1/backoff_add1 can be replaced with any real lambda >= 0
   the \"1\" in loglinear1 can be replaced with any C >= 0 )
lexicon is the location of the word vector file, which is only used in the loglinear model
trainpath is the location of the training corpus
  (the search path for this includes "{}")
""".format(sys.argv[0], sys.argv[0], sys.argv[0], course_dir, sys.argv[0],
           course_dir, course_dir, Probs.DEFAULT_TRAINING_DIR))
        sys.exit(1)

    mode = sys.argv[1]
    argv = sys.argv[2:]

    smoother = argv.pop(0)
    lexicon = argv.pop(0)
    train_file1 = argv.pop(0)
    train_file2 = argv.pop(0)

    if mode == 'TRAIN':

        lm = Probs.LanguageModel()
        lm.set_vocab_size(train_file1, train_file2)
        lm.set_smoother(smoother)
        lm.read_vectors(lexicon)
        lm.train(train_file1)
        lm.save(get_model_filename(smoother, lexicon, train_file1))

        lm.train(train_file2)
        lm.save(get_model_filename(smoother, lexicon, train_file2))

    elif mode == 'TEST':
        if len(argv) < 2:
            print("warning: not enough")
        lm1 = Probs.LanguageModel.load(
            get_model_filename(smoother, lexicon, train_file1))
        lm2 = Probs.LanguageModel.load(
            get_model_filename(smoother, lexicon, train_file2))
        # We use natural log for our internal computations and that's
        # the kind of log-probability that fileLogProb returns.
        # But we'd like to print a value in bits: so we convert
        # log base e to log base 2 at print time, by dividing by log(2).
        prior_gen = argv.pop(0)
        prior_gen = float(prior_gen)

        #file_len_acc = open("len_acc.txt","w")
        total_cross_1 = 0.
        total_cross_2 = 0.

        sum_acc1 = 0.
        sum_acc2 = 0.

        count_1 = 0
        count_2 = 0
        file_count = 0
        for testfile in argv:
            file_count += 1

            log_prior_1 = math.log(prior_gen, 2)
            ce1 = lm1.filelogprob(testfile) / math.log(2)
            log_posterior_1 = ce1 + log_prior_1

            log_prior_2 = math.log(1 - prior_gen, 2)
            ce2 = lm2.filelogprob(testfile) / math.log(2)
            log_posterior_2 = ce2 + log_prior_2

            total_cross_1 -= log_posterior_1
            total_cross_2 -= log_posterior_2

            if log_posterior_1 > log_posterior_2:
                print(train_file1 + "\t" + testfile)
                count_1 += 1
            else:
                print(train_file2 + "\t" + testfile)
                count_2 += 1

            #filename_spt = testfile.split("/")
            #length = filename_spt[2].split(".")[1]

            CON = max(0 - log_posterior_1, 0 - log_posterior_2)
            try:
                p1 = pow(2, log_posterior_1 + CON)
                p2 = pow(2, log_posterior_2 + CON)

                acc1 = p1 / (p1 + p2)
                acc2 = p2 / (p1 + p2)

                #print(acc1)
                #print(acc2)

                sum_acc1 += acc1
                sum_acc2 += acc2

            except Exception as e:
                #print(e)

                if log_posterior_1 > log_posterior_2:
                    sum_acc1 += 1

                else:
                    sum_acc2 += 1

        setname = testfile.split("/")[1]

        #if setname == "english":
        #   print(sum_acc1)
        #  print(total_cross_1)

        # elif setname == "spanish":
        #    print(sum_acc2)
        #   print(total_cross_2)

        # print(file_count)
        # print(sum([lm1.num_tokens(testfile) for testfile in argv]))

        #if filename_spt[1] == train_file1:

        #file_len_acc.write(length+" "+str(log_posterior_1)+"\n")

        #elif filename_spt[1] == train_file2:

        #file_len_acc.write(length+" "+str(log_posterior_2)+"\n")

        #file_len_acc.close()

        prob1 = round((float(count_1) / float(count_1 + count_2)) * 100, 2)
        prob2 = round((float(count_2) / float(count_1 + count_2)) * 100, 2)

        print(
            str(count_1) + " files were more probably " + train_file1 + " (" +
            str(prob1) + "%")
        print(
            str(count_2) + " files were more probably " + train_file2 + " (" +
            str(prob2) + "%)")
    else:
        sys.exit(-1)
示例#5
0
def main():
  course_dir = '/usr/local/data/cs465/'

  if len(sys.argv) < 5 or (sys.argv[1] == 'TRAIN' and len(sys.argv) != 6):
    print("""
Prints the log-probability of each file under a smoothed n-gram model.

Usage:   {} TRAIN smoother lexicon trainpath
         {} TEST smoother lexicon trainpath files...
Example: {} TRAIN add0.01 {}hw-lm/lexicons/words-10.txt switchboard-small
         {} TEST add0.01 {}hw-lm/lexicons/words-10.txt switchboard-small {}hw-lm/speech/sample*

Possible values for smoother: uniform, add1, backoff_add1, backoff_wb, loglinear1
  (the \"1\" in add1/backoff_add1 can be replaced with any real lambda >= 0
   the \"1\" in loglinear1 can be replaced with any C >= 0 )
lexicon is the location of the word vector file, which is only used in the loglinear model
trainpath is the location of the training corpus
  (the search path for this includes "{}")
""".format(sys.argv[0], sys.argv[0], sys.argv[0], course_dir, sys.argv[0], course_dir, course_dir, Probs.DEFAULT_TRAINING_DIR))
    sys.exit(1)

  mode = sys.argv[1]
  argv = sys.argv[2:]

  smoother = argv.pop(0)
  lexicon = argv.pop(0)
  train_file1 = argv.pop(0)
  train_file2 = argv.pop(0)

  if mode == 'TRAIN':
    lm = Probs.LanguageModel()
    lm.set_vocab_size(train_file1, train_file2)
    lm.set_smoother(smoother)
    lm.read_vectors(lexicon)
    lm.train(train_file1)
    lm.save(get_model_filename(smoother, lexicon, train_file1))
    lm.train(train_file2)
    lm.save(get_model_filename(smoother, lexicon, train_file2))
  elif mode == 'TEST':
    if not argv:
      print("warning: no input files specified")
    lm1 = Probs.LanguageModel.load(get_model_filename(smoother, lexicon, train_file1))
    lm2 = Probs.LanguageModel.load(get_model_filename(smoother, lexicon, train_file2))
    prior_lm1 = float(argv.pop(0))
    assert prior_lm1 <= 1 and prior_lm1 >= 0
    prior_lm2 = 1 - prior_lm1
    # We use natural log for our internal computations and that's
    # the kind of log-probability that fileLogProb returns.  
    # But we'd like to print a value in bits: so we convert
    # log base e to log base 2 at print time, by dividing by log(2).

    lm1_type = train_file1.split('/')[-1].split('.')[0]
    lm2_type = train_file2.split('/')[-1].split('.')[0]
    
    file_correct = {}
    file_total = {}
    
    for testfile in argv:
      file_length = testfile.split("/")[-1].split(".")[1]
      file_type = testfile.split("/")[-1].split(".")[0]

      lm1_ce = (math.log(prior_lm1) + lm1.filelogprob(testfile)) / math.log(2)
      lm2_ce = (math.log(prior_lm2) + lm2.filelogprob(testfile)) / math.log(2)

      file_total[file_length] = file_total.get(file_length, 0) + 1

      if lm1_ce > lm2_ce:
          if file_type == lm1_type:
              file_correct[file_length] = file_correct.get(file_length, 0) + 1
      else:
          if file_type == lm2_type:
              file_correct[file_length] = file_correct.get(file_length, 0) + 1
  
    accuracies = {}
    for key, value in file_total.items():
        accuracies[key] = file_correct[key] / float(value)
    print(accuracies)
  else:
    sys.exit(-1)
示例#6
0
def main():
    course_dir = '/usr/local/data/cs465/'

    if len(sys.argv) < 5 or (sys.argv[1] == 'TRAIN' and len(sys.argv) != 6):
        print("""
Prints the log-probability of each file under a smoothed n-gram model.

Usage:   {} TRAIN smoother lexicon trainpath
         {} TEST smoother lexicon trainpath files...
Example: {} TRAIN add0.01 {}hw-lm/lexicons/words-10.txt switchboard-small
         {} TEST add0.01 {}hw-lm/lexicons/words-10.txt switchboard-small {}hw-lm/speech/sample*

Possible values for smoother: uniform, add1, backoff_add1, backoff_wb, loglinear1
  (the \"1\" in add1/backoff_add1 can be replaced with any real lambda >= 0
   the \"1\" in loglinear1 can be replaced with any C >= 0 )
lexicon is the location of the word vector file, which is only used in the loglinear model
trainpath is the location of the training corpus
  (the search path for this includes "{}")
""".format(sys.argv[0], sys.argv[0], sys.argv[0], course_dir, sys.argv[0],
           course_dir, course_dir, Probs.DEFAULT_TRAINING_DIR))
        sys.exit(1)

    mode = sys.argv[1]
    argv = sys.argv[2:]

    smoother = argv.pop(0)
    lexicon = argv.pop(0)
    train_file1 = argv.pop(0)
    train_file2 = argv.pop(0)

    if mode == 'TRAIN':
        lm = Probs.LanguageModel()
        lm.set_vocab_size(train_file1, train_file2)
        lm.set_smoother(smoother)
        lm.read_vectors(lexicon)
        lm.train(train_file1)
        lm.save(get_model_filename(smoother, lexicon, train_file1))
        lm.train(train_file2)
        lm.save(get_model_filename(smoother, lexicon, train_file2))
    elif mode == 'TEST':
        if not argv:
            print("warning: no input files specified")
        lm1 = Probs.LanguageModel.load(
            get_model_filename(smoother, lexicon, train_file1))
        lm2 = Probs.LanguageModel.load(
            get_model_filename(smoother, lexicon, train_file2))
        prior_lm1 = float(argv.pop(0))
        assert prior_lm1 <= 1 and prior_lm1 >= 0
        prior_lm2 = 1 - prior_lm1
        # We use natural log for our internal computations and that's
        # the kind of log-probability that fileLogProb returns.
        # But we'd like to print a value in bits: so we convert
        # log base e to log base 2 at print time, by dividing by log(2).

        lm1_count = 0
        lm2_count = 0

        train_file1 = basename(train_file1)
        train_file2 = basename(train_file2)

        for testfile in argv:
            lm1_ce = (math.log(prior_lm1) +
                      lm1.filelogprob(testfile)) / math.log(2)
            lm2_ce = (math.log(prior_lm2) +
                      lm2.filelogprob(testfile)) / math.log(2)
            if lm1_ce > lm2_ce:
                lm1_count += 1
                print(train_file1 + '\t' + testfile)
            else:
                lm2_count += 1
                print(train_file2 + '\t' + testfile)
        print("{0:d} files were more probably {1:s} ({2:.2f}%)".format(lm1_count, train_file1, \
          float(100.0 * lm1_count / (lm1_count + lm2_count))))
        print("{0:d} files were more probably {1:s} ({2:.2f}%)".format(lm2_count, train_file2, \
          float(100.0 * lm2_count / (lm1_count + lm2_count))))
    else:
        sys.exit(-1)
示例#7
0
def main():
    course_dir = '/usr/local/data/cs465/'

    if len(sys.argv) < 5 or (sys.argv[1] == 'TRAIN' and len(sys.argv) != 5):
        print("""
Prints the log-probability of each file under a smoothed n-gram model.

Usage:   {} TRAIN smoother lexicon trainpath
         {} TEST smoother lexicon trainpath files...
Example: {} TRAIN add0.01 {}hw-lm/lexicons/words-10.txt switchboard-small
         {} TEST add0.01 {}hw-lm/lexicons/words-10.txt switchboard-small {}hw-lm/speech/sample*

Possible values for smoother: uniform, add1, backoff_add1, backoff_wb, loglinear1
  (the \"1\" in add1/backoff_add1 can be replaced with any real lambda >= 0
   the \"1\" in loglinear1 can be replaced with any C >= 0 )
lexicon is the location of the word vector file, which is only used in the loglinear model
trainpath is the location of the training corpus
  (the search path for this includes "{}")
""".format(sys.argv[0], sys.argv[0], sys.argv[0], course_dir, sys.argv[0],
           course_dir, course_dir, Probs.DEFAULT_TRAINING_DIR))
        sys.exit(1)

    argv = sys.argv[1:]

    smoother = argv.pop(0)
    lexicon = argv.pop(0)
    train_file = argv.pop(0)

    if not argv:
        print("warning: no input files specified")

    lm = Probs.LanguageModel()
    lm.set_smoother(smoother)
    lm.read_vectors(lexicon)
    lm.train(train_file)

    total_words = 0
    total_error = 0.0
    for testfile in argv:
        f = open(testfile)
        line = f.readline()
        sequences = []

        # Read data from file
        line = f.readline()
        while line:
            w_list = []
            items = line.split()
            line = f.readline()
            error_rate = float(items[0])
            log_p_uw = float(items[1])
            words = int(items[2])
            for i in range(3, words + 5):
                w_list.append(items[i])
            w_list = w_list[1:-1]

            # log probability computation
            # trigram model
            log_prob = 0.0
            x, y = Probs.BOS, Probs.BOS
            for z in w_list:
                log_prob += math.log(lm.prob(x, y, z))
                x = y
                y = z
            log_prob += math.log(lm.prob(x, y, Probs.EOS))

            # bigram model
            #y = Probs.BOS
            #for z in w:
            #  log_prob += math.log(lm.prob_bigram(y, z))
            #  y = z
            #log_prob += math.log(lm.prob_bigram(y, Probs.EOS))

            # unigram model
            #for z in w:
            #  log_prob += math.log(lm.prob_unigram(z))

            sequences.append(
                (error_rate, words, log_p_uw + log_prob / math.log(2)))

        # Pick the best match, the one with highest probability
        best_match = max(sequences, key=lambda item: item[2])
        total_error += best_match[0] * best_match[1]
        total_words += best_match[1]
        print('{0}\t{1}'.format(best_match[0], testfile))

    print('{0:0.03f}\t{1}'.format(total_error / total_words, "OVERALL"))