def main(args):
  logging.basicConfig(level=LOGGING_LEVEL, format="DEBUG: %(message)s")

  if len(args) < 3 or len(args) > 4:
    print 'usage: %s training-file dev-file [output-dir]' % args[0]
    print '       output-dir is optional, default is "%s"' % OUTPUT_DIR_DEFAULT
    sys.exit(1)

  training_filename = args[1]
  dev_filename = args[2]
  output_dir = args[3] if len(args) == 4 else OUTPUT_DIR_DEFAULT

  logging.debug('Training models...')

  # train all the models!
  unigram_model = Unigram(training_filename)
  logging.debug('Done training unigram model')
  bigram_model = Bigram(training_filename)
  logging.debug('Done training bigram model')
  trigram_model = Trigram(training_filename)
  logging.debug('Done training trigram model')

  dev_words = [line.strip() for line in open(dev_filename, 'r')]

  # write predictions out to disk
  unigram_model.write_probability_list(dev_words, get_output_filename(output_dir, dev_filename, 'unigram'))
  logging.debug('Wrote dev set predictions using unigram model')
  bigram_model.write_probability_list(dev_words, get_output_filename(output_dir, dev_filename, 'bigram'))
  logging.debug('Wrote dev set predictions using bigram model')
  trigram_model.write_probability_list(dev_words, get_output_filename(output_dir, dev_filename, 'trigram'))
  logging.debug('Wrote dev set predictions using trigram model')
 def find_unigrams(self):
     print "finding unigrams"
     print "inputpath",self.input_path
     u=Unigram()
     u.set_input_path(self.input_path)
     u.set_output_path(self.unigram_output_path)
     u.find_unigram()
     self.no_of_words=u.no_of_unigrams
     print "self.no_of_words",u.get_no_of_unigrams()
     self.words=u.ranked_list
     print "self.words",len(self.words)
Пример #3
0
class Author(object):

    __name = ""
    __unigram = Unigram()
    __bigram = Bigram()
    __trigram = Trigram()

    # Constructor.
    def __init__(self, name):
        self.__name = name
        self.__unigram = Unigram()
        self.__bigram = Bigram()
        self.__trigram = Trigram()

    # Getters.
    def getUnigram(self):
        return self.__unigram

    def getBigram(self):
        return self.__bigram

    def getTrigram(self):
        return self.__trigram

    def getName(self):
        return self.__name

    # Caller method, it is used for counting frequency in the unigram, bigram and trigram.
    def counterCaller(self, separated_line):
        self.__unigram.counter(separated_line)
        self.__bigram.counter(separated_line)
        self.__trigram.counter(separated_line)

    # Caller method, it is used for generating new text with respect to unigram, bigram and trigram.
    def generatorCaller(self, uni_list, bi_list, tri_list):
        self.__unigram.generator(uni_list)
        self.__bigram.generator(bi_list)
        self.__trigram.generator(tri_list)
Пример #4
0
def unigram(train_sentences, test_sentences):
    #  processing of the sentences to tagged samples, since there's no importance to sentences structure
    train = sentences_to_samples(train_sentences)
    test = sentences_to_samples(test_sentences)

    unigram_HMM = Unigram(train)
    unigram_HMM.train()

    #  initialisation of lists of samples containing known and unknown words
    test_known_words, test_unknown_words = divide_test_to_known_and_unknown_samples(
        train_sentences, test_sentences)

    #  evaluation of the accuracy for each case
    print("Accuracy rate for unknown words: ",
          unigram_HMM.get_accuracy_rate(np.array(test_unknown_words)))
    print("Accuracy rate for known words: ",
          unigram_HMM.get_accuracy_rate(np.array(test_known_words)))
    print("Total accuracy rate: ",
          unigram_HMM.get_accuracy_rate(np.array(test)))
Пример #5
0
 def __init__(self, name):
     self.__name = name
     self.__unigram = Unigram()
     self.__bigram = Bigram()
     self.__trigram = Trigram()
Пример #6
0
print(os.cpu_count())
Config.num_threads = os.cpu_count()

Config.epsilon = args.epsilon
Config.learning_rate = args.lr
Config.lamb = args.lamb
Config.t = args.t

data = IOModule()
data_set = data.read_file(Config.train_data)
valid_set = data.read_file(Config.validate_data)
test_set = data.read_file(Config.test_data)

if args.model == 'unigram':
    model = Unigram(data_set, valid_set, test_set)
elif args.model == 'ngram':
    model = BiTrigram(data_set, valid_set, test_set)
elif args.model == 'custom':
    model = CustomModel(data_set, valid_set, test_set)
elif args.model == 'best':
    model1 = BiTrigram(data_set, valid_set, test_set)
    model2 = CustomModel(data_set, valid_set, test_set)
    model = Model(data_set, valid_set, test_set)
    model.combine_features_from_models(model1, model2)

model.generate_input_matrix()

model.gradient_ascent()

model.plot_output(args.model)
Пример #7
0
i = 0
delete = False
while i < len(origWords):
    if(origWords[i].startswith("targ=")):
        while(origWords[i] != "</ERR>"):
            del origWords[i]
        del origWords[i]
        i += 1
    else:
        i += 1




### Let's form our bigram.
unigram = Unigram(correctWords)
bigram = Bigram(correctWords, unigram)
lettersMap = {}

lettersMap[EditDistance.WORDBOUNDARY] = len(correctWords)
for word in correctWords:
    for i in range(len(word)):
        lettersMap[word[i]] = lettersMap.get(word[i] , 0) + 1
    lettersMap[(EditDistance.WORDBOUNDARY + word[0])] = lettersMap.get((EditDistance.WORDBOUNDARY + word[0]), 0) + 1
    for i in range(len(word) - 1):
        lettersMap[(word[i] + word[i + 1])] = lettersMap.get((word[i] + word[i + 1]) , 0) + 1

# This is for creating the edit distances. They return a hashmap with a tuple like this: ('ins', 'a', 'ab') -> 22
changeMap = {}
wrongWordsSet = set([origWords[i] for i in correctionIndexes])
start = time.time()