def seq_prob(w_all, n, n_grams, n_min_1_grams): parsed_n_grams = parse_ngrams(w_all, n) #print(parsed_n_grams) if n is 1: return product( [rel_prob(ng.split(), n_grams) for ng in parsed_n_grams]) return product([ cond_prob(ng.split(), ng.split()[0:-1], n_grams, n_min_1_grams) for ng in parsed_n_grams ])
def seq_prob_add1(w_all, n, n_grams, n_min_1_grams, unigrams): """ Applies add-1 smoothing to the bi-gram model """ parsed_n_grams = parse_ngrams(w_all, n) if n is 1: return product( [rel_prob(ng.split(), n_grams) for ng in parsed_n_grams]) return product([ cond_prob_add1(ng.split(), ng.split()[0:-1], n_grams, n_min_1_grams, unigrams) for ng in parsed_n_grams ])
def seq_prob(w_all, n, n_grams, n_min_1_grams): m = n - 1# if n > 1 else n # add START and STOPS for i in xrange(0, m): w_all.insert(0, 'START') w_all.append('STOP') parsed_n_grams = parse_ngrams(w_all, n) #print(parsed_n_grams) if n is 1: return product([rel_prob(ng.split(), n_grams) for ng in parsed_n_grams]) return product([cond_prob(ng.split(), ng.split()[0:-1], n_grams, n_min_1_grams) for ng in parsed_n_grams])
def seq_prob_gt(w_all, n, n_grams, n_min_1_grams, N, unigrams): """ Computes the sequential probability after good-turing has been performed """ prob = 1 parsed_n_grams = parse_ngrams(w_all, n) unseen = len([ng for ng in parsed_n_grams if ng not in n_grams]) if unseen: prob = (float(N[1]) / (N[0] * len(n_grams)))**unseen if unseen == len(parsed_n_grams): return prob prob = prob * product([ cond_prob_gt(ng.split(), ng.split()[0:-1], n_grams, n_min_1_grams) for ng in parsed_n_grams if ng in n_grams ]) return prob
# INPUT CHECKS if not args.train_file or not args.n or not args.test_file or args.n is not 2: parser.print_help() exit( 'Missing required arguments or n is not 2 (assignment is for bigrams)' ) # split and flatten array # sentences is list of sentences that start with START and end with STOP sentences = get_sentences( add_start_stop(args.train_file, args.n if not args.m else 1)) test_sentences = get_sentences( add_start_stop(args.test_file, args.n if not args.m else 1)) n_grams = Counter( list(chain(*[parse_ngrams(sen, args.n) for sen in sentences]))) n_min_1_grams = Counter( list(chain(*[parse_ngrams(sen, args.n - 1) for sen in sentences]))) unigrams = len(n_min_1_grams) probs = calc_probabilities_seq_file(test_sentences, args.n, n_grams, n_min_1_grams, unigrams, args.smoothing) percentagenonzero = 100 * float( len([prob for prob in probs if probs[prob] != 0])) / len(probs) print('{} % of {} have a nonzero probability'.format( percentagenonzero, len(probs))) print('{} most likely sentences:'.format(args.m)) print_ngrams(sort_ngrams_bidirectional(probs, True), args.m)
if idx is m: break idx += 1 print '{} {}'.format(word, freq) if __name__ == "__main__": # here code for program parser = ArgumentParser(description='Assignment A, Step 1') parser.add_argument('-corpus', dest ='input_file', type=str, help='Path to corpus file') parser.add_argument('-n', dest='n', type=int, help='Length of word-sequences to process (n-grams)') parser.add_argument('-m', dest='m', type=int, default=None, help='Number of n-grams to show in output') args = parser.parse_args() lines = read_words(args.input_file) n_grams_frequency = Counter(parse_ngrams(lines, args.n)) freq_sum = sum(n_grams_frequency.values()) print 'sum: {}'.format(freq_sum) # sort n_grams by value in descending order n_grams_frequency = sort_ngrams(n_grams_frequency) print_ngrams(n_grams_frequency, args.m) a1_step2.py000644 000765 000024 00000014531 12470142143 013515 0ustar00markusstaff000000 000000 # -*- coding: utf-8 -*- """ Created on Tue Feb 10 11:36:38 2015 @author: markus """
if args.scored_perms and args.n is not 2: exit('n must be 2 when using permutations') if not args.cond_file and not args.m and not args.seq_file and not args.scored_perms: parser.print_help() exit('What shall I do?') # split and flatten array # sentences is list of sentences that start with START and end with STOP sentences = get_sentences( add_start_stop(args.input_file, args.n if not args.m else 1)) #print(sentences) n_grams = Counter( list(itertools.chain(*[parse_ngrams(sen, args.n) for sen in sentences]))) n_min_1_grams = Counter( list( itertools.chain( *[parse_ngrams(sen, args.n - 1) for sen in sentences]))) # when n=1 n_min_1_grams would become a dict instead of a Counter. To keep # stuff consistent... if not n_min_1_grams: n_min_1_grams = Counter() # if wished, print m most bigrams if args.m: print('n-grams:') print_ngrams(sort_ngrams(n_grams), args.m)