def load_sequences(filename, do_stem = False): """ Returns (seqs, words, word_codes, coded_seqs) .`words` is never stemmed. All others use stemming if do_stem is true. """ split_lines = [seq.split() for seq in fileinput.input(filename)] words = list({w for seq in split_lines for w in seq}) if do_stem: seqs = ([stem(w) for w in seq if not isStopWord(w)] for seq in split_lines) stemmed_words = list({stem(w) for w in words if not isStopWord(w)}) else: seqs = split_lines stemmed_words = words word_codes = {w: i for i,w in enumerate(stemmed_words)} coded_seqs = [np.array([word_codes[w] for w in seq]) for seq in seqs if len(seq) > 0] return words, word_codes, coded_seqs
def output_results(prefix, states, stemmed, iterations, seed, words, word_codes, hmm, i): filename ='{}_states-{}_stemmed-{}_iters-{}_seed-{}_{}.txt'.format( prefix, states, stemmed, iterations, seed, i) with open(filename , 'w') as out: for w in words: if not stemmed or not isStopWord(w): i = word_codes[stem(w)] if stemmed else word_codes[w] out.write(w + ' ' + ' '.join(str(x) for x in hmm.emit_probs[:, i])) out.write('\n') return filename
help='Run HMM on stemmed words') parser.add_argument('-i', default=10000, type=int, help='Maximum number of iterations of EM to do') parser.add_argument('--seed', default=False, action='store_true', help='Emission probabilities are seeded with a modded co-occurrence matrix') parser.add_argument('--out', default=None, type=str, metavar='file prefix', help='Output intermittent data in files prefixed with this argument. By default, final results are printed to stdout.') args = parser.parse_args() log('Reading sequences') words, word_codes, coded_seqs = load_sequences(args.f, args.s) log('{} words, {} sequences, {} observables'.format(len(words), len(coded_seqs), len(word_codes))) log('Generating initial HMM') if args.seed: emit_probs = make_modded_cooccurrence(args.n, len(word_codes), coded_seqs) init_hmm = random_hmm(args.n, len(words), emit_probs = emit_probs) else: init_hmm = random_hmm(args.n, len(words)) log('Running EM') out_func = lambda hmm, i : output_results(args.out, args.n, args.s, args.i, args.seed, words, word_codes, hmm, i) if args.out else None final_hmm = maximize_expectation(init_hmm, coded_seqs, max_iters = args.i, print_nll = True, out_func = out_func) if not args.out: log('Writing results') for w in words: if not args.s or not isStopWord(w): i = word_codes[stem(w)] if args.s else word_codes[w] print(w + ' ' + ' '.join(str(x) for x in final_hmm.emit_probs[:, i]))