예제 #1
0
파일: loader.py 프로젝트: mike-n-7/CharPOS
def get_batches(scts):
	batch_xs, batch_ys = empty_batch()
	ex_cnt = 0
	
	if OOV_ONLY:
		iv_set = set()
		def add_iv(f):
			for sent in ptb.sents(f):
				for tok in sent:
					iv_set.add(tok)
		common.for_all_in_ptb_scts(TRAIN_SCTS, add_iv)
	
	for sct in scts:
		print "Section " + sct 
		fs = [f for f in ptb.fileids() if f.startswith("WSJ/" + sct)]
		for f in fs:
			print "  File " + f + "...",
			# For each word in the sentences of the file,
			# create an example and add it to the batch.
			for sent in ptb.tagged_sents(f):
				for i in range(len(sent)):
					# Ignore "None" tags (not overt lingustic elements)
					if sent[i][1] == "-NONE-":
						continue
					
					# If we're in OOV, skip known tokens
					if OOV_ONLY and sent[i][0] in iv_set:
						continue
						
					x, y = get_example(sent, i)
					batch_xs[ex_cnt] = x
					batch_ys[ex_cnt] = y
					
					# If we reach enough examples to form a batch, yield it now,
					# then start a new batch.
					ex_cnt += 1
					if ex_cnt == BATCH_SIZE:
						yield (batch_xs, batch_ys)
						batch_xs, batch_ys = empty_batch()
						ex_cnt = 0
	
	# If we have an incomplete batch at the end, pad it with nothings
	# and yield it.
	if ex_cnt != 0:
		while ex_cnt < BATCH_SIZE:
			x, y = empty_example()
			batch_xs[ex_cnt] = x
			batch_ys[ex_cnt] = y
			ex_cnt += 1
		yield (batch_xs, batch_ys)
	
	raise StopIteration
예제 #2
0
def make_word_model(scts):
	super_model = {tag: {} for tag in common.OPEN_CLASSES | common.CLOSED_CLASSES}
	
	def parse_file(f):
		for word, tag in ptb.tagged_words(f):
			if tag in common.OPEN_CLASSES:
				add_counts(word, super_model[tag])
			elif tag in common.CLOSED_CLASSES:
				observe_closed(word, super_model[tag])
	
	common.for_all_in_ptb_scts(scts, parse_file)
	
	for tag, model in super_model.iteritems():
		if tag in common.OPEN_CLASSES:
			# smooth(model, 1)
			open_as_probs(model)
		else:
			counts_to_probs(tag, super_model)
		
	return super_model
예제 #3
0
def make_word_model(scts):
    super_model = {
        tag: {}
        for tag in common.OPEN_CLASSES | common.CLOSED_CLASSES
    }

    def parse_file(f):
        for word, tag in ptb.tagged_words(f):
            if tag in common.OPEN_CLASSES:
                add_counts(word, super_model[tag])
            elif tag in common.CLOSED_CLASSES:
                observe_closed(word, super_model[tag])

    common.for_all_in_ptb_scts(scts, parse_file)

    for tag, model in super_model.iteritems():
        if tag in common.OPEN_CLASSES:
            # smooth(model, 1)
            open_as_probs(model)
        else:
            counts_to_probs(tag, super_model)

    return super_model