Exemplo n.º 1
0
def extract(rows, target_postags, target_structures, target_word=None, mongodb=True, VERBOSE=True):



	print 'anchor pos tags:', color.render(', '.join(target_postags), 'lc')
	print 'structures:', color.render(', '.join([x[0]+':'+str(x[1]) for x in target_structures]), 'lc')
	print '='*60
	collect_cnt, skip_cnt = 0, 0

	for entry in rows:

		## extract rows
		sid, sent, pos, raw_tree, raw_dep = entry if not mongodb else (entry['id'], entry['sent'], entry['pos'], entry['tree'], entry['dep'])
		
		# read dependency and tree objs
		deps = dependency.read(raw_dep, return_type=dict)
		if not deps: continue
		tree = Tree(raw_tree)

		# collect certain dependency relations according to pre-specified pos tags
		## cdeps: [(u'is', u'VBZ', 8), (u"'ve", u'VBP', 5), (u'do', u'VBP', 7), (u'Yeah', u'JJ', 1), (u'well', u'NN', 2), (u'gotta', u'NN', 6), (u'bowl', u'NN', 11), (u'vinegar', u'NN', 13), (u'put', u'VBN', 9)]
		cdeps = extract_anchors(deps, tree, targets=target_postags)

		total_word_cnt += len(tree.pos())
		anchor_word_cnt += len(cdeps)

		##  ('is', 'VBZ', 8) in [(u'is', u'VBZ', 8), (u"'ve", u'VBP', 5), (u'do', u'VBP', 7) ...]
		for (word, pos, idx) in cdeps:

			## check if this is the target word if a target specified
			if target_word and word.lower() != target_word.lower():
				if VERBOSE:
					print color.render('(ancher[x]) '+word+'-'+str(idx)+' #'+pos, 'b')
				continue

			## extract dependency relations which match the target structures 
			rdeps = _filter_deps_by_rel(deps, anchor=(word, idx), targets=target_structures)

			if rdeps: ## got deps match the target structures

				if VERBOSE:
					print color.render('(anchor[v]) '+word+'-'+str(idx)+' #'+pos, 'g')

				T = [ _transform_to_tuple(dep) for dep in rdeps]
				for (rel, (l, li), (r, ri)) in T: print '  ',color.render(rel,'r'),color.render('( '+l+'-'+str(li)+', '+r+'-'+str(ri)+' )','y')

	print '='*60
Exemplo n.º 2
0
def extract_and_save(rows, target_postags, target_structures, det_db_cfg, target_word=None, mongodb=True):


	lmtzr = WordNetLemmatizer()


	print 'anchor pos tags:', color.render(', '.join(target_postags), 'lc')
	print 'structures:', color.render(', '.join([x[0]+':'+str(x[1]) for x in target_structures]), 'lc')
	print '='*60
	collect_cnt, skip_cnt = 0, 0	

	mc = pymongo.Connection(det_db_cfg['server_addr'])
	db = mc[det_db_cfg['db']]
	co = db[det_db_cfg['collection']]

	sent_cnt, total_word_cnt, anchor_word_cnt, anchor_word_structure_cnt = 0, 0, 0, 0


	for entry in rows:

		## extract rows
		sid, sent, pos, raw_tree, raw_dep = entry if not mongodb else (entry['id'], entry['sent'], entry['pos'], entry['tree'], entry['dep'])
		
		# read dependency and tree objs
		deps = dependency.read(raw_dep, return_type=dict)
		if not deps: continue
		tree = Tree(raw_tree)


		# collect certain dependency relations according to pre-specified pos tags
		## cdeps: [(u'is', u'VBZ', 8), (u"'ve", u'VBP', 5), (u'do', u'VBP', 7), (u'Yeah', u'JJ', 1), (u'well', u'NN', 2), (u'gotta', u'NN', 6), (u'bowl', u'NN', 11), (u'vinegar', u'NN', 13), (u'put', u'VBN', 9)]
		cdeps = extract_anchors(deps, tree, targets=target_postags)

		## for stat
		sent_cnt += 1
		total_word_cnt += len(tree.pos())
		anchor_word_cnt += len(cdeps)

		##  ('is', 'VBZ', 8) in [(u'is', u'VBZ', 8), (u"'ve", u'VBP', 5), (u'do', u'VBP', 7) ...]
		for (word, pos, idx) in cdeps:

			## check if this is the target word if a target specified
			if target_word and word.lower() != target_word.lower(): continue

			## extract dependency relations which match the target structures 
			rdeps = _filter_deps_by_rel(deps, anchor=(word, idx), targets=target_structures)

			if rdeps: ## got deps match the target structures

				print color.render('(anchor[v]) '+word+'-'+str(idx)+' #'+pos, 'g')

				T = [ _transform_to_tuple(dep) for dep in rdeps]
				for (rel, (l, li), (r, ri)) in T: print '  ',color.render(rel,'r'),color.render('( '+l+'-'+str(li)+', '+r+'-'+str(ri)+' )','y')

				lemma = lmtzr.lemmatize(word, _getWordNetPOS(pos))

				# generate mongo obj
				mongo_obj = {}
				mongo_obj['sid'] = sid 		# sentence id
				mongo_obj['word'] = word 	# anchor word
				mongo_obj['pos'] = pos 		# pos tag of word
				mongo_obj['idx'] = idx 		# word index 
				mongo_obj['deps'] = rdeps	# related deps
				mongo_obj['lemma'] = lemma	# word lemma
				
				co.insert(mongo_obj)

				anchor_word_structure_cnt += 1

	
	mc.close()

	print '='*60
	print 'write statistic log'
	with open('stat.log','w') as fw:
		fw.write('total sent'+'\t'+str(sent_cnt)+'\n')
		fw.write('total word'+'\t'+str(total_word_cnt)+'\n')
		fw.write('anchor word'+'\t'+str(anchor_word_cnt)+'\n')
		fw.write('anchor word with structures'+'\t'+str(anchor_word_structure_cnt)+'\n')