Python NERTest примеры использования

Язык программирования: Python

Класс/Тип: NERTest

Примеров на hotexamples.com: 2

Python NERTest - 2 примера найдено. Это лучшие примеры Python кода для NERTest, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

readnerxml(2)

calc(1)

calc2(1)

calc3(1)

readnerxml2(1)

Пример #1

Показать файл

Файл: TestResolver_xml_ner.py Проект: datta90/TopoCluster

def writeplaintext(wordref, stan_path, outxml, xml):
	import NERTest as NER
	plaintext = str.join('', [wordref[x]+' ' for x in wordref])
	filename = "/home/grant/devel/TopoCluster/data/tempplain_" + xml
	op = io.open(filename, 'w', encoding='utf-8')
	op.write(unicode(plaintext))
	op.close()
	NER.calc3(stan_path, filename, outxml)
	toporef2, wordref2 = NER.readnerxml2(outxml)
	return wordref2, toporef2

Пример #2

Показать файл

Файл: TestResolver_PlainText_NER.py Проект: grantdelozier/TopoCluster

def calc(in_domain_stat_tbl, out_domain_stat_tbl, test_xml, conn_info, gtbl, window, percentile,
		 main_topo_weight, other_topo_weight, other_word_weight, country_tbl, region_tbl,
		 state_tbl, geonames_tbl, in_corp_lamb, out_corp_lamb, results_file, stan_path):
	print "In Domain Local Statistics Table Name: ", in_domain_stat_tbl
	print "Out of domain Local Statistics Table Name: ", out_domain_stat_tbl
	print "Directory Path containing plain text files to be parsed: ", test_xml
	print "DB connection info: ", conn_info
	print "Grid table used: ", gtbl
	print "Window size", window
	print "Percentile: ", percentile
	print "Main Toponym weight: ", main_topo_weight
	print "Other Toponym weight: ", other_topo_weight
	print "Other Word weight: ", other_word_weight

	print "Country table name: ", country_tbl
	print "Region table name: ", region_tbl
	print "State table name: ", state_tbl
	print "Out of Domain Lambda", out_corp_lamb
	print "In Domain Lambda", in_corp_lamb

	#Test the connection to the database
	conn = psycopg2.connect(conn_info)
	print "Connection Success"

	#These words and characters will not be evaluated or be used for Gi* vector summing
	stopwords = set(['.',',','(',')','-', '--', u'\u2010', u'\u2011', u'\u2012', u'\u2013','=',";",':',"'",'"','$','the','a','an','that','this',
					'to', 'be', 'have', 'has', 'is', 'are', 'was', 'am', "'s",
					'and', 'or','but',
					'by', 'of', 'from','in','after','on','for', 'to', 'TO',
					'I', 'me', 'he', 'him', 'she', 'her', 'we', 'us', 'you', 'your', 'yours' 'they', 'them', 'their', 'it', 'its'])

	#Initialize a cursor for DB connection
	cur = conn.cursor()

	#Check to see if database tables were set up properly
	try:
		ood_table_list = ['enwiki20130102_ner_final_atoi', 'enwiki20130102_ner_final_jtos', 'enwiki20130102_ner_final_ttoz', 'enwiki20130102_ner_final_other']
		for tb in ood_table_list:
			SQL_check = "select * FROM %s LIMIT 1;" % (tb)
			cur.execute(SQL_check)
			result = cur.fetchall()
			if len(result) > 0:	
				print tb, "table loaded correctly"
	except:
		print "Out of domain tables were not loaded correctly"
		print "Exiting..."
		sys.exit()

	try:
		if in_domain_stat_tbl != "None":
			SQL_check = "select * FROM %s LIMIT 1;" % (out_domain_stat_tbl)
			cur.execute(SQL_check)
			result = cur.fetchall()
			if len(result) > 0:
				print out_domain_stat_tbl, "table loaded correctly"
	except:
		print "In domain table provided was not loaded correctly:", in_domain_stat_tbl
		print "Exiting..."
		sys.exit()



	#Intitialize a Ditionary that links GlobalGrid gid values to Latitude/Longitudes
	lat_long_lookup = {}
	SQL2 = "SELECT gid, ST_Y(geog::geometry), ST_X(geog::geometry) from %s ;" % gtbl
	cur.execute(SQL2)
	lat_long_lookup = dict([(int(g[0]), [g[1],g[2]]) for g in cur.fetchall()])
	#print len(lat_long_lookup)

	point_total_correct = 0
	poly_total_correct = 0

	start_time = datetime.datetime.now()

	total_topo = 0

	#Test xml should always be a directory name. System currently only supports directory as an argument, though may change in the future.
	if os.path.isdir(test_xml) == True:
		print "Reading as directory"
		files = os.listdir(test_xml)
		point_bigerror = []
		poly_bigerror = []
		point_dist_list = []
		poly_dist_list = []
		point_error_sum = 0.0
		poly_error_sum = 0.0
		error_sum2 = 0.0
		poly_dist = 0.0
		m = 0

		#These queries are designed to pull all the alternate names from the geonames, country, state, and region tables. Alternate names are used in later steps to enhance gazetteer matching
		SQL1 = "SELECT p1.gid, p1.name, p1.name_long, p1.geonames_gid, p1.altnames FROM %s as p1 ;" % country_tbl
		SQl2 = "SELECT p1.gid, p1.name, p1.name_long, p1.geonames_gid, p1.altnames FROM %s as p1 ;" % region_tbl
		SQL3 = "SELECT p1.gid, p1.name, p1.geonames_gid, p1.altnames FROM %s as p1 ;" % state_tbl
		SQL4 = "SELECT p1.gid, p1.name, p1.asciiname, p1.alternames FROM %s as p1 where p1.featurecode = 'PPLC' or p1.featurecode = 'PPLA' or p1.featurecode = 'PPLA2' or p1.featurecode = 'PPL';" % geonames_tbl

		cur.execute(SQL1)

		cntry_alt = {}
		for row in cur.fetchall():
			alist = [row[1], row[2]]
			if row[4] is not None:
				alist.extend(row[4].split(','))
			#print alist
			for w in alist:
				cntry_alt.setdefault(w, set()).add(row[0])
			#cntry_alt.setdefault(row[0], list()).append(alist)

		cur.execute(SQL3)

		state_alt = {}
		for row in cur.fetchall():
			alist = [row[1], row[2]]
			if row[3] is not None:
				alist.extend(row[3].split(','))
			#print alist
			for w in alist:
				state_alt.setdefault(w, set()).add(row[0])
			#state_alt.setdefault(row[0], list()).append(alist)

		cur.execute(SQL2)

		region_alt = {}
		for row in cur.fetchall():
			alist = [row[1], row[2]]
			#print row
			if len(row) > 3 and row[4] is not None:
				alist.extend(row[4].split(','))
			#print alist
			for w in alist:
				region_alt.setdefault(w, set()).add(row[0])
			#region_alt.setdefault(row[0], list()).append(alist)

		cur.execute(SQL4)
		pplc_alt = {}
		for row in cur.fetchall():
			alist = [row[1], row[2]]
			#print row
			if len(row) > 3 and row[3] is not None:
				alist.extend(row[3].split(','))
			for w in alist:
				pplc_alt.setdefault(w, set()).add(row[0])


		print "Done Creating Alt Names"
		print "Length of PPL AltNames: ", len(pplc_alt)

		#Import script that issues commands to Stanford NER
		import NERTest as NER
		predictions = []

		#Loop through every plaintext file in directory
		for plaintext in files:
			m += 1
			print plaintext
			filename = os.path.join(test_xml, plaintext)
			outxml = "ner_" + plaintext
			#Catch errors from the Stanford NER. Doesn't always succeed in parsing files. 
			try:
				#NER.calc(stan_path, filename, outxml) 
				NER.calc2(stan_path, filename, outxml)
				toporef, wordref = NER.readnerxml(outxml)
				os.remove(outxml)
			except:
				print "Problem using the Stanford Parser for this file, skipping"
				print plaintext
				toporef = {}
				wordref = {}


			print "Files left to go: ", len(files) - m
			print "Total Toponyms ", total_topo
			#Vector Sum Function (Performs actual Disambiguation)
			#wordref = other words dictionary with key = token index : value = word (at this point)
			#toporef = toponym dictionary with key = token index : value = word (at this point)
			#total_topo = total number of toponyms currently georeferenced
			predictions, total_topo = VectorSum(wordref, toporef, total_topo, cur, lat_long_lookup,  
				percentile, window, stopwords, main_topo_weight, other_topo_weight, other_word_weight, plaintext, predictions, country_tbl, region_tbl, state_tbl,
				 geonames_tbl, cntry_alt, region_alt, state_alt, pplc_alt, in_domain_stat_tbl, in_corp_lamb, out_corp_lamb)
			with io.open(results_file+plaintext, 'w', encoding='utf-8') as w:
				w.write(u"NER_Toponym,Source_File,Token_index,GeoRefSource,Table,gid,Table_Toponym,Centroid_Lat,Centroid_Long\r\n")
				for p in predictions:
					#The encoding of the toponym can change based on the document being read
					if isinstance(p[0], str):
						toponym = p[0].decode('utf-8')
					if isinstance(p[0], unicode):
						toponym = p[0].encode('utf-8').decode('utf-8')
					#The encoding of the toponym name from the table can change based on the table results were pulled from
					if isinstance(p[6], str):
						table_toponym = p[6].decode('utf-8')
					if isinstance(p[6], unicode):
						table_toponym = p[6].encode('utf-8').decode('utf-8')
					w.write(toponym+u','+p[1]+u','+unicode(p[2])+u','+p[3]+u','+p[4]+u','+unicode(p[5])+u','+table_toponym+u','+unicode(p[7])+u','+unicode(p[8])+'\r\n')

		'''print "=============Vector Sum================"
		print "Total Toponyms: ", total_topo
		print "Window: ", window
		print "Percentile: ", percentile
		print "Main Topo weight:", main_topo_weight
		print "Other Topo weight:", other_topo_weight
		print "Other word weight:", other_word_weight
		#Write all toponym resolution results to results file
		with io.open(results_file, 'w', encoding='utf-8') as w:
			w.write(u"=============TopoCluster Run Settings================" + '\r\n')
			w.write(u"In Domain Local Statistics Table Name: " + unicode(in_domain_stat_tbl) + '\r\n')
			w.write(u"Out of domain Local Statistics Table Name: " + unicode(out_domain_stat_tbl) + '\r\n')
			w.write(u"Test XML directory/file path: " + test_xml + '\r\n')
			w.write(u"In Domain Corp Lambda: " + unicode(in_corp_lamb) + '\r\n')
			w.write(u"Out Domain Corp Lambda: " + unicode(out_corp_lamb) + '\r\n')
			w.write(u"Window: " + unicode(window) + '\r\n')
			w.write(u"Total Toponyms: " + str(total_topo) + '\r\n')
			w.write(u"Main Topo Weight:"+ unicode(main_topo_weight) + '\r\n')
			w.write(u"Other Topo Weight:"+ unicode(other_topo_weight) + '\r\n')
			w.write(u"Other Word Weight:"+ unicode(other_word_weight) + '\r\n')
			w.write(u"=====================================================" + '\r\n')
			w.write(u"NER_Toponym,Source_File,Token_index,GeoRefSource,Table,gid,Table_Toponym,Centroid_Lat,Centroid_Long\r\n")
			for p in predictions:
				#The encoding of the toponym can change based on the document being read
				if isinstance(p[0], str):
					toponym = p[0].decode('utf-8')
				if isinstance(p[0], unicode):
					toponym = p[0].encode('utf-8').decode('utf-8')
				#The encoding of the toponym name from the table can change based on the table results were pulled from
				if isinstance(p[6], str):
					table_toponym = p[6].decode('utf-8')
				if isinstance(p[6], unicode):
					table_toponym = p[6].encode('utf-8').decode('utf-8')
				w.write(toponym+u','+p[1]+u','+unicode(p[2])+u','+p[3]+u','+p[4]+u','+unicode(p[5])+u','+table_toponym+u','+unicode(p[7])+u','+unicode(p[8])+'\r\n')'''
		conn.close()

	end_time = datetime.datetime.now()

	print total_topo
	print "Check File @ ", results_file+plaintext
	print "Total Time: ", end_time - start_time