Exemplo n.º 1
0
def writeplaintext(wordref, stan_path, outxml, xml):
	import NERTest as NER
	plaintext = str.join('', [wordref[x]+' ' for x in wordref])
	filename = "/home/grant/devel/TopoCluster/data/tempplain_" + xml
	op = io.open(filename, 'w', encoding='utf-8')
	op.write(unicode(plaintext))
	op.close()
	NER.calc(stan_path, filename, outxml)
	toporef2, wordref2 = NER.readnerxml(outxml)
	return wordref2, toporef2
def calc(in_domain_stat_tbl, out_domain_stat_tbl, test_xml, conn_info, gtbl, window, percentile,
		 main_topo_weight, other_topo_weight, other_word_weight, country_tbl, region_tbl,
		 state_tbl, geonames_tbl, in_corp_lamb, out_corp_lamb, results_file, stan_path):
	print "In Domain Local Statistics Table Name: ", in_domain_stat_tbl
	print "Out of domain Local Statistics Table Name: ", out_domain_stat_tbl
	print "Directory Path containing plain text files to be parsed: ", test_xml
	print "DB connection info: ", conn_info
	print "Grid table used: ", gtbl
	print "Window size", window
	print "Percentile: ", percentile
	print "Main Toponym weight: ", main_topo_weight
	print "Other Toponym weight: ", other_topo_weight
	print "Other Word weight: ", other_word_weight

	print "Country table name: ", country_tbl
	print "Region table name: ", region_tbl
	print "State table name: ", state_tbl
	print "Out of Domain Lambda", out_corp_lamb
	print "In Domain Lambda", in_corp_lamb

	#Test the connection to the database
	conn = psycopg2.connect(conn_info)
	print "Connection Success"

	#These words and characters will not be evaluated or be used for Gi* vector summing
	stopwords = set(['.',',','(',')','-', '--', u'\u2010', u'\u2011', u'\u2012', u'\u2013','=',";",':',"'",'"','$','the','a','an','that','this',
					'to', 'be', 'have', 'has', 'is', 'are', 'was', 'am', "'s",
					'and', 'or','but',
					'by', 'of', 'from','in','after','on','for', 'to', 'TO',
					'I', 'me', 'he', 'him', 'she', 'her', 'we', 'us', 'you', 'your', 'yours' 'they', 'them', 'their', 'it', 'its'])

	#Initialize a cursor for DB connection
	cur = conn.cursor()

	#Check to see if database tables were set up properly
	try:
		ood_table_list = ['enwiki20130102_ner_final_atoi', 'enwiki20130102_ner_final_jtos', 'enwiki20130102_ner_final_ttoz', 'enwiki20130102_ner_final_other']
		for tb in ood_table_list:
			SQL_check = "select * FROM %s LIMIT 1;" % (tb)
			cur.execute(SQL_check)
			result = cur.fetchall()
			if len(result) > 0:	
				print tb, "table loaded correctly"
	except:
		print "Out of domain tables were not loaded correctly"
		print "Exiting..."
		sys.exit()

	try:
		if in_domain_stat_tbl != "None":
			SQL_check = "select * FROM %s LIMIT 1;" % (out_domain_stat_tbl)
			cur.execute(SQL_check)
			result = cur.fetchall()
			if len(result) > 0:
				print out_domain_stat_tbl, "table loaded correctly"
	except:
		print "In domain table provided was not loaded correctly:", in_domain_stat_tbl
		print "Exiting..."
		sys.exit()



	#Intitialize a Ditionary that links GlobalGrid gid values to Latitude/Longitudes
	lat_long_lookup = {}
	SQL2 = "SELECT gid, ST_Y(geog::geometry), ST_X(geog::geometry) from %s ;" % gtbl
	cur.execute(SQL2)
	lat_long_lookup = dict([(int(g[0]), [g[1],g[2]]) for g in cur.fetchall()])
	#print len(lat_long_lookup)

	point_total_correct = 0
	poly_total_correct = 0

	start_time = datetime.datetime.now()

	total_topo = 0

	#Test xml should always be a directory name. System currently only supports directory as an argument, though may change in the future.
	if os.path.isdir(test_xml) == True:
		print "Reading as directory"
		files = os.listdir(test_xml)
		point_bigerror = []
		poly_bigerror = []
		point_dist_list = []
		poly_dist_list = []
		point_error_sum = 0.0
		poly_error_sum = 0.0
		error_sum2 = 0.0
		poly_dist = 0.0
		m = 0

		#These queries are designed to pull all the alternate names from the geonames, country, state, and region tables. Alternate names are used in later steps to enhance gazetteer matching
		SQL1 = "SELECT p1.gid, p1.name, p1.name_long, p1.geonames_gid, p1.altnames FROM %s as p1 ;" % country_tbl
		SQl2 = "SELECT p1.gid, p1.name, p1.name_long, p1.geonames_gid, p1.altnames FROM %s as p1 ;" % region_tbl
		SQL3 = "SELECT p1.gid, p1.name, p1.geonames_gid, p1.altnames FROM %s as p1 ;" % state_tbl
		SQL4 = "SELECT p1.gid, p1.name, p1.asciiname, p1.alternames FROM %s as p1 where p1.featurecode = 'PPLC' or p1.featurecode = 'PPLA' or p1.featurecode = 'PPLA2' or p1.featurecode = 'PPL';" % geonames_tbl

		cur.execute(SQL1)

		cntry_alt = {}
		for row in cur.fetchall():
			alist = [row[1], row[2]]
			if row[4] is not None:
				alist.extend(row[4].split(','))
			#print alist
			for w in alist:
				cntry_alt.setdefault(w, set()).add(row[0])
			#cntry_alt.setdefault(row[0], list()).append(alist)

		cur.execute(SQL3)

		state_alt = {}
		for row in cur.fetchall():
			alist = [row[1], row[2]]
			if row[3] is not None:
				alist.extend(row[3].split(','))
			#print alist
			for w in alist:
				state_alt.setdefault(w, set()).add(row[0])
			#state_alt.setdefault(row[0], list()).append(alist)

		cur.execute(SQL2)

		region_alt = {}
		for row in cur.fetchall():
			alist = [row[1], row[2]]
			#print row
			if len(row) > 3 and row[4] is not None:
				alist.extend(row[4].split(','))
			#print alist
			for w in alist:
				region_alt.setdefault(w, set()).add(row[0])
			#region_alt.setdefault(row[0], list()).append(alist)

		cur.execute(SQL4)
		pplc_alt = {}
		for row in cur.fetchall():
			alist = [row[1], row[2]]
			#print row
			if len(row) > 3 and row[3] is not None:
				alist.extend(row[3].split(','))
			for w in alist:
				pplc_alt.setdefault(w, set()).add(row[0])


		print "Done Creating Alt Names"
		print "Length of PPL AltNames: ", len(pplc_alt)

		#Import script that issues commands to Stanford NER
		import NERTest as NER
		predictions = []

		#Loop through every plaintext file in directory
		for plaintext in files:
			m += 1
			print plaintext
			filename = os.path.join(test_xml, plaintext)
			outxml = "ner_" + plaintext
			#Catch errors from the Stanford NER. Doesn't always succeed in parsing files. 
			try:
				#NER.calc(stan_path, filename, outxml) 
				NER.calc2(stan_path, filename, outxml)
				toporef, wordref = NER.readnerxml(outxml)
				os.remove(outxml)
			except:
				print "Problem using the Stanford Parser for this file, skipping"
				print plaintext
				toporef = {}
				wordref = {}


			print "Files left to go: ", len(files) - m
			print "Total Toponyms ", total_topo
			#Vector Sum Function (Performs actual Disambiguation)
			#wordref = other words dictionary with key = token index : value = word (at this point)
			#toporef = toponym dictionary with key = token index : value = word (at this point)
			#total_topo = total number of toponyms currently georeferenced
			predictions, total_topo = VectorSum(wordref, toporef, total_topo, cur, lat_long_lookup,  
				percentile, window, stopwords, main_topo_weight, other_topo_weight, other_word_weight, plaintext, predictions, country_tbl, region_tbl, state_tbl,
				 geonames_tbl, cntry_alt, region_alt, state_alt, pplc_alt, in_domain_stat_tbl, in_corp_lamb, out_corp_lamb)
			with io.open(results_file+plaintext, 'w', encoding='utf-8') as w:
				w.write(u"NER_Toponym,Source_File,Token_index,GeoRefSource,Table,gid,Table_Toponym,Centroid_Lat,Centroid_Long\r\n")
				for p in predictions:
					#The encoding of the toponym can change based on the document being read
					if isinstance(p[0], str):
						toponym = p[0].decode('utf-8')
					if isinstance(p[0], unicode):
						toponym = p[0].encode('utf-8').decode('utf-8')
					#The encoding of the toponym name from the table can change based on the table results were pulled from
					if isinstance(p[6], str):
						table_toponym = p[6].decode('utf-8')
					if isinstance(p[6], unicode):
						table_toponym = p[6].encode('utf-8').decode('utf-8')
					w.write(toponym+u','+p[1]+u','+unicode(p[2])+u','+p[3]+u','+p[4]+u','+unicode(p[5])+u','+table_toponym+u','+unicode(p[7])+u','+unicode(p[8])+'\r\n')

		'''print "=============Vector Sum================"
		print "Total Toponyms: ", total_topo
		print "Window: ", window
		print "Percentile: ", percentile
		print "Main Topo weight:", main_topo_weight
		print "Other Topo weight:", other_topo_weight
		print "Other word weight:", other_word_weight
		#Write all toponym resolution results to results file
		with io.open(results_file, 'w', encoding='utf-8') as w:
			w.write(u"=============TopoCluster Run Settings================" + '\r\n')
			w.write(u"In Domain Local Statistics Table Name: " + unicode(in_domain_stat_tbl) + '\r\n')
			w.write(u"Out of domain Local Statistics Table Name: " + unicode(out_domain_stat_tbl) + '\r\n')
			w.write(u"Test XML directory/file path: " + test_xml + '\r\n')
			w.write(u"In Domain Corp Lambda: " + unicode(in_corp_lamb) + '\r\n')
			w.write(u"Out Domain Corp Lambda: " + unicode(out_corp_lamb) + '\r\n')
			w.write(u"Window: " + unicode(window) + '\r\n')
			w.write(u"Total Toponyms: " + str(total_topo) + '\r\n')
			w.write(u"Main Topo Weight:"+ unicode(main_topo_weight) + '\r\n')
			w.write(u"Other Topo Weight:"+ unicode(other_topo_weight) + '\r\n')
			w.write(u"Other Word Weight:"+ unicode(other_word_weight) + '\r\n')
			w.write(u"=====================================================" + '\r\n')
			w.write(u"NER_Toponym,Source_File,Token_index,GeoRefSource,Table,gid,Table_Toponym,Centroid_Lat,Centroid_Long\r\n")
			for p in predictions:
				#The encoding of the toponym can change based on the document being read
				if isinstance(p[0], str):
					toponym = p[0].decode('utf-8')
				if isinstance(p[0], unicode):
					toponym = p[0].encode('utf-8').decode('utf-8')
				#The encoding of the toponym name from the table can change based on the table results were pulled from
				if isinstance(p[6], str):
					table_toponym = p[6].decode('utf-8')
				if isinstance(p[6], unicode):
					table_toponym = p[6].encode('utf-8').decode('utf-8')
				w.write(toponym+u','+p[1]+u','+unicode(p[2])+u','+p[3]+u','+p[4]+u','+unicode(p[5])+u','+table_toponym+u','+unicode(p[7])+u','+unicode(p[8])+'\r\n')'''
		conn.close()

	end_time = datetime.datetime.now()

	print total_topo
	print "Check File @ ", results_file+plaintext
	print "Total Time: ", end_time - start_time