def writeplaintext(wordref, stan_path, outxml, xml): import NERTest as NER plaintext = str.join('', [wordref[x]+' ' for x in wordref]) filename = "/home/grant/devel/TopoCluster/data/tempplain_" + xml op = io.open(filename, 'w', encoding='utf-8') op.write(unicode(plaintext)) op.close() NER.calc3(stan_path, filename, outxml) toporef2, wordref2 = NER.readnerxml2(outxml) return wordref2, toporef2
def calc(in_domain_stat_tbl, out_domain_stat_tbl, test_xml, conn_info, gtbl, window, percentile, main_topo_weight, other_topo_weight, other_word_weight, country_tbl, region_tbl, state_tbl, geonames_tbl, in_corp_lamb, out_corp_lamb, results_file, stan_path): print "In Domain Local Statistics Table Name: ", in_domain_stat_tbl print "Out of domain Local Statistics Table Name: ", out_domain_stat_tbl print "Directory Path containing plain text files to be parsed: ", test_xml print "DB connection info: ", conn_info print "Grid table used: ", gtbl print "Window size", window print "Percentile: ", percentile print "Main Toponym weight: ", main_topo_weight print "Other Toponym weight: ", other_topo_weight print "Other Word weight: ", other_word_weight print "Country table name: ", country_tbl print "Region table name: ", region_tbl print "State table name: ", state_tbl print "Out of Domain Lambda", out_corp_lamb print "In Domain Lambda", in_corp_lamb #Test the connection to the database conn = psycopg2.connect(conn_info) print "Connection Success" #These words and characters will not be evaluated or be used for Gi* vector summing stopwords = set(['.',',','(',')','-', '--', u'\u2010', u'\u2011', u'\u2012', u'\u2013','=',";",':',"'",'"','$','the','a','an','that','this', 'to', 'be', 'have', 'has', 'is', 'are', 'was', 'am', "'s", 'and', 'or','but', 'by', 'of', 'from','in','after','on','for', 'to', 'TO', 'I', 'me', 'he', 'him', 'she', 'her', 'we', 'us', 'you', 'your', 'yours' 'they', 'them', 'their', 'it', 'its']) #Initialize a cursor for DB connection cur = conn.cursor() #Check to see if database tables were set up properly try: ood_table_list = ['enwiki20130102_ner_final_atoi', 'enwiki20130102_ner_final_jtos', 'enwiki20130102_ner_final_ttoz', 'enwiki20130102_ner_final_other'] for tb in ood_table_list: SQL_check = "select * FROM %s LIMIT 1;" % (tb) cur.execute(SQL_check) result = cur.fetchall() if len(result) > 0: print tb, "table loaded correctly" except: print "Out of domain tables were not loaded correctly" print "Exiting..." sys.exit() try: if in_domain_stat_tbl != "None": SQL_check = "select * FROM %s LIMIT 1;" % (out_domain_stat_tbl) cur.execute(SQL_check) result = cur.fetchall() if len(result) > 0: print out_domain_stat_tbl, "table loaded correctly" except: print "In domain table provided was not loaded correctly:", in_domain_stat_tbl print "Exiting..." sys.exit() #Intitialize a Ditionary that links GlobalGrid gid values to Latitude/Longitudes lat_long_lookup = {} SQL2 = "SELECT gid, ST_Y(geog::geometry), ST_X(geog::geometry) from %s ;" % gtbl cur.execute(SQL2) lat_long_lookup = dict([(int(g[0]), [g[1],g[2]]) for g in cur.fetchall()]) #print len(lat_long_lookup) point_total_correct = 0 poly_total_correct = 0 start_time = datetime.datetime.now() total_topo = 0 #Test xml should always be a directory name. System currently only supports directory as an argument, though may change in the future. if os.path.isdir(test_xml) == True: print "Reading as directory" files = os.listdir(test_xml) point_bigerror = [] poly_bigerror = [] point_dist_list = [] poly_dist_list = [] point_error_sum = 0.0 poly_error_sum = 0.0 error_sum2 = 0.0 poly_dist = 0.0 m = 0 #These queries are designed to pull all the alternate names from the geonames, country, state, and region tables. Alternate names are used in later steps to enhance gazetteer matching SQL1 = "SELECT p1.gid, p1.name, p1.name_long, p1.geonames_gid, p1.altnames FROM %s as p1 ;" % country_tbl SQl2 = "SELECT p1.gid, p1.name, p1.name_long, p1.geonames_gid, p1.altnames FROM %s as p1 ;" % region_tbl SQL3 = "SELECT p1.gid, p1.name, p1.geonames_gid, p1.altnames FROM %s as p1 ;" % state_tbl SQL4 = "SELECT p1.gid, p1.name, p1.asciiname, p1.alternames FROM %s as p1 where p1.featurecode = 'PPLC' or p1.featurecode = 'PPLA' or p1.featurecode = 'PPLA2' or p1.featurecode = 'PPL';" % geonames_tbl cur.execute(SQL1) cntry_alt = {} for row in cur.fetchall(): alist = [row[1], row[2]] if row[4] is not None: alist.extend(row[4].split(',')) #print alist for w in alist: cntry_alt.setdefault(w, set()).add(row[0]) #cntry_alt.setdefault(row[0], list()).append(alist) cur.execute(SQL3) state_alt = {} for row in cur.fetchall(): alist = [row[1], row[2]] if row[3] is not None: alist.extend(row[3].split(',')) #print alist for w in alist: state_alt.setdefault(w, set()).add(row[0]) #state_alt.setdefault(row[0], list()).append(alist) cur.execute(SQL2) region_alt = {} for row in cur.fetchall(): alist = [row[1], row[2]] #print row if len(row) > 3 and row[4] is not None: alist.extend(row[4].split(',')) #print alist for w in alist: region_alt.setdefault(w, set()).add(row[0]) #region_alt.setdefault(row[0], list()).append(alist) cur.execute(SQL4) pplc_alt = {} for row in cur.fetchall(): alist = [row[1], row[2]] #print row if len(row) > 3 and row[3] is not None: alist.extend(row[3].split(',')) for w in alist: pplc_alt.setdefault(w, set()).add(row[0]) print "Done Creating Alt Names" print "Length of PPL AltNames: ", len(pplc_alt) #Import script that issues commands to Stanford NER import NERTest as NER predictions = [] #Loop through every plaintext file in directory for plaintext in files: m += 1 print plaintext filename = os.path.join(test_xml, plaintext) outxml = "ner_" + plaintext #Catch errors from the Stanford NER. Doesn't always succeed in parsing files. try: #NER.calc(stan_path, filename, outxml) NER.calc2(stan_path, filename, outxml) toporef, wordref = NER.readnerxml(outxml) os.remove(outxml) except: print "Problem using the Stanford Parser for this file, skipping" print plaintext toporef = {} wordref = {} print "Files left to go: ", len(files) - m print "Total Toponyms ", total_topo #Vector Sum Function (Performs actual Disambiguation) #wordref = other words dictionary with key = token index : value = word (at this point) #toporef = toponym dictionary with key = token index : value = word (at this point) #total_topo = total number of toponyms currently georeferenced predictions, total_topo = VectorSum(wordref, toporef, total_topo, cur, lat_long_lookup, percentile, window, stopwords, main_topo_weight, other_topo_weight, other_word_weight, plaintext, predictions, country_tbl, region_tbl, state_tbl, geonames_tbl, cntry_alt, region_alt, state_alt, pplc_alt, in_domain_stat_tbl, in_corp_lamb, out_corp_lamb) with io.open(results_file+plaintext, 'w', encoding='utf-8') as w: w.write(u"NER_Toponym,Source_File,Token_index,GeoRefSource,Table,gid,Table_Toponym,Centroid_Lat,Centroid_Long\r\n") for p in predictions: #The encoding of the toponym can change based on the document being read if isinstance(p[0], str): toponym = p[0].decode('utf-8') if isinstance(p[0], unicode): toponym = p[0].encode('utf-8').decode('utf-8') #The encoding of the toponym name from the table can change based on the table results were pulled from if isinstance(p[6], str): table_toponym = p[6].decode('utf-8') if isinstance(p[6], unicode): table_toponym = p[6].encode('utf-8').decode('utf-8') w.write(toponym+u','+p[1]+u','+unicode(p[2])+u','+p[3]+u','+p[4]+u','+unicode(p[5])+u','+table_toponym+u','+unicode(p[7])+u','+unicode(p[8])+'\r\n') '''print "=============Vector Sum================" print "Total Toponyms: ", total_topo print "Window: ", window print "Percentile: ", percentile print "Main Topo weight:", main_topo_weight print "Other Topo weight:", other_topo_weight print "Other word weight:", other_word_weight #Write all toponym resolution results to results file with io.open(results_file, 'w', encoding='utf-8') as w: w.write(u"=============TopoCluster Run Settings================" + '\r\n') w.write(u"In Domain Local Statistics Table Name: " + unicode(in_domain_stat_tbl) + '\r\n') w.write(u"Out of domain Local Statistics Table Name: " + unicode(out_domain_stat_tbl) + '\r\n') w.write(u"Test XML directory/file path: " + test_xml + '\r\n') w.write(u"In Domain Corp Lambda: " + unicode(in_corp_lamb) + '\r\n') w.write(u"Out Domain Corp Lambda: " + unicode(out_corp_lamb) + '\r\n') w.write(u"Window: " + unicode(window) + '\r\n') w.write(u"Total Toponyms: " + str(total_topo) + '\r\n') w.write(u"Main Topo Weight:"+ unicode(main_topo_weight) + '\r\n') w.write(u"Other Topo Weight:"+ unicode(other_topo_weight) + '\r\n') w.write(u"Other Word Weight:"+ unicode(other_word_weight) + '\r\n') w.write(u"=====================================================" + '\r\n') w.write(u"NER_Toponym,Source_File,Token_index,GeoRefSource,Table,gid,Table_Toponym,Centroid_Lat,Centroid_Long\r\n") for p in predictions: #The encoding of the toponym can change based on the document being read if isinstance(p[0], str): toponym = p[0].decode('utf-8') if isinstance(p[0], unicode): toponym = p[0].encode('utf-8').decode('utf-8') #The encoding of the toponym name from the table can change based on the table results were pulled from if isinstance(p[6], str): table_toponym = p[6].decode('utf-8') if isinstance(p[6], unicode): table_toponym = p[6].encode('utf-8').decode('utf-8') w.write(toponym+u','+p[1]+u','+unicode(p[2])+u','+p[3]+u','+p[4]+u','+unicode(p[5])+u','+table_toponym+u','+unicode(p[7])+u','+unicode(p[8])+'\r\n')''' conn.close() end_time = datetime.datetime.now() print total_topo print "Check File @ ", results_file+plaintext print "Total Time: ", end_time - start_time