def __init__(self, dbpath="./Geonames_dump.sql", min_popln=0, min_length=1): self.gazetteer = GeoNames(dbpath) self.min_popln = min_popln self.min_length = min_length
def __init__(self, db, nerKeyMap=None, spacy=False): self.gazetteer = GeoNames(db, confMethod='Uniform', escore=False) DEFAULT_NER_MAP = { 'LOCATION': 'LOCATION', 'ORGANIZATION': 'ORGANIZATION', 'NATIONALITY': 'NATIONALITY', 'OTHER': 'OTHER', 'PERSON': 'PERSON' } if nerKeyMap is None: nerKeyMap = DEFAULT_NER_MAP else: for key in DEFAULT_NER_MAP: if key not in nerKeyMap: nerKeyMap[key] = DEFAULT_NER_MAP[key] if spacy is True: nerKeyMap['GPE'] = 'LOCATION' nerKeyMap['NORP'] = 'NATIONALITY' nerKeyMap['ORG'] = 'ORGANIZATION' nerKeyMap['LOC'] = 'LOCATION' self.nerKeyMap = nerKeyMap self.weightage = { "LOCATION": 1.0, "NATIONALITY": 0.75, "ORGANIZATION": 0.5, "OTHER": 0.0, "PERSON": 0.0 }
def __init__(self, db, min_popln=0, min_length=1): self.gazetteer = GeoNames(db) self.min_popln = min_popln self.min_length = min_length self.weightage = { "LOCATION": 1.0, "NATIONALITY": 0.75, "ORGANIZATION": 0.5, "OTHER": 0.2 }
def __init__(self, dbpath="./Geonames_dump.sql", min_popln=0, coverageLength=10): """ Description """ self.coverageLength = coverageLength self.gazetteer = GeoNames("./Geonames_dump.sql") self.min_popln = min_popln
def __init__(self, db, min_popln=0, min_length=1, model="./geoModels/rf_geo.pkl"): self.gazetteer = GeoNames(db) self.min_popln = min_popln self.min_length = min_length self.weightage = { "LOCATION": 1.0, "NATIONALITY": 0.75, "ORGANIZATION": 0.5, "OTHER": 0.0 } with open(model, "rb") as inf: self.model = pickle.load(inf)
class BaseGeo(object): def __init__(self, db, min_popln=0, min_length=1): self.gazetteer = GeoNames(db) self.min_popln = min_popln self.min_length = min_length self.weightage = { "LOCATION": 1.0, "NATIONALITY": 0.75, "ORGANIZATION": 0.5, "OTHER": 0.2 } def geocode(self, doc=None, loclist=None, **kwargs): locTexts = [] if doc is not None: # Get all location entities from document with atleast min_length characters locTexts += [(numstrip.sub("", l['expr'].lower()).strip(), l['neType']) for l in doc["BasisEnrichment"]["entities"] if ((l["neType"] in ("LOCATION", "NATIONALITY")) and len(l['expr']) >= self.min_length)] # locTexts += [(numstrip.sub("", l['expr'].lower()).strip(), 'OTHER') for l in # doc['BasisEnrichment']['nounPhrases']] if loclist is not None: locTexts += [l.lower() for l in loclist] results = self.get_locations_fromURL((doc["url"] if doc.get("url", "") else doc.get("link", ""))) # results = {} # kwargs['analyzer'] = 'standard' return self.geocode_fromList(locTexts, results, **kwargs) def geocode_fromList(self, locTexts, results=None, min_popln=None, **kwargs): if results is None: results = {} if min_popln is None: min_popln = self.min_popln itype = {} for l in locTexts: if l == "": continue if isinstance(l, tuple): itype[l[0]] = l[1] l = l[0] else: itype[l] = 'LOCATION' try: if l in results: results[l].frequency += 1 else: for sub in l.split(","): sub = sub.strip() if sub in results: results[sub].frequency += 1 else: itype[sub] = itype[l] try: # # Exclusion # list_exclude = ['city','town'] # for ext in list_exclude: # if ext in sub: # sub = sub.replace(ext, "") query = self.gazetteer.query(sub, min_popln=min_popln,**kwargs) results[sub] = LocationDistribution(query) except UnicodeDecodeError: ipdb.set_trace() results[sub].frequency = 1 except UnicodeDecodeError: log.exception("Unable to make query for string - {}".format(encode(l))) scores = self.score(results) custom_max = lambda x: max(x.viewvalues(), key=lambda y: y['score']) lrank = self.get_locRanks(scores, results) lmap = {l: custom_max(lrank[l]) for l in lrank if not lrank[l] == {}} total_weight = sum([self.weightage[itype.get(key, 'OTHER')] for key in lmap]) return lmap, max(lmap.items(), key=lambda x: x[1]['score'] * self.weightage[itype.get(x[0], 'OTHER')] / total_weight)[1]['geo_point'] if scores else {} def get_locations_fromURL(self, url): """ Parse URL to get URL COUNTRY and also URL SUBJECT like taiwan in 'cnn.com/taiwan/protest.html' Params: url - a web url Returns: Dict of locations obtained from URL """ results = {} urlinfo = urlparse(url) if urlinfo.netloc != "": urlsubject = urlinfo.path.split("/", 2)[1] urlcountry = urlinfo.netloc.rsplit(".", 1)[-1] # Find URL DOMAIN Country from 2 letter iso-code if len(urlcountry.strip()) == 2: urlcountry = self.gazetteer.get_country(urlcountry.upper()) if urlcountry != []: urlcountry = urlcountry[0] urlcountry.confidence = 1.0 results["URL-DOMAIN_{}".format(urlcountry)] = LocationDistribution(urlcountry) results["URL-DOMAIN_{}".format(urlcountry)].frequency = 1 if 5 < len(urlsubject) < 20: usubj_q = self.gazetteer.query(urlsubject, 15000) if usubj_q: results["URL-SUBJECT_{}".format(urlsubject)] = LocationDistribution(usubj_q) results["URL-SUBJECT_{}".format(urlsubject)].frequency = 1 return results def annotate(self, doc, **kwargs): """ Attach embersGeoCode to document """ try: lmap, gp = self.geocode(doc=doc, **kwargs) except UnicodeDecodeError as e: log.exception("unable to geocode:{}".format(str(e))) lmap, gp = {}, {} doc['embersGeoCode'] = gp doc["location_distribution"] = lmap return doc def update(self,l,scoresheet): for s in l.city: scoresheet[s] += l.city[s] * l.frequency for s in l.admin1: scoresheet[s] += l.admin1[s] * l.frequency for s in l.country: scoresheet[s] += l.country[s] * l.frequency def score(self, results): scoresheet = defaultdict(float) num_mentions = float(sum((l.frequency for l in results.values()))) _ = [self.update(item,scoresheet) for item in results.viewvalues()] for s in scoresheet: scoresheet[s] /= num_mentions return scoresheet def get_realization_score(self,l,scores): lscore_map = {} for lstr, r in l.realizations.viewitems(): base_score = scores[lstr] # if r.ltype == 'city': if not isempty(r.city): l_adminstr = '/'.join([r.country, r.admin1, '']) base_score = (base_score + scores[l_adminstr] + scores[r.country + "//"]) * r.confidence elif not isempty(r.admin1): base_score = (base_score + scores[r.country + "//"]) * r.confidence elif r.ltype == "country": # do nothing pass else: base_score = base_score * r.confidence # code for other types # if not isempty(r.city): # l_adminstr = '/'.join([r.country, r.admin1, '']) # base_score = (base_score + scores[l_adminstr] + scores[r.country + "//"]) * r.confidence # ipdb.set_trace() # raise Exception("Unknown location type-{} for {}".format(r.ltype, lstr)) lscore_map[lstr] = {'score': base_score, 'geo_point': r.__dict__} # for s in l.realizations: # base_score = scores[s] # if l.realizations[s].ltype not in ('country', 'admin'): # l_adminstr = encode('/'.join([l.realizations[s].country, # l.realizations[s].admin1, ''])) # base_score += scores[l_adminstr] + scores[l.realizations[s].country] # elif l.realizations[s].ltype == 'admin': # base_score += scores[l.realizations[s].country] # lscore_map[s] = {'score': base_score, 'geo_point': l.realizations[s].__dict__} return lscore_map def get_locRanks(self, scores, loc_cand): """ Each city score needs to be re-inforced with the corresponding state and country scores to get the actual meaning of that name. For example, several mentions of cities within virginia would have given virginia state a high score. Now this high score has to be brought back to lower levels to decide on meaning of each name/city """ loc_rankmap = {} for locpt in loc_cand: loc_rankmap[locpt] = self.get_realization_score(loc_cand[locpt],scores) return loc_rankmap
class SupervisedGeo(object): def __init__(self, db, min_popln=0, min_length=1, model="./geoModels/rf_geo.pkl"): self.gazetteer = GeoNames(db) self.min_popln = min_popln self.min_length = min_length self.weightage = { "LOCATION": 1.0, "NATIONALITY": 0.75, "ORGANIZATION": 0.5, "OTHER": 0.0 } with open(model, "rb") as inf: self.model = pickle.load(inf) def _build_data(self, doc=None, loclist=None, eKey='BasisEnrichment', **kwargs): locTexts, persons = [], [] NAMED_ENTITY_TYPES_TO_CHECK = [ key for key in self.weightage if self.weightage[key] > 0 ] if doc is not None: doclength = len(doc[eKey]['tokens']) locTexts += [ (numstrip.sub("", l['expr'].lower()).strip(), l['neType'], (sum([int(_) for _ in l['offset'].split(":")])) / (2.0 * doclength)) for l in doc[eKey]["entities"] if ((l["neType"] in NAMED_ENTITY_TYPES_TO_CHECK) and len(l['expr']) >= self.min_length) ] persons = [ (numstrip.sub("", l['expr'].lower()).strip(), (sum([int(_) for _ in l['offset'].split(":")])) / (2.0 * doclength)) for l in doc[eKey]["entities"] if ((l["neType"] == "PERSON") and len(l['expr']) >= self.min_length) ] if loclist is not None: locTexts += [l.lower() for l in loclist] return self._esquery_fromList(locTexts, persons, doclength=doclength, **kwargs) def _esquery_fromList(self, locTexts, persons, results=None, min_popln=None, **kwargs): if results is None: results = {} if min_popln is None: min_popln = self.min_popln meta_entInfo = {} realized_countries = [] idx = 0 offsetmat = [] for entitem in locTexts: querytext, enttype, offset = entitem if isempty(querytext): continue if querytext in results: results[querytext].frequency += 1 meta_entInfo[querytext]["offsets"].append(offset) meta_entInfo[querytext]["neType"] = (enttype) meta_entInfo[querytext]["indexes"].append(idx) offsetmat.append(offset) else: for subidx, substr in enumerate(querytext.split(",")): substr = substr.strip() if substr in results: results[substr].frequency += 1 meta_entInfo[substr]["offsets"].append( offset + float(subidx) / kwargs['doclength']) meta_entInfo[substr]["neType"] = (enttype) meta_entInfo[substr]["indexes"].append(idx + subidx) offsetmat.append(offset + float(subidx) / kwargs['doclength']) continue if substr not in meta_entInfo: meta_entInfo[substr] = { "offsets": [offset + float(subidx) / kwargs['doclength']], "neType": enttype, "indexes": [idx + subidx] } offsetmat.append(offset + float(subidx) / kwargs['doclength']) else: meta_entInfo[substr]["offsets"].append( offset + float(subidx) / kwargs['doclength']) meta_entInfo[substr]["neType"] = (enttype) meta_entInfo[substr]["indexes"].append(idx + subidx) offsetmat.append(offset + float(subidx) / kwargs['doclength']) ld = self._queryitem(substr, meta_entInfo[substr]["neType"]) if meta_entInfo[substr][ "neType"] != "LOCATION" and ld.isempty(): continue results[substr] = ld if len(results[substr].realizations) == 1: realized_countries.append( list(results[substr].realizations.values())[0] ['countryCode'].lower()) results[substr].frequency = 1 idx += subidx idx += 1 offsetmat = np.array(offsetmat) offset_diffmat = offsetmat[:, np.newaxis] - offsetmat selco = realized_countries #realized_countries = Counter(realized_countries) #co_realized = float(sum(realized_countries.values())) #selco = [kl for kl, vl in realized_countries.viewitems() # if float(vl/co_realized) >= 0.5] #try: # selco = realized_countries.most_common(n=1)[0][0] #except: # selco = [] persons_res = {} for entitem in persons: querytext, offset = entitem if querytext not in persons_res: diffs = offsetmat - offset persons_res[querytext] = { "expansions": self._queryitem(querytext, "LOCATION", countryCode=selco), "offset": diffs, "freq": 1 } else: persons_res[querytext]["freq"] += 1 if not isempty(selco): results = self.fuzzyquery(results, countryFilter=selco) freqsheet = self.score(results, meta_entInfo) return results, freqsheet, locTexts, meta_entInfo, offset_diffmat, persons_res, selco def _queryitem(self, item, itemtype, **kwargs): if itemtype == "LOCATION": res = self.gazetteer.query(item, **kwargs) else: res = self.gazetteer.query(item, fuzzy='AUTO', featureCode='pcli', operator='or') if res == []: res = self.gazetteer.query(item, featureCode='adm1', operator='or') return LocationDistribution(res) def fuzzyquery(self, locmap, countryFilter=[]): for loc in locmap: if len(locmap[loc].realizations) != 1: freq = locmap[loc].frequency subres = self.gazetteer.query(loc, countryCode=countryFilter, fuzzy='AUTO') if subres != []: locmap[loc] = LocationDistribution( subres + locmap[loc].realizations.values()) locmap[loc].frequency = freq return locmap def score(self, results, metaInfo): scoresheet = defaultdict(lambda: defaultdict(lambda: { "freq": 0.0, "offs_idx": [] })) num_mentions = float(sum((l.frequency for l in results.values()))) def update(key, l): offs = metaInfo[key]["indexes"] for s in l.city: scoresheet["city"][s]['freq'] += l.frequency scoresheet["city"][s]['offs_idx'] += (offs) for s in l.admin1: scoresheet["admin1"][s]["freq"] += l.frequency scoresheet["admin1"][s]['offs_idx'] += (offs) for s in l.country: scoresheet["country"][s]["freq"] += l.frequency scoresheet["country"][s]['offs_idx'] += (offs) _ = [update(key, val) for key, val in results.viewitems()] for typ in scoresheet: for s in scoresheet[typ]: scoresheet[typ][s]['freq'] /= num_mentions scoresheet[typ].default_factory = None scoresheet.default_factory = None return scoresheet def geocode(self, doc, enrichmentKeys=['BasisEnrichment'], **kwargs): """ Attach embersGeoCode to document """ eKey = None for key in enrichmentKeys: if key in doc and doc[key]: eKey = key if eKey is None: return doc all_exp_locs, freqsheet, loctexts, metaInfo, offsdiffmat, persons_res, selco = self._build_data( doc) if "events" in doc: self._expand_events(doc) locdist = {} clfdata = {} for loc in all_exp_locs: x, names = self.build_featuremat(all_exp_locs[loc], offsdiffmat, freqsheet) if x != []: clfdata[loc] = zip(names, x) ypred = self.model[1].predict_proba( self.model[0].transform(x))[:, 1] prob, final_nm = max(zip(ypred, names), key=lambda lx: lx[0]) locdist[loc] = { "conf": prob, "details": all_exp_locs[loc].realizations[final_nm].__dict__ } person_dist = {} for loc in persons_res: exps = persons_res[loc]["expansions"] x, names = [], [] for real in exps.realizations: d1 = self.build_persmat(exps.realizations[real], persons_res[loc], freqsheet) x.append(d1) names.append(real) if x != []: clfdata[loc] = zip(names, x) ypred = self.model[1].predict(self.model[0].transform(x)) pred, nm = max(zip(ypred, names), key=lambda lx: lx[0]) if pred is True: person_dist[loc] = exps.realizations[nm].__dict__ true_geos = self.matchwithGSRLocs(doc, all_exp_locs, persons_res, offsdiffmat, freqsheet) doc['true_geos'] = true_geos doc['location_distribution'] = locdist doc['person_dist'] = person_dist doc['geo_debug'] = {"selco": selco, "clfdata": clfdata} return doc def calc_offset_stats(self, indices, diffmat): tril = np.tril(diffmat[indices]) ntril = tril[np.nonzero(tril)] abstril = np.abs(ntril) if abstril.shape[0] == 0: return 1, 1, 1, 1 abs_minval = np.min(abstril) medval = np.mean(abstril) try: before_closest = np.min(ntril[ntril > 0]) except: before_closest = 1 try: after_closest = abs(np.max(ntril[ntril < 0])) except: after_closest = 1 return medval, abs_minval, before_closest, after_closest def _single_build_featuremat(self, realization, diffmat, freqsheet): country = realization.country admin = "/".join([country, realization.admin1]) city = "/".join( [admin, getattr(realization, "admin2", "") or realization.city]) featureCode = realization.featureCode offs = freqsheet["country"][country + "//"]["offs_idx"] co_offset = self.calc_offset_stats(np.ix_(offs, offs), diffmat) try: offs = freqsheet["admin1"][admin + "/"]["offs_idx"] st_offset = self.calc_offset_stats(np.ix_(offs, offs), diffmat) except: st_offset = [0, 0, 0, 0] if realization.featureCode[:3] not in ("adm1", "pcli"): try: offs = freqsheet["city"][city]["offs_idx"] ci_offset = self.calc_offset_stats(np.ix_(offs, offs), diffmat) cifreq = freqsheet["city"][city]["freq"] except: ci_offset = [1, 1, 1, 1] cifreq = 0 else: ci_offset = [0, 0, 0, 0] cifreq = 0 return { "country": freqsheet["country"][country + "//"]["freq"], "state": freqsheet.get("admin1", {}).get(admin + "/", {}).get("freq", 0), "city": cifreq, "poplnConf": realization.poplnConf, "co_Offmean": co_offset[0], "co_Offmin": co_offset[1], "co_prev": co_offset[2], "co_after": co_offset[3], "st_offmean": st_offset[0], "st_offmin": st_offset[1], "st_prev": st_offset[2], "st_after": st_offset[3], "ci_offmean": ci_offset[0], "ci_offmin": ci_offset[1], "ci_prev": ci_offset[2], "ci_after": ci_offset[3] } def build_persmat(self, realization, meta_info, freqsheet): country = realization.country admin = "/".join([country, realization.admin1]) city = "/".join( [admin, getattr(realization, "admin2", "") or realization.city]) featureCode = realization.featureCode co_offset = self.calc_offset_stats( freqsheet["country"][country + "//"]["offs_idx"], meta_info['offset']) if (admin + "/") in freqsheet["admin1"]: st_offset = self.calc_offset_stats( freqsheet["admin1"][admin + "/"]["offs_idx"], meta_info["offset"]) st_freq = freqsheet["admin1"][admin + "/"]["freq"] else: st_offset = [1, 1, 1, 1] st_freq = meta_info['freq'] if realization.featureCode[:3] not in ("adm1", "pcli"): if city in freqsheet.get("city", {}): ci_offset = self.calc_offset_stats( freqsheet["city"][city]["offs_idx"], meta_info["offset"]) cifreq = freqsheet["city"][city]["freq"] else: ci_offset = [1, 1, 1, 1] cifreq = meta_info["freq"] else: ci_offset = [0, 0, 0, 0] cifreq = 0 return { "country": freqsheet["country"][country + "//"]["freq"], "state": st_freq, "city": cifreq, "poplnConf": realization.poplnConf, "co_Offmean": co_offset[0], "co_Offmin": co_offset[1], "co_prev": co_offset[2], "co_after": co_offset[3], "st_offmean": st_offset[0], "st_offmin": st_offset[1], "st_prev": st_offset[2], "st_after": st_offset[3], "ci_offmean": ci_offset[0], "ci_offmin": ci_offset[1], "ci_prev": ci_offset[2], "ci_after": ci_offset[3] } #self.build_persmat(persons_res[loc].realizations[x]) def build_featuremat(self, loc, *args): xmat = [] lbls = [] for real in loc.realizations: x = self._single_build_featuremat(loc.realizations[real], *args) lbls.append(real) xmat.append(x) #if xmat != []: # xmat = self.model[0].transform(xmat) return xmat, lbls def _expand_events(self, doc): for evt in doc["events"]: if "expanded_loc" in evt: continue try: loc = self.gazetteer.get_locInfo(country=evt['Country'], admin=evt['State'], city=evt["City"]) evt['expanded_loc'] = loc except Exception as e: pass return def matchwithGSRLocs(self, doc, all_exp_locs, persons_res, offsdiffmat, freqsheet): locstrings = set() for evt in doc['events']: estr = u"/".join([evt['Country'], evt['State'], evt['City']]) locstrings.add(estr) if "expanded_loc" in evt: for loc in evt['expanded_loc']: gp = GeoPoint(**loc) lstr = "/".join([ gp.country, gp.admin1, (getattr(gp, "admin2", "") or gp.city) ]) locstrings.add(lstr) matched_locs = set() true_geos = {'persons': {}, 'locations': {}} for loc in all_exp_locs: for x in all_exp_locs[loc].realizations: if x in locstrings: true_geos['locations'][loc] = all_exp_locs[ loc].realizations[x].__dict__ remaininglocs = locstrings - matched_locs for loc in persons_res: for x in persons_res[loc]["expansions"].realizations: if x in remaininglocs: true_geos['persons'][loc] = persons_res[loc][ "expansions"].realizations[x].__dict__ return true_geos
class BaseGeo(object): def __init__(self, dbpath="./Geonames_dump.sql", min_popln=0, min_length=1): self.gazetteer = GeoNames(dbpath) self.min_popln = min_popln self.min_length = min_length def geocode(self, doc=None, loclist=None): locTexts = [] if doc is not None: # Get all location entities from document with atleast min_length characters locTexts += [ l['expr'].lower() for l in doc["BasisEnrichment"]["entities"] if ((l["neType"] == "LOCATION") and len(l['expr']) >= self.min_length) ] if loclist is not None: locTexts += [l.lower() for l in loclist] results = self.get_locations_fromURL( (doc["url"] if doc.get("url", "") else doc.get("link", ""))) return self.geocode_fromList(locTexts, results) def geocode_fromList(self, locTexts, results=None, min_popln=None): if results is None: results = {} if min_popln is None: min_popln = self.min_popln for l in locTexts: try: if l in results: results[l].frequency += 1 else: q = self.gazetteer.query(l, min_popln=min_popln) if not q: for sub in l.split(","): sub = sub.strip() if sub in results: results[sub].frequency += 1 else: results[sub] = LocationDistribution( self.gazetteer.query(sub, min_popln=min_popln)) results[sub].frequency = 1 else: results[l] = LocationDistribution(q) results[l].frequency = 1 except: log.exception("Unable to make query for string - {}".format( encode(l))) scores = self.score(results) custom_max = lambda x: max(x.viewvalues(), key=lambda y: y['score']) lrank = self.get_locRanks(scores, results) lmap = {l: custom_max(lrank[l]) for l in lrank if not lrank[l] == {}} #ipdb.set_trace() return lmap, max( lmap.values(), key=lambda x: x['score'])['geo_point'] if scores else {} def get_locations_fromURL(self, url): """ Parse URL to get URL COUNTRY and also URL SUBJECT like taiwan in 'cnn.com/taiwan/protest.html' Params: url - a web url Returns: Dict of locations obtained from URL """ results = {} urlinfo = urlparse(url) if urlinfo.netloc != "": urlsubject = urlinfo.path.split("/", 2)[1] urlcountry = urlinfo.netloc.rsplit(".", 1)[-1] # Find URL DOMAIN Country from 2 letter iso-code if len(urlcountry.strip()) == 2: urlcountry = self.gazetteer.get_country(urlcountry.upper()) if urlcountry != []: urlcountry = urlcountry[0] urlcountry.confidence = 1.0 results["URL-DOMAIN_{}".format( urlcountry)] = LocationDistribution(urlcountry) results["URL-DOMAIN_{}".format(urlcountry)].frequency = 1 if self.min_length < len(urlsubject) < 20: usubj_q = self.gazetteer.query(urlsubject, 15000) if usubj_q: results["URL-SUBJECT_{}".format( urlsubject)] = LocationDistribution(usubj_q) results["URL-SUBJECT_{}".format(urlsubject)].frequency = 1 return results def annotate(self, doc): """ Attach embersGeoCode to document """ try: lmap, gp = self.geocode(doc=doc) except Exception, e: log.exception("unable to geocode:{}".format(str(e))) lmap, gp = {}, {} doc['embersGeoCode'] = gp doc["location_distribution"] = lmap return doc
class TextGeo(object): def __init__(self, dbpath="./Geonames_dump.sql", min_popln=0, coverageLength=10): """ Description """ self.coverageLength = coverageLength self.gazetteer = GeoNames("./Geonames_dump.sql") self.min_popln = min_popln def geocode(self, doc): """ """ def getEntityDetails(entity): """ return entity string, starting offset, coverage end point """ start, end = entity['offset'].split(":") start, end = int(start), int(end) return (entity['expr'], start, start - self.coverageLength, end + self.coverageLength) urlinfo = urlparse(doc["url"]) loc_results = {} locTexts = [ getEntityDetails(l) for l in doc["BasisEnrichment"]['entities'] if l['neType'] == 'LOCATION' ] if urlinfo.netloc != "": urlsubject = urlinfo.path.split("/", 2)[1] urlcountry = urlinfo.netloc.rsplit(".", 1)[-1] if len(urlcountry.strip()) == 2: urlcountry = self.gazetteer.get_country(urlcountry.upper()) if urlcountry != []: urlcountry = urlcountry[0] urlcountry.confidence = 1.0 loc_results["url"] = LocationDistribution(urlcountry) loc_results["url"].frequency = 1 if len(urlsubject) < 20: locTexts.insert(0, (urlsubject, -1, -1, -1)) loc_results.update(self.query_gazetteer(self.group(locTexts))) scores = self.score(loc_results) custom_max = lambda x: max(x.realizations.viewvalues(), key=lambda x: scores[x.__str__()]) lmap = { l: custom_max(loc_results[l]['geo-point']) for l in loc_results if not loc_results[l]['geo-point'].isEmpty() } egeo = {} if scores: egeo = scores[max(scores, key=lambda x: scores[x])] return lmap, egeo def score(self, results): scoresheet = defaultdict(float) def update(item): l = item['geo-point'] freq = item['frequency'] for s in l.city: scoresheet[s] += l.city[s] * freq for s in l.admin1: scoresheet[s] += l.admin1[s] * freq for s in l.country: scoresheet[s] += l.country[s] * freq [update(item) for item in results.viewvalues()] return scoresheet def query_gazetteer(self, lgroups): """ get Location groups """ gp_map = {} query_gp = lambda x: self.gazetteer.query( x) if x not in gp_map else gp_map[x] for grp in lgroups: imap = {txt: query_gp(txt) for txt in grp} imap = self.get_geoPoints_intersection(imap) for l in imap: if l in gp_map: gp_map[l]['frequency'] += 1 else: gp_map[l] = {'geo-point': imap[l], 'frequency': 1} #gp_map.update(imap) for l in gp_map: gp_map[l]['geo-point'] = LocationDistribution( gp_map[l]['geo-point']) return gp_map def group(self, loc): groups = [] i = 0 while i < len(loc): grp = [loc[i][0]] for j, l in enumerate(loc[i + 1:]): if l[1] <= loc[i][-1]: grp.append(l[0]) i += 1 else: groups.append(grp) i += 1 grp = [loc[i][0]] break else: groups.append(grp) i += 1 return groups def get_geoPoints_intersection(self, gps): try: selcountry = set.intersection( *[set([l.country]) for name in gps for l in gps[name]]) except: selcountry = None if not selcountry: return gps selcountry = selcountry.pop() filtered_gps = [ set([encode('/'.join([l.country, l.admin1, ""]))]) for name in gps for l in gps[name] if l.country == selcountry ] sel_admin1 = set.intersection(*filtered_gps) if not sel_admin1: return { name: [l for l in gps[name] if l.country == selcountry] for name in gps } sel_admin1 = sel_admin1.pop() ns = {} for l in gps: t_admin = [gp for gp in gps[l] if gp.__str__() == sel_admin1] if t_admin != []: ns[l] = t_admin continue t_cand = [ gp for gp in gps[l] if encode("/".join([gp.country, gp.admin1, ""])) == sel_admin1 ] ns[l] = t_cand return ns
class BaseGeo(object): def __init__(self, db, min_popln=0, min_length=1): self.gazetteer = GeoNames(db) self.min_popln = min_popln self.min_length = min_length self.weightage = { "LOCATION": 1.0, "NATIONALITY": 0.75, "ORGANIZATION": 0.5, "OTHER": 0.0 } def geocode(self, doc=None, loclist=None, eKey='BasisEnrichment', **kwargs): locTexts = [] NAMED_ENTITY_TYPES_TO_CHECK = [ key for key in self.weightage if self.weightage[key] > 0 ] if doc is not None: # Get all location entities from document with atleast min_length characters locTexts += [(numstrip.sub("", l['expr'].lower()).strip(), l['neType']) for l in doc[eKey]["entities"] if ((l["neType"] in NAMED_ENTITY_TYPES_TO_CHECK) and len(l['expr']) >= self.min_length)] # locTexts += [(numstrip.sub("", l['expr'].lower()).strip(), 'OTHER') for l in # doc['BasisEnrichment']['nounPhrases']] persons = [(numstrip.sub("", l['expr'].lower()).strip(), l['neType']) for l in doc[eKey]["entities"] if ((l["neType"] == "PERSON") and len(l['expr']) >= self.min_length)] if loclist is not None: locTexts += [l.lower() for l in loclist] results = self.get_locations_fromURL( (doc["url"] if doc.get("url", "") else doc.get("link", ""))) # results = {} # kwargs['analyzer'] = 'standard' return self.geocode_fromList(locTexts, persons, results, **kwargs) def geocode_fromList(self, locTexts, persons, results=None, min_popln=None, **kwargs): if results is None: results = {} if min_popln is None: min_popln = self.min_popln itype = {} realized_countries = [] for l in locTexts: if l == "": continue if isinstance(l, tuple): itype[l[0]] = l[1] l = l[0] else: itype[l] = 'LOCATION' try: if l in results: results[l].frequency += 1 else: for sub in l.split(","): sub = sub.strip() if sub in results: results[sub].frequency += 1 else: itype[sub] = itype[l] try: results[sub] = self._queryitem(sub, itype[sub]) if len(results[sub].realizations) == 1: realized_countries.append( list( results[sub].realizations.values()) [0]['countryCode'].lower()) except UnicodeDecodeError: ipdb.set_trace() results[sub].frequency = 1 except UnicodeDecodeError: log.exception("Unable to make query for string - {}".format( encode(l))) # realized_countries = Counter(realized_countries) # co_realized = float(sum(realized_countries.values())) # selco = [kl for kl, vl in realized_countries.viewitems() # if float(vl/co_realized) >= 0.5] # try: # selco = realized_countries.most_common(n=1)[0][0] # except: # selco = [] selco = list(set(realized_countries)) if selco not in (None, "", []): results = self.fuzzyquery(results, countryFilter=selco) persons_res = {} for entitem in persons: querytext, _ = entitem if querytext not in persons_res: persons_res[querytext] = { "expansions": self._queryitem(querytext, "LOCATION", countryCode=selco), "freq": 1 } if querytext not in results: results[querytext] = persons_res[querytext]['expansions'] results[querytext].frequency = 1 else: persons_res[querytext]["freq"] += 1 results[querytext].frequency += 1 scores = self.score(results) custom_max = lambda x: max(x.viewvalues(), key=lambda y: y['score']) lrank = self.get_locRanks(scores, results) lmap = {l: custom_max(lrank[l]) for l in lrank if not lrank[l] == {}} total_weight = sum( [self.weightage[itype.get(key, 'OTHER')] for key in lmap]) return lmap, max(lmap.items(), key=lambda x: x[1]['score'] * self.weightage[ itype.get(x[0], 'OTHER')] / total_weight )[1]['geo_point'] if scores else {} def _queryitem(self, item, itemtype, **kwargs): if itemtype == "LOCATION": res = self.gazetteer.query(item, **kwargs) else: res = self.gazetteer.query(item, fuzzy='AUTO', featureCode='pcli', operator='or') if res == []: res = self.gazetteer.query(item, featureCode='adm1', operator='or') return LocationDistribution(res) def get_locations_fromURL(self, url): """ Parse URL to get URL COUNTRY and also URL SUBJECT like taiwan in 'cnn.com/taiwan/protest.html' Params: url - a web url Returns: Dict of locations obtained from URL """ results = {} urlinfo = urlparse(url) if urlinfo.netloc != "": urlsubject = urlinfo.path.split("/", 2)[1] urlcountry = urlinfo.netloc.rsplit(".", 1)[-1] # Find URL DOMAIN Country from 2 letter iso-code if len(urlcountry.strip()) == 2: urlcountry = self.gazetteer.get_country(urlcountry.upper()) if urlcountry != []: urlcountry = urlcountry[0] urlcountry.confidence = 1.0 results["URL-DOMAIN_{}".format( urlcountry)] = LocationDistribution(urlcountry) results["URL-DOMAIN_{}".format(urlcountry)].frequency = 1 if 5 < len(urlsubject) < 20: usubj_q = self.gazetteer.query(urlsubject, 15000) if usubj_q: results["URL-SUBJECT_{}".format( urlsubject)] = LocationDistribution(usubj_q) results["URL-SUBJECT_{}".format(urlsubject)].frequency = 1 return results def fuzzyquery(self, locmap, countryFilter=[]): for loc in locmap: if len(locmap[loc].realizations) != 1: freq = locmap[loc].frequency subres = self.gazetteer.query(loc, countryCode=countryFilter, fuzzy='AUTO') if subres != []: pts = subres + locmap[loc].realizations.values() ldist = self.gazetteer._get_loc_confidence( pts, self.min_popln) locmap[loc] = LocationDistribution(ldist) locmap[loc].frequency = freq return locmap def annotate(self, doc, enrichmentKeys=['BasisEnrichment'], **kwargs): """ Attach embersGeoCode to document """ eKey = None for key in enrichmentKeys: if key in doc and doc[key]: eKey = key if eKey is None: return doc try: lmap, gp = self.geocode(doc=doc, eKey=eKey, **kwargs) except UnicodeDecodeError as e: log.exception("unable to geocode:{}".format(str(e))) lmap, gp = {}, {} doc['embersGeoCode'] = gp doc["location_distribution"] = lmap return doc def score(self, results): scoresheet = defaultdict(float) num_mentions = float(sum((l.frequency for l in results.values()))) def update(l): for s in l.city: scoresheet[s] += l.city[s] * l.frequency for s in l.admin1: scoresheet[s] += l.admin1[s] * l.frequency for s in l.country: scoresheet[s] += l.country[s] * l.frequency _ = [update(item) for item in results.viewvalues()] for s in scoresheet: scoresheet[s] /= num_mentions return scoresheet def get_locRanks(self, scores, loc_cand): """ Each city score needs to be re-inforced with the corresponding state and country scores to get the actual meaning of that name. For example, several mentions of cities within virginia would have given virginia state a high score. Now this high score has to be brought back to lower levels to decide on meaning of each name/city """ loc_rankmap = {} def get_realization_score(l): lscore_map = {} for lstr, r in l.realizations.viewitems(): base_score = scores[lstr] #if r.ltype == 'city': if not isempty(r.city): l_adminstr = '/'.join([r.country, r.admin1, '']) base_score = (base_score + scores[l_adminstr] + scores[r.country + "//"]) * r.confidence elif not isempty(r.admin1): base_score = (base_score + scores[r.country + "//"]) * r.confidence elif r.ltype == "country": # do nothing pass else: base_score = base_score * r.confidence # code for other types #if not isempty(r.city): # l_adminstr = '/'.join([r.country, r.admin1, '']) # base_score = (base_score + scores[l_adminstr] + scores[r.country + "//"]) * r.confidence #ipdb.set_trace() #raise Exception("Unknown location type-{} for {}".format(r.ltype, lstr)) lscore_map[lstr] = { 'score': base_score, 'geo_point': r.__dict__ } #for s in l.realizations: # base_score = scores[s] # if l.realizations[s].ltype not in ('country', 'admin'): # l_adminstr = encode('/'.join([l.realizations[s].country, # l.realizations[s].admin1, ''])) # base_score += scores[l_adminstr] + scores[l.realizations[s].country] # elif l.realizations[s].ltype == 'admin': # base_score += scores[l.realizations[s].country] # lscore_map[s] = {'score': base_score, 'geo_point': l.realizations[s].__dict__} return lscore_map for locpt in loc_cand: loc_rankmap[locpt] = get_realization_score(loc_cand[locpt]) return loc_rankmap
class SteinerGeo(): def __init__(self, db, nerKeyMap=None, spacy=False): self.gazetteer = GeoNames(db, confMethod='Uniform', escore=False) DEFAULT_NER_MAP = { 'LOCATION': 'LOCATION', 'ORGANIZATION': 'ORGANIZATION', 'NATIONALITY': 'NATIONALITY', 'OTHER': 'OTHER', 'PERSON': 'PERSON' } if nerKeyMap is None: nerKeyMap = DEFAULT_NER_MAP else: for key in DEFAULT_NER_MAP: if key not in nerKeyMap: nerKeyMap[key] = DEFAULT_NER_MAP[key] if spacy is True: nerKeyMap['GPE'] = 'LOCATION' nerKeyMap['NORP'] = 'NATIONALITY' nerKeyMap['ORG'] = 'ORGANIZATION' nerKeyMap['LOC'] = 'LOCATION' self.nerKeyMap = nerKeyMap self.weightage = { "LOCATION": 1.0, "NATIONALITY": 0.75, "ORGANIZATION": 0.5, "OTHER": 0.0, "PERSON": 0.0 } def geocode(self, doc): entities = defaultdict(list) NAMED_ENTITY_TYPES_TO_CHECK = [ key for key in self.nerKeyMap if self.weightage[self.nerKeyMap[key]] > 0 ] _ = [ entities[self.nerKeyMap[l['neType']]].extend( (x.strip() for x in numstrip.sub("", l['expr']).split(","))) for l in doc['BasisEnrichment']['entities'] if (len(l['expr']) > 2) and ( l['neType'] in NAMED_ENTITY_TYPES_TO_CHECK) ] idmap = {} cc = set() for loc in entities['LOCATION']: loc = loc.lower() if loc in idmap: idmap[loc]['count'] += 1 else: expansions = self.gazetteer.query(loc) resolved = False if len(expansions) == 1: resolved = True cc.add(expansions[0].countryCode.lower()) idmap[loc] = { 'expansions': {exp.geonameid: exp for exp in expansions}, 'resolved': resolved, 'count': 1 } # check if any organization is talking about a country organization_checklist = {} for org in (entities['ORGANIZATION'] + entities.get('NATIONALITY', [])): if org.isupper(): continue org = org.lower() country = self.gazetteer.query(org, fuzzy='AUTO', featureCode='pcli', operator='or') if country: cc.add(country[0].countryCode.lower()) if org in idmap: idmap[org]['count'] += 1 else: idmap[org] = { 'expansions': {exp.geonameid: exp for exp in country}, 'resolved': True, 'count': 1 } else: if org in organization_checklist: organization_checklist[org] += 1 else: organization_checklist[org] = 1 locdist = idmap if cc: locdist = self.fuzzyquery(idmap, organization_checklist, tuple(cc)) #self.locdist = locdist #return locdist G, focus = self.steiner_tree_approx(locdist) return G, locdist, focus def annotate(self, doc): #stG, locdist stG, locdist, focus = self.geocode(doc) doc['location_distribution'] = { loc: locdist[loc]['expansions'][stG.neighbors( unidecode(loc + u"T0")).__next__()].__dict__ for loc in locdist if locdist[loc]['expansions'] } if focus: doc['embersGeoCode'] = doc['location_distribution'][focus[0][:-2]] else: doc['embersGeoCode'] = {} self.graph = stG return doc def steiner_tree_approx(self, locationMap): G = nx.DiGraph() terminalNodes = ["E"] for loc in locationMap: for rl in locationMap[loc]['expansions'].values(): #eW = (2 - FEATURE_WEIGHTS.get(rl.featureCode, 0.00)) eW = FEATURE_WEIGHTS.get(rl.featureCode, 12) nodename = unidecode(loc + u"T0") if rl.ltype == 'country': edges = [(nodename, rl.geonameid, eW), (rl.geonameid, rl.country, eW), (rl.country, 'E', eW)] elif rl.ltype == 'admin1': edges = [(nodename, rl.geonameid, eW), (rl.geonameid, rl.admin1, eW), (rl.admin1, rl.country, eW), (rl.country, 'E', eW)] else: #edges = [(loc + "T0", rl.geonameid, eW), (rl.geonameid, rl.name, eW), (rl.name, rl.admin1, eW), (rl.admin1, rl.country, eW), (rl.country, 'E', eW)] edges = [(nodename, rl.geonameid, eW), (rl.geonameid, rl.admin1, eW), (rl.admin1, rl.country, eW), (rl.country, 'E', eW)] G.add_weighted_edges_from(edges) terminalNodes.append(nodename) if G.number_of_nodes() == 0: return G, [] stG = approximation.steiner_tree(G.to_undirected(), terminalNodes) def ego_nw_degree(degree, node): return sum((degree(p) for p in nx.descendants(G, node))) G = G.subgraph(stG) degree = G.degree() geofocus = sorted([(t, ego_nw_degree(degree, t)) for t in terminalNodes[1:]], reverse=True) return G, geofocus[0] if geofocus else [] def fuzzyquery(self, locmap, orgChecklist, countryFilter=[]): for loc in locmap: if locmap[loc]['resolved'] is False: subres = self.gazetteer.query(loc, countryCode=countryFilter, fuzzy='AUTO') new_exp = {res.geonameid: res for res in subres} if new_exp: # locmap[loc]['expansions'].update(new_exp) locmap[loc]['expansions'] = (new_exp) for org in orgChecklist: subres = self.gazetteer.query(org, countryCode=countryFilter) locmap[org] = { "expansions": {res.geonameid: res for res in subres}, "resolved": len(subres) == 1, "count": orgChecklist[org] } return locmap
class PrepareTraining(object): def __init__(self, db, min_popln=0, min_length=1): self.gazetteer = GeoNames(db) self.min_popln = min_popln self.min_length = min_length self.weightage = { "LOCATION": 1.0, "NATIONALITY": 0.75, "ORGANIZATION": 0.5, "OTHER": 0.0 } def geocode(self, doc=None, loclist=None, eKey='BasisEnrichment', **kwargs): locTexts, persons = [], [] NAMED_ENTITY_TYPES_TO_CHECK = [ key for key in self.weightage if self.weightage[key] > 0 ] if doc is not None: doclength = len(doc[eKey]['tokens']) locTexts += [ (numstrip.sub("", l['expr'].lower()).strip(), l['neType'], (sum([int(_) for _ in l['offset'].split(":")])) / (2.0 * doclength)) for l in doc[eKey]["entities"] if ((l["neType"] in NAMED_ENTITY_TYPES_TO_CHECK) and len(l['expr']) >= self.min_length) ] persons = [ (numstrip.sub("", l['expr'].lower()).strip(), (sum([int(_) for _ in l['offset'].split(":")])) / (2.0 * doclength)) for l in doc[eKey]["entities"] if ((l["neType"] == "PERSON") and len(l['expr']) >= self.min_length) ] if loclist is not None: locTexts += [l.lower() for l in loclist] return self.geocode_fromList(locTexts, persons, doclength=doclength, **kwargs) def geocode_fromList(self, locTexts, persons, results=None, min_popln=None, **kwargs): if results is None: results = {} if min_popln is None: min_popln = self.min_popln meta_entInfo = {} realized_countries = [] idx = 0 offsetmat = [] for entitem in locTexts: querytext, enttype, offset = entitem if isempty(querytext): continue if querytext in results: results[querytext].frequency += 1 meta_entInfo[querytext]["offsets"].append(offset) meta_entInfo[querytext]["neType"] = (enttype) meta_entInfo[querytext]["indexes"].append(idx) offsetmat.append(offset) else: for subidx, substr in enumerate(querytext.split(",")): substr = substr.strip() if substr in results: results[substr].frequency += 1 meta_entInfo[substr]["offsets"].append( offset + float(subidx) / kwargs['doclength']) meta_entInfo[substr]["neType"] = (enttype) meta_entInfo[substr]["indexes"].append(idx + subidx) offsetmat.append(offset + float(subidx) / kwargs['doclength']) continue if substr not in meta_entInfo: meta_entInfo[substr] = { "offsets": [offset + float(subidx) / kwargs['doclength']], "neType": enttype, "indexes": [idx + subidx] } offsetmat.append(offset + float(subidx) / kwargs['doclength']) else: meta_entInfo[substr]["offsets"].append( offset + float(subidx) / kwargs['doclength']) meta_entInfo[substr]["neType"] = (enttype) meta_entInfo[substr]["indexes"].append(idx + subidx) offsetmat.append(offset + float(subidx) / kwargs['doclength']) ld = self._queryitem(substr, meta_entInfo[substr]["neType"]) if meta_entInfo[substr][ "neType"] != "LOCATION" and ld.isempty(): continue results[substr] = ld if len(results[substr].realizations) == 1: realized_countries.append( list(results[substr].realizations.values())[0] ['countryCode'].lower()) results[substr].frequency = 1 idx += subidx idx += 1 offsetmat = np.array(offsetmat) offset_diffmat = offsetmat[:, np.newaxis] - offsetmat realized_countries = Counter(realized_countries) co_realized = float(sum(realized_countries.values())) selco = [ kl for kl, vl in realized_countries.viewitems() if float(vl / co_realized) >= 0.5 ] try: selco = realized_countries.most_common(n=1)[0][0] except: selco = [] persons_res = {} for entitem in persons: querytext, offset = entitem if querytext not in persons_res: diffs = offsetmat - offset persons_res[querytext] = { "expansions": self._queryitem(querytext, "LOCATION", countryCode=selco), "offset": diffs, "freq": 1 } else: persons_res[querytext]["freq"] += 1 if not isempty(selco): results = self.fuzzyquery(results, countryFilter=selco) freqsheet = self.score(results, meta_entInfo) return results, freqsheet, locTexts, meta_entInfo, offset_diffmat, persons_res def _queryitem(self, item, itemtype, **kwargs): if itemtype == "LOCATION": res = self.gazetteer.query(item, **kwargs) else: res = self.gazetteer.query(item, fuzzy='AUTO', featureCode='pcli', operator='or') if res == []: res = self.gazetteer.query(item, featureCode='adm1', operator='or') return LocationDistribution(res) def fuzzyquery(self, locmap, countryFilter=[]): for loc in locmap: if len(locmap[loc].realizations) != 1: freq = locmap[loc].frequency subres = self.gazetteer.query(loc, countryCode=countryFilter, fuzzy='AUTO') if subres != []: locmap[loc] = LocationDistribution(subres) locmap[loc].frequency = freq return locmap def score(self, results, metaInfo): scoresheet = defaultdict(lambda: defaultdict(lambda: { "freq": 0.0, "offs_idx": [] })) num_mentions = float(sum((l.frequency for l in results.values()))) def update(key, l): offs = metaInfo[key]["indexes"] for s in l.city: scoresheet["city"][s]['freq'] += l.frequency scoresheet["city"][s]['offs_idx'] += (offs) for s in l.admin1: scoresheet["admin1"][s]["freq"] += l.frequency scoresheet["admin1"][s]['offs_idx'] += (offs) for s in l.country: scoresheet["country"][s]["freq"] += l.frequency scoresheet["country"][s]['offs_idx'] += (offs) _ = [update(key, val) for key, val in results.viewitems()] for typ in scoresheet: for s in scoresheet[typ]: scoresheet[typ][s]['freq'] /= num_mentions scoresheet[typ].default_factory = None scoresheet.default_factory = None return scoresheet def _builddoc(self, doc, enrichmentKeys=['BasisEnrichment'], **kwargs): """ Attach embersGeoCode to document """ eKey = None for key in enrichmentKeys: if key in doc and doc[key]: eKey = key if eKey is None: return doc all_exp_locs, freqsheet, loctexts, metaInfo, offsdiffmat, persons_res = self.geocode( doc) label_locs, Xmat, Ymat, pers_data, idxes = self.matchwithGSRLocs( doc, all_exp_locs, persons_res, offsdiffmat, freqsheet, metaInfo) doc['match_indexes'] = idxes # Xmat, Ymat = [], [] # for loc in all_exp_locs: # if all_exp_locs[loc].haslabel is True: # for r in all_exp_locs[loc].realizations.values(): # Ymat.append(r.label) # Xmat.append(self.build_featuremat(r, offsdiffmat, freqsheet)) # label_locs, freqsheet, loctexts, metaInfo, offsdiffmat, return doc # return Xmat, Ymat, pers_data def build_trainingdata(self, docs): xmat, ymat = [], [] for doc in docs: x, y = self._builddoc(doc) xmat += x ymat += y return xmat, ymat def calc_offset_stats(self, indices, diffmat): tril = np.tril(diffmat[indices]) ntril = tril[np.nonzero(tril)] abstril = np.abs(ntril) if abstril.shape[0] == 0: return 1, 1, 1, 1 abs_minval = np.min(abstril) medval = np.mean(abstril) try: before_closest = np.min(ntril[ntril > 0]) except: before_closest = 1 try: after_closest = abs(np.max(ntril[ntril < 0])) except: after_closest = 1 return medval, abs_minval, before_closest, after_closest def _single_build_featuremat(self, realization, diffmat, freqsheet): country = realization.country admin = "/".join([country, realization.admin1]) city = "/".join( [admin, getattr(realization, "admin2", "") or realization.city]) featureCode = realization.featureCode offs = freqsheet["country"][country + "//"]["offs_idx"] co_offset = self.calc_offset_stats(np.ix_(offs, offs), diffmat) offs = freqsheet["admin1"][admin + "/"]["offs_idx"] st_offset = self.calc_offset_stats(np.ix_(offs, offs), diffmat) if realization.featureCode[:3] not in ("adm1", "pcli"): try: offs = freqsheet["city"][city]["offs_idx"] ci_offset = self.calc_offset_stats(np.ix_(offs, offs), diffmat) cifreq = freqsheet["city"][city]["freq"] except: ci_offset = [1, 1, 1, 1] cifreq = 0 else: ci_offset = [0, 0, 0, 0] cifreq = 0 return { "country": freqsheet["country"][country + "//"]["freq"], "state": freqsheet["admin1"][admin + "/"]["freq"], "city": cifreq, "poplnConf": realization.poplnConf, "co_Offmean": co_offset[0], "co_Offmin": co_offset[1], "co_prev": co_offset[2], "co_after": co_offset[3], "st_offmean": st_offset[0], "st_offmin": st_offset[1], "st_prev": st_offset[2], "st_after": st_offset[3], "ci_offmean": ci_offset[0], "ci_offmin": ci_offset[1], "ci_prev": ci_offset[2], "ci_after": ci_offset[3] } def build_persmat(self, realization, meta_info, freqsheet): country = realization.country admin = "/".join([country, realization.admin1]) city = "/".join( [admin, getattr(realization, "admin2", "") or realization.city]) featureCode = realization.featureCode co_offset = self.calc_offset_stats( freqsheet["country"][country + "//"]["offs_idx"], meta_info['offset']) if (admin + "/") in freqsheet["admin1"]: st_offset = self.calc_offset_stats( freqsheet["admin1"][admin + "/"]["offs_idx"], meta_info["offset"]) st_freq = freqsheet["admin1"][admin + "/"]["freq"] else: st_offset = [1, 1, 1, 1] st_freq = meta_info['freq'] if realization.featureCode[:3] not in ("adm1", "pcli"): if city in freqsheet["city"]: ci_offset = self.calc_offset_stats( freqsheet["city"][city]["offs_idx"], meta_info["offset"]) cifreq = freqsheet["city"][city]["freq"] else: ci_offset = [1, 1, 1, 1] cifreq = meta_info["freq"] else: ci_offset = [0, 0, 0, 0] cifreq = 0 return { "country": freqsheet["country"][country + "//"]["freq"], "state": st_freq, "city": cifreq, "poplnConf": realization.poplnConf, "co_Offmean": co_offset[0], "co_Offmin": co_offset[1], "co_prev": co_offset[2], "co_after": co_offset[3], "st_offmean": st_offset[0], "st_offmin": st_offset[1], "st_prev": st_offset[2], "st_after": st_offset[3], "ci_offmean": ci_offset[0], "ci_offmin": ci_offset[1], "ci_prev": ci_offset[2], "ci_after": ci_offset[3] } #self.build_persmat(persons_res[loc].realizations[x]) def build_featuremat(self, loc, *args): xmat, ymat = [], [] for real in loc.realizations: x = self._single_build_featuremat(loc.realizations[real], *args) y = loc.realizations[real].label xmat.append(x) ymat.append(y) return xmat, ymat def matchwithGSRLocs(self, doc, all_exp_locs, persons_res, offsdiffmat, freqsheet, metaInfo): locstrings = set() matched_idx = [] for evt in doc['events']: estr = u"/".join( [evt['Country'], evt['State'], evt['City'].lower()]) locstrings.add(estr.lower()) if 'expanded_loc' in evt: try: loc = self.gazetteer.get_locInfo(country=evt['Country'], admin=evt['State'], city=evt["City"]) evt['expanded_loc'] = loc except Exception as e: pass if "expanded_loc" in evt: for loc in evt['expanded_loc']: gp = GeoPoint(**loc) lstr = "/".join([ gp.country, gp.admin1, (getattr(gp, "admin2", "") or gp.city) ]) locstrings.add(lstr.lower()) matched_locs = set() xmat, ymat = [], [] for loc in all_exp_locs: all_exp_locs[loc].haslabel = False for x in all_exp_locs[loc].realizations: if x.lower() in locstrings: all_exp_locs[loc].realizations[x].label = True all_exp_locs[loc].haslabel = True matched_locs.add(x.lower()) matched_idx.append([loc, metaInfo[loc]['indexes']]) else: all_exp_locs[loc].realizations[x].label = False if all_exp_locs[loc].haslabel: x, y = self.build_featuremat(all_exp_locs[loc], offsdiffmat, freqsheet) xmat.append(x) ymat.append(y) remaininglocs = locstrings - matched_locs pers_data = [[], []] for loc in persons_res: persons_res[loc]['expansions'].haslabel = False for x in persons_res[loc]["expansions"].realizations: d1 = self.build_persmat( persons_res[loc]["expansions"].realizations[x], persons_res[loc], freqsheet) pers_data[0].append(d1) if x.lower() in remaininglocs: persons_res[loc]["expansions"].haslabel = True persons_res[loc]["expansions"].realizations[x].label = True matched_idx.append([loc, None]) pers_data[1].append(True) else: pers_data[1].append(False) return all_exp_locs, xmat, ymat, pers_data, matched_idx