Exemplo n.º 1
0
    def get_locations_fromURL(self, url):
        """
        Parse URL to get URL COUNTRY and also URL SUBJECT like taiwan in
        'cnn.com/taiwan/protest.html'

        Params:
            url - a web url

        Returns:
            Dict of locations obtained from URL
        """
        results = {}
        urlinfo = urlparse(url)
        if urlinfo.netloc != "":
            urlsubject = urlinfo.path.split("/", 2)[1]
            urlcountry = urlinfo.netloc.rsplit(".", 1)[-1]
            # Find URL DOMAIN Country from 2 letter iso-code
            if len(urlcountry.strip()) == 2:
                urlcountry = self.gazetteer.get_country(urlcountry.upper())
                if urlcountry != []:
                    urlcountry = urlcountry[0]
                    urlcountry.confidence = 1.0
                    results["URL-DOMAIN_{}".format(urlcountry)] = LocationDistribution(urlcountry)
                    results["URL-DOMAIN_{}".format(urlcountry)].frequency = 1

            if 5 < len(urlsubject) < 20:
                usubj_q = self.gazetteer.query(urlsubject, 15000)
                if usubj_q:
                    results["URL-SUBJECT_{}".format(urlsubject)] = LocationDistribution(usubj_q)
                    results["URL-SUBJECT_{}".format(urlsubject)].frequency = 1
        return results
Exemplo n.º 2
0
 def fuzzyquery(self, locmap, countryFilter=[]):
     for loc in locmap:
         if len(locmap[loc].realizations) != 1:
             freq = locmap[loc].frequency
             subres = self.gazetteer.query(loc,
                                           countryCode=countryFilter,
                                           fuzzy='AUTO')
             if subres != []:
                 locmap[loc] = LocationDistribution(subres)
                 locmap[loc].frequency = freq
     return locmap
Exemplo n.º 3
0
 def fuzzyquery(self, locmap, countryFilter=[]):
     for loc in locmap:
         if len(locmap[loc].realizations) != 1:
             freq = locmap[loc].frequency
             subres = self.gazetteer.query(loc,
                                           countryCode=countryFilter,
                                           fuzzy='AUTO')
             if subres != []:
                 pts = subres + locmap[loc].realizations.values()
                 ldist = self.gazetteer._get_loc_confidence(
                     pts, self.min_popln)
                 locmap[loc] = LocationDistribution(ldist)
                 locmap[loc].frequency = freq
     return locmap
Exemplo n.º 4
0
    def geocode_fromList(self, locTexts, results=None, min_popln=None, **kwargs):
        if results is None:
            results = {}

        if min_popln is None:
            min_popln = self.min_popln

        itype = {}
        for l in locTexts:
            if l == "":
                continue
            if isinstance(l, tuple):
                itype[l[0]] = l[1]
                l = l[0]
            else:
                itype[l] = 'LOCATION'
            try:
                if l in results:
                    results[l].frequency += 1
                else:
                    for sub in l.split(","):
                        sub = sub.strip()
                        if sub in results:
                            results[sub].frequency += 1
                        else:
                            itype[sub] = itype[l]
                            try:
                                # # Exclusion
                                # list_exclude = ['city','town']
                                # for ext in list_exclude:
                                #     if ext in sub:
                                #         sub = sub.replace(ext, "")
                                query = self.gazetteer.query(sub, min_popln=min_popln,**kwargs)
                                results[sub] = LocationDistribution(query)
                            except UnicodeDecodeError:
                                ipdb.set_trace()
                            results[sub].frequency = 1
            except UnicodeDecodeError:
                log.exception("Unable to make query for string - {}".format(encode(l)))

        scores = self.score(results)
        custom_max = lambda x: max(x.viewvalues(),
                                   key=lambda y: y['score'])
        lrank = self.get_locRanks(scores, results)
        lmap = {l: custom_max(lrank[l]) for l in lrank if not lrank[l] == {}}
        total_weight = sum([self.weightage[itype.get(key, 'OTHER')] for key in lmap])
        return lmap, max(lmap.items(),
                         key=lambda x: x[1]['score'] * self.weightage[itype.get(x[0], 'OTHER')] / total_weight)[1]['geo_point'] if scores else {}
Exemplo n.º 5
0
    def geocode(self, doc):
        """

        """
        def getEntityDetails(entity):
            """
            return entity string, starting offset, coverage end point
            """
            start, end = entity['offset'].split(":")
            start, end = int(start), int(end)
            return (entity['expr'], start, start - self.coverageLength,
                    end + self.coverageLength)

        urlinfo = urlparse(doc["url"])
        loc_results = {}
        locTexts = [
            getEntityDetails(l) for l in doc["BasisEnrichment"]['entities']
            if l['neType'] == 'LOCATION'
        ]
        if urlinfo.netloc != "":
            urlsubject = urlinfo.path.split("/", 2)[1]
            urlcountry = urlinfo.netloc.rsplit(".", 1)[-1]
            if len(urlcountry.strip()) == 2:
                urlcountry = self.gazetteer.get_country(urlcountry.upper())
                if urlcountry != []:
                    urlcountry = urlcountry[0]
                    urlcountry.confidence = 1.0
                    loc_results["url"] = LocationDistribution(urlcountry)
                    loc_results["url"].frequency = 1
            if len(urlsubject) < 20:
                locTexts.insert(0, (urlsubject, -1, -1, -1))

        loc_results.update(self.query_gazetteer(self.group(locTexts)))

        scores = self.score(loc_results)
        custom_max = lambda x: max(x.realizations.viewvalues(),
                                   key=lambda x: scores[x.__str__()])
        lmap = {
            l: custom_max(loc_results[l]['geo-point'])
            for l in loc_results if not loc_results[l]['geo-point'].isEmpty()
        }
        egeo = {}
        if scores:
            egeo = scores[max(scores, key=lambda x: scores[x])]
        return lmap, egeo
Exemplo n.º 6
0
    def _queryitem(self, item, itemtype, **kwargs):
        if itemtype == "LOCATION":
            res = self.gazetteer.query(item, **kwargs)
        else:
            res = self.gazetteer.query(item,
                                       fuzzy='AUTO',
                                       featureCode='pcli',
                                       operator='or')
            if res == []:
                res = self.gazetteer.query(item,
                                           featureCode='adm1',
                                           operator='or')

        return LocationDistribution(res)
Exemplo n.º 7
0
    def query_gazetteer(self, lgroups):
        """
        get Location groups
        """
        gp_map = {}
        query_gp = lambda x: self.gazetteer.query(
            x) if x not in gp_map else gp_map[x]
        for grp in lgroups:
            imap = {txt: query_gp(txt) for txt in grp}
            imap = self.get_geoPoints_intersection(imap)
            for l in imap:
                if l in gp_map:
                    gp_map[l]['frequency'] += 1
                else:
                    gp_map[l] = {'geo-point': imap[l], 'frequency': 1}

            #gp_map.update(imap)

        for l in gp_map:
            gp_map[l]['geo-point'] = LocationDistribution(
                gp_map[l]['geo-point'])

        return gp_map
Exemplo n.º 8
0
    def geocode_fromList(self, locTexts, results=None, min_popln=None):
        if results is None:
            results = {}

        if min_popln is None:
            min_popln = self.min_popln

        for l in locTexts:
            try:
                if l in results:
                    results[l].frequency += 1
                else:
                    q = self.gazetteer.query(l, min_popln=min_popln)
                    if not q:
                        for sub in l.split(","):
                            sub = sub.strip()
                            if sub in results:
                                results[sub].frequency += 1
                            else:
                                results[sub] = LocationDistribution(
                                    self.gazetteer.query(sub,
                                                         min_popln=min_popln))
                                results[sub].frequency = 1
                    else:
                        results[l] = LocationDistribution(q)
                        results[l].frequency = 1
            except:
                log.exception("Unable to make query for string - {}".format(
                    encode(l)))

        scores = self.score(results)
        custom_max = lambda x: max(x.viewvalues(), key=lambda y: y['score'])
        lrank = self.get_locRanks(scores, results)
        lmap = {l: custom_max(lrank[l]) for l in lrank if not lrank[l] == {}}
        #ipdb.set_trace()
        return lmap, max(
            lmap.values(),
            key=lambda x: x['score'])['geo_point'] if scores else {}
Exemplo n.º 9
0
print("gpool created")
tp = ThreadPool(10)
print("tpool created")


#mp = (10)
#print "mpool created"
def t(l):
    return gn.query(l, min_popln=0)


##strt = time.clock()
##s = mp.map(t, loclist)
##end = time.clock()
##print "multiprocessing pool: {}".format(end-strt)

strt = time.clock()
s = tp.map(t, loclist)
end = time.clock()
print("Threadpool processing: {}".format(end - strt))

strt = time.clock()
s = gp.map(t, loclist)
end = time.clock()
print("gevent pool processing: {}".format(end - strt))

strt = time.clock()
s = [LocationDistribution(l) for l in s]
end = time.clock()
print("LD time:{}".format(end - strt))