예제 #1
    def searchAllSource(self, query, timeout=None):
        if query.kinds is not None and len(query.kinds) > 0 and len(self.kinds.intersection(query.kinds)) == 0:
            logs.debug('Skipping %s (kinds: %s)' % (self.sourceName, query.kinds))
            return self.emptySource

        logs.debug('Searching %s...' % self.sourceName)
        def gen():
                raw_results = []

                def getFactualSearch(q, useLocation=False):
                    if useLocation and q.coordinates is not None:
                        results = self.__factual.search(q.query_string, coordinates=q.coordinates)
                        results = self.__factual.search(q.query_string)
                    for result in results:

                if query.coordinates is not None:
                    pool = Pool(2)
                    pool.spawn(getFactualSearch, query, False)
                    pool.spawn(getFactualSearch, query, True)
                    raw_results = getFactualSearch(query)

                if raw_results is not None:
                    for result in raw_results:
                        yield FactualPlace(data=result)
            except GeneratorExit:
        return generatorSource(gen(), constructor=FactualSearchAll)
예제 #2
 def validate(self, results):
         Validates the search result set to ensure that there are no obvious 
         duplicate results.
         Returns True if all results are unique within a fuzzy margin of error 
         or False otherwise.
     proxies = map(self.__stamped.proxyFromEntity, results)
     # ensure that no result resolves definitively to any other result in the result set
     for i in xrange(len(proxies)):
         proxy = proxies[i]
         def dedup():
             for j in xrange(len(proxies)):
                 proxy2 = proxies[j]
                 if i != j and proxy.kind == proxy2.kind:
                     yield proxy2
         dups = self.__resolver.resolve(proxy, generatorSource(dedup()), count=1)
         if len(dups) > 0 and dups[0][0]['resolved']:
             return False
     seen = defaultdict(set)
     # ensure that there are no obvious duplicate results without using the resolver
     for i in xrange(len(results)):
         result = results[i]
         keys   = [ k for k in result.sources if k.endswith('_id') ]
         # ensure that the same source id doesn't appear twice in the result set
         # (source ids are supposed to be unique)
         for key in keys:
             value = str(result[key])
             if value in seen[key]:
                 return False
         for j in xrange(i + 1, len(results)):
             result2 = results[j]
             if i != j and self._eq(result.kind, result2.kind) and self._eq(result.title, result2.title):
                 if len(result.types.intersection(result2.types)) > 0:
                     utils.log("!" * 80)
                     utils.log("dupe encountered: %s\n%s" % (result, result2))
                     utils.log("!" * 80)
                     return False
     return True
예제 #3
 def placeSource(self, query):
     def gen():
             results = self.__factual.search(query.name)
             for result in results:
                 yield FactualPlace(data=result)
         except GeneratorExit:
     return generatorSource(gen())
예제 #4
    def search(self, 
               coords   = None, 
               full     = True, 
               local    = False, 
               kinds    = None,
               types    = None, 
               offset   = 0, 
               limit    = 10):
        before  = time.time()
        query   = QuerySearchAll(query, coords, kinds, types, local)
        pool    = Pool(len(self._sources))
        results = []
        timeout = 6.5
        # NOTE: order is important here; e.g., we want to give precedence to 
        # certain third-party APIs to begin their requests before others.
        for source in self._sources:
            if not full and source.sourceName != 'stamped':
                # ignore any external sources if full search is disabled

            # TODO: Make sure timeout gets passed through to source member functions.
            pool.spawn(self.__search_helper, query, limit, offset, source, results, timeout=timeout)
        all_results = {}
        total = 0
        for source_name, result in results:
            if query.kinds is None or result[1].target.kind in query.kinds:
                if query.types is None or len(query.types.intersection(result[1].target.types)) > 0:
                    source_results = all_results.setdefault(source_name,[])
                    total += 1
                    logs.debug("Filtered out %s (types=%s)" % 
                              (result[1].name, result[1].target.types))
                logs.debug("Filtered out %s (kind=%s)" % 
                          (result[1].name, result[1].target.kind))
        for source_name, source_results in all_results.iteritems():
            all_results[source_name] = sortedResults(source_results)
        print("\n\n\nGenerated %d results in %f seconds from: %s\n\n\n" % (
            total, time.time() - before, ' '.join([ '%s:%s' % (k, len(v)) for k,v in all_results.iteritems()])
        before2 = time.time()
        chosen  = []
        limit   = max(0, min(total, limit if limit else total))
        while len(chosen) < limit:
            best_name = None
            best = None
            for name, source_results in list(all_results.iteritems()):
                if len(source_results) == 0:
                    del all_results[name]
                    cur_best = source_results[0]
                    if best is None or cur_best[0]['total'] > best[0]['total']:
                        best = cur_best
                        best_name = name
                        if _verbose:
                            print("skipped %s with value %s" % (name, cur_best[0]['total']))
            if best is not None:
                del all_results[best_name][0]
                if _verbose:
                    print("Chose %s with value %s" % (best_name, best[0]['total']))
                cur = best[1]
                def dedup():
                    for entry in chosen:
                        target = entry[1].target
                        if target.types == cur.target.types:
                            yield target
                dups = self.__resolver.resolve(cur.target, generatorSource(dedup()), count=1)
                if len(dups) > 0 and dups[0][0]['resolved']:
                    if _verbose:
                        print("Discarded %s:%s as a duplicate to %s:%s" % 
                              (cur.source, cur.name, dups[0][1].source, dups[0][1].name))
                        print(formatResults(dups[0:1], verbose=True))
                    """# useful debugging aid if you find dupes in the search results
                    if len(best) == 2 and len(dups) > 0:
                        print("COMPARED %s:%s with %s:%s" % 
                              (cur.source, cur.name, dups[0][1].source, dups[0][1].name))
                        print(formatResults(dups[0:2], verbose=True))
        if _verbose:
            print("\n\n\nDeduped %d results in %f seconds\n\n\n" % (total - len(chosen), time.time() - before2))
        return chosen