def searchAllSource(self, query, timeout=None): if query.kinds is not None and len(query.kinds) > 0 and len(self.kinds.intersection(query.kinds)) == 0: logs.debug('Skipping %s (kinds: %s)' % (self.sourceName, query.kinds)) return self.emptySource logs.debug('Searching %s...' % self.sourceName) def gen(): try: raw_results = [] def getFactualSearch(q, useLocation=False): if useLocation and q.coordinates is not None: results = self.__factual.search(q.query_string, coordinates=q.coordinates) else: results = self.__factual.search(q.query_string) for result in results: raw_results.append(result) if query.coordinates is not None: pool = Pool(2) pool.spawn(getFactualSearch, query, False) pool.spawn(getFactualSearch, query, True) pool.join(timeout=timeout) else: raw_results = getFactualSearch(query) if raw_results is not None: for result in raw_results: yield FactualPlace(data=result) except GeneratorExit: pass return generatorSource(gen(), constructor=FactualSearchAll)
def validate(self, results): """ Validates the search result set to ensure that there are no obvious duplicate results. Returns True if all results are unique within a fuzzy margin of error or False otherwise. """ proxies = map(self.__stamped.proxyFromEntity, results) # ensure that no result resolves definitively to any other result in the result set for i in xrange(len(proxies)): proxy = proxies[i] def dedup(): for j in xrange(len(proxies)): proxy2 = proxies[j] if i != j and proxy.kind == proxy2.kind: yield proxy2 dups = self.__resolver.resolve(proxy, generatorSource(dedup()), count=1) if len(dups) > 0 and dups[0][0]['resolved']: return False seen = defaultdict(set) # ensure that there are no obvious duplicate results without using the resolver for i in xrange(len(results)): result = results[i] keys = [ k for k in result.sources if k.endswith('_id') ] # ensure that the same source id doesn't appear twice in the result set # (source ids are supposed to be unique) for key in keys: value = str(result[key]) if value in seen[key]: return False seen[key].add(value) for j in xrange(i + 1, len(results)): result2 = results[j] if i != j and self._eq(result.kind, result2.kind) and self._eq(result.title, result2.title): if len(result.types.intersection(result2.types)) > 0: utils.log("") utils.log("!" * 80) utils.log("dupe encountered: %s\n%s" % (result, result2)) utils.log("!" * 80) utils.log("") return False return True
def placeSource(self, query): def gen(): try: results = self.__factual.search(query.name) for result in results: yield FactualPlace(data=result) except GeneratorExit: pass return generatorSource(gen())
def search(self, query, coords = None, full = True, local = False, kinds = None, types = None, offset = 0, limit = 10): before = time.time() query = QuerySearchAll(query, coords, kinds, types, local) pool = Pool(len(self._sources)) results = [] timeout = 6.5 # NOTE: order is important here; e.g., we want to give precedence to # certain third-party APIs to begin their requests before others. for source in self._sources: if not full and source.sourceName != 'stamped': # ignore any external sources if full search is disabled continue # TODO: Make sure timeout gets passed through to source member functions. pool.spawn(self.__search_helper, query, limit, offset, source, results, timeout=timeout) pool.join(timeout=timeout) all_results = {} total = 0 for source_name, result in results: if query.kinds is None or result[1].target.kind in query.kinds: if query.types is None or len(query.types.intersection(result[1].target.types)) > 0: source_results = all_results.setdefault(source_name,[]) source_results.append(result) total += 1 else: logs.debug("Filtered out %s (types=%s)" % (result[1].name, result[1].target.types)) else: logs.debug("Filtered out %s (kind=%s)" % (result[1].name, result[1].target.kind)) for source_name, source_results in all_results.iteritems(): all_results[source_name] = sortedResults(source_results) print("\n\n\nGenerated %d results in %f seconds from: %s\n\n\n" % ( total, time.time() - before, ' '.join([ '%s:%s' % (k, len(v)) for k,v in all_results.iteritems()]) )) before2 = time.time() chosen = [] limit = max(0, min(total, limit if limit else total)) while len(chosen) < limit: best_name = None best = None for name, source_results in list(all_results.iteritems()): if len(source_results) == 0: del all_results[name] else: cur_best = source_results[0] if best is None or cur_best[0]['total'] > best[0]['total']: best = cur_best best_name = name else: if _verbose: print("skipped %s with value %s" % (name, cur_best[0]['total'])) if best is not None: del all_results[best_name][0] if _verbose: print("Chose %s with value %s" % (best_name, best[0]['total'])) cur = best[1] def dedup(): for entry in chosen: target = entry[1].target if target.types == cur.target.types: yield target dups = self.__resolver.resolve(cur.target, generatorSource(dedup()), count=1) if len(dups) > 0 and dups[0][0]['resolved']: if _verbose: print("Discarded %s:%s as a duplicate to %s:%s" % (cur.source, cur.name, dups[0][1].source, dups[0][1].name)) print(formatResults(dups[0:1], verbose=True)) else: chosen.append(best) """# useful debugging aid if you find dupes in the search results if len(best) == 2 and len(dups) > 0: print("COMPARED %s:%s with %s:%s" % (cur.source, cur.name, dups[0][1].source, dups[0][1].name)) print(formatResults(dups[0:2], verbose=True)) """ else: break if _verbose: print("\n\n\nDeduped %d results in %f seconds\n\n\n" % (total - len(chosen), time.time() - before2)) return chosen