Exemplo n.º 1
0
    def __init__(self):
        allCategories = Constants.categories
        self.__all_sources = []
        self.__entity_collection = MongoEntityCollection()
        self.__stats_collection = MongoEntityStatsCollection()
        # Within each category, we have a number of sources and each is assigned a priority. The priority is used to
        # determine how long to wait for results from that source.
        self.__categories_to_sources_and_priorities = {}
        for category in allCategories:
            self.__categories_to_sources_and_priorities[category] = []

        self.__registerSource(StampedSource(), music=3, film=3, book=3, app=3, place=3)
        self.__registerSource(iTunesSource(), music=10, film=10, book=3, app=10)
        # TODO: Enable film for Amazon. Amazon film results blend TV and movies and have better retrieval than
        # iTunes. On the other hand, they're pretty dreadful -- no clear distinction between TV and movies, no
        # clear distinction between individual movies and box sets, etc.
        self.__registerSource(AmazonSource(), music=5, book=10)
        self.__registerSource(FactualSource(), place=8)
        self.__registerSource(GooglePlacesSource(), place=8)
        self.__registerSource(RdioSource(), music=8)
        self.__registerSource(SpotifySource(), music=8)
        self.__registerSource(TMDBSource(), film=8)
        self.__registerSource(TheTVDBSource(), film=8)
Exemplo n.º 2
0
class EntitySearch(object):
    def __registerSource(self, source, **categoriesToPriorities):
        self.__all_sources.append(source)
        for (category, priority) in categoriesToPriorities.items():
            if category not in Constants.categories:
                raise Exception("unrecognized category: %s" % category)
            self.__categories_to_sources_and_priorities[category].append((source, priority))

    def __init__(self):
        allCategories = Constants.categories
        self.__all_sources = []
        self.__entity_collection = MongoEntityCollection()
        self.__stats_collection = MongoEntityStatsCollection()
        # Within each category, we have a number of sources and each is assigned a priority. The priority is used to
        # determine how long to wait for results from that source.
        self.__categories_to_sources_and_priorities = {}
        for category in allCategories:
            self.__categories_to_sources_and_priorities[category] = []

        self.__registerSource(StampedSource(), music=3, film=3, book=3, app=3, place=3)
        self.__registerSource(iTunesSource(), music=10, film=10, book=3, app=10)
        # TODO: Enable film for Amazon. Amazon film results blend TV and movies and have better retrieval than
        # iTunes. On the other hand, they're pretty dreadful -- no clear distinction between TV and movies, no
        # clear distinction between individual movies and box sets, etc.
        self.__registerSource(AmazonSource(), music=5, book=10)
        self.__registerSource(FactualSource(), place=8)
        self.__registerSource(GooglePlacesSource(), place=8)
        self.__registerSource(RdioSource(), music=8)
        self.__registerSource(SpotifySource(), music=8)
        self.__registerSource(TMDBSource(), film=8)
        self.__registerSource(TheTVDBSource(), film=8)

    def __terminateWaiting(self, pool, start_time, category, resultsDict):
        logTimingData('IN TERMINATE WAITING')
        sources_to_priorities = dict(self.__categories_to_sources_and_priorities[category])
        total_value_received = 0
        total_potential_value_outstanding = sum(sources_to_priorities.values())
        sources_seen = set()
        while True:
            try:
                elapsed_seconds = total_seconds(datetime.datetime.now() - start_time)

                if elapsed_seconds >= 7:
                    logs.warning('Search completely timed out at 7s!')
                    pool.kill()
                    return

                for (source, results) in resultsDict.items():
                    if source in sources_seen:
                        continue
                    logTimingData('JUST NOW SEEING SOURCE: ' + source.sourceName)
                    sources_seen.add(source)
                    # If a source returns at least 5 results, we assume we got a good result set from it. If it
                    # returns less, we're more inclined to wait for straggling sources.
                    total_value_received += sources_to_priorities[source] * min(5, len(results)) / 5.0
                    logTimingData('DECREMENTING OUTSTANDING BY ' + str(sources_to_priorities[source]) + ' FOR SOURCE ' + source.sourceName)
                    total_potential_value_outstanding -= sources_to_priorities[source]
                logTimingData('AT %f seconds elapsed, TOTAL VALUE RECEIVED IS %f, TOTAL OUTSTANDING IS %f' % (
                        elapsed_seconds, total_value_received, total_potential_value_outstanding
                    ))
            except Exception:
                logs.warning('TERMINATE_WARNING SHIT IS F****D')
                logs.report()
                raise

            if total_potential_value_outstanding <= 0:
                logTimingData('ALL SOURCES DONE')
                return

            if total_value_received:
                marginal_value_of_outstanding_sources = total_potential_value_outstanding / total_value_received
                # Comes out to:
                #   0.08 for 1s
                #   0.25 for 1.5s
                #   0.79 for 2s
                #   2.51 for 2.5s
                #   7.94 for 3s
                # So we'll ditch that 4th remaining source for music around 1.5s; we'll ditch the second source for
                # something like Places around 2s; we'll ditch any lingering source around 3s if we've received
                # anything.
                min_marginal_value = 10 ** (elapsed_seconds - 2.1)
                if min_marginal_value > marginal_value_of_outstanding_sources:
                    sources_not_seen = [
                        source.sourceName for source in sources_to_priorities.keys() if source not in sources_seen
                    ]
                    if sources_not_seen:
                        # This is interesting information whether we want the full timing data logged or not.
                        log_template = 'QUITTING EARLY: At %f second elapsed, bailing on sources [%s] because with ' + \
                            'value received %f, value outstanding %f, marginal value %f, min marginal value %f'
                        logs.debug(log_template % (
                            elapsed_seconds, ', '.join(sources_not_seen), total_value_received,
                            total_potential_value_outstanding, marginal_value_of_outstanding_sources, min_marginal_value
                        ))
                    pool.kill()
                    return

            gevent.sleep(0.01)

    def __searchSource(self, source, queryCategory, queryText, resultsDict, timesDict, **queryParams):
        try:
            # Note that the timing here is not 100% legit because gevent won't interrupt code except on I/O, but it's good
            # enough to give a solid idea.
            before = datetime.datetime.now()
            if shouldLogRawSourceResults:
                queryParams['logRawResults'] = True
            results = source.searchLite(queryCategory, queryText, **queryParams)

            after = datetime.datetime.now()
            # First level of filtering on data quality score -- results that are really horrendous get dropped entirely
            # pre-clustering.
            filteredResults = [result for result in results if result.dataQuality >= MIN_RESULT_DATA_QUALITY_TO_CLUSTER]
            timesDict[source] = after - before
            logs.debug("GOT RESULTS FROM SOURCE %s IN ELAPSED TIME %s -- COUNT: %d, AFTER FILTERING: %d" % (
                source.sourceName, str(after - before), len(results), len(filteredResults)
            ))
            resultsDict[source] = filteredResults
        except GreenletExit:
            pass
        except:
            logs.report()
            resultsDict[source] = []

    def search(self, category, text, timeout=SEARCH_TIMEOUT, limit=10, coords=None):
        if not isinstance(text, unicode):
            text = text.decode('utf-8')
        if category not in Constants.categories:
            raise Exception("unrecognized category: (%s)" % category)

        start = datetime.datetime.now()
        results = {}
        times = {}
        pool = utils.LoggingThreadPool(len(self.__categories_to_sources_and_priorities))

        def termWaiting():
            logs.debug('in termWaiting')
            try:
                return self.__terminateWaiting(pool, datetime.datetime.now(), category, results)
            except Exception:
                logs.report()
            logs.debug('done with termWaiting')

        logs.debug("SHOULD_DISABLE_TIMEOUT IS " + str(shouldDisableTimeout))
        if not shouldDisableTimeout:
            logTimingData('SPAWNING TERMINATE WAITING')
            #pool.spawn(self.__terminateWaiting, pool, datetime.datetime.now(), category, results)
            pool.spawn(termWaiting)

        for (source, priority) in self.__categories_to_sources_and_priorities[category]:
            # TODO: Handing the exact same timeout down to the inner call is probably wrong because we end up in this
            # situation where outer pools and inner pools are using the same timeout and possibly the outer pool will
            # nix the whole thing before the inner pool cancels out, which is what we'd prefer so that it's handled
            # more gracefully.
            pool.spawn(self.__searchSource, source, category, text, results, times, timeout=timeout, coords=coords)


        logTimingData("TIME CHECK ISSUED ALL QUERIES AT " + str(datetime.datetime.now()))
        pool.join()
        logTimingData("TIME CHECK GOT ALL RESPONSES AT " + str(datetime.datetime.now()))

        logTimingData('TIMES: ' + (', '.join(['%s took %s' % (source.sourceName, str(times[source])) for source in times])))
        for source in self.__all_sources:
            if source in results and results[source]:
                logSourceResultsData("\nRESULTS FROM SOURCE " + source.sourceName + " TIME ELAPSED: " + str(times[source]) + "\n\n")
                for result in results[source]:
                    logSourceResultsData(utils.normalize(repr(result)))
                    pass

        beforeDeduping = datetime.datetime.now()
        dedupedResults = SearchResultDeduper().dedupeResults(category, results.values())
        afterDeduping = datetime.datetime.now()
        logTimingData("DEDUPING TOOK " + str(afterDeduping - beforeDeduping))
        logTimingData("TIME CHECK DONE AT:" + str(datetime.datetime.now()))
        logTimingData("ELAPSED:" + str(afterDeduping - start))

        logClusterData("\n\nDEDUPED RESULTS\n\n")
        for dedupedResult in dedupedResults[:limit]:
            logClusterData("\n\n%s\n\n" % str(dedupedResult))

        return dedupedResults[:limit]


    def __getEntityIdForCluster(self, cluster):
        idsFromClusteredEntities = []
        fastResolveQueries = []
        for result in cluster.results:
            if result.dataQuality < MIN_RESULT_DATA_QUALITY_TO_INCLUDE:
                continue
            if result.resolverObject.source == 'stamped':
                idsFromClusteredEntities.append(result.resolverObject.key)
            else:
                fastResolveQueries.append((result.resolverObject.source, result.resolverObject.key))

        fastResolvedIds = filter(None, self.__stampedSource.resolve_fast_batch(fastResolveQueries)) if fastResolveQueries else []

        allIds = idsFromClusteredEntities + fastResolvedIds
        if len(idsFromClusteredEntities) > 2:
            logs.warning('Search results directly clustered multiple StampedSource results: [%s]' %
                         ', '.join(str(entityId) for entityId in idsFromClusteredEntities))
        elif len(allIds) > 2:
            logs.warning('Search results indirectly clustered multiple entity IDs together: [%s]' %
                         ', '.join(str(entityId) for entityId in allIds))
        if not allIds:
            return None
        return allIds[0]


    def __proxyToEntity(self, cluster):
        # Additional level of filtering -- some things get clustered (for the purpose of boosting certain cluster
        # scores) but never included in the final result because we're not 100% that the data is good enough to show
        # users.
        filteredResults = [r for r in cluster.results if r.dataQuality >= MIN_RESULT_DATA_QUALITY_TO_INCLUDE]
        # So this is ugly, but it's pretty common for two listings to have the same or virtually the same data quality
        # and using relevance as a tie-breaker is really helpful.
        filteredResults.sort(key=lambda r: (r.dataQuality + (r.relevance / 10.0), r.resolverObject.source, r.resolverObject.key), reverse=True)
        # TODO PRELAUNCH: Only use the best result from each source.
        entity = EntityProxyContainer().addAllProxies(result.resolverObject for result in filteredResults).buildEntity()
        for result in filteredResults:
            entity.addThirdPartyId(result.resolverObject.source, result.resolverObject.key)
        return entity

    @utils.lazyProperty
    def __stampedSource(self):
        return StampedSource()


    def __buildEntity(self, entityId):
        entity = self.__entity_collection.getEntity(entityId)
        entity._maybeRegenerateThirdPartyIds()
        return entity


    def rescoreFinalResults(self, entityAndClusterList):
        def isTempEntity(entity):
            return entity.entity_id is None
        realEntityIds = [ entity.entity_id for (entity, cluster) in entityAndClusterList if not isTempEntity(entity) ]
        entityStats = self.__stats_collection.getStatsForEntities(realEntityIds)
        statsByEntityId = dict([(stats.entity_id, stats) for stats in entityStats])

        def scoreEntityAndCluster((entity, cluster)):
            if isTempEntity(entity):
                dataScore = cluster.dataQuality
            else:
                numStamps = 0
                if entity.entity_id in statsByEntityId:
                    numStamps = statsByEntityId[entity.entity_id].num_stamps
                dataScore = 1.1 + math.log(numStamps+1, 50)

            # TODO: Possibly distinguish even more about which of these have rich data. There are some types of data
            # that don't affect dataQuality because they don't make us less certain about the state of a cluster, but
            # they make user interactions with it more positive -- pictures, preview URLs, etc. We should factor
            # these in here.
            return dataScore * cluster.relevance

        entityAndClusterList.sort(key=scoreEntityAndCluster, reverse=True)


    def searchEntitiesAndClusters(self, category, text, timeout=SEARCH_TIMEOUT, limit=10, coords=None):
        clusters = self.search(category, text, timeout=timeout, limit=limit, coords=coords)
        searchDoneTime = datetime.datetime.now()
        entityResults = []

        entityIdsToNewClusterIdxs = {}
        entitiesAndClusters = []
        for cluster in clusters:
            # TODO: make use of nemesis ids here.
            entityId = self.__getEntityIdForCluster(cluster)
            if not entityId:
                # One more layer of filtering here -- clusters that don't overall hit our quality minimum get
                # dropped. We never drop clusters that resolve to entities for this reason.
                if cluster.dataQuality >= MIN_CLUSTER_DATA_QUALITY:
                    entitiesAndClusters.append((self.__proxyToEntity(cluster), cluster))
                else:
                    logClusterData('DROPPING CLUSTER for shitty data quality:\n%s' % cluster)

            # TODO PRELAUNCH: Make sure that the type we get from fast_resolve == the type we get from
            # StampedSourceObject.key, or else using these as keys in a map together won't work.
            elif entityId not in entityIdsToNewClusterIdxs:
                entityIdsToNewClusterIdxs[entityId] = len(entitiesAndClusters)
                entitiesAndClusters.append((self.__buildEntity(entityId), cluster))
            else:
                originalIndex = entityIdsToNewClusterIdxs[entityId]
                (_, originalCluster) = entitiesAndClusters[originalIndex]
                # We're not actually augmenting the result at all here; the result is the unadultered entity. We won't
                # show an entity augmented with other third-party IDs we've attached in search results because it will
                # create inconsistency for the entity show page and we don't know if they will definitely be attached.
                # The point of the grok is entirely to boost the rank of the cluster (and thus of the entity.)
                # TODO PRELAUNCH: Consider overriding this for sparse or user-created entities.
                # TODO: Debug check to see if the two are definitely not a match according to our clustering logic.
                originalCluster.grok(cluster)

        # TODO: Reorder according to final scores that incorporate dataQuality and a richness score (presence of stamps,
        # presence of enriched entity, etc.)

        convertedToEntitiesTime = datetime.datetime.now()
        logTimingData('CONVERTING TO ENTITIES TOOK: %s' % (convertedToEntitiesTime - searchDoneTime))

        self.rescoreFinalResults(entitiesAndClusters)
        rescoredTime = datetime.datetime.now()
        logTimingData('RESCORING TOOK: %s' % (rescoredTime - convertedToEntitiesTime))

        return entitiesAndClusters


    def searchEntities(self, *args, **kwargs):
        return [entity for entity, _ in self.searchEntitiesAndClusters(*args, **kwargs)]