示例#1
0
    def __init__(self, config, filesdb, entitiesdb, profiler, sphinx):
        self.config = config
        self.filesdb = filesdb
        self.entitiesdb = entitiesdb
        self.profiler = profiler
        self.sphinx = sphinx

        # logs de informacion
        self.bot_events = defaultdict(int)

        # actualiza información de servidores
        self.update_servers()

        self.maintenance = EventManager()
        self.maintenance.start()
        self.maintenance.interval(config["SERVICE_SEARCH_PROFILE_INTERVAL"], self.save_profile_info)
        self.maintenance.interval(config["SERVERS_REFRESH_INTERVAL"], self.update_servers)
示例#2
0
class SearchProxy:
    def __init__(self, config, filesdb, entitiesdb, profiler, sphinx):
        self.config = config
        self.filesdb = filesdb
        self.entitiesdb = entitiesdb
        self.profiler = profiler
        self.sphinx = sphinx

        # logs de informacion
        self.bot_events = defaultdict(int)

        # actualiza información de servidores
        self.update_servers()

        self.maintenance = EventManager()
        self.maintenance.start()
        self.maintenance.interval(config["SERVICE_SEARCH_PROFILE_INTERVAL"], self.save_profile_info)
        self.maintenance.interval(config["SERVERS_REFRESH_INTERVAL"], self.update_servers)

    def save_profile_info(self):
        # informacion de accesos de bots
        profiling_info, self.bot_events = self.bot_events, defaultdict(int)

        # guarda información
        self.profiler.save_data(profiling_info)

    def update_sources(self):
        # obtiene los origenes
        self.sources = {int(s["_id"]): s for s in self.filesdb.get_sources(blocked=None)}

        # actualiza origenes bloqueados
        self.blocked_sources = [sid for sid, s in self.sources.iteritems() if "crbl" in s and int(s["crbl"])!=0]
        self.sphinx.update_blocked_sources(self.blocked_sources)

        # calcula pesos para origenes individuales por calidad
        self.sources_weights = _update_source_weights(self.sources, self.blocked_sources)

        # listado de origenes ordenados por cantidad de ficheros
        sources = self.stats["src"]
        sources_relevance = sorted(((sources[sid], sid, s["g"]) for sid, s in self.sources.iteritems() if sid not in self.blocked_sources), reverse=True)
        self.sources_relevance_streaming = [self.sources[sid]["d"]
                                                for (count,sid,group) in sources_relevance if "s" in group]
        self.sources_relevance_download = [self.sources[sid]["d"]
                                                for (count,sid,group) in sources_relevance if ("w" in group or "f" in group)]
        self.sources_relevance_p2p = ["Torrent","eD2k","Gnutella"]

    def update_servers(self):
        # obtiene los servidores activos para busquedas
        self.servers = {str(int(server["_id"])):(str(server["sp"]), int(server["spp"])) for server in self.filesdb.get_servers() if "sp" in server and server["sp"]}

        self.servers_set = set(self.servers.iterkeys())

        # estadisticas de busqueda por servidor
        new_servers_stats = {server_id:self.filesdb.get_server_stats(int(server_id)) for server_id in self.servers.iterkeys()}

        # estadisticas combinadas para todos los servidores
        new_stats = {"sg":defaultdict(int), # subgrupos
                      "rc":defaultdict(int), "ra":defaultdict(int), "rv":defaultdict(int), "rd":{}, "rM":defaultdict(int), # por rating
                      "src":defaultdict(int), "src_rc":defaultdict(int), "src_ra":defaultdict(int), "src_rv":defaultdict(int), "src_rd":defaultdict(int) # por rating y origen
                      }

        # evita multiples accesos a mismas claves globales
        subgroups = new_stats["sg"]
        rating_count = new_stats["rc"]
        rating_average = new_stats["ra"]
        rating_variance = new_stats["rv"]
        rating_maximum = new_stats["rM"]
        rating_deviation = new_stats["rd"]

        # recorre la información de cada servidor
        for server, server_stats in new_servers_stats.iteritems():

            # evita multiples accesos a mismas claves para el servidor
            server_rating_count = server_stats["rc"]
            server_rating_average = server_stats["ra"]
            server_rating_pow_average = server_stats["rpa"]
            server_rating_maximum = server_stats["rM"]

            # recorre los subgrupos del servidor
            for sg, count in server_stats["sg"].iteritems():
                subgroups[sg] += count
                if sg in server_rating_count:
                    rating_count[sg] += server_rating_count[sg]                                 # numero de entradas con rating
                    rating_average[sg] += server_rating_average[sg] * server_rating_count[sg]   # valor medio del rating (a falta de normalizar)
                    rating_variance[sg] += server_rating_pow_average[sg]                        # varianza del rating (a falta de normalizar)
                    rating_maximum[sg] = max(rating_maximum[sg], server_rating_maximum)         # maximo rating

        # actualiza valores dependientes de valores totales que no se pueden calcular en el bucle anterior
        for sg, count in rating_count.iteritems():
            rating_average[sg] /= count
            rating_variance[sg] /= count
            rating_deviation[sg] = sqrt(rating_variance[sg])

        # estadisticas por origenes
        sources = new_stats["src"]
        new_sources_rating_count = new_stats["src_rc"]
        new_sources_rating_average = new_stats["src_ra"]
        new_sources_rating_variance = new_stats["src_rv"]
        new_sources_rating_standard_deviation = new_stats["src_rd"]

        # recorre subgrupos agrupando por origenes
        for sg, count in subgroups.iteritems():
            src = get_src(sg)
            sources[src] += count
            if sg in rating_count:
                new_sources_rating_count[src] += rating_count[sg]
                new_sources_rating_average[src] += rating_average[sg]*rating_count[sg]
                new_sources_rating_variance[src] += rating_variance[sg]*rating_count[sg]

        # actualiza valores dependientes de valores totales que no se pueden calcular en el bucle anterior
        for src, count in new_sources_rating_count.iteritems():
            new_sources_rating_average[src] /= count
            new_sources_rating_variance[src] /= count
            new_sources_rating_standard_deviation[src] = sqrt(new_sources_rating_variance[src])

        self.servers_stats = new_servers_stats
        self.stats = new_stats
        self.sources_rating_count = new_sources_rating_count
        self.sources_rating_average = new_sources_rating_average
        self.sources_rating_variance = new_sources_rating_variance
        self.sources_rating_standard_deviation = new_sources_rating_standard_deviation
        self.sources_rating_count = new_sources_rating_count
        self.sources_rating_average = new_sources_rating_average
        self.sources_rating_variance = new_sources_rating_variance
        self.sources_rating_standard_deviation = new_sources_rating_standard_deviation

        # pesos para los origenes por tipo
        self.update_sources()

    def log_bot_event(self, bot, result):
        self.bot_events[("bot_" if result else "bot_no_") + bot] += 1