def search(): ''' Realiza una búsqueda de archivo ''' # TODO: seguridad en param #si no se ha buscado nada se manda al inicio query = request.args.get("q", None) if not query: flash("write_something") return redirect(url_for("index.home")) page = int(request.args.get("page", 1)) g.title = query+" - "+g.title results = {"total_found":0,"total":0,"time":0} didyoumean = None tags = None if 0 < page < 101: #obtener los tags y el quiso decir taming = taming_search(current_app.config, query, request.args.get("type", None), contextg=g._get_current_object()) #obtener los resultados y sacar la paginación profiler.checkpoint(opening=["sphinx"]) results = search_files(query,request.args,page) or results ids = get_ids(results) profiler.checkpoint(opening=["mongo"], closing=["sphinx"]) files_dict = {mid2hex(file_data["_id"]):fill_data(file_data, False, query) for file_data in get_files(ids)} profiler.checkpoint(opening=["visited"], closing=["mongo"]) save_visited(files_dict.values()) profiler.checkpoint(closing=["visited"]) files=(files_dict[bin2hex(file_id[0])] for file_id in ids if bin2hex(file_id[0]) in files_dict) # recupera los resultados del taming try: tags = taming.next() didyoumean = taming.next() except: pass else: files = () return render_template('files/search.html', results=results, search=request.args["q"].split(" "), files=files, pagination=Pagination(page, 10, min(results["total_found"], 1000)), didyoumean=didyoumean, tags=tags)
def search(): ''' Realiza una búsqueda de archivo ''' # TODO: seguridad en param #si no se ha buscado nada se manda al inicio query = request.args.get("q", None) if not query: flash("write_something") return redirect(url_for("index.home")) #para evitar errores cuando en page no viene un número page = request.args.get("page", "1") if page.isdigit(): page = int(page) else: abort(404) g.title = "%s - %s" % (query, g.title) results = {"total_found":0,"total":0,"time":0} didyoumean = None tags = None if 0 < page < 101: #obtener los tags y el quiso decir tags, dym = taming_search(query, request.args.get("type", None)) #obtener los resultados y sacar la paginación profiler.checkpoint(opening=["sphinx"]) results = search_files(query,request.args,page) or results ids = get_ids(results) profiler.checkpoint(opening=["mongo"], closing=["sphinx"]) files_dict = {mid2hex(file_data["_id"]):fill_data(file_data, False, query) for file_data in get_files(ids)} profiler.checkpoint(opening=["visited"], closing=["mongo"]) save_visited(files_dict.values()) profiler.checkpoint(closing=["visited"]) files=({"file":files_dict[bin2hex(file_id[0])], "search":file_id} for file_id in ids if bin2hex(file_id[0]) in files_dict) # recupera los resultados del taming try: tags = tags.next() didyoumean = dym.next() except: pass else: files = () return render_template('files/search.html', results=results, search=request.args["q"].split(" "), files=files, pagination=Pagination(page, 10, min(results["total_found"], 1000)), didyoumean=didyoumean, tags=tags)
def process_search_results(s=None, query=None, category=None, not_category=None, title=None, zone="", last_items=[], skip=None, limit=70, max_limit=50, ignore_ids=[], show_order=True, results_template="results.html", details=False): files = [] files_text = [] files_dict = None results = None must_cache = True if not title: title = (None, 2, False) if s: ids = [result for result in ((bin2hex(fileid), server, sphinxid, weight, sg) for (fileid, server, sphinxid, weight, sg) in s.get_results((1.0, 0.1), last_items=last_items, skip=skip*max_limit if skip else None, min_results=limit, max_results=limit, extra_browse=limit, weight_processor=weight_processor, tree_visitor=tree_visitor, restart_if_skip=True)) if result[0] not in ignore_ids] # don't use all ids del ids[int(max_limit*1.1):] results_entities = list(set(int(aid[4])>>32 for aid in ids if int(aid[4])>>32)) ntts = {int(ntt["_id"]):ntt for ntt in entitiesdb.get_entities(results_entities)} if results_entities else {} stats = s.get_stats() canonical_query = stats["ct"] if canonical_query: # elimina categoria y no categoria de la busqueda canonica canonical_query_parts = [part for part in canonical_query.split("_") if not ((not_category and part==u"-("+not_category+")") or (category and part==u"("+category+")"))] canonical_query = "_".join(canonical_query_parts) if any(len(part)>=WORD_SEARCH_MIN_LEN or part in NGRAM_CHARS for part in canonical_query_parts) else "" sure = stats["s"] if (not sure) or ("total_sure" in stats and not stats["total_sure"]): g.must_cache = 0 cache.cacheme = False else: sure = True canonical_query = "" # no realiza busquedas bloqueadas if canonical_query: #si la query exacta está en underage no se muestra nada safe_phrase = canonical_query.replace("_"," ").strip() #Si solo la incluye ya tiene que completar con misconduct prepared_phrase = blacklists.prepare_phrase(safe_phrase) if blacklists["underage"].exact(safe_phrase) or prepared_phrase in blacklists["forbidden"] or prepared_phrase in blacklists["searchblocked"] or (prepared_phrase in blacklists["misconduct"] and prepared_phrase in blacklists["underage"]): g.blacklisted_content = "Search" if not g.show_blacklisted_content and g.page_type in {SEARCH_PAGE_TYPE, CATEGORY_PAGE_TYPE} and not g.show_blacklisted_content: abort(404) # si la canonical query es vacia, solo interesan resultados para busquedas con query nulo (rankings) if (g.show_blacklisted_content or not g.blacklisted_content) and (canonical_query or not query): if ids: files_dict={str(f["_id"]):prepare_data(f,text=query,ntts=ntts,details=details, current_category=category) for f in get_files(ids,s)} if not g.search_bot: save_visited(files_dict.values()) # ordena resultados y añade informacion de la busqueda position = 0 for search_result in ids: fid = search_result[0] if fid in files_dict and files_dict[fid]: afile = files_dict[fid] afile["search"] = search_result files.append(afile) files_text.append(afile["view"]["nfn"]) featured_weight = (afile['view']["rating5"] + (10 if 'images_server' in afile['view'] or 'thumbnail' in afile['view'] else 0)) g.featured.append((-featured_weight, position, afile)) position+=1 results = render_template(results_template, files=files[:max_limit or limit], list_title=title[0] or query or category, title_level=title[1], title_class=title[2], zone=zone, show_order=show_order) count = min(len(files), max_limit or limit) search_info = {"time": max(stats["t"].itervalues()) if stats["t"] else 0, "total_found": stats["cs"], "count": count, "next": False if "end" in stats and stats["end"] or skip>=10 else (skip or 0)+1, "files_text":files_text, "canonical_query":canonical_query, "sure":sure} else: search_info = {"time": 0, "total_found": 0, "count": 0, "next": False, "files_text":[], "canonical_query":"-", "sure":sure} # intenta evitar problemas de memoria del files return results, search_info
def store_files(self, asearch, timeout, timeout_fallback): groups = self.filters_state["g"] subgroups = self.text_state["sg"] for server, query, sphinx_results, messages in self.proxy.browse_results(asearch, timeout, timeout_fallback): # permite manejar igual resultados de querys simples o de multiquerys if not sphinx_results: logging.error("Error in search thread:'%s'"%messages[1]) continue elif "matches" in sphinx_results: sphinx_results = [sphinx_results] # comprueba si es una consulta de resumen if query["sf"]: main = [(True, False), (False, True)] elif query["st"]: main = [(True, True)] else: main = False # por defecto los valores son válidos valid = True # incorpora resultados al subgrupo que corresponda for result in sphinx_results: if not result: logging.error("No ha llegado respuesta del servidor de búsquedas.") continue elif result["error"]: logging.error("Error en búsqueda (servidor %d): %s" % (server, result["error"])) continue elif result["warning"]: valid = False # los resultados se usan, pero se marcan como inválidos para próximas veces logging.error("Alertas en búsqueda (servidor %d): %s" % (server, result["warning"])) total = 0 for r in result["matches"]: # calcula el subgrupo y el id del fichero sg = str(r["attrs"]["g"]) fid = bin2hex(struct.pack('III',r["attrs"]["uri1"],r["attrs"]["uri2"],r["attrs"]["uri3"])) g = get_group(sg) g2 = get_group2(sg) weight = r["attrs"]["vrw"] count = r["attrs"]["@count"] if not main: first = query["g"][g] total += count # almacena fichero en grupos y subgrupos if not fid in subgroups[sg]["f"]: filtrable_info = {"z":r["attrs"]["z"], "e":r["attrs"]["e"]} subgroups[sg]["f"][fid] = (weight, server, filtrable_info, r["id"]) # si aplica para los filtros if self.satisfies_filters(sg, filtrable_info): heapq.heappush(groups[g]["g2"][g2]["sg"][sg]["h"], (-weight, fid)) # actualiza totales de grupos y subgrupos if main[0][0]: # almacena en text_state subgroups[sg]["c"][server] = count subgroups[sg]["z"][0] = max(subgroups[sg]["z"][0], r["attrs"]["zm"]) subgroups[sg]["z"][1] = min(subgroups[sg]["z"][1], r["attrs"]["zx"]) if main[0][1]: # almacena en filters_state groups[g]["g2"][g2]["sg"][sg]["c"][server] = count # actualiza el último registro vaĺido obtenido para el grupo en el servidor if valid: if main: groups[g]["g2"][g2]["sg"][sg]["lv"][server] = max(1,groups[g]["g2"][g2]["sg"][sg]["lv"][server]) else: groups[g]["g2"][g2]["sg"][sg]["lv"][server] = max(first+count,groups[g]["g2"][g2]["sg"][sg]["lv"][server]) # totales absolutos if main: if main[0][0]: # almacena en text_state self.text_state["c"][server] = total if valid and server in self.text_state["i"]: self.text_state["i"].remove(server) elif not valid and server not in self.text_state["i"]: self.text_state["i"].append(server) self.text_state["t"][server] = result["time"] if main[0][1]: # almacena en filters_state self.filters_state["c"][server] = total if valid and server in self.filters_state["i"]: self.filters_state["i"].remove(server) elif not valid and server not in self.filters_state["i"]: self.filters_state["i"].append(server) self.filters_state["t"][server] = result["time"] main.pop()
search_results["files"] = render_template('files/file.html',files=search_results["files"]) return jsonify(search_results) def search_files(query,filters,min_results=0,max_results=30,download=None,last_items=[],query_time=None,extra_wait_time=500, async=False, max_extra_searches=4, non_group=False, order=None, weight_processor=None, tree_visitor=None): ''' Realiza una búsqueda de archivos ''' if not last_items and min_results==0: min_results=5 # obtener los resultados profiler_data={} profiler.checkpoint(profiler_data,opening=["sphinx"]) s = searchd.search(query, filters=filters, start=not bool(last_items), group=True, no_group=non_group, order=order) ids = [(bin2hex(fileid), server, sphinxid, weight, sg) for (fileid, server, sphinxid, weight, sg) in s.get_results((1.4, 0.1), last_items=last_items, min_results=min_results, max_results=max_results, extra_browse=0 if max_results>30 else None, weight_processor=weight_processor, tree_visitor=tree_visitor)] stats = s.get_stats() profiler.checkpoint(profiler_data,opening=["entities"], closing=["sphinx"]) results_entities = list(set(int(aid[4])>>32 for aid in ids if int(aid[4])>>32)) ntts = {int(ntt["_id"]):ntt for ntt in entitiesdb.get_entities(results_entities)} if results_entities else {} profiler.checkpoint(profiler_data, closing=["entities"]) '''# trae entidades relacionadas if ntts: rel_ids = list(set(eid for ntt in ntts.itervalues() for eids in ntt["r"].itervalues() if "r" in ntt for eid in eids)) ntts.update({int(ntt["_id"]):ntt for ntt in entitiesdb.get_entities(rel_ids, None, (False, [u"episode"]))}) ''' result = {"time": max(stats["t"].itervalues()) if stats["t"] else 0, "total_found": stats["cs"]}
def process_search_results( s=None, query=None, category=None, not_category=None, title=None, zone="", last_items=[], skip=None, limit=70, max_limit=50, ignore_ids=[], show_order=True, ): files = [] files_text = [] files_dict = None results = None must_cache = True if not title: title = (None, 2, False) if s: ids = [ result for result in ( (bin2hex(fileid), server, sphinxid, weight, sg) for (fileid, server, sphinxid, weight, sg) in s.get_results( (3.0, 0.1), last_items=last_items, skip=skip * 100 if skip else None, min_results=limit, max_results=limit, extra_browse=limit, weight_processor=weight_processor, tree_visitor=tree_visitor, ) ) if result[0] not in ignore_ids ] results_entities = list(set(int(aid[4]) >> 32 for aid in ids if int(aid[4]) >> 32)) ntts = {int(ntt["_id"]): ntt for ntt in entitiesdb.get_entities(results_entities)} if results_entities else {} stats = s.get_stats() canonical_query = stats["ct"] if canonical_query: # elimina categoria y no categoria de la busqueda canonica canonical_query_parts = [ part for part in canonical_query.split("_") if not ( (not_category and part == u"-(" + not_category + ")") or (category and part == u"(" + category + ")") ) ] canonical_query = ( "_".join(canonical_query_parts) if any(len(part) >= WORD_SEARCH_MIN_LEN or part in NGRAM_CHARS for part in canonical_query_parts) else "" ) sure = stats["s"] if (not sure) or ("total_sure" in stats and not stats["total_sure"]): g.must_cache = 0 cache.cacheme = False else: sure = True canonical_query = "" # ~ ids= [("2a6a52f7ad943af97f57ee79","1",0,0,0),("fd83615ca1e57647491b3744","1",0,0,0)] # ~ ids= ["2a6a52f7ad943af97f57ee79", "fd83615ca1e57647491b3744"] data_filtered = get_data_filtered() ids = [item[0] for item in data_filtered] filtered = {item[0]: {"query": item[1], "blocked": item[2]} for item in data_filtered} stats = {"cs": 133} ntts = {} # no realiza busquedas bloqueadas if canonical_query: prepared_phrase = blacklists.prepare_phrase(canonical_query.replace("_", " ")) if ( prepared_phrase in blacklists["forbidden"] or prepared_phrase in blacklists["searchblocked"] or (prepared_phrase in blacklists["misconduct"] and prepared_phrase in blacklists["underage"]) ): g.blacklisted_content = True # si la canonical query es vacia, solo interesan resultados para busquedas con query nulo (rankings) if (g.show_blacklisted_content or not g.blacklisted_content) and (canonical_query or not query): if ids: files_dict = {str(f["_id"]): prepare_data(f, text=query, ntts=ntts) for f in filesdb.get_files(ids, s, 1)} # ordena resultados y añade informacion de la busqueda position = 0 for search_result in ids: fid = search_result if fid in files_dict and files_dict[fid]: afile = files_dict[fid] afile["search"] = search_result files.append(afile) files_text.append(afile["view"]["nfn"]) afile["view"]["blocked"] = filtered[str(afile["file"]["_id"])]["blocked"] afile["view"]["query"] = filtered[str(afile["file"]["_id"])]["query"] featured_weight = afile["view"]["rating"] + ( 10 if "images_server" in afile["view"] or "thumbnail" in afile["view"]["md"] else 0 ) g.featured.append((-featured_weight, position, afile)) position -= 1 results = render_template( "filters_results.html", files=files, list_title=title[0] or query or category, title_level=title[1], title_class=title[2], zone=zone, show_order=show_order, ) count = min(len(files), max_limit or limit) search_info = { "time": 0, "total_found": stats["cs"], "count": count, "next": False, "files_text": files_text, "canonical_query": canonical_query, "sure": sure, } else: search_info = { "time": 0, "total_found": 0, "count": 0, "next": False, "files_text": [], "canonical_query": "-", "sure": sure, } # intenta evitar problemas de memoria del files return results, search_info