def process( app ): query = app.query obj_id = query.parms["id"] if not app.user.can_read( obj_id ): raise errors.PrivilegeError( "%d cannot read %d" % (app.user.id, obj_id) ) obj = db_object.DBObject.create_typed_object( app=app, object_id=obj_id ) result = {} if type(obj)==entry.Entry: if query.parms["method"] == "create_draft": draft = obj.create_draft() result = { "succeeded" : True, "draft" : get_module.get(app=app, object_ids=[draft.id], recursive=(True,True))[0] } else: raise errors.ParameterError( "Unsupported method for type" ) elif type(obj)==entry.Draft: if query.parms["method"] == "publish": entry_id = obj.publish() result = { "succeeded" : True, "entry" : get_module.get(app=app, object_ids=[entry_id], recursive=(True,True))[0] } elif query.parms["method"] == "merge_to_parent": entry_id = obj.merge_to_parent() result = { "succeeded" : True, "entry" : get_module.get(app=app, object_ids=[entry_id], recursive=(True,True))[0] } else: raise errors.ParameterError( "Unsupported method for type" ) else: raise errors.ParameterError( "Object with unsupported type" ) app.response.output = json.dumps( result )
def run( self ): self.app.open_db() # App-DB für diesen Thread neu öffnen... while not self.quitting: objs = get_module.get( self.app, limit=10 ) message = str( {'succeeded': True, 'objects': objs} ) self.websocket.send( message ) time.sleep(30) self.app.close_db( commit=True ) # also commits pending changes
def process( app ): query = app.query response = app.response session = app.session target_ids = [int(x) for x in query.parms["id"].split(",")] object_list = get_module.get( app, object_ids=target_ids ) metainfo_list = [] for target_id in target_ids: if app.user.can_read( target_id ): target_obj = files.File( app, object_id=target_id ) metainfo_list.append( target_obj.identify() ) else: raise errors.PrivilegeError() for metainfo in metainfo_list: for obj in object_list: if obj["id"] == metainfo["id"]: obj.update( metainfo ) response.output = json.dumps( {"succeeded" : True, "objects" : object_list} )
def process( app ): query = app.query response = app.response session = app.session source_id = int( query.parms["id"] ) mode = query.parms["mode"] if "mode" in query.parms else "convert" if app.user.can_read( source_id ): source_obj = files.File( app, object_id=source_id ) if re.match( r"^video/.*", source_obj.media_type ): new_poster_offset = float(query.parms["poster_offset"]) if "poster_offset" in query.parms else None new_poster_id = int(query.parms["poster_id"]) if "poster_id" in query.parms else None if new_poster_id and app.user.can_write( source_id ) and app.user.can_read( new_poster_id ): new_poster_obj = files.File( app, object_id=new_poster_id ) else: new_poster_obj = None # zur Bestimmung der Größen und Bitraten der Alternativobjekte identifizieren wir zunächst das Orignalobjekt: source_size = source_obj.get_size() source_meta = source_obj.identify() source_width = int( source_meta["mplayer"]["id"]["video"]["width"] ) source_rate = round(source_size*8/float(source_meta["mplayer"]["id"]["length"])/1000) results = [] class ConversionDescription: def __init__( self, role, width, media_type, rate=0, condition=lambda x: True ): self.role = role self.width = width self.media_type = media_type self.rate = rate self.condition = condition def applies( self ): return self.condition(self) def __eq__( self, other ): # rate is not part of equality, because this is used to compare conversion candidates with already existing substitutes # and bitrate is deemed to specific for that purpose; we just assume rates are ok/equal for a given size return ( (self.role, self.width, self.media_type, self.applies()) == (other.role, other.width, other.media_type, other.applies()) ) def __str__( self ): return( "(role=%s, width=%dpx, media_type=%s, rate=%dk, applies=%s)" % (self.role, self.width, self.media_type, self.rate, str(self.applies())) ) # we want a poster image, an mp4-substitute in source_width for non-mp4 sources and a scaled down mp4-substitute for big sources: missing_conversions = [ ConversionDescription( role="poster", width=min(1280, source_width), media_type="image/jpeg" ), ConversionDescription( role="compatible", width=source_width, media_type="video/mp4", rate=source_rate, condition = lambda self: source_obj.media_type != "video/mp4" ), ConversionDescription( role="compatible", width=min(1280, source_width), media_type="video/mp4", rate=min(2000, source_rate), condition = lambda self: (self.width < (source_width*0.8)) and (self.rate < (source_rate*0.8)) ), ] c = app.db.cursor() # Hier müssen wir zunächst prüfen ob das angefragte Objekt selbst schon ein Substitute-Objekt ist, ... c.execute( """select original_id from substitutes where substitute_id=?""", [source_obj.id] ) if c.fetchone(): # ... denn für Substitute-Objekte sollten keine weiteren Substitute-Objekte generiert werden. missing_conversions = [] c.execute( """select s.substitute_id, s.type, s.size, s.priority, sobj.id from substitutes s left join objects sobj on sobj.id=s.substitute_id where s.original_id=?""", [source_obj.id] ) for row in c: substitute = { "substitute_id" : int(row[0]), "type" : row[1], "size" : int(row[2]), "priority" : int(row[3]) } sobj_id = row[4] if sobj_id==None: # Zombie-Substitutes bereinigen (FIXME: sollte das von DBObject.delete() erledigt werden?): del_c = app.db.cursor() del_c.execute( """delete from substitutes where substitute_id=?""", [substitute["substitute_id"]] ) else: substitute_obj = db_object.DBObject( app, object_id=substitute["substitute_id"] ) conversion = ConversionDescription( role=substitute["type"], width=substitute["size"], media_type=substitute_obj.media_type ) if conversion in missing_conversions: if substitute["type"]=="poster" and (new_poster_offset or new_poster_obj): # bestehendes Poster-Substitute entfernen, da es neu definiert werden soll: del_c = app.db.cursor() del_c.execute( """delete from substitutes where original_id=? and substitute_id=?""", [source_obj.id,substitute_obj.id] ) else: missing_conversions.remove( conversion ) results.append( substitute ) else: results.append( substitute ) error_list = [] if mode == "convert": # Alle fehlende Objekte sofort ohne Daten anlegen, um Mehrfachkonvertierung zu vermeiden: new_objects = [] for conversion in [x for x in missing_conversions if x.applies()]: # Privilege-Escalation damit nicht nur der Eigentümer des Target-Objekts diesen Code ausführen kann: app_old_user = app.user app.user = user.get_admin_user(app) existing_object_id = None if conversion.role=="poster" and new_poster_obj: existing_object_id = new_poster_obj.id new_obj = files.File( app, object_id=existing_object_id, parent_id=source_obj.id, media_type=conversion.media_type ) if not existing_object_id: new_obj.conversion = conversion; new_objects.append( new_obj ) substitute = { "substitute_id" : new_obj.id, "type" : conversion.role, "size" : conversion.width, "priority" : None } results.append( substitute ) app.user = app_old_user c = app.db.cursor() c.execute( """insert into substitutes (original_id, substitute_id, type, size) values(?,?,?,?)""", [source_obj.id, new_obj.id, conversion.role, conversion.width] ) # Konvertierungsvorgänge für angelegte Objekte durchführen: for new_obj in new_objects: # this may take a long time, so wie have to commit first: app.db.commit() base_type, sub_type = new_obj.media_type.split("/") new_tmp_name = new_obj.storage_path+".tmp."+sub_type if( re.match(r"^video/.*", new_obj.media_type) ): # Konvertierung mit geänderter Breite bei Erhaltug des Seitenverhältnisses: # http://stackoverflow.com/questions/8218363/maintaining-ffmpeg-aspect-ratio p = subprocess.Popen( ["ffmpeg", "-y", "-i", source_obj.storage_path, "-vf", "scale=%d:trunc(ow/a/2)*2" % (new_obj.conversion.width), "-r", "25", "-b", "%dk" % new_obj.conversion.rate, "-qmin", "0", "-strict", "-2", new_tmp_name], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) elif( new_obj.conversion.role=="poster" and new_obj.media_type == "image/jpeg" ): # Vorschaubild bei Zeitindex 3s extrahieren: p = subprocess.Popen( ["ffmpeg", "-y", "-i", source_obj.storage_path, "-vf", "scale=%d:trunc(ow/a/2)*2" % (new_obj.conversion.width), "-ss", str(new_poster_offset if new_poster_offset else 3), "-vframes", "1", new_tmp_name], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) else: raise NotImplementedError( "missing operation for conversion: %s" % (str(new_obj.conversion)) ) stdout, stderr = p.communicate() if p.returncode!=0: try: # FIXME: Löschenfunktion nach DBObject ausmodularisieren und Dateibereinigung nach files.File: # Privilege-Escalation damit nicht nur der Eigentümer des Target-Objekts diesen Code ausführen kann: app_old_user = app.user app.user = user.get_admin_user(app) db_object.DBObject.delete_in( app, [new_obj.id] ) app.user = app_old_user os.remove( new_tmp_name ) c = app.db.cursor() c.execute( """delete from substitutes where original_id=? and substitute_id=?""", [source_obj.id, new_obj.id] ) results = [x for x in results if x["substitute_id"]!=new_obj.id] except Exception as e: error_list.append( e ) errmsg = stderr.decode().split("\n")[-1] error_list.append( errors.InternalProgramError(errmsg) ) else: os.rename( new_tmp_name, new_obj.storage_path ) # Fehlerbehandlung: if error_list: msg = "" for error in error_list: if msg: msg += "; " msg += str(error) raise errors.InternalProgramError( msg ) else: for result in results: result["substitute_object"] = get_module.get(app, object_ids=[result["substitute_id"]]) response.output = json.dumps( {"succeeded": True, "substitutes": results} ) elif re.match( r"^audio/.*", source_obj.media_type ): # TODO: add safe conversions results = [] response.output = json.dumps( {"succeeded": True, "substitutes": results} ) else: raise NotImplementedError( "unsupported media type: "+source_obj.media_type ) else: raise errors.PrivilegeError()
def process(app): query = app.query response = app.response session = app.session target_id = int(query.parms["id"]) mode = query.parms["mode"] if "mode" in query.parms else "convert" if app.user.can_read(target_id): target_obj = files.File(app, object_id=target_id) if re.match(r"^video/.*", target_obj.media_type): new_poster_offset = float(query.parms["poster_offset"]) if "poster_offset" in query.parms else None new_poster_id = int(query.parms["poster_id"]) if "poster_id" in query.parms else None if new_poster_id and app.user.can_write(target_id) and app.user.can_read(new_poster_id): new_poster_obj = files.File(app, object_id=new_poster_id) else: new_poster_obj = None # Wir brauchen min. webm (Firefox, Chrome) und mp4 (Safari, IE, Chrome) für eine halbwegs gute # Client-Abdeckung. # Direkte Kindobjekte danach durchsuchen: results = [] missing_conversions = [ ("poster", 480, "image/jpeg"), ("compatible", 480, "video/mp4"), ("compatible", 480, "video/webm"), ] c = app.db.cursor() # Hier müssen wir zunächst prüfen ob das angefragte Objekt selbst schon ein Substitute-Objekt ist, ... c.execute("""select original_id from substitutes where substitute_id=?""", [target_obj.id]) if c.fetchone(): # ... denn für Substitute-Objekte sollten keine weiteren Substitute-Objekte generiert werden. missing_conversions = [] c.execute( """select s.substitute_id, s.type, s.size, s.priority, sobj.id from substitutes s left join objects sobj on sobj.id=s.substitute_id where s.original_id=?""", [target_obj.id], ) for row in c: substitute = { "substitute_id": int(row[0]), "type": row[1], "size": int(row[2]), "priority": int(row[3]), } sobj_id = row[4] if sobj_id == None: # Zombie-Substitutes bereinigen (FIXME: sollte das von DBObject.delete() erledigt werden?): del_c = app.db.cursor() del_c.execute("""delete from substitutes where substitute_id=?""", [substitute["substitute_id"]]) app.db.commit() else: substitute_obj = db_object.DBObject(app, object_id=substitute["substitute_id"]) conversion = (substitute["type"], substitute["size"], substitute_obj.media_type) if conversion in missing_conversions: if substitute["type"] == "poster" and (new_poster_offset or new_poster_obj): # bestehendes Poster-Substitute entfernen, da es neu definieter werden soll: del_c = app.db.cursor() del_c.execute( """delete from substitutes where original_id=? and substitute_id=?""", [target_obj.id, substitute_obj.id], ) app.db.commit() else: missing_conversions.remove(conversion) results.append(substitute) error_list = [] if mode == "convert": # Alle fehlende Objekte sofort ohne Daten anlegen, um Mehrfachkonvertierung zu vermeiden: new_objects = [] for conversion in missing_conversions: conversion_type, conversion_size, new_media_type = conversion # Privilege-Escalation damit nicht nur der Eigentümer des Target-Objekts diesen Code ausführen kann: app_old_user = app.user app.user = user.get_admin_user(app) existing_object_id = None if conversion_type == "poster" and new_poster_obj: existing_object_id = new_poster_obj.id new_obj = files.File( app, object_id=existing_object_id, parent_id=target_obj.id, media_type=new_media_type ) if not existing_object_id: new_obj.conversion = conversion new_objects.append(new_obj) substitute = { "substitute_id": new_obj.id, "type": conversion_type, "size": conversion_size, "priority": None, } results.append(substitute) app.user = app_old_user c = app.db.cursor() c.execute( """insert into substitutes (original_id, substitute_id, type, size) values(?,?,?,?)""", [target_obj.id, new_obj.id, conversion_type, conversion_size], ) app.db.commit() # Konvertierungsvorgänge für angelegte Objekte durchführen: for new_obj in new_objects: conversion_type, conversion_size, ignored = new_obj.conversion base_type, sub_type = new_obj.media_type.split("/") new_tmp_name = new_obj.storage_path + ".tmp." + sub_type if re.match(r"^video/.*", new_obj.media_type): # Konvertierung mit konservativer Breite von 480px bei Erhaltug des Seitenverhältnisses: # http://stackoverflow.com/questions/8218363/maintaining-ffmpeg-aspect-ratio p = subprocess.Popen( [ "ffmpeg", "-y", "-i", target_obj.storage_path, "-vf", "scale=%d:trunc(ow/a/2)*2" % (conversion_size), "-r", "25", "-b", "1000k", "-qmin", "0", "-strict", "-2", new_tmp_name, ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) elif conversion_type == "poster" and new_obj.media_type == "image/jpeg": # Vorschaubild bei Zeitindex 3s extrahieren (TODO: Mit beliebigem Zeitindex einstellbar machen?): p = subprocess.Popen( [ "ffmpeg", "-y", "-i", target_obj.storage_path, "-vf", "scale=%d:trunc(ow/a/2)*2" % (conversion_size), "-ss", str(new_poster_offset if new_poster_offset else 3), "-vframes", "1", new_tmp_name, ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) else: raise NotImplementedError("missing operation for conversion: " + str(new_obj.conversion)) stdout, stderr = p.communicate() if p.returncode != 0: try: # FIXME: Löschenfunktion nach DBObject ausmodularisieren und Dateibereinigung nach files.File: # Privilege-Escalation damit nicht nur der Eigentümer des Target-Objekts diesen Code ausführen kann: app_old_user = app.user app.user = user.get_admin_user(app) delete_module.delete_in(app, [new_obj.id]) app.user = app_old_user os.remove(new_tmp_name) c = app.db.cursor() c.execute( """delete from substitutes where original_id=? and substitute_id=?""", [target_obj.id, new_obj.id], ) app.db.commit() results = [x for x in results if x["substitute_id"] != new_obj.id] except Exception as e: error_list.append(e) errmsg = stderr.decode().split("\n")[-1] error_list.append(errors.InternalProgramError(errmsg)) else: os.rename(new_tmp_name, new_obj.storage_path) # Fehlerbehandlung: if error_list: msg = "" for error in error_list: if msg: msg += "; " msg += str(error) raise errors.InternalProgramError(msg) else: for result in results: result["substitute_object"] = get_module.get(app, object_ids=[result["substitute_id"]]) response.output = json.dumps({"succeeded": True, "substitutes": results}) elif re.match(r"^audio/.*", target_obj.media_type): raise NotImplementedError("unsupported media type: " + target_obj.media_type) else: raise NotImplementedError("unsupported media type: " + target_obj.media_type) else: raise errors.PrivilegeError()
def process( app ): query = app.query response = app.response session = app.session target_ids = [int(x) for x in query.parms["id"].split(",")] object_list = get_module.get( app, object_ids=target_ids ) metainfo_list = [] for target_id in target_ids: if app.user.can_read( target_id ): target_obj = files.File( app, object_id=target_id ) if re.match( r"^video/.*", target_obj.media_type ) or re.match( r"^audio/.*", target_obj.media_type ): mplayer_id = {} p = subprocess.Popen( ["mplayer", "-identify", "-frames" , "0", "-ao", "null", "-vo", "null", target_obj.storage_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) stdout, stderr = p.communicate() if p.returncode!=0: errmsg = stderr.decode() raise errors.InternalProgramError( errmsg ) else: for line in stdout.decode().split("\n"): if line.startswith("ID_") and not line.startswith("ID_FILENAME"): parts = line.split("=") key = parts[0].lower() value = "=".join(parts[1:]) populate_dict( mplayer_id, key, value, delim="_" ) metainfo_list.append( {"id" : target_id, "mplayer" : mplayer_id} ) elif re.match( r"^image/.*", target_obj.media_type ): exiv2_data = { "summary" : {} } image_info = {} # Substruktur für dauerhaft verfügbare Metadaten (z.b. width, height) p = subprocess.Popen( ["exiv2", target_obj.storage_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) stdout, stderr = p.communicate() if p.returncode not in (0, 253): errmsg = stderr.decode() raise errors.InternalProgramError( errmsg ) else: for line in stdout.split(b"\n"): try: line = line.decode() except UnicodeDecodeError: continue result = re.findall( "([^:]+):(.*)", line ) try: key, value = result[0] except IndexError: continue key = key.strip().replace(" ","_") if( key in ["File_name"] ): continue value = value.strip() exiv2_data[ "summary" ][ key ] = value if( key=="Image_size" ): x, y = value.split("x") x=int(x.strip()) y=int(y.strip()) image_info["width"] = x #.image.width image_info["height"] = y #.image.height p = subprocess.Popen( ["exiv2", "-pa", target_obj.storage_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) stdout, stderr = p.communicate() if p.returncode not in (0, 253): errmsg = stderr.decode() raise errors.InternalProgramError( errmsg ) else: for line in stdout.decode().split("\n"): result = re.findall( "([^ ]+)[ ]+([^ ]+)[ ]+([^ ]+)[ ]+([^ ].*)", line ) try: key, type, count, value = result[0] except IndexError: continue populate_dict( exiv2_data, key, value ) metainfo_list.append( {"id" : target_id, "exiv2":exiv2_data, "image":image_info} ) else: raise NotImplementedError( "unsupported media type: "+target_obj.media_type ) else: raise errors.PrivilegeError() for metainfo in metainfo_list: for obj in object_list: if obj["id"] == metainfo["id"]: obj.update( metainfo ) response.output = json.dumps( {"succeeded" : True, "objects" : object_list} )
def search( app, search_phrase, result_types=[], min_weight=0, order_by=None, order_reverse=True, range_offset=0, range_limit=None, recursive=(False,False), max_phrase_word_dist=3 ): q = app.query # 1.) Suchausdruck parsen und Datenstrukturen initialisieren: phrase_parts = [] current_word = "" in_phrase = False phrase_start_char = None for i,c in enumerate( search_phrase ): if c in " \t\r\n" and not in_phrase: if current_word: phrase_parts.append( current_word ) current_word = "" elif c in ['"',"'"] and not in_phrase: in_phrase=True phrase_start_char = c current_word += c elif in_phrase and c==phrase_start_char: in_phrase=False phrase_start_char = None current_word += c else: current_word += c if current_word: phrase_parts.append( current_word ) search_words = [] for part in phrase_parts: match = re.fullmatch( "([+-]*)((?:[\w]+:)*)(.+)", part, re.DOTALL ) _word = match.group(3) phrase_match = re.fullmatch( '"([^"]*)"?(?:\[([0-9]+)\])?', _word ) if not phrase_match: phrase_match = re.fullmatch( "'([^']*)'?(?:\[([0-9]+)\])?", _word ) if phrase_match: _word = None _phrase = phrase_match.group(1) try: _phrase_max_word_dist = int(phrase_match.group(2)) except TypeError: _phrase_max_word_dist = max_phrase_word_dist else: _phrase = None _phrase_max_word_dist = max_phrase_word_dist word = { "weight" : match.group(1), "type" : match.group(2), "word" : _word, "phrase" : _phrase, "phrase_max_word_dist" : _phrase_max_word_dist, "raw_word" : part } search_words.append( word ) raw_results = {} c = app.db.cursor() search_word_rows = {} search_word_hits = {} # hiermit zählen wir Treffer pro Suchwort im gefilterten Endergebnis # 2.) Einzelne Suchbegriffe mit optionaler Typbindung im Wortindex nachschlagen und # getroffene Objekt-Ids mit der Suchbegriffwichtung verknüpft zwischenspeichern: for i, search_word in enumerate(search_words): # optionales Wichtungs-Präfix aus '-' und '+' parsen, wobei ein positiveres Präfix Treffer des Suchwortes # höher bewertet und ein negativeres Präfix Treffer der Ausschlussmenge des Suchwortes höher bewertet: weight_prefix = search_word["weight"] word_weight = sum( [(lambda x: 10 if x=='+' else -10)(c) for c in weight_prefix] ) + (10 if not weight_prefix else 0) # optionalen Typ-Selektor der Form <[typ1:[typ2:[...]]]wort> parsen: word_types = search_word["type"].split(":")[:-1] type_query = "" type_names = [] for j, word_type in enumerate(word_types): if j: type_query += " or " if word_type in search_type_alias: type_query += "o.type like ?" type_names.append( search_type_alias[word_type] ) else: type_query += "k0.scan_source=?" type_names.append( word_type ) if type_query: type_query = "and (" + type_query + ")" search_word_rows[ search_word["raw_word"] ] = 0 search_word_hits[ search_word["raw_word"] ] = 0 if search_word["word"]: word = search_word["word"] c.execute( """select object_id, word, pos, scan_source, o.type from keywords k0 inner join objects o on o.id=object_id where word like ? %(type_query)s order by object_id, pos""" % locals(), [word]+type_names ) elif search_word["phrase"]: phrase = search_word["phrase"] phrase_max_word_dist = search_word["phrase_max_word_dist"] phrase_words = phrase.split() phrase_joins = [] phrase_queries = [] for i,phrase_word in enumerate(phrase_words): if i>0: prev_i = i-1 phrase_joins.append( """ inner join keywords k%(i)d on k0.object_id=k%(i)d.object_id and k0.scan_source=k%(i)d.scan_source and abs(k%(i)d.pos-k%(prev_i)d.pos)<=%(phrase_max_word_dist)d""" % locals() ) phrase_queries.append( "and k%(i)d.word like ?" % locals() ) else: phrase_queries.append( "k%(i)d.word like ?" % locals() ) s_phrase_joins = "\n".join( phrase_joins ) s_phrase_queries = "\n".join( phrase_queries ) c.execute( """select k0.object_id, '', k0.pos, k0.scan_source, o.type from keywords k0 inner join objects o on o.id=k0.object_id %(s_phrase_joins)s where %(s_phrase_queries)s %(type_query)s order by k0.object_id, k0.pos""" % locals(), phrase_words+type_names ) for row in c: search_word_rows[ search_word["raw_word"] ] += 1 object_id, result_word, pos, scan_source, object_type = row hit = { "object_id" : object_id, "result_word" : result_word, "pos" : pos, "scan_source" : scan_source, "object_type" : object_type, "search_word" : search_word["raw_word"], "keyword" : word, "weight" : word_weight, "extra_reasons" : { "valid_types" : [], "associated_to" : [] } } if object_id in raw_results: raw_results[object_id].append( hit ) else: raw_results[object_id] = [ hit ] # 3.) Wir machen eine Zugriffsprüfung, filtern die Trefferliste entsprechend und # erweitern die Trefferliste ggf. um Eltern- und Kindobjekte mit passendem Typ, sodass z.b. # Blog-Einträge für auf die Volltextsuche passende plain/text-Objekte oder Beiträge # von passenden Nutzernamen gefunden werden: filtered_results = {} for result_id in raw_results: for hit in raw_results[result_id]: object_id = hit["object_id"] object_type = hit["object_type"] search_word = hit["search_word"] if app.user.can_read( object_id ): direct_hit = False if object_type in result_types or "file" in result_types and files.File.supports(app, object_type) or not result_types: c = app.db.cursor() # Hier müssen wir zunächst prüfen ob das gefundene Objekt ein Substitute-Objekt ist, denn # Substitute-Objekte sollten nicht als Treffer zurück geliefert werden. c.execute( """select original_id from substitutes where substitute_id=?""", [object_id] ) if c.fetchone()==None: direct_hit = True hit["extra_reasons"]["valid_types"].append( object_type ) if object_id in filtered_results: filtered_results[object_id].append( hit ) else: filtered_results[object_id] = [ hit ] search_word_hits[search_word] += 1 if not direct_hit: obj = db_object.DBObject( app, object_id ) matching_associates = obj.resolve_parents( parent_type_set=set(result_types) ) + obj.resolve_children( child_type_set=set(result_types) ) for alt_obj_id in matching_associates: if app.user.can_read( alt_obj_id ): c = app.db.cursor() # Hier müssen wir zunächst prüfen ob das gefundene Objekt ein Substitute-Objekt ist, denn # Substitute-Objekte sollten nicht als Treffer zurück geliefert werden. c.execute( """select original_id from substitutes where substitute_id=?""", [alt_obj_id] ) if c.fetchone()==None: hit["extra_reasons"]["associated_to"].append( alt_obj_id ) if alt_obj_id in filtered_results: filtered_results[alt_obj_id].append( hit ) else: filtered_results[alt_obj_id] = [ hit ] search_word_hits[search_word] += 1 # 4.) Treffer sortieren if order_by=="weight" or min_weight!=None: # a) Relevanzsortierung/Relevanzfilterung, wobei: # - Anzahl treffender Suchbegriffe verstärkend wirken: len(filtered_results[x]) # - Gesamtzahl der Treffer aller treffenden Suchbegriffe abschwächend wirken: /sum(...) sort_key = lambda x: (1+sum([h["weight"] for h in filtered_results[x]])) * len(filtered_results[x]) / max(1,sum([search_word_hits[sw] for sw in set([h["search_word"] for h in filtered_results[x]])])) hit_weights = [(hit_id,sort_key(hit_id)) for hit_id in filtered_results] if order_by=="weight": hit_weights = sorted( hit_weights, key=lambda x: x[1], reverse=order_reverse ) # b) Treffer nach Minimalgewicht filtern, falls definiert: if min_weight!=None: hit_weights = [x for x in hit_weights if x[1]>min_weight] else: hit_weights = [(hit_id,0) for hit_id in filtered_results] hit_id_list = [x[0] for x in hit_weights] if order_by in ("id","ctime","mtime"): # c) Möglichst effiziente SQL-Sortierung nach Zeitstempel durchführen, falls gewünscht: order_dir = "desc" if order_reverse else "asc" hit_id_list_string = ",".join( [str(x) for x in hit_id_list] ) c.execute( """select id, ctime, mtime from objects where id in (%(hit_id_list_string)s) order by %(order_by)s %(order_dir)s""" % locals() ) hit_id_list = [row[0] for row in c] # 7.) Vorsortierte Objekt-ID-Liste beschneiden, falls gefordert: hit_id_list = hit_id_list[range_offset:None if range_limit==None else range_offset+range_limit] # 8.) Ggf. rekursiver Lookup von Eltern- und Kind-Objekten der reduzierten Trefferliste: hitlist = [] if hit_id_list: hitlist = get.get( app, object_ids=hit_id_list, recursive=recursive, access_errors=False ) # 9.) Ergebnis JSON-kodieren: result = { # "hit_weights" : hit_weights, # "reasons" : {}, "hitlist" : hitlist, "search_word_rows" : search_word_rows, "search_word_hits" : search_word_hits, } # for hit_id,hit_weight in hit_weights: # result["reasons"][hit_id] = filtered_results[hit_id] app.response.output = json.dumps( result )
def search( app, search_phrase, result_types=[], min_weight=0, order_by=None, order_reverse=True, range_offset=0, range_limit=None, recursive=(False,False), max_phrase_word_dist=3, exact_includes=True, exact_excludes=True ): q = app.query # 1.) Suchausdruck parsen und Datenstrukturen initialisieren: phrase_parts = [] current_word = "" in_phrase = False phrase_start_char = None for i,c in enumerate( search_phrase ): if c in " \t\r\n" and not in_phrase: if current_word: phrase_parts.append( current_word ) current_word = "" elif c in ['"',"'"] and not in_phrase: in_phrase=True phrase_start_char = c current_word += c elif in_phrase and c==phrase_start_char: in_phrase=False phrase_start_char = None current_word += c else: current_word += c if current_word: phrase_parts.append( current_word ) search_words = [] for part in phrase_parts: match = re.fullmatch( "([?]?)([+-]*)((?:[\w]+:)*)(.+)", part, re.DOTALL ) optional = match.group(1)=="?" weight_prefix = match.group(2) word_weight = sum( [(lambda x: 10 if x=='+' else -10)(c) for c in weight_prefix] ) + (10 if weight_prefix=="" else 0) _word = match.group(4) phrase_match = re.fullmatch( '([?]?)"([^"]*)"?(?:\[([0-9]+)\])?', _word ) if not phrase_match: phrase_match = re.fullmatch( "([?]?)'([^']*)'?(?:\[([0-9]+)\])?", _word ) if phrase_match: _word = None _phrase = phrase_match.group(2) try: _phrase_max_word_dist = int(phrase_match.group(3)) except TypeError: _phrase_max_word_dist = max_phrase_word_dist else: _phrase = None _phrase_max_word_dist = max_phrase_word_dist word = { "optional" : optional, "weight" : word_weight, "type" : match.group(3), "word" : _word, "phrase" : _phrase, "phrase_max_word_dist" : _phrase_max_word_dist, "raw_word" : part } search_words.append( word ) raw_results = {} c = app.db.cursor() search_word_rows = {} search_word_hits = {} # hiermit zählen wir Treffer pro Suchwort im gefilterten Endergebnis # 2.) Einzelne Suchbegriffe mit optionaler Typbindung im Wortindex nachschlagen und # getroffene Objekt-Ids mit der Suchbegriffwichtung verknüpft zwischenspeichern: for i, search_word in enumerate(search_words): # optionales Wichtungs-Präfix aus '-' und '+' parsen, wobei ein positiveres Präfix Treffer des Suchwortes # höher bewertet und ein negativeres Präfix Treffer der Ausschlussmenge des Suchwortes höher bewertet: word_weight = search_word["weight"] # optionalen Typ-Selektor der Form <[typ1:[typ2:[...]]]wort> parsen: word_types = search_word["type"].split(":")[:-1] type_query = "" type_names = [] time_col = None for j, word_type in enumerate(word_types): if word_type in search_type_alias: if len(type_names): type_query += " or " type_query += "o.type like ?" type_names.append( search_type_alias[word_type] ) elif word_type in time_types: time_col = word_type else: if len(type_names): type_query += " or " type_query += "k0.scan_source=?" type_names.append( word_type ) if type_query: type_query = "and (" + type_query + ")" search_word_rows[ search_word["raw_word"] ] = 0 search_word_hits[ search_word["raw_word"] ] = 0 if time_col: # time queries have the form ctime:201707 ("created some time in july 2017") or mtime:<2017 ("last modified before 2017") time_query_string = search_word["word"] if search_word["word"] else search_word["phrase"] # lib.application replaces some xml control characters with entities, we have to fix that now: time_query_string = time_query_string.replace("&","&").replace(">",">").replace("<","<") time_query_string = re.sub( "[-:. ]","", time_query_string ) # eg: 2017-07-23 15:38 -> 201707231538 match = re.fullmatch( r"(<|>|<=|>=|=)?([0-9]+)", time_query_string ) if not match: raise errors.ParameterError( "Illegal time query, try something like ctime:2017-01" ) time_op = match.group(1) or "=" time_string = match.group(2) attr_list = ["year"] if len(time_string)>=6: attr_list.append( "month" ) if len(time_string)>=8: attr_list.append( "day" ) if len(time_string)>=10: attr_list.append( "hour" ) if len(time_string)>=12: attr_list.append( "minute" ) if len(time_string)>=14: attr_list.append( "second" ) time_pattern_dict = { "year":"%Y", "month":"%m", "day":"%d", "hour":"%H", "minute":"%M", "second":"%S" } # generate time parse pattern, eg.: %Y%m%d time_pattern = "".join( [time_pattern_dict[x] for x in attr_list] ) time_range_begin = datetime.datetime.strptime( time_string, time_pattern ) time_range_after = None if time_op == "=": # for time range equality comparisons we need to determine the end of the range: while not time_range_after: try: # try to generate the first date not matching the time string (but 20171232 or 201713 would be illegal...) last_attr = attr_list.pop() ref_time = datetime.datetime.strptime( time_string, time_pattern ) time_range_after = ref_time.replace( **{last_attr:getattr(time_range_begin,last_attr)+1} ) except ValueError: # for illegal cases try again one level higher, eg: 20171232 -> 201713 -> 2018 time_string = time_string[:-2] time_pattern = time_pattern[:-2] c.execute( """select object_id, o.%(time_col)s, pos, scan_source, o.type from keywords k0 inner join objects o on o.id=object_id where %(time_col)s>=? and %(time_col)s<? %(type_query)s order by object_id, pos""" % locals(), [time_range_begin.timestamp(), time_range_after.timestamp()] + type_names ) else: c.execute( """select object_id, o.%(time_col)s, pos, scan_source, o.type from keywords k0 inner join objects o on o.id=object_id where %(time_col)s %(time_op)s ? %(type_query)s order by object_id, pos""" % locals(), [time_range_begin.timestamp()] + type_names ) elif search_word["word"]: word = search_word["word"] c.execute( """select object_id, word, pos, scan_source, o.type from keywords k0 inner join objects o on o.id=object_id where word like ? %(type_query)s order by object_id, pos""" % locals(), [word]+type_names ) elif search_word["phrase"]: phrase = search_word["phrase"] phrase_max_word_dist = search_word["phrase_max_word_dist"] phrase_words = phrase.split() phrase_joins = [] phrase_queries = [] result_phrase_field_string = "" for i,phrase_word in enumerate(phrase_words): if i>0: prev_i = i-1 phrase_joins.append( """ inner join keywords k%(i)d on k0.object_id=k%(i)d.object_id and k0.scan_source=k%(i)d.scan_source and abs(k%(i)d.pos-k%(prev_i)d.pos)<=%(phrase_max_word_dist)d""" % locals() ) phrase_queries.append( "and k%(i)d.word like ?" % locals() ) result_phrase_field_string += "||' '||k%(i)d.word" % locals() else: phrase_queries.append( "k%(i)d.word like ?" % locals() ) result_phrase_field_string += "k%(i)d.word" % locals() s_phrase_joins = "\n".join( phrase_joins ) s_phrase_queries = "\n".join( phrase_queries ) c.execute( """select k0.object_id, %(result_phrase_field_string)s, k0.pos, k0.scan_source, o.type from keywords k0 inner join objects o on o.id=k0.object_id %(s_phrase_joins)s where %(s_phrase_queries)s %(type_query)s order by k0.object_id, k0.pos""" % locals(), phrase_words+type_names ) for row in c: search_word_rows[ search_word["raw_word"] ] += 1 object_id, result_word, pos, scan_source, object_type = row if not type(word_weight)==int: raise errors.StateError(type(word_weight)) if type(search_word["word"]) not in (str,type(None)): raise errors.StateError(type(search_word["word"])) if type(result_word) not in (str,int): raise errors.StateError(type(result_word)) if not type(pos)==int: raise errors.StateError(type(pos)) if not type(scan_source)==str: raise errors.StateError(type(scan_source)) if not type(search_word["raw_word"])==str: raise errors.StateError(type(search_word["raw_word"])) hit = { "object_id" : object_id, "object_type" : object_type, "reasons" : { ( object_id, #"object_id" word_weight, #"weight" search_word["word"], #"keyword" result_word, #"result_word" scan_source, #"scan_source" search_word["raw_word"], #"raw_word" ) }, "associated_to" : set() } if object_id in raw_results: raw_results[object_id]["reasons"] = raw_results[object_id]["reasons"].union( hit["reasons"] ) else: raw_results[object_id] = hit # 3.) Wir machen eine Zugriffsprüfung, filtern die Trefferliste entsprechend und # erweitern die Trefferliste ggf. um Eltern- und Kindobjekte mit passendem Typ, sodass z.b. # Blog-Einträge für auf die Volltextsuche passende plain/text-Objekte oder Beiträge # von passenden Nutzernamen gefunden werden: filtered_results = {} for object_id in raw_results: hit = raw_results[ object_id ] object_type = hit["object_type"] if app.user.can_read( object_id ): direct_hit = False if object_type in result_types or "file" in result_types and files.File.supports(app, object_type) or not result_types: c = app.db.cursor() # Hier müssen wir zunächst prüfen ob das gefundene Objekt ein Substitute-Objekt ist, denn # Substitute-Objekte sollten nicht als Treffer zurück geliefert werden. c.execute( """select original_id from substitutes where substitute_id=?""", [object_id] ) if c.fetchone()==None: direct_hit = True if object_id not in filtered_results: filtered_results[object_id] = hit else: # Merge existing results reason set: # this is not going to happen as long as we iterate over raw_results and that is a dictionary, but who knows... filtered_results[object_id]["reasons"] = filtered_results[object_id]["reasons"].union( hit["reasons"] ) for reason in hit["reasons"]: object_id, weight, keyword, result_word, scan_source, raw_word = reason search_word_hits[ raw_word ] += 1 if not direct_hit: obj = db_object.DBObject( app, object_id ) matching_associates = obj.resolve_parents( parent_type_set=set(result_types) ) + obj.resolve_children( child_type_set=set(result_types) ) for alt_obj_id in matching_associates: if app.user.can_read( alt_obj_id ): c = app.db.cursor() # Hier müssen wir zunächst prüfen ob das gefundene Objekt ein Substitute-Objekt ist, denn # Substitute-Objekte sollten nicht als Treffer zurück geliefert werden. c.execute( """select original_id from substitutes where substitute_id=?""", [alt_obj_id] ) if c.fetchone()==None: hit["associated_to"].add( object_id ) if alt_obj_id not in filtered_results: filtered_results[alt_obj_id] = hit else: # Merge existing results reason set: filtered_results[alt_obj_id]["reasons"] = filtered_results[alt_obj_id]["reasons"].union( hit["reasons"] ) for reason in hit["reasons"]: object_id, weight, keyword, result_word, scan_source, raw_word = reason search_word_hits[ raw_word ] += 1 # 4.) Treffer sortieren if order_by=="weight" or min_weight!=None or exact_includes or exact_excludes: # a) Relevanzsortierung/Relevanzfilterung, wobei die: # - gewichtete Anzahl treffender Suchbegriffe verstärkend wirkt: weighted_reason_sum # - Gesamtzahl der Treffer aller treffenden Suchbegriffe abschwächend wirken: search_word_hit_sum def sort_key( object_id ): hit = filtered_results[ object_id ] weighted_reason_sum = 0 search_word_hit_sum = 0 all_positive_terms_found = True no_negative_terms_found = True for reason in hit["reasons"]: object_id, weight, keyword, result_word, scan_source, raw_word = reason weighted_reason_sum += 1*weight search_word_hit_sum += search_word_hits[ raw_word ] no_negative_terms_found = no_negative_terms_found and weight>=0 if exact_includes: for search_word in search_words: if not search_word["optional"] and search_word["weight"]>=0: positive_term_found = False for reason in hit["reasons"]: object_id, weight, keyword, result_word, scan_source, raw_word = reason if search_word["raw_word"]==raw_word: positive_term_found = True break all_positive_terms_found = all_positive_terms_found and positive_term_found if all_positive_terms_found==False: break hit_weight = weighted_reason_sum / (1+search_word_hit_sum) return (hit_weight, all_positive_terms_found, no_negative_terms_found) hit_weights = [(hit_id,sort_key(hit_id)) for hit_id in filtered_results] if order_by=="weight": hit_weights = sorted( hit_weights, key=lambda x: x[1], reverse=order_reverse ) # b) Exclude hits, if below min_weight, when defined: if min_weight!=None: hit_weights = [x for x in hit_weights if x[1][0]>min_weight] # c) Exclude hits not matching all positive search terms if required if exact_includes: hit_weights = [x for x in hit_weights if x[1][1]==True] # d) Exclude hits matching at least one negative search term if required if exact_excludes: hit_weights = [x for x in hit_weights if x[1][2]==True] else: hit_weights = [(hit_id,0) for hit_id in filtered_results] hit_id_list = [x[0] for x in hit_weights] if order_by in ("id","ctime","mtime"): # c) Möglichst effiziente SQL-Sortierung nach Zeitstempel durchführen, falls gewünscht: order_dir = "desc" if order_reverse else "asc" hit_id_list_string = ",".join( [str(x) for x in hit_id_list] ) c.execute( """select id, ctime, mtime from objects where id in (%(hit_id_list_string)s) order by %(order_by)s %(order_dir)s""" % locals() ) hit_id_list = [row[0] for row in c] # 7.) Vorsortierte Objekt-ID-Liste beschneiden, falls gefordert: hit_id_list = hit_id_list[range_offset:None if range_limit==None else range_offset+range_limit] # 8.) Ggf. rekursiver Lookup von Eltern- und Kind-Objekten der reduzierten Trefferliste: hitlist = [] if hit_id_list: hitlist = get.get( app, object_ids=hit_id_list, recursive=recursive, access_errors=False ) for hit in hitlist: hit["reasons"] = list( filtered_results[hit["id"]]["reasons"] ) #hit["associated_to"] = get.get( app, object_ids=list(filtered_results[hit["id"]]["associated_to"]), recursive=(False,False), access_errors=False ) hit["weight"] = [x[1] for x in hit_weights if x[0]==hit["id"]][0] # 9.) Ergebnis JSON-kodieren: result = { "hitlist" : hitlist, "search_word_rows" : search_word_rows, "search_word_hits" : search_word_hits, } app.response.output = json.dumps( result )