Пример #1
0
def process( app ):
	query = app.query
	obj_id = query.parms["id"]
	if not app.user.can_read( obj_id ):
		raise errors.PrivilegeError( "%d cannot read %d" % (app.user.id, obj_id) )
	obj = db_object.DBObject.create_typed_object( app=app, object_id=obj_id )
	result = {}
	if type(obj)==entry.Entry:
		if query.parms["method"] == "create_draft":
			draft = obj.create_draft()
			result = { "succeeded" : True, "draft" : get_module.get(app=app, object_ids=[draft.id], recursive=(True,True))[0] }
		else:
			raise errors.ParameterError( "Unsupported method for type" )
	elif type(obj)==entry.Draft:
		if query.parms["method"] == "publish":
			entry_id = obj.publish()
			result = { "succeeded" : True, "entry" : get_module.get(app=app, object_ids=[entry_id], recursive=(True,True))[0] }
		elif query.parms["method"] == "merge_to_parent":
			entry_id = obj.merge_to_parent()
			result = { "succeeded" : True, "entry" : get_module.get(app=app, object_ids=[entry_id], recursive=(True,True))[0] }
		else:
			raise errors.ParameterError( "Unsupported method for type" )
	else:
		raise errors.ParameterError( "Object with unsupported type" )
	app.response.output = json.dumps( result )
Пример #2
0
	def run( self ):
		self.app.open_db() # App-DB für diesen Thread neu öffnen...
		while not self.quitting:
			objs = get_module.get( self.app, limit=10 )
			message = str( {'succeeded': True, 'objects': objs} )
			self.websocket.send( message )
			time.sleep(30)
		self.app.close_db( commit=True ) # also commits pending changes
Пример #3
0
def process( app ):
	query = app.query
	response = app.response
	session = app.session
	target_ids = [int(x) for x in query.parms["id"].split(",")]
	object_list = get_module.get( app, object_ids=target_ids )
	metainfo_list = []
	for target_id in target_ids:
		if app.user.can_read( target_id ):
			target_obj = files.File( app, object_id=target_id )
			metainfo_list.append( target_obj.identify() )
		else:
			raise errors.PrivilegeError()
	for metainfo in metainfo_list:
		for obj in object_list:
			if obj["id"] == metainfo["id"]:
				obj.update( metainfo )
	response.output = json.dumps( {"succeeded" : True, "objects" : object_list} )
Пример #4
0
def process( app ):
	query = app.query
	response = app.response
	session = app.session
	source_id = int( query.parms["id"] )
	mode = query.parms["mode"] if "mode" in query.parms else "convert"
	if app.user.can_read( source_id ):
		source_obj = files.File( app, object_id=source_id )
		if re.match( r"^video/.*", source_obj.media_type ):
			new_poster_offset = float(query.parms["poster_offset"]) if "poster_offset" in query.parms else None
			new_poster_id = int(query.parms["poster_id"]) if "poster_id" in query.parms else None
			if new_poster_id and app.user.can_write( source_id ) and app.user.can_read( new_poster_id ):
				new_poster_obj = files.File( app, object_id=new_poster_id )
			else:
				new_poster_obj = None
			# zur Bestimmung der Größen und Bitraten der Alternativobjekte identifizieren wir zunächst das Orignalobjekt:
			source_size = source_obj.get_size()
			source_meta = source_obj.identify()
			source_width = int( source_meta["mplayer"]["id"]["video"]["width"] )
			source_rate = round(source_size*8/float(source_meta["mplayer"]["id"]["length"])/1000)
			results = []
			class ConversionDescription:
				def __init__( self, role, width, media_type, rate=0, condition=lambda x: True ):
					self.role = role
					self.width = width
					self.media_type = media_type
					self.rate = rate
					self.condition = condition
				def applies( self ):
					return self.condition(self)
				def __eq__( self, other ):
					# rate is not part of equality, because this is used to compare conversion candidates with already existing substitutes 
					# and bitrate is deemed to specific for that purpose; we just assume rates are ok/equal for a given size
					return (	(self.role, self.width, self.media_type, self.applies()) ==
								(other.role, other.width, other.media_type, other.applies()) )
				def __str__( self ):
					return( "(role=%s, width=%dpx, media_type=%s, rate=%dk, applies=%s)" % (self.role, self.width, self.media_type, self.rate, str(self.applies())) )
			# we want a poster image, an mp4-substitute in source_width for non-mp4 sources and a scaled down mp4-substitute for big sources:
			missing_conversions = [
				ConversionDescription( role="poster", width=min(1280, source_width), media_type="image/jpeg" ),
				ConversionDescription( role="compatible", width=source_width, media_type="video/mp4", rate=source_rate,
						  condition = lambda self:  source_obj.media_type != "video/mp4" ),
				ConversionDescription( role="compatible", width=min(1280, source_width), media_type="video/mp4", rate=min(2000, source_rate),
						  condition = lambda self: (self.width < (source_width*0.8)) and (self.rate < (source_rate*0.8)) ),
			]
			c = app.db.cursor()
			# Hier müssen wir zunächst prüfen ob das angefragte Objekt selbst schon ein Substitute-Objekt ist, ...
			c.execute( """select original_id from substitutes where substitute_id=?""", [source_obj.id] )
			if c.fetchone():
				# ... denn für Substitute-Objekte sollten keine weiteren Substitute-Objekte generiert werden.
				missing_conversions = []
			c.execute( """select s.substitute_id, s.type, s.size, s.priority, sobj.id from substitutes s
							left join objects sobj on sobj.id=s.substitute_id
							where s.original_id=?""", [source_obj.id] )
			for row in c:
				substitute = { "substitute_id" : int(row[0]), "type" : row[1], "size" : int(row[2]), "priority" : int(row[3]) }
				sobj_id = row[4]
				if sobj_id==None:
					# Zombie-Substitutes bereinigen (FIXME: sollte das von DBObject.delete() erledigt werden?):
					del_c = app.db.cursor()
					del_c.execute( """delete from substitutes where substitute_id=?""", [substitute["substitute_id"]] )
				else:
					substitute_obj = db_object.DBObject( app, object_id=substitute["substitute_id"] )
					conversion = ConversionDescription( role=substitute["type"], width=substitute["size"], media_type=substitute_obj.media_type )
					if conversion in missing_conversions:
						if substitute["type"]=="poster" and (new_poster_offset or new_poster_obj):
							# bestehendes Poster-Substitute entfernen, da es neu definiert werden soll:
							del_c = app.db.cursor()
							del_c.execute( """delete from substitutes where original_id=? and substitute_id=?""", [source_obj.id,substitute_obj.id] )
						else:
							missing_conversions.remove( conversion )
							results.append( substitute )
					else:
						results.append( substitute )
			error_list = []
			if mode == "convert":
				# Alle fehlende Objekte sofort ohne Daten anlegen, um Mehrfachkonvertierung zu vermeiden:
				new_objects = []
				for conversion in [x for x in missing_conversions if x.applies()]:
					# Privilege-Escalation damit nicht nur der Eigentümer des Target-Objekts diesen Code ausführen kann:
					app_old_user = app.user
					app.user = user.get_admin_user(app)
					existing_object_id = None
					if conversion.role=="poster" and new_poster_obj:
						existing_object_id = new_poster_obj.id
					new_obj = files.File( app, object_id=existing_object_id, parent_id=source_obj.id, media_type=conversion.media_type )
					if not existing_object_id:
						new_obj.conversion = conversion;
						new_objects.append( new_obj )
					substitute = { "substitute_id" : new_obj.id, "type" : conversion.role, "size" : conversion.width, "priority" : None }
					results.append( substitute )
					app.user = app_old_user
					c = app.db.cursor()
					c.execute( """insert into substitutes (original_id, substitute_id, type, size) values(?,?,?,?)""", 
														[source_obj.id, new_obj.id, conversion.role, conversion.width] )
				# Konvertierungsvorgänge für angelegte Objekte durchführen:
				for new_obj in new_objects:
					# this may take a long time, so wie have to commit first:
					app.db.commit()
					base_type, sub_type = new_obj.media_type.split("/")
					new_tmp_name = new_obj.storage_path+".tmp."+sub_type
					if( re.match(r"^video/.*", new_obj.media_type) ):
						# Konvertierung mit geänderter Breite bei Erhaltug des Seitenverhältnisses:
						# http://stackoverflow.com/questions/8218363/maintaining-ffmpeg-aspect-ratio
						p = subprocess.Popen( ["ffmpeg", "-y", "-i", source_obj.storage_path, "-vf", "scale=%d:trunc(ow/a/2)*2" % (new_obj.conversion.width), 
													"-r", "25", "-b", "%dk" % new_obj.conversion.rate, "-qmin", "0", "-strict", "-2", new_tmp_name],
												stdout=subprocess.PIPE, stderr=subprocess.PIPE )
					elif( new_obj.conversion.role=="poster" and new_obj.media_type == "image/jpeg" ):
						# Vorschaubild bei Zeitindex 3s extrahieren:
						p = subprocess.Popen( ["ffmpeg", "-y", "-i", source_obj.storage_path, "-vf", "scale=%d:trunc(ow/a/2)*2" % (new_obj.conversion.width), 
													"-ss", str(new_poster_offset if new_poster_offset else 3), "-vframes", "1", new_tmp_name],
												stdout=subprocess.PIPE, stderr=subprocess.PIPE )
					else:
						raise NotImplementedError( "missing operation for conversion: %s" % (str(new_obj.conversion)) )
					stdout, stderr = p.communicate()
					if p.returncode!=0:
						try:
							# FIXME: Löschenfunktion nach DBObject ausmodularisieren und Dateibereinigung nach files.File:
							# Privilege-Escalation damit nicht nur der Eigentümer des Target-Objekts diesen Code ausführen kann:
							app_old_user = app.user
							app.user = user.get_admin_user(app)
							db_object.DBObject.delete_in( app, [new_obj.id] )
							app.user = app_old_user
							os.remove( new_tmp_name )
							c = app.db.cursor()
							c.execute( """delete from substitutes where original_id=? and substitute_id=?""", [source_obj.id, new_obj.id] )
							results = [x for x in results if x["substitute_id"]!=new_obj.id]
						except Exception as e:
							error_list.append( e )
						errmsg = stderr.decode().split("\n")[-1]
						error_list.append( errors.InternalProgramError(errmsg) )
					else:
						os.rename( new_tmp_name, new_obj.storage_path )
			# Fehlerbehandlung:
			if error_list:
				msg = ""
				for error in error_list:
					if msg:
						msg += "; "
					msg += str(error)
				raise errors.InternalProgramError( msg )
			else:
				for result in results:
					result["substitute_object"] = get_module.get(app, object_ids=[result["substitute_id"]])
				response.output = json.dumps( {"succeeded": True,
										"substitutes": results} )
		elif re.match( r"^audio/.*", source_obj.media_type ):
			# TODO: add safe conversions
			results = []
			response.output = json.dumps( {"succeeded": True,
									"substitutes": results} )
		else:
			raise NotImplementedError( "unsupported media type: "+source_obj.media_type )
	else:
		raise errors.PrivilegeError()
Пример #5
0
def process(app):
    query = app.query
    response = app.response
    session = app.session
    target_id = int(query.parms["id"])
    mode = query.parms["mode"] if "mode" in query.parms else "convert"
    if app.user.can_read(target_id):
        target_obj = files.File(app, object_id=target_id)
        if re.match(r"^video/.*", target_obj.media_type):
            new_poster_offset = float(query.parms["poster_offset"]) if "poster_offset" in query.parms else None
            new_poster_id = int(query.parms["poster_id"]) if "poster_id" in query.parms else None
            if new_poster_id and app.user.can_write(target_id) and app.user.can_read(new_poster_id):
                new_poster_obj = files.File(app, object_id=new_poster_id)
            else:
                new_poster_obj = None
                # Wir brauchen min. webm (Firefox, Chrome) und mp4 (Safari, IE, Chrome) für eine halbwegs gute
                # Client-Abdeckung.
                # Direkte Kindobjekte danach durchsuchen:
            results = []
            missing_conversions = [
                ("poster", 480, "image/jpeg"),
                ("compatible", 480, "video/mp4"),
                ("compatible", 480, "video/webm"),
            ]
            c = app.db.cursor()
            # Hier müssen wir zunächst prüfen ob das angefragte Objekt selbst schon ein Substitute-Objekt ist, ...
            c.execute("""select original_id from substitutes where substitute_id=?""", [target_obj.id])
            if c.fetchone():
                # ... denn für Substitute-Objekte sollten keine weiteren Substitute-Objekte generiert werden.
                missing_conversions = []
            c.execute(
                """select s.substitute_id, s.type, s.size, s.priority, sobj.id from substitutes s
							left join objects sobj on sobj.id=s.substitute_id
							where s.original_id=?""",
                [target_obj.id],
            )
            for row in c:
                substitute = {
                    "substitute_id": int(row[0]),
                    "type": row[1],
                    "size": int(row[2]),
                    "priority": int(row[3]),
                }
                sobj_id = row[4]
                if sobj_id == None:
                    # Zombie-Substitutes bereinigen (FIXME: sollte das von DBObject.delete() erledigt werden?):
                    del_c = app.db.cursor()
                    del_c.execute("""delete from substitutes where substitute_id=?""", [substitute["substitute_id"]])
                    app.db.commit()
                else:
                    substitute_obj = db_object.DBObject(app, object_id=substitute["substitute_id"])
                    conversion = (substitute["type"], substitute["size"], substitute_obj.media_type)
                    if conversion in missing_conversions:
                        if substitute["type"] == "poster" and (new_poster_offset or new_poster_obj):
                            # bestehendes Poster-Substitute entfernen, da es neu definieter werden soll:
                            del_c = app.db.cursor()
                            del_c.execute(
                                """delete from substitutes where original_id=? and substitute_id=?""",
                                [target_obj.id, substitute_obj.id],
                            )
                            app.db.commit()
                        else:
                            missing_conversions.remove(conversion)
                            results.append(substitute)
            error_list = []
            if mode == "convert":
                # Alle fehlende Objekte sofort ohne Daten anlegen, um Mehrfachkonvertierung zu vermeiden:
                new_objects = []
                for conversion in missing_conversions:
                    conversion_type, conversion_size, new_media_type = conversion
                    # Privilege-Escalation damit nicht nur der Eigentümer des Target-Objekts diesen Code ausführen kann:
                    app_old_user = app.user
                    app.user = user.get_admin_user(app)
                    existing_object_id = None
                    if conversion_type == "poster" and new_poster_obj:
                        existing_object_id = new_poster_obj.id
                    new_obj = files.File(
                        app, object_id=existing_object_id, parent_id=target_obj.id, media_type=new_media_type
                    )
                    if not existing_object_id:
                        new_obj.conversion = conversion
                        new_objects.append(new_obj)
                    substitute = {
                        "substitute_id": new_obj.id,
                        "type": conversion_type,
                        "size": conversion_size,
                        "priority": None,
                    }
                    results.append(substitute)
                    app.user = app_old_user
                    c = app.db.cursor()
                    c.execute(
                        """insert into substitutes (original_id, substitute_id, type, size) values(?,?,?,?)""",
                        [target_obj.id, new_obj.id, conversion_type, conversion_size],
                    )
                    app.db.commit()
                    # Konvertierungsvorgänge für angelegte Objekte durchführen:
                for new_obj in new_objects:
                    conversion_type, conversion_size, ignored = new_obj.conversion
                    base_type, sub_type = new_obj.media_type.split("/")
                    new_tmp_name = new_obj.storage_path + ".tmp." + sub_type
                    if re.match(r"^video/.*", new_obj.media_type):
                        # Konvertierung mit konservativer Breite von 480px bei Erhaltug des Seitenverhältnisses:
                        # http://stackoverflow.com/questions/8218363/maintaining-ffmpeg-aspect-ratio
                        p = subprocess.Popen(
                            [
                                "ffmpeg",
                                "-y",
                                "-i",
                                target_obj.storage_path,
                                "-vf",
                                "scale=%d:trunc(ow/a/2)*2" % (conversion_size),
                                "-r",
                                "25",
                                "-b",
                                "1000k",
                                "-qmin",
                                "0",
                                "-strict",
                                "-2",
                                new_tmp_name,
                            ],
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE,
                        )
                    elif conversion_type == "poster" and new_obj.media_type == "image/jpeg":
                        # Vorschaubild bei Zeitindex 3s extrahieren (TODO: Mit beliebigem Zeitindex einstellbar machen?):
                        p = subprocess.Popen(
                            [
                                "ffmpeg",
                                "-y",
                                "-i",
                                target_obj.storage_path,
                                "-vf",
                                "scale=%d:trunc(ow/a/2)*2" % (conversion_size),
                                "-ss",
                                str(new_poster_offset if new_poster_offset else 3),
                                "-vframes",
                                "1",
                                new_tmp_name,
                            ],
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE,
                        )
                    else:
                        raise NotImplementedError("missing operation for conversion: " + str(new_obj.conversion))
                    stdout, stderr = p.communicate()
                    if p.returncode != 0:
                        try:
                            # FIXME: Löschenfunktion nach DBObject ausmodularisieren und Dateibereinigung nach files.File:
                            # Privilege-Escalation damit nicht nur der Eigentümer des Target-Objekts diesen Code ausführen kann:
                            app_old_user = app.user
                            app.user = user.get_admin_user(app)
                            delete_module.delete_in(app, [new_obj.id])
                            app.user = app_old_user
                            os.remove(new_tmp_name)
                            c = app.db.cursor()
                            c.execute(
                                """delete from substitutes where original_id=? and substitute_id=?""",
                                [target_obj.id, new_obj.id],
                            )
                            app.db.commit()
                            results = [x for x in results if x["substitute_id"] != new_obj.id]
                        except Exception as e:
                            error_list.append(e)
                        errmsg = stderr.decode().split("\n")[-1]
                        error_list.append(errors.InternalProgramError(errmsg))
                    else:
                        os.rename(new_tmp_name, new_obj.storage_path)
                        # Fehlerbehandlung:
            if error_list:
                msg = ""
                for error in error_list:
                    if msg:
                        msg += "; "
                    msg += str(error)
                raise errors.InternalProgramError(msg)
            else:
                for result in results:
                    result["substitute_object"] = get_module.get(app, object_ids=[result["substitute_id"]])
                response.output = json.dumps({"succeeded": True, "substitutes": results})
        elif re.match(r"^audio/.*", target_obj.media_type):
            raise NotImplementedError("unsupported media type: " + target_obj.media_type)
        else:
            raise NotImplementedError("unsupported media type: " + target_obj.media_type)
    else:
        raise errors.PrivilegeError()
Пример #6
0
def process( app ):
	query = app.query
	response = app.response
	session = app.session
	target_ids = [int(x) for x in query.parms["id"].split(",")]
	object_list = get_module.get( app, object_ids=target_ids )
	metainfo_list = []
	for target_id in target_ids:
		if app.user.can_read( target_id ):
			target_obj = files.File( app, object_id=target_id )
			if re.match( r"^video/.*", target_obj.media_type ) or re.match( r"^audio/.*", target_obj.media_type ):
				mplayer_id = {}
				p = subprocess.Popen( ["mplayer", "-identify", "-frames" , "0", "-ao", "null", "-vo", "null", 
										target_obj.storage_path],
										stdout=subprocess.PIPE, stderr=subprocess.PIPE )
				stdout, stderr = p.communicate()
				if p.returncode!=0:
					errmsg = stderr.decode()
					raise errors.InternalProgramError( errmsg )
				else:
					for line in stdout.decode().split("\n"):
						if line.startswith("ID_") and not line.startswith("ID_FILENAME"):
							parts = line.split("=")
							key = parts[0].lower()
							value = "=".join(parts[1:])
							populate_dict( mplayer_id, key, value, delim="_" )
					metainfo_list.append( {"id" : target_id, "mplayer" : mplayer_id} )
			elif re.match( r"^image/.*", target_obj.media_type ):
				exiv2_data = { "summary" : {} }
				image_info = {} # Substruktur für dauerhaft verfügbare Metadaten (z.b. width, height)
				p = subprocess.Popen( ["exiv2", target_obj.storage_path],
										stdout=subprocess.PIPE, stderr=subprocess.PIPE )
				stdout, stderr = p.communicate()
				if p.returncode not in (0, 253):
					errmsg = stderr.decode()
					raise errors.InternalProgramError( errmsg )
				else:
					for line in stdout.split(b"\n"):
						try:
							line = line.decode()
						except UnicodeDecodeError:
							continue
						result = re.findall( "([^:]+):(.*)", line )
						try:
							key, value = result[0]
						except IndexError:
							continue
						key = key.strip().replace(" ","_")
						if( key in ["File_name"] ):
							continue
						value = value.strip()
						exiv2_data[ "summary" ][ key ] = value
						if( key=="Image_size" ):
							x, y = value.split("x")
							x=int(x.strip())
							y=int(y.strip())
							image_info["width"] = x #.image.width
							image_info["height"] = y #.image.height
				p = subprocess.Popen( ["exiv2", "-pa", target_obj.storage_path],
										stdout=subprocess.PIPE, stderr=subprocess.PIPE )
				stdout, stderr = p.communicate()
				if p.returncode not in (0, 253):
					errmsg = stderr.decode()
					raise errors.InternalProgramError( errmsg )
				else:
					for line in stdout.decode().split("\n"):
						result = re.findall( "([^ ]+)[ ]+([^ ]+)[ ]+([^ ]+)[ ]+([^ ].*)", line )
						try:
							key, type, count, value = result[0]
						except IndexError:
							continue
						populate_dict( exiv2_data, key, value )
				metainfo_list.append( {"id" : target_id, "exiv2":exiv2_data, "image":image_info} )
			else:
				raise NotImplementedError( "unsupported media type: "+target_obj.media_type )
		else:
			raise errors.PrivilegeError()
	for metainfo in metainfo_list:
		for obj in object_list:
			if obj["id"] == metainfo["id"]:
				obj.update( metainfo )
	response.output = json.dumps( {"succeeded" : True, "objects" : object_list} )
Пример #7
0
def search( app, search_phrase, result_types=[], min_weight=0, order_by=None, order_reverse=True, range_offset=0, range_limit=None, recursive=(False,False), max_phrase_word_dist=3 ):
	q = app.query
	
	# 1.) Suchausdruck parsen und Datenstrukturen initialisieren:
	phrase_parts = []
	current_word = ""
	in_phrase = False
	phrase_start_char = None
	for i,c in enumerate( search_phrase ):
		if c in " \t\r\n" and not in_phrase:
			if current_word:
				phrase_parts.append( current_word )
				current_word = ""
		elif c in ['"',"'"] and not in_phrase:
			in_phrase=True
			phrase_start_char = c
			current_word += c
		elif in_phrase and c==phrase_start_char:
			in_phrase=False
			phrase_start_char = None
			current_word += c
		else:
			current_word += c
	if current_word:
		phrase_parts.append( current_word )
			
	search_words = []
	for part in phrase_parts:
		match = re.fullmatch( "([+-]*)((?:[\w]+:)*)(.+)", part, re.DOTALL )
		_word = match.group(3)
		phrase_match = re.fullmatch( '"([^"]*)"?(?:\[([0-9]+)\])?', _word )
		if not phrase_match:
			phrase_match = re.fullmatch( "'([^']*)'?(?:\[([0-9]+)\])?", _word )
		if phrase_match:
			_word = None
			_phrase = phrase_match.group(1)
			try:
				_phrase_max_word_dist = int(phrase_match.group(2))
			except TypeError:
				_phrase_max_word_dist = max_phrase_word_dist
		else:
			_phrase = None
			_phrase_max_word_dist = max_phrase_word_dist
		word = {
			"weight" : match.group(1),
			"type" : match.group(2),
			"word" : _word,
			"phrase" : _phrase,
			"phrase_max_word_dist" : _phrase_max_word_dist,
			"raw_word" : part
		}
		search_words.append( word )
	raw_results = {}
	c = app.db.cursor()
	search_word_rows = {}
	search_word_hits = {} # hiermit zählen wir Treffer pro Suchwort im gefilterten Endergebnis
	
	# 2.) Einzelne Suchbegriffe mit optionaler Typbindung im Wortindex nachschlagen und 
	#     getroffene Objekt-Ids mit der Suchbegriffwichtung verknüpft zwischenspeichern:
	for i, search_word in enumerate(search_words):
		# optionales Wichtungs-Präfix aus '-' und '+' parsen, wobei ein positiveres Präfix Treffer des Suchwortes
		# höher bewertet und ein negativeres Präfix Treffer der Ausschlussmenge des Suchwortes höher bewertet:
		weight_prefix = search_word["weight"]
		word_weight = sum( [(lambda x: 10 if x=='+' else -10)(c) for c in weight_prefix] ) + (10 if not weight_prefix else 0)
		# optionalen Typ-Selektor der Form <[typ1:[typ2:[...]]]wort> parsen:
		word_types = search_word["type"].split(":")[:-1]
		type_query = ""
		type_names = []
		for j, word_type in enumerate(word_types):
			if j:
				type_query += " or "
			if word_type in search_type_alias:
				type_query += "o.type like ?"
				type_names.append( search_type_alias[word_type] )
			else:
				type_query += "k0.scan_source=?"
				type_names.append( word_type )
		if type_query:
			type_query = "and (" + type_query + ")"
		search_word_rows[ search_word["raw_word"] ] = 0
		search_word_hits[ search_word["raw_word"] ] = 0
		if search_word["word"]:
			word = search_word["word"]
			c.execute( """select object_id, word, pos, scan_source, o.type from keywords k0
							inner join objects o on o.id=object_id
							where word like ? %(type_query)s order by object_id, pos""" % locals(), [word]+type_names )
		elif search_word["phrase"]:
			phrase = search_word["phrase"]
			phrase_max_word_dist = search_word["phrase_max_word_dist"]
			phrase_words = phrase.split()
			phrase_joins = []
			phrase_queries = []
			for i,phrase_word in enumerate(phrase_words):
				if i>0:
					prev_i = i-1
					phrase_joins.append( """
						inner join keywords k%(i)d 
							on k0.object_id=k%(i)d.object_id 
							and k0.scan_source=k%(i)d.scan_source 
							and abs(k%(i)d.pos-k%(prev_i)d.pos)<=%(phrase_max_word_dist)d""" % locals() )
					phrase_queries.append( "and k%(i)d.word like ?" % locals() )
				else:
					phrase_queries.append( "k%(i)d.word like ?" % locals() )
			s_phrase_joins = "\n".join( phrase_joins )
			s_phrase_queries = "\n".join( phrase_queries )
			c.execute( """select k0.object_id, '', k0.pos, k0.scan_source, o.type from keywords k0 
							inner join objects o on o.id=k0.object_id
							%(s_phrase_joins)s
							where %(s_phrase_queries)s %(type_query)s order by k0.object_id, k0.pos""" % locals(), phrase_words+type_names )
		for row in c:
			search_word_rows[ search_word["raw_word"] ] += 1
			object_id, result_word, pos, scan_source, object_type = row
			hit = {
				"object_id" : object_id,
				"result_word" : result_word,
				"pos" : pos,
				"scan_source" : scan_source,
				"object_type" : object_type,
				"search_word" : search_word["raw_word"],
				"keyword" : word,
				"weight" : word_weight,
				"extra_reasons" : { "valid_types" : [], "associated_to" : [] }
			}
			if object_id in raw_results:
				raw_results[object_id].append( hit )
			else:
				raw_results[object_id] = [ hit ]
	
	# 3.) Wir machen eine Zugriffsprüfung, filtern die Trefferliste entsprechend und 
	#     erweitern die Trefferliste ggf. um Eltern- und Kindobjekte mit passendem Typ, sodass z.b.
	#     Blog-Einträge für auf die Volltextsuche passende plain/text-Objekte oder Beiträge
	#     von passenden Nutzernamen gefunden werden:
	filtered_results = {}
	for result_id in raw_results:
		for hit in raw_results[result_id]:
			object_id = hit["object_id"]
			object_type = hit["object_type"]
			search_word = hit["search_word"]
			if app.user.can_read( object_id ):
				direct_hit = False
				if object_type in result_types or "file" in result_types and files.File.supports(app, object_type) or not result_types:
					c = app.db.cursor()
					# Hier müssen wir zunächst prüfen ob das gefundene Objekt ein Substitute-Objekt ist, denn 
					# Substitute-Objekte sollten nicht als Treffer zurück geliefert werden.
					c.execute( """select original_id from substitutes where substitute_id=?""", [object_id] )
					if c.fetchone()==None:
						direct_hit = True
						hit["extra_reasons"]["valid_types"].append( object_type )
						if object_id in filtered_results:
							filtered_results[object_id].append( hit )
						else:
							filtered_results[object_id] = [ hit ]
						search_word_hits[search_word] += 1
				if not direct_hit:
					obj = db_object.DBObject( app, object_id )
					matching_associates = obj.resolve_parents( parent_type_set=set(result_types) ) + obj.resolve_children( child_type_set=set(result_types) )
					for alt_obj_id in matching_associates:
						if app.user.can_read( alt_obj_id ):
							c = app.db.cursor()
							# Hier müssen wir zunächst prüfen ob das gefundene Objekt ein Substitute-Objekt ist, denn 
							# Substitute-Objekte sollten nicht als Treffer zurück geliefert werden.
							c.execute( """select original_id from substitutes where substitute_id=?""", [alt_obj_id] )
							if c.fetchone()==None:
								hit["extra_reasons"]["associated_to"].append( alt_obj_id )
								if alt_obj_id in filtered_results:
									filtered_results[alt_obj_id].append( hit )
								else:
									filtered_results[alt_obj_id] = [ hit ]
								search_word_hits[search_word] += 1
	
	# 4.) Treffer sortieren
	if order_by=="weight" or min_weight!=None:
		# a) Relevanzsortierung/Relevanzfilterung, wobei:
		# - Anzahl treffender Suchbegriffe verstärkend wirken: len(filtered_results[x])
		# - Gesamtzahl der Treffer aller treffenden Suchbegriffe abschwächend wirken: /sum(...)
		sort_key = lambda x: (1+sum([h["weight"] for h in filtered_results[x]])) * len(filtered_results[x]) / max(1,sum([search_word_hits[sw] for sw in set([h["search_word"] for h in filtered_results[x]])]))
		hit_weights = [(hit_id,sort_key(hit_id)) for hit_id in filtered_results]
		if order_by=="weight":
			hit_weights = sorted( hit_weights, key=lambda x: x[1], reverse=order_reverse )
		# b) Treffer nach Minimalgewicht filtern, falls definiert:
		if min_weight!=None:
			hit_weights = [x for x in hit_weights if x[1]>min_weight]
	else:
		hit_weights = [(hit_id,0) for hit_id in filtered_results]
	hit_id_list = [x[0] for x in hit_weights]
	if order_by in ("id","ctime","mtime"):
		# c) Möglichst effiziente SQL-Sortierung nach Zeitstempel durchführen, falls gewünscht:
		order_dir = "desc" if order_reverse else "asc"
		hit_id_list_string = ",".join( [str(x) for x in hit_id_list] )
		c.execute( """select id, ctime, mtime from objects where id in (%(hit_id_list_string)s) order by %(order_by)s %(order_dir)s""" % locals() )
		hit_id_list = [row[0] for row in c]
	
	# 7.) Vorsortierte Objekt-ID-Liste beschneiden, falls gefordert:
	hit_id_list = hit_id_list[range_offset:None if range_limit==None else range_offset+range_limit]
	
	# 8.) Ggf. rekursiver Lookup von Eltern- und Kind-Objekten der reduzierten Trefferliste:
	hitlist = []
	if hit_id_list:
		hitlist = get.get( app, object_ids=hit_id_list, recursive=recursive, access_errors=False )
	
	# 9.) Ergebnis JSON-kodieren:
	result = {
#		"hit_weights" : hit_weights,
#		"reasons" : {},
		"hitlist" : hitlist,
		"search_word_rows" : search_word_rows,
		"search_word_hits" : search_word_hits,
	}
#	for hit_id,hit_weight in hit_weights:
#		result["reasons"][hit_id] = filtered_results[hit_id]
	app.response.output = json.dumps( result )
Пример #8
0
def search( app, search_phrase, result_types=[], min_weight=0, order_by=None, order_reverse=True, range_offset=0, 
		   range_limit=None, recursive=(False,False), max_phrase_word_dist=3, exact_includes=True, exact_excludes=True ):
	q = app.query
	
	# 1.) Suchausdruck parsen und Datenstrukturen initialisieren:
	phrase_parts = []
	current_word = ""
	in_phrase = False
	phrase_start_char = None
	for i,c in enumerate( search_phrase ):
		if c in " \t\r\n" and not in_phrase:
			if current_word:
				phrase_parts.append( current_word )
				current_word = ""
		elif c in ['"',"'"] and not in_phrase:
			in_phrase=True
			phrase_start_char = c
			current_word += c
		elif in_phrase and c==phrase_start_char:
			in_phrase=False
			phrase_start_char = None
			current_word += c
		else:
			current_word += c
	if current_word:
		phrase_parts.append( current_word )
			
	search_words = []
	for part in phrase_parts:
		match = re.fullmatch( "([?]?)([+-]*)((?:[\w]+:)*)(.+)", part, re.DOTALL )
		optional = match.group(1)=="?"
		weight_prefix = match.group(2)
		word_weight = sum( [(lambda x: 10 if x=='+' else -10)(c) for c in weight_prefix] ) + (10 if weight_prefix=="" else 0)
		_word = match.group(4)
		phrase_match = re.fullmatch( '([?]?)"([^"]*)"?(?:\[([0-9]+)\])?', _word )
		if not phrase_match:
			phrase_match = re.fullmatch( "([?]?)'([^']*)'?(?:\[([0-9]+)\])?", _word )
		if phrase_match:
			_word = None
			_phrase = phrase_match.group(2)
			try:
				_phrase_max_word_dist = int(phrase_match.group(3))
			except TypeError:
				_phrase_max_word_dist = max_phrase_word_dist
		else:
			_phrase = None
			_phrase_max_word_dist = max_phrase_word_dist
		word = {
			"optional" : optional,
			"weight" : word_weight,
			"type" : match.group(3),
			"word" : _word,
			"phrase" : _phrase,
			"phrase_max_word_dist" : _phrase_max_word_dist,
			"raw_word" : part
		}
		search_words.append( word )
	raw_results = {}
	c = app.db.cursor()
	search_word_rows = {}
	search_word_hits = {} # hiermit zählen wir Treffer pro Suchwort im gefilterten Endergebnis
	
	# 2.) Einzelne Suchbegriffe mit optionaler Typbindung im Wortindex nachschlagen und 
	#     getroffene Objekt-Ids mit der Suchbegriffwichtung verknüpft zwischenspeichern:
	for i, search_word in enumerate(search_words):
		# optionales Wichtungs-Präfix aus '-' und '+' parsen, wobei ein positiveres Präfix Treffer des Suchwortes
		# höher bewertet und ein negativeres Präfix Treffer der Ausschlussmenge des Suchwortes höher bewertet:
		word_weight = search_word["weight"]
		# optionalen Typ-Selektor der Form <[typ1:[typ2:[...]]]wort> parsen:
		word_types = search_word["type"].split(":")[:-1]
		type_query = ""
		type_names = []
		time_col = None
		for j, word_type in enumerate(word_types):
			if word_type in search_type_alias:
				if len(type_names):
					type_query += " or "
				type_query += "o.type like ?"
				type_names.append( search_type_alias[word_type] )
			elif word_type in time_types:
				time_col = word_type
			else:
				if len(type_names):
					type_query += " or "
				type_query += "k0.scan_source=?"
				type_names.append( word_type )
		if type_query:
			type_query = "and (" + type_query + ")"
		search_word_rows[ search_word["raw_word"] ] = 0
		search_word_hits[ search_word["raw_word"] ] = 0
		if time_col:
			# time queries have the form ctime:201707 ("created some time in july 2017") or mtime:<2017 ("last modified before 2017")
			time_query_string = search_word["word"] if search_word["word"] else search_word["phrase"]
			# lib.application replaces some xml control characters with entities, we have to fix that now:
			time_query_string = time_query_string.replace("&amp;","&").replace("&gt;",">").replace("&lt;","<")
			time_query_string = re.sub( "[-:. ]","", time_query_string ) # eg: 2017-07-23 15:38 -> 201707231538
			match = re.fullmatch( r"(<|>|<=|>=|=)?([0-9]+)", time_query_string )
			if not match:
				raise errors.ParameterError( "Illegal time query, try something like ctime:2017-01" )
			time_op = match.group(1) or "="
			time_string = match.group(2)
			attr_list = ["year"]
			if len(time_string)>=6:
				attr_list.append( "month" )
			if len(time_string)>=8:
				attr_list.append( "day" )
			if len(time_string)>=10:
				attr_list.append( "hour" )
			if len(time_string)>=12:
				attr_list.append( "minute" )
			if len(time_string)>=14:
				attr_list.append( "second" )
			time_pattern_dict = { "year":"%Y", "month":"%m", "day":"%d", "hour":"%H", "minute":"%M", "second":"%S" }
			# generate time parse pattern, eg.: %Y%m%d
			time_pattern = "".join( [time_pattern_dict[x] for x in attr_list] )
			time_range_begin = datetime.datetime.strptime( time_string, time_pattern )
			time_range_after = None
			if time_op == "=":
				# for time range equality comparisons we need to determine the end of the range:
				while not time_range_after:
					try:
						# try to generate the first date not matching the time string (but 20171232 or 201713 would be illegal...)
						last_attr = attr_list.pop()
						ref_time = datetime.datetime.strptime( time_string, time_pattern )
						time_range_after = ref_time.replace( **{last_attr:getattr(time_range_begin,last_attr)+1} )
					except ValueError:
						# for illegal cases try again one level higher, eg: 20171232 -> 201713 -> 2018
						time_string = time_string[:-2]
						time_pattern = time_pattern[:-2]
				c.execute( """select object_id, o.%(time_col)s, pos, scan_source, o.type from keywords k0
								inner join objects o on o.id=object_id
								where %(time_col)s>=? and %(time_col)s<? %(type_query)s order by object_id, pos""" % locals(), 
							[time_range_begin.timestamp(), time_range_after.timestamp()] + type_names )
			else:
				c.execute( """select object_id, o.%(time_col)s, pos, scan_source, o.type from keywords k0
								inner join objects o on o.id=object_id
								where %(time_col)s %(time_op)s ? %(type_query)s order by object_id, pos""" % locals(), 
							[time_range_begin.timestamp()] + type_names )
		elif search_word["word"]:
			word = search_word["word"]
			c.execute( """select object_id, word, pos, scan_source, o.type from keywords k0
							inner join objects o on o.id=object_id
							where word like ? %(type_query)s order by object_id, pos""" % locals(), [word]+type_names )
		elif search_word["phrase"]:
			phrase = search_word["phrase"]
			phrase_max_word_dist = search_word["phrase_max_word_dist"]
			phrase_words = phrase.split()
			phrase_joins = []
			phrase_queries = []
			result_phrase_field_string = ""
			for i,phrase_word in enumerate(phrase_words):
				if i>0:
					prev_i = i-1
					phrase_joins.append( """
						inner join keywords k%(i)d 
							on k0.object_id=k%(i)d.object_id 
							and k0.scan_source=k%(i)d.scan_source 
							and abs(k%(i)d.pos-k%(prev_i)d.pos)<=%(phrase_max_word_dist)d""" % locals() )
					phrase_queries.append( "and k%(i)d.word like ?" % locals() )
					result_phrase_field_string += "||' '||k%(i)d.word" % locals()
				else:
					phrase_queries.append( "k%(i)d.word like ?" % locals() )
					result_phrase_field_string += "k%(i)d.word" % locals()
			s_phrase_joins = "\n".join( phrase_joins )
			s_phrase_queries = "\n".join( phrase_queries )
			c.execute( """select k0.object_id, %(result_phrase_field_string)s, k0.pos, k0.scan_source, o.type from keywords k0 
							inner join objects o on o.id=k0.object_id
							%(s_phrase_joins)s
							where %(s_phrase_queries)s %(type_query)s order by k0.object_id, k0.pos""" % locals(), phrase_words+type_names )
		for row in c:
			search_word_rows[ search_word["raw_word"] ] += 1
			object_id, result_word, pos, scan_source, object_type = row
			if not type(word_weight)==int:
				raise errors.StateError(type(word_weight))
			if type(search_word["word"]) not in (str,type(None)):
				raise errors.StateError(type(search_word["word"]))
			if type(result_word) not in (str,int):
				raise errors.StateError(type(result_word))
			if not type(pos)==int:
				raise errors.StateError(type(pos))
			if not type(scan_source)==str:
				raise errors.StateError(type(scan_source))
			if not type(search_word["raw_word"])==str:
				raise errors.StateError(type(search_word["raw_word"]))
			hit = {
				"object_id" : object_id,
				"object_type" : object_type,
				"reasons" : { (
					object_id, #"object_id"
					word_weight, #"weight"
					search_word["word"], #"keyword"
					result_word, #"result_word"
					scan_source, #"scan_source"
					search_word["raw_word"], #"raw_word"
				) },
				"associated_to" : set()
			}
			if object_id in raw_results:
				raw_results[object_id]["reasons"] = raw_results[object_id]["reasons"].union( hit["reasons"] )
			else:
				raw_results[object_id] = hit
	
	# 3.) Wir machen eine Zugriffsprüfung, filtern die Trefferliste entsprechend und 
	#     erweitern die Trefferliste ggf. um Eltern- und Kindobjekte mit passendem Typ, sodass z.b.
	#     Blog-Einträge für auf die Volltextsuche passende plain/text-Objekte oder Beiträge
	#     von passenden Nutzernamen gefunden werden:
	filtered_results = {}
	for object_id in raw_results:
		hit = raw_results[ object_id ]
		object_type = hit["object_type"]
		if app.user.can_read( object_id ):
			direct_hit = False
			if object_type in result_types or "file" in result_types and files.File.supports(app, object_type) or not result_types:
				c = app.db.cursor()
				# Hier müssen wir zunächst prüfen ob das gefundene Objekt ein Substitute-Objekt ist, denn 
				# Substitute-Objekte sollten nicht als Treffer zurück geliefert werden.
				c.execute( """select original_id from substitutes where substitute_id=?""", [object_id] )
				if c.fetchone()==None:
					direct_hit = True
					if object_id not in filtered_results:
						filtered_results[object_id] = hit
					else:
						# Merge existing results reason set:
						# this is not going to happen as long as we iterate over raw_results and that is a dictionary, but who knows...
						filtered_results[object_id]["reasons"] = filtered_results[object_id]["reasons"].union( hit["reasons"] )
					for reason in hit["reasons"]:
						object_id, weight, keyword, result_word, scan_source, raw_word = reason
						search_word_hits[ raw_word ] += 1
			if not direct_hit:
				obj = db_object.DBObject( app, object_id )
				matching_associates = obj.resolve_parents( parent_type_set=set(result_types) ) + obj.resolve_children( child_type_set=set(result_types) )
				for alt_obj_id in matching_associates:
					if app.user.can_read( alt_obj_id ):
						c = app.db.cursor()
						# Hier müssen wir zunächst prüfen ob das gefundene Objekt ein Substitute-Objekt ist, denn 
						# Substitute-Objekte sollten nicht als Treffer zurück geliefert werden.
						c.execute( """select original_id from substitutes where substitute_id=?""", [alt_obj_id] )
						if c.fetchone()==None:
							hit["associated_to"].add( object_id )
							if alt_obj_id not in filtered_results:
								filtered_results[alt_obj_id] = hit
							else:
								# Merge existing results reason set:
								filtered_results[alt_obj_id]["reasons"] = filtered_results[alt_obj_id]["reasons"].union( hit["reasons"] )
							for reason in hit["reasons"]:
								object_id, weight, keyword, result_word, scan_source, raw_word = reason
								search_word_hits[ raw_word ] += 1
	
	# 4.) Treffer sortieren
	if order_by=="weight" or min_weight!=None or exact_includes or exact_excludes:
		# a) Relevanzsortierung/Relevanzfilterung, wobei die:
		# - gewichtete Anzahl treffender Suchbegriffe verstärkend wirkt: weighted_reason_sum
		# - Gesamtzahl der Treffer aller treffenden Suchbegriffe abschwächend wirken: search_word_hit_sum
		def sort_key( object_id ):
			hit = filtered_results[ object_id ]
			weighted_reason_sum = 0
			search_word_hit_sum = 0
			all_positive_terms_found = True
			no_negative_terms_found = True
			for reason in hit["reasons"]:
				object_id, weight, keyword, result_word, scan_source, raw_word = reason
				weighted_reason_sum += 1*weight
				search_word_hit_sum += search_word_hits[ raw_word ]
				no_negative_terms_found = no_negative_terms_found and weight>=0
			if exact_includes:
				for search_word in search_words:
					if not search_word["optional"] and search_word["weight"]>=0:
						positive_term_found = False
						for reason in hit["reasons"]:
							object_id, weight, keyword, result_word, scan_source, raw_word = reason
							if search_word["raw_word"]==raw_word:
								positive_term_found = True
								break
						all_positive_terms_found = all_positive_terms_found and positive_term_found
						if all_positive_terms_found==False:
							break
			hit_weight = weighted_reason_sum / (1+search_word_hit_sum)
			return (hit_weight, all_positive_terms_found, no_negative_terms_found)
		hit_weights = [(hit_id,sort_key(hit_id)) for hit_id in filtered_results]
		if order_by=="weight":
			hit_weights = sorted( hit_weights, key=lambda x: x[1], reverse=order_reverse )
		# b) Exclude hits, if below min_weight, when defined:
		if min_weight!=None:
			hit_weights = [x for x in hit_weights if x[1][0]>min_weight]
		# c) Exclude hits not matching all positive search terms if required
		if exact_includes:
			hit_weights = [x for x in hit_weights if x[1][1]==True]
		# d) Exclude hits matching at least one negative search term if required
		if exact_excludes:
			hit_weights = [x for x in hit_weights if x[1][2]==True]
	else:
		hit_weights = [(hit_id,0) for hit_id in filtered_results]
	hit_id_list = [x[0] for x in hit_weights]
	if order_by in ("id","ctime","mtime"):
		# c) Möglichst effiziente SQL-Sortierung nach Zeitstempel durchführen, falls gewünscht:
		order_dir = "desc" if order_reverse else "asc"
		hit_id_list_string = ",".join( [str(x) for x in hit_id_list] )
		c.execute( """select id, ctime, mtime from objects where id in (%(hit_id_list_string)s) order by %(order_by)s %(order_dir)s""" % locals() )
		hit_id_list = [row[0] for row in c]
	
	# 7.) Vorsortierte Objekt-ID-Liste beschneiden, falls gefordert:
	hit_id_list = hit_id_list[range_offset:None if range_limit==None else range_offset+range_limit]
	
	# 8.) Ggf. rekursiver Lookup von Eltern- und Kind-Objekten der reduzierten Trefferliste:
	hitlist = []
	if hit_id_list:
		hitlist = get.get( app, object_ids=hit_id_list, recursive=recursive, access_errors=False )
	for hit in hitlist:
		hit["reasons"] = list( filtered_results[hit["id"]]["reasons"] )
		#hit["associated_to"] = get.get( app, object_ids=list(filtered_results[hit["id"]]["associated_to"]), recursive=(False,False), access_errors=False )
		hit["weight"] = [x[1] for x in hit_weights if x[0]==hit["id"]][0]
	
	# 9.) Ergebnis JSON-kodieren:
	result = {
		"hitlist" : hitlist,
		"search_word_rows" : search_word_rows,
		"search_word_hits" : search_word_hits,
	}
	app.response.output = json.dumps( result )