Python UnveillanceDocument.loadAsset 예제들, lib.Worker.Models.uv_document.UnveillanceDocument.loadAsset Python 예제들

예제 #1

0

파일 보기

파일: ner_entity_extractor.py 프로젝트: harlo/CompassAnnex

def extractNEREntities(task):
	task_tag = "NER ENTITY EXTRACTION"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "TOKENIZING TEXT DOCUMENT at %s" % task.doc_id
	task.setStatus(302)

	from lib.Worker.Models.uv_document import UnveillanceDocument

	from conf import DEBUG
	from vars import ASSET_TAGS

	doc = UnveillanceDocument(_id=task.doc_id)
	if doc is None:
		print "DOC IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return

	from json import loads

	try:
		texts = loads(doc.loadAsset("doc_texts.json"))
	except Exception as e:
		print "ERROR GETTING DOC-TEXTS: %s" % e
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return

	import ner, os
	from conf import getConfig
	from lib.Core.Utils.funcs import cleanLine

	st = ner.SocketNER(host='localhost', port=getConfig("nlp_server.port"))
	entities = {}

	for i, page in enumerate(texts):
		if page is None: continue

		lemmas = st.get_entities(cleanLine(page))
		if len(lemmas.keys()) == 0: continue

		for lemma_type in lemmas.keys():
			entities = updateEntities(entities, lemmas[lemma_type], lemma_type, i)

		#if DEBUG and i > 25: break
		
	if len(entities.keys()) > 0:
		ner_entity_path = doc.addAsset(entities, "stanford-ner_entities.json", as_literal=False,
			description="Entities as per Stanford-NER Tagger (via NLTK)",
			tags=[ASSET_TAGS['STANFORD_NER_ENTITIES'], ASSET_TAGS['CP_ENTITIES']])

		if ner_entity_path is not None:
			doc.addFile(ner_entity_path, None, sync=True)

	doc.addCompletedTask(task.task_path)
	task.routeNext()
	print "\n\n************** %s [END] ******************\n" % task_tag
	task.finish()

예제 #2

0

파일 보기

파일: preprocess_nlp.py 프로젝트: frnsys/UnveillanceAnnex

def preprocessNLP(task):
	task_tag = "TEXT NLP PREPROCESSING"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "nlp preprocessing text at %s" % task.doc_id
	task.setStatus(302)
	
	import re
	from json import loads
	
	from lib.Worker.Models.uv_document import UnveillanceDocument
	from lib.Core.Utils.funcs import cleanAndSplitLine
	
	from conf import DEBUG
	from vars import ASSET_TAGS
	
	document = UnveillanceDocument(_id=task.doc_id)
	if document is None: 
		print "DOC IS NONE"
		task.fail()
		return
		
	#	1. get all the words (bag of words)
	try:
		texts = loads(document.loadAsset("doc_texts.json"))
	except Exception as e:
		print "ERROR GETTING DOC-TEXTS: %s" % e
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return
	
	word_groups = [cleanAndSplitLine(text) for text in texts if text is not None]
	word_groups = [wg for wg in word_groups if len(wg) > 0]
	bag_of_words = sum(word_groups, [])
	document.addAsset(bag_of_words, "bag_of_words.txt", as_literal=False,
		description="bag of words", tags=ASSET_TAGS['BOW'])
	
	#	2. get keywords, weighted and parsable by gensim
	once_words = set(word for word in set(bag_of_words) if bag_of_words.count(word) == 1)
	key_words = [word for word in bag_of_words if word not in once_words]
	
	if len(key_words) > 0:	
		document.addAsset(key_words, "key_words_gensim.txt", as_literal=False,
			description="keywords, as list, and parsable by gensim",
			tags=ASSET_TAGS['KW'])

	document.addCompletedTask(task.task_path)
	task.routeNext()
	task.finish()
	print "\n\n************** %s [END] ******************\n" % task_tag

예제 #3

0

파일 보기

파일: j3mify.py 프로젝트: harlo/InformaAnnex

def j3mify(uv_task):
	task_tag = "J3MIFYING"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "j3mifying asset at %s" % uv_task.doc_id
	uv_task.setStatus(302)
	
	import os
	from lib.Worker.Models.uv_document import UnveillanceDocument
	
	from conf import DEBUG
	from vars import ASSET_TAGS
	
	media = UnveillanceDocument(_id=uv_task.doc_id)
	if media is None:
		print "DOC IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail()
		return

	j3m = media.loadAsset(uv_task.j3m_name)
	if j3m is None:
		error_message = "J3M IS NONE"
		print error_message
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail(message=error_message)
		return
	
	import json
	print "JSSON HERE:"
	try:
		print type(j3m)
		j3m = json.loads(j3m)
	except Exception as e:
		print "\n\n************** J3MIFYING [WARN] ******************\n"
		print e
		print "json load once fail. trying again"

		print j3m

	if type(j3m) in [str, unicode]:
		try:
			j3m = json.loads(j3m)
		except Exception as e:
			print "\n\n************** J3MIFYING [WARN] ******************\n"
			print e
			print "json loads twice fail."

	print type(j3m)

	try:
		j3m_sig = j3m['signature']
	except Exception as e:
		print "NO SIGNATURE TO EXTRACT"
		print "\n\n************** J3MIFYING [ERROR] ******************\n"
		uv_task.fail(status=412, message="No Signature in J3M.")
		return
	
	media.addAsset(j3m_sig, "j3m.sig", tags=[ASSET_TAGS['SIG']],
		description="The j3m's signature")
	
	media.addFile(
		media.addAsset(j3m['j3m'], "j3m.json", tags=[ASSET_TAGS['J3M']], description="The j3m itself.", as_literal=False), 
		None, sync=True)

	media.addCompletedTask(uv_task.task_path)
	
	uv_task.j3m_name = "j3m.json"
	uv_task.save()
	
	uv_task.routeNext()	
	uv_task.finish()
	print "\n\n************** %s [END] ******************\n" % task_tag

예제 #4

0

파일 보기

파일: evaluate_text.py 프로젝트: unveillance/UnveillanceAnnex

def evaluateText(task):
	task_tag = "TEXT EVALUATION"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "evaluating text at %s" % task.doc_id
	task.setStatus(302)
	
	from lib.Worker.Models.uv_document import UnveillanceDocument
	from conf import DEBUG
	from vars import MIME_TYPE_TASKS
	
	document = UnveillanceDocument(_id=task.doc_id)	
	"""
		limited choices: json, pgp, or txt
	"""

	if hasattr(task, "text_file"):
		content = document.loadAsset(task.text_file)
	else:
		content = document.loadFile(document.file_name)	
	
	if content is None:
		print "no text to evaluate :("
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return
	
	new_mime_type = None
	import json
	try:
		json_txt = json.loads(content)
		new_mime_type = "application/json"
		
		print "THIS IS JSON"
	except Exception as e:
		print "NOT JSON: %s" % e
	
	task_path = None	
	if new_mime_type is not None:
		document.mime_type = new_mime_type
		document.save()
		
		if document.mime_type in MIME_TYPE_TASKS.keys():
			task_path = MIME_TYPE_TASKS[document.mime_type][0]
	else:
		try:
			from lib.Core.Utils.funcs import cleanLine
			from vars import ASSET_TAGS
			
			txt_json = []
			txt_pages = []
			line_count = 0
			
			# this is arbitrary
			MAX_LINES_PER_PAGE = 80
			
			for line in content.splitlines():
				txt_pages.append(cleanLine(line))
				line_count += 1
				
				if line_count == MAX_LINES_PER_PAGE:
					txt_json.append(" ".join(txt_pages))
					txt_pages = []
					line_count = 0

			txt_json.append(" ".join(txt_pages))

			document.total_pages = len(txt_json)
			document.save()
						
			asset_path = document.addAsset(txt_json, "doc_texts.json", as_literal=False,
				description="jsonified text of original document, segment by segment",
				tags=[ASSET_TAGS['TXT_JSON']])

			from lib.Worker.Models.uv_text import UnveillanceText
			uv_text = UnveillanceText(inflate={
				'media_id' : document._id,
				'searchable_text' : txt_json,
				'file_name' : asset_path
			})
			
			document.text_id = uv_text._id
			document.save()
		except Exception as e: 
			if DEBUG:
				print "ERROR HERE GENERATING DOC TEXTS:"
				print e
	
	document.addCompletedTask(task.task_path)
	task.finish()
	task.routeNext()
	print "\n\n************** %s [END] ******************\n" % task_tag

예제 #5

0

파일 보기

파일: massage_j3m.py 프로젝트: harlo/InformaAnnex

def massageJ3M(task):
	task_tag = "MASSAGING J3M"
	
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "massaging j3m at %s" % task.doc_id
	task.setStatus(302)
		
	from lib.Worker.Models.uv_document import UnveillanceDocument
	
	from conf import DEBUG
	from vars import ASSET_TAGS
	
	media = UnveillanceDocument(_id=task.doc_id)
	if media is None:
		print "DOC IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return
	
	if hasattr(task, "j3m_name"):
		j3m_name = task.j3m_name
	else:
		j3m_name = "j3m.json"

	j3m = media.loadAsset(j3m_name)
	if j3m is None:
		print "J3M IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return
	
	from json import loads
	
	try:
		j3m = loads(j3m)
	except Exception as e:
		print "J3M IS INVALID"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail(status=412)
		return
	
	try:
		media.date_created = j3m['genealogy']['dateCreated']
		media.saveFields("date_created")
	except KeyError as e:
		print "J3M HAS NO DATE CREATED: %s" % e
		print "\n\n************** %s [WARN] ******************\n" % task_tag
	
	from hashlib import sha1
	try:
		j3m['public_hash'] = sha1("".join(
			[j3m['genealogy']['createdOnDevice'],
			"".join(j3m['genealogy']['hashes'])])).hexdigest()
	except KeyError as e:
		if DEBUG:
			print "no key %s" % e
		
		pass

	if 'data' in j3m.keys():
		try:
			location = j3m['data']['exif']['location']
			j3m['data']['exif'].update({
				'location' : [location[1], location[0]]
			})
		except KeyError as e:
			if DEBUG:
				print "no key %s" % e
			
			pass
		
		try:
			if type(j3m['data']['sensorCapture']) is list: pass
		except KeyError as e:
			if DEBUG: print "no key %s" % e
			pass

		if 'sensorCapture' in j3m['data'].keys():
			for playback in j3m['data']['sensorCapture']:
				if 'gps_coords' in playback['sensorPlayback'].keys():
					try:
						gps = str(playback['sensorPlayback']['gps_coords'])[1:-1].split(",")
						if DEBUG:
							print "REPLACING %s as geopoint" % gps
							print type(gps)
					
						playback['sensorPlayback'].update({
							'gps_coords' : [float(gps[1]), float(gps[0])]
						})
					except Exception as e:
						if DEBUG: print e
						pass
				
				if 'regionLocationData' in playback['sensorPlayback'].keys():
					try:
						gps = str(playback['sensorPlayback']['regionLocationData']['gps_coords'])
						gps = gps[1:-1].split(",")

						if DEBUG:
							print "REPLACING %s as geopoint" % gps
							
						playback['sensorPlayback']['regionLocationData'].update({
							'gps_coords' : [float(gps[1]), float(gps[0])]
						})
					except Exception as e:
						if DEBUG: print e
						pass
				
				if 'visibleWifiNetworks' in playback['sensorPlayback'].keys():
					try:
						for i,b in enumerate(playback['sensorPlayback']['visibleWifiNetworks']):
							playback['sensorPlayback']['visibleWifiNetworks'][i].update({
								'bt_hash' : sha1(b['bssid']).hexdigest()
							})
					except Exception as e:
						if DEBUG: print e
						pass
		
		import os, json
		from conf import getConfig
		from lib.Core.Utils.funcs import b64decode
		from lib.Worker.Utils.funcs import getFileType, unGzipBinary

		searchable_text = []
		
		if 'userAppendedData' in j3m['data'].keys():
			try:
				with open(os.path.join(getConfig('informacam.forms_root'), "forms.json"), 'rb') as F:
				
					form_data = json.loads(F.read())['forms']
					
					for udata in j3m['data']['userAppendedData']:
						for aForms in udata['associatedForms']:
							st_keys = aForms['answerData'].keys()
						
							for f in form_data:
								if f['namespace'] == aForms['namespace']:
									try:
										for mapping in f['mapping']:
											try:
												group = mapping.keys()[0]
												key = aForms['answerData'][group].split(" ")
											
												for m in mapping[group]:
													if m.keys()[0] in key:
														key[key.index(m.keys()[0])] = m[m.keys()[0]]
												aForms['answerData'][group] = " ".join(key)
											except KeyError as e:
												if DEBUG: print "no key %s" % e
												pass
									except KeyError as e:
										if DEBUG: print "no key %s" % e
										pass
								
									try:
										idx = 0
										for audio in f['audio_form_data']:
											try:
												while audio in st_keys: st_keys.remove(audio)
											except Exception as e: pass
										
											try:
												audio_data = b64decode(
													aForms['answerData'][audio])
											
												if audio_data is None:
													if DEBUG: print "could not unb64 audio"
													continue
											
												if getFileType(audio_data, as_buffer=True) != MIME_TYPES['gzip']:
													if DEBUG: print "audio is not gzipped"
													continue
													
												audio_f = "audio_%d.3gp" % idx
												idx += 1
											
												media.addAsset(unGzipBinary(audio_data), audio_f,
													tags=[ASSET_TAGS['A_3GP']],
													description="3gp audio file from form")
											
												'''
												new_task=UnveillanceTask(inflate={
													'task_path' : "Media.convert.audioConvert",
													'doc_id' : media._id,
													'formats' : ["3gp", "wav"],
													'src_file' : "audio_%d.3gp" % idx,
													'queue' : task.queue
												})
												new_task.run()
												'''
												
												aForms['answerData'][audio] = "audio_%d.wav"
											except KeyError as e:
												if DEBUG: print "no key %s" % e
												pass
									except KeyError as e:
										if DEBUG: print "no key %s" % e
										pass
						
							if len(st_keys) > 0:
								for key in st_keys:
									searchable_text.append(aForms['answerData'][key])
							
			except KeyError as e:
				if DEBUG: print "no key %s" % e
				pass
			except IOError as e:
				print "\n\n************** %s [WARN] ******************\n" % task_tag
				if DEBUG: print "no forms to go over: %s" % e
			except ValueError as e:
				print "\n\n************** %s [WARN] ******************\n" % task_tag
				if DEBUG: print "for some reason, forms.json is not legible?\n%s" % e
											
	if media.addAsset(j3m, "j3m.json", as_literal=False) is False:
		print "J3M COULD NOT BE ADDED"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return

	from lib.Worker.Models.ic_j3m import InformaCamJ3M
	
	j3m['media_id'] = media._id
	if len(searchable_text) > 0:
		j3m['searchable_text'] = searchable_text

	j3m = InformaCamJ3M(inflate=j3m)
	
	print "\n\n***NEW J3M CREATED***\n\n" 
	j3m.save()
	
	media.j3m_id = j3m._id
	media.save()
	media.addCompletedTask(task.task_path)
	
	task.routeNext()
	task.finish()
	print "\n\n************** %s [END] ******************\n" % task_tag

예제 #6

0

파일 보기

파일: verify_signature.py 프로젝트: harlo/InformaAnnex

def verifySignature(task):
	task_tag = "VERIFYING SIGNATURE"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "image preprocessing at %s" % task.doc_id
	task.setStatus(302)
		
	from lib.Worker.Models.uv_document import UnveillanceDocument
	
	from conf import DEBUG
	from vars import ASSET_TAGS
	
	media = UnveillanceDocument(_id=task.doc_id)
	if media is None:
		print "DOC IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return
	
	sig = media.getAsset("j3m.sig", return_only="path")
	j3m = media.getAsset("j3m.json", return_only="path")
	
	if DEBUG:
		print "j3m path: %s, sig path: %s" % (j3m, sig)
	
	if sig is None or j3m is None:
		err_msg = "NO SIGNATURE or J3M"
		print err_msg
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail(message=err_msg)
		return
	
	import gnupg
	from conf import getConfig
	
	try:
		gpg = gnupg.GPG(homedir=getConfig('gpg_homedir'))
	except Exception as e:
		print "ERROR INITING GPG"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return
	
	media.j3m_verified = False
	verified = gpg.verify_file(j3m, sig_file=sig)
	
	if DEBUG:
		print "verified fingerprint: %s" % verified.fingerprint
	
	if verified.fingerprint is not None:
		from json import loads
		
		supplied_fingerprint = str(loads(
			media.loadAsset("j3m.json"))['genealogy']['createdOnDevice'])
		
		if verified.fingerprint.upper() == supplied_fingerprint.upper():
			if DEBUG:
				print "SIGNATURE VALID for %s" % verified.fingerprint.upper()
			
			media.j3m_verified = True
	
	media.save()
	media.addCompletedTask(task.task_path)
	
	task.routeNext()
	task.finish()
	print "\n\n************** %s [END] ******************\n" % task_tag

예제 #7

0

파일 보기

파일: locate_j3m.py 프로젝트: unveillance/InformaAnnex

def locate_j3m(uv_task):
	task_tag = "PULLING J3M"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "pulling j3m at %s" % uv_task.doc_id
	uv_task.setStatus(302)
		
	from lib.Worker.Models.uv_document import UnveillanceDocument
	
	from conf import DEBUG, ANNEX_DIR
	from vars import ASSET_TAGS
	
	media = UnveillanceDocument(_id=uv_task.doc_id)
	if media is None:
		print "DOC IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		uv_task.fail()
		return

	from lib.Worker.Utils.funcs import getFileType
	from vars import MIME_TYPES, MIME_TYPE_MAP

	ic_j3m_txt = media.loadAsset("j3m_raw.txt")
	ic_j3m_txt_mime_type = getFileType(ic_j3m_txt, as_buffer=True, force_json=True)
	inflate = {}

	print "J3M MIME TYPE SNIFFED: %s" % ic_j3m_txt_mime_type

	if ic_j3m_txt_mime_type != MIME_TYPES['json']:
		import os
		from lib.Core.Utils.funcs import b64decode
		
		un_b64 = b64decode(ic_j3m_txt)
	
		if un_b64 is not None:
			un_b64_mime_type = getFileType(un_b64, as_buffer=True)
			if un_b64_mime_type in [MIME_TYPES['pgp'], MIME_TYPES['gzip']]:
				if DEBUG:
					print "MIME TYPE: %s" % un_b64_mime_type
				
				asset_path = "j3m_raw.%s" % MIME_TYPE_MAP[un_b64_mime_type]
				media.addAsset(un_b64, asset_path)
				
				if DEBUG:
					print "\n\nPGP KEY FILE PATH: %s\n\n" % asset_path
				
				gz = media.addAsset(None, "j3m_raw.gz", tags=[ASSET_TAGS['OB_M']], 
					description="j3m data extracted from obscura marker")
				
				if un_b64_mime_type == MIME_TYPES['pgp']:
					uv_task.put_next([
						"PGP.decrypt.decrypt",
						"J3M.j3mify.parse_zipped_j3m"
					])

					inflate.update({
						'pgp_file' : os.path.join(media.base_path, asset_path),
						'save_as' : gz
					})
					
					was_encrypted = True
					
				elif un_b64_mime_type in MIME_TYPES['gzip']:
					uv_task.put_next("J3M.j3mify.parse_zipped_j3m")
				
	else:
		import os
		from fabric.api import settings, local

		with settings(warn_only=True):
			src_j3m = os.path.join(ANNEX_DIR, media.base_path, "j3m_raw.txt")
			dest_j3m = os.path.join(ANNEX_DIR, media.base_path, "j3m_raw.json")

			local("mv %s %s" % (src_j3m, dest_j3m))

		print "PUTTING J3M FROM HERE!!!! WAS JSON! (%s -> %s)" % (src_j3m, dest_j3m)
		media.addAsset(None, "j3m_raw.json")

		uv_task.put_next([
			"J3M.j3mify.j3mify",
			"PGP.verify_signature.verifySignature",
			"J3M.massage_j3m.massageJ3M",
			"J3M.verify_visual_content.verifyVisualContent",
			"J3M.notarize.notarize_media"
		])
		
		inflate.update({'j3m_name' : "j3m_raw.json"})

	media.addCompletedTask(uv_task.task_path)
	uv_task.routeNext(inflate=inflate)
	uv_task.finish()
	print "\n\n************** %s [END] ******************\n" % task_tag

예제 #8

0

파일 보기

파일: topic_modeler.py 프로젝트: harlo/CompassAnnex

def createGensimObjects(task):
	task_tag = "GENSIM TOPIC EXTRACTION"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "USING TEXT DOCUMENT at %s" % task.doc_id
	task.setStatus(302)

	from lib.Worker.Models.uv_document import UnveillanceDocument

	from conf import DEBUG
	from vars import ASSET_TAGS

	doc = UnveillanceDocument(_id=task.doc_id)
	if doc is None:
		print "DOC IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return

	from json import loads

	try:
		texts = loads(doc.loadAsset("doc_texts.json"))
	except Exception as e:
		print "ERROR GETTING DOC-TEXTS: %s" % e
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return

	if len(texts) == 0:
		print "THERE ARE NO TEXTS HERE ANYWAY!"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return 

	import logging, os, bz2
	from json import loads
	from gensim import corpora

	from lib.Core.Utils.funcs import cleanLine
	from conf import getConfig, ANNEX_DIR

	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

	try:
		wiki_dictionary = corpora.Dictionary.load_from_text(os.path.join(
			getConfig('compass.gensim.training_data'), 'wiki_en_wordids.txt'))
		wiki_corpus = corpora.MmCorpus(bz2.BZ2File(os.path.join(
			getConfig('compass.gensim.training_data'), 'wiki_en_tfidf.mm.bz2')))
	except Exception as e:
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		error_msg = "having trouble loading gensim dictionary and corpus from wiki dump: (error type %s)" % type(e)

		print error_msg
		print e
		
		task.fail(message=error_msg)
		return

	from gensim import models

	wiki_log_entropy_file = os.path.join(getConfig('compass.gensim.training_data'), 'wiki_en_log_entropy.model')
	if not os.path.exists(wiki_log_entropy_file):
		print "\n\n************** %s [WARN] ******************\n" % task_tag
		print "no pre-prepared log entropy model.  going to generate this here, now.  might take a minute..."
		
		logent_transformation = models.LogEntropyModel(wiki_corpus, id2word=wiki_dictionary)
		logent_transformation.save(wiki_log_entropy_file)
	else:
		logent_transformation = models.LogEntropyModel.load(wiki_log_entropy_file)

	tokenize_function = corpora.wikicorpus.tokenize

	doc_corpus = [wiki_dictionary.doc2bow(tokenize_function(cleanLine(page).lower())) for page in texts]
	doc_corpus = logent_transformation[doc_corpus]

	wiki_tfidf_file = os.path.join(getConfig('compass.gensim.training_data'), 'wiki_en_tfidf.tfidf_model')
	if not os.path.exists(wiki_tfidf_file):
		print "\n\n************** %s [WARN] ******************\n" % task_tag
		print "no pre-prepared tfidf model.  going to generate this here, now.  might take a minute..."
		
		wiki_tfidf = models.TfidfModel(wiki_corpus)
		wiki_tfidf.save(wiki_tfidf_file)
	else:
		wiki_tfidf = models.TfidfModel.load(wiki_tfidf_file)

	doc_tfidf = wiki_tfidf[doc_corpus]

	num_topics = 35
	lsi = models.LsiModel(corpus=doc_tfidf, id2word=wiki_dictionary, num_topics=num_topics)

	topics = []
	t_lambda = lambda x : [float(x[0]), x[1]]
	for t_group in [t.split("+") for t in [str(topic) for topic in lsi.print_topics(num_topics)]]:
		topics.append([t_lambda(t.strip().replace('\"','').split("*")) for t in t_group])

	lsi_topics = {
		"topics" : topics,
		"doc_comprehension" : []
	}

	doc_lsi = lsi[doc_tfidf]

	for d in doc_lsi:
		lsi_topics['doc_comprehension'].append(d)

	topic_path = doc.addAsset(lsi_topics, "%s_topics.json" % doc.file_name, as_literal=False,
		description="Gensim Topics dump (from LSI Model)", tags=[ASSET_TAGS["GM_TOPICS"]])

	doc.addCompletedTask(task.task_path)
	task.routeNext()
	print "\n\n************** %s [END] ******************\n" % task_tag
	task.finish()

예제 #9

0

파일 보기

파일: compile_metadata.py 프로젝트: frnsys/UnveillanceAnnex

def compileMetadata(task):
	task_tag = "COMPILING METADATA"
	print "\n\n************** %s [START] ******************\n" %  task_tag
	print "compiling metadata for %s" % task.doc_id
	task.setStatus(302)
	
	from lib.Worker.Models.uv_document import UnveillanceDocument
	from conf import DEBUG
	
	document = UnveillanceDocument(_id=task.doc_id)
	if document is None:
		err = "DOC IS NONE"
		print err
		print "\n\n************** %s [ERROR] ******************\n" % task_tag

		task.fail(message=err)
		return
	
	metadata = document.loadAsset(task.md_file)
	if metadata is None:
		print "NO METADATA FILE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		return
	
	import csv, re
	from Levenshtein import ratio
	from string import letters
	from vars import METADATA_ASPECTS, ASSET_TAGS
	
	numbers = str("".join([str(i) for i in range(0,10)]))
	missing_value = "NaN"
	
	labels = ["_id"]
	values = [document._id]
	
	try:
		for mda in METADATA_ASPECTS[task.md_namespace]:
			labels.append(mda['label'])
			if hasattr(task, "md_rx"):
				pattern = re.compile(task.md_rx % (mda['tag_position'], mda['label']))
			else:
				pattern = re.compile(mda['tag_position'])
				
			if DEBUG: print pattern.pattern
			
			value = missing_value
			ideal = mda['ideal']
			
			if mda['ideal'] is None:
				if mda['type'] == "str":
					ideal = letters + numbers
				elif mda['type'] == "int":
					ideal = int(numbers)
			
			print "IDEAL FOR TAG: %s" % ideal
			
			for line in metadata.splitlines():
				match = re.findall(pattern, line.strip())
				if len(match) == 1:
					if DEBUG: print "VALUE FOUND: %s (%s)" % (match[0], type(match[0]))
					
					if mda['type'] == "str":
						try:
							value = "%.9f" % ratio(ideal, str(value.replace("\"", '')))
						except TypeError as e:
							if DEBUG: print e
							value = 0

					elif mda['type'] == "int":
						try:
							value = ideal/float(match[0].replace("\"", ''))
						except ZeroDivisionError as e:
							if DEBUG: print e
							value = 0
					break
					
			if value == missing_value:
				if mda['ideal'] is None: value = 1
				else: value = 0
			
			values.append(value)
		
		if hasattr(task, 'md_extras'):
			for key, value in task.md_extras.iteritems():
				labels.append(key)
				values.append(value)
		"""
		if DEBUG:
			print "labels %s" % labels
			print "values %s" % values
		"""

		from cStringIO import StringIO
		
		md_csv_file = StringIO()
		md_csv = csv.writer(md_csv_file, 
			delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)

		md_csv.writerow(labels)
		md_csv.writerow(values)
		
		md_asset = document.addAsset(md_csv_file.getvalue(), "file_metadata.csv", 
			tags=[ASSET_TAGS["F_MD"]], 
			description="CSV representation of %s" % task.md_file)
		
		md_csv_file.close()	
		
		if md_asset is None or not document.addFile(md_asset, None):
			print "Could not save the Metadata"
			print "\n\n************** %s [ERROR] ******************\n" % task_tag
			task.fail()
			return
		
		document.addCompletedTask(task.task_path)
		
		from lib.Worker.Utils.funcs import routeNextTask
		routeNextTask(task, document)
		
		task.finish()
		print "\n\n************** %s [END] ******************\n" % task_tag
			
	except KeyError as e:
		if DEBUG: print e
		print "No metadata aspects for %s" % task.md_namespace
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return

예제 #10

0

파일 보기

파일: verify_visual_content.py 프로젝트: harlo/InformaAnnex

def verifyVisualContent(task):
	task_tag = "VERIFYING VISUAL CONTENT"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "image preprocessing at %s" % task.doc_id
	task.setStatus(302)
		
	from lib.Worker.Models.uv_document import UnveillanceDocument
	
	from conf import DEBUG
	from vars import ASSET_TAGS
	
	media = UnveillanceDocument(_id=task.doc_id)
	if media is None:
		print "DOC IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return
	
	j3m = media.loadAsset("j3m.json")
	if j3m is None:
		print "NO J3M AT ALL"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return
	
	import os
	from json import loads
	from subprocess import Popen, PIPE
	from conf import ANNEX_DIR, getConfig
	from vars import MIME_TYPES
	
	try:
		supplied_hashes = loads(j3m)['genealogy']['hashes']
	except KeyError as e:
		print "NO HASHES"
		print "\n\n************** %s [WARNING] ******************\n" % task_tag
		task.finish()
		return
		
	media.media_verified = False
	
	if media.mime_type == MIME_TYPES['image']:
		cmd = ["java", "-jar", 
			os.path.join(getConfig('jpeg_tools_dir'), "JavaMediaHasher.jar"),
			os.path.join(ANNEX_DIR, media.file_name)]
	elif media.mime_type == MIME_TYPES['video']:
		cmd = ["ffmpeg", "-y", "-i",
			os.path.join(ANNEX_DIR, media.file_name), "-vcodec", "copy",
			"-an", "-f", "md5", "-"]
		
	p = Popen(cmd, stdout=PIPE, close_fds=True)
	verified_hash = p.stdout.readline().strip().replace("MD5=", "")
	p.stdout.close()
	
	if type(supplied_hashes) is list:
		for hash in supplied_hashes:
			if type(hash) is unicode:
				hash = str(hash)
			
			if hash == verified_hash:
				media.media_verified = True
	
	media.save()
	media.addCompletedTask(task.task_path)

	task.routeNext()
	task.finish()
	print "\n\n************** %s [END] ******************\n" % task_tag

예제 #11

0

파일 보기

파일: verify_visual_content.py 프로젝트: unveillance/InformaAnnex

def verifyVisualContent(task):
	task_tag = "VERIFYING VISUAL CONTENT"
	print "\n\n************** %s [START] ******************\n" % task_tag
	print "image preprocessing at %s" % task.doc_id
	task.setStatus(302)
		
	from lib.Worker.Models.uv_document import UnveillanceDocument
	
	from conf import DEBUG
	from vars import ASSET_TAGS
	
	media = UnveillanceDocument(_id=task.doc_id)
	if media is None:
		print "DOC IS NONE"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return
	
	j3m = media.loadAsset("j3m.json")
	if j3m is None:
		print "NO J3M AT ALL"
		print "\n\n************** %s [ERROR] ******************\n" % task_tag
		task.fail()
		return
	
	
	from json import loads
	
	from vars import MIME_TYPES
	
	try:
		supplied_hashes = loads(j3m)['genealogy']['hashes']
	except KeyError as e:
		print "NO HASHES"
		print "\n\n************** %s [WARNING] ******************\n" % task_tag
		task.finish()
		return
		
	media.media_verified = False
	if not hasattr(media, "verified_hash"):	
		if media.mime_type == MIME_TYPES['image']:
			from lib.Worker.Models.ic_image import InformaCamImage
			media = InformaCamImage(_id=media._id)
			media.get_image_hash()
		elif media.mime_type == MIME_TYPES['video']:
			from lib.Worker.Models.ic_video import InformaCamVideo
			media = InformaCamVideo(_id=media._id)
			media.get_video_hash()
	
	if type(supplied_hashes) is list:
		for hash in supplied_hashes:
			if type(hash) is unicode:
				hash = str(hash)
			
			if hash == media.verified_hash:
				media.media_verified = True
	
	media.saveFields("media_verified")
	media.addCompletedTask(task.task_path)

	task.routeNext()
	task.finish()
	print "\n\n************** %s [END] ******************\n" % task_tag