def extractNEREntities(task): task_tag = "NER ENTITY EXTRACTION" print "\n\n************** %s [START] ******************\n" % task_tag print "TOKENIZING TEXT DOCUMENT at %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS doc = UnveillanceDocument(_id=task.doc_id) if doc is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return from json import loads try: texts = loads(doc.loadAsset("doc_texts.json")) except Exception as e: print "ERROR GETTING DOC-TEXTS: %s" % e print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return import ner, os from conf import getConfig from lib.Core.Utils.funcs import cleanLine st = ner.SocketNER(host='localhost', port=getConfig("nlp_server.port")) entities = {} for i, page in enumerate(texts): if page is None: continue lemmas = st.get_entities(cleanLine(page)) if len(lemmas.keys()) == 0: continue for lemma_type in lemmas.keys(): entities = updateEntities(entities, lemmas[lemma_type], lemma_type, i) #if DEBUG and i > 25: break if len(entities.keys()) > 0: ner_entity_path = doc.addAsset(entities, "stanford-ner_entities.json", as_literal=False, description="Entities as per Stanford-NER Tagger (via NLTK)", tags=[ASSET_TAGS['STANFORD_NER_ENTITIES'], ASSET_TAGS['CP_ENTITIES']]) if ner_entity_path is not None: doc.addFile(ner_entity_path, None, sync=True) doc.addCompletedTask(task.task_path) task.routeNext() print "\n\n************** %s [END] ******************\n" % task_tag task.finish()
def preprocessNLP(task): task_tag = "TEXT NLP PREPROCESSING" print "\n\n************** %s [START] ******************\n" % task_tag print "nlp preprocessing text at %s" % task.doc_id task.setStatus(302) import re from json import loads from lib.Worker.Models.uv_document import UnveillanceDocument from lib.Core.Utils.funcs import cleanAndSplitLine from conf import DEBUG from vars import ASSET_TAGS document = UnveillanceDocument(_id=task.doc_id) if document is None: print "DOC IS NONE" task.fail() return # 1. get all the words (bag of words) try: texts = loads(document.loadAsset("doc_texts.json")) except Exception as e: print "ERROR GETTING DOC-TEXTS: %s" % e print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return word_groups = [cleanAndSplitLine(text) for text in texts if text is not None] word_groups = [wg for wg in word_groups if len(wg) > 0] bag_of_words = sum(word_groups, []) document.addAsset(bag_of_words, "bag_of_words.txt", as_literal=False, description="bag of words", tags=ASSET_TAGS['BOW']) # 2. get keywords, weighted and parsable by gensim once_words = set(word for word in set(bag_of_words) if bag_of_words.count(word) == 1) key_words = [word for word in bag_of_words if word not in once_words] if len(key_words) > 0: document.addAsset(key_words, "key_words_gensim.txt", as_literal=False, description="keywords, as list, and parsable by gensim", tags=ASSET_TAGS['KW']) document.addCompletedTask(task.task_path) task.routeNext() task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def j3mify(uv_task): task_tag = "J3MIFYING" print "\n\n************** %s [START] ******************\n" % task_tag print "j3mifying asset at %s" % uv_task.doc_id uv_task.setStatus(302) import os from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS media = UnveillanceDocument(_id=uv_task.doc_id) if media is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return j3m = media.loadAsset(uv_task.j3m_name) if j3m is None: error_message = "J3M IS NONE" print error_message print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(message=error_message) return import json print "JSSON HERE:" try: print type(j3m) j3m = json.loads(j3m) except Exception as e: print "\n\n************** J3MIFYING [WARN] ******************\n" print e print "json load once fail. trying again" print j3m if type(j3m) in [str, unicode]: try: j3m = json.loads(j3m) except Exception as e: print "\n\n************** J3MIFYING [WARN] ******************\n" print e print "json loads twice fail." print type(j3m) try: j3m_sig = j3m['signature'] except Exception as e: print "NO SIGNATURE TO EXTRACT" print "\n\n************** J3MIFYING [ERROR] ******************\n" uv_task.fail(status=412, message="No Signature in J3M.") return media.addAsset(j3m_sig, "j3m.sig", tags=[ASSET_TAGS['SIG']], description="The j3m's signature") media.addFile( media.addAsset(j3m['j3m'], "j3m.json", tags=[ASSET_TAGS['J3M']], description="The j3m itself.", as_literal=False), None, sync=True) media.addCompletedTask(uv_task.task_path) uv_task.j3m_name = "j3m.json" uv_task.save() uv_task.routeNext() uv_task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def evaluateText(task): task_tag = "TEXT EVALUATION" print "\n\n************** %s [START] ******************\n" % task_tag print "evaluating text at %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import MIME_TYPE_TASKS document = UnveillanceDocument(_id=task.doc_id) """ limited choices: json, pgp, or txt """ if hasattr(task, "text_file"): content = document.loadAsset(task.text_file) else: content = document.loadFile(document.file_name) if content is None: print "no text to evaluate :(" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return new_mime_type = None import json try: json_txt = json.loads(content) new_mime_type = "application/json" print "THIS IS JSON" except Exception as e: print "NOT JSON: %s" % e task_path = None if new_mime_type is not None: document.mime_type = new_mime_type document.save() if document.mime_type in MIME_TYPE_TASKS.keys(): task_path = MIME_TYPE_TASKS[document.mime_type][0] else: try: from lib.Core.Utils.funcs import cleanLine from vars import ASSET_TAGS txt_json = [] txt_pages = [] line_count = 0 # this is arbitrary MAX_LINES_PER_PAGE = 80 for line in content.splitlines(): txt_pages.append(cleanLine(line)) line_count += 1 if line_count == MAX_LINES_PER_PAGE: txt_json.append(" ".join(txt_pages)) txt_pages = [] line_count = 0 txt_json.append(" ".join(txt_pages)) document.total_pages = len(txt_json) document.save() asset_path = document.addAsset(txt_json, "doc_texts.json", as_literal=False, description="jsonified text of original document, segment by segment", tags=[ASSET_TAGS['TXT_JSON']]) from lib.Worker.Models.uv_text import UnveillanceText uv_text = UnveillanceText(inflate={ 'media_id' : document._id, 'searchable_text' : txt_json, 'file_name' : asset_path }) document.text_id = uv_text._id document.save() except Exception as e: if DEBUG: print "ERROR HERE GENERATING DOC TEXTS:" print e document.addCompletedTask(task.task_path) task.finish() task.routeNext() print "\n\n************** %s [END] ******************\n" % task_tag
def massageJ3M(task): task_tag = "MASSAGING J3M" print "\n\n************** %s [START] ******************\n" % task_tag print "massaging j3m at %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS media = UnveillanceDocument(_id=task.doc_id) if media is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return if hasattr(task, "j3m_name"): j3m_name = task.j3m_name else: j3m_name = "j3m.json" j3m = media.loadAsset(j3m_name) if j3m is None: print "J3M IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return from json import loads try: j3m = loads(j3m) except Exception as e: print "J3M IS INVALID" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail(status=412) return try: media.date_created = j3m['genealogy']['dateCreated'] media.saveFields("date_created") except KeyError as e: print "J3M HAS NO DATE CREATED: %s" % e print "\n\n************** %s [WARN] ******************\n" % task_tag from hashlib import sha1 try: j3m['public_hash'] = sha1("".join( [j3m['genealogy']['createdOnDevice'], "".join(j3m['genealogy']['hashes'])])).hexdigest() except KeyError as e: if DEBUG: print "no key %s" % e pass if 'data' in j3m.keys(): try: location = j3m['data']['exif']['location'] j3m['data']['exif'].update({ 'location' : [location[1], location[0]] }) except KeyError as e: if DEBUG: print "no key %s" % e pass try: if type(j3m['data']['sensorCapture']) is list: pass except KeyError as e: if DEBUG: print "no key %s" % e pass if 'sensorCapture' in j3m['data'].keys(): for playback in j3m['data']['sensorCapture']: if 'gps_coords' in playback['sensorPlayback'].keys(): try: gps = str(playback['sensorPlayback']['gps_coords'])[1:-1].split(",") if DEBUG: print "REPLACING %s as geopoint" % gps print type(gps) playback['sensorPlayback'].update({ 'gps_coords' : [float(gps[1]), float(gps[0])] }) except Exception as e: if DEBUG: print e pass if 'regionLocationData' in playback['sensorPlayback'].keys(): try: gps = str(playback['sensorPlayback']['regionLocationData']['gps_coords']) gps = gps[1:-1].split(",") if DEBUG: print "REPLACING %s as geopoint" % gps playback['sensorPlayback']['regionLocationData'].update({ 'gps_coords' : [float(gps[1]), float(gps[0])] }) except Exception as e: if DEBUG: print e pass if 'visibleWifiNetworks' in playback['sensorPlayback'].keys(): try: for i,b in enumerate(playback['sensorPlayback']['visibleWifiNetworks']): playback['sensorPlayback']['visibleWifiNetworks'][i].update({ 'bt_hash' : sha1(b['bssid']).hexdigest() }) except Exception as e: if DEBUG: print e pass import os, json from conf import getConfig from lib.Core.Utils.funcs import b64decode from lib.Worker.Utils.funcs import getFileType, unGzipBinary searchable_text = [] if 'userAppendedData' in j3m['data'].keys(): try: with open(os.path.join(getConfig('informacam.forms_root'), "forms.json"), 'rb') as F: form_data = json.loads(F.read())['forms'] for udata in j3m['data']['userAppendedData']: for aForms in udata['associatedForms']: st_keys = aForms['answerData'].keys() for f in form_data: if f['namespace'] == aForms['namespace']: try: for mapping in f['mapping']: try: group = mapping.keys()[0] key = aForms['answerData'][group].split(" ") for m in mapping[group]: if m.keys()[0] in key: key[key.index(m.keys()[0])] = m[m.keys()[0]] aForms['answerData'][group] = " ".join(key) except KeyError as e: if DEBUG: print "no key %s" % e pass except KeyError as e: if DEBUG: print "no key %s" % e pass try: idx = 0 for audio in f['audio_form_data']: try: while audio in st_keys: st_keys.remove(audio) except Exception as e: pass try: audio_data = b64decode( aForms['answerData'][audio]) if audio_data is None: if DEBUG: print "could not unb64 audio" continue if getFileType(audio_data, as_buffer=True) != MIME_TYPES['gzip']: if DEBUG: print "audio is not gzipped" continue audio_f = "audio_%d.3gp" % idx idx += 1 media.addAsset(unGzipBinary(audio_data), audio_f, tags=[ASSET_TAGS['A_3GP']], description="3gp audio file from form") ''' new_task=UnveillanceTask(inflate={ 'task_path' : "Media.convert.audioConvert", 'doc_id' : media._id, 'formats' : ["3gp", "wav"], 'src_file' : "audio_%d.3gp" % idx, 'queue' : task.queue }) new_task.run() ''' aForms['answerData'][audio] = "audio_%d.wav" except KeyError as e: if DEBUG: print "no key %s" % e pass except KeyError as e: if DEBUG: print "no key %s" % e pass if len(st_keys) > 0: for key in st_keys: searchable_text.append(aForms['answerData'][key]) except KeyError as e: if DEBUG: print "no key %s" % e pass except IOError as e: print "\n\n************** %s [WARN] ******************\n" % task_tag if DEBUG: print "no forms to go over: %s" % e except ValueError as e: print "\n\n************** %s [WARN] ******************\n" % task_tag if DEBUG: print "for some reason, forms.json is not legible?\n%s" % e if media.addAsset(j3m, "j3m.json", as_literal=False) is False: print "J3M COULD NOT BE ADDED" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return from lib.Worker.Models.ic_j3m import InformaCamJ3M j3m['media_id'] = media._id if len(searchable_text) > 0: j3m['searchable_text'] = searchable_text j3m = InformaCamJ3M(inflate=j3m) print "\n\n***NEW J3M CREATED***\n\n" j3m.save() media.j3m_id = j3m._id media.save() media.addCompletedTask(task.task_path) task.routeNext() task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def verifySignature(task): task_tag = "VERIFYING SIGNATURE" print "\n\n************** %s [START] ******************\n" % task_tag print "image preprocessing at %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS media = UnveillanceDocument(_id=task.doc_id) if media is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return sig = media.getAsset("j3m.sig", return_only="path") j3m = media.getAsset("j3m.json", return_only="path") if DEBUG: print "j3m path: %s, sig path: %s" % (j3m, sig) if sig is None or j3m is None: err_msg = "NO SIGNATURE or J3M" print err_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail(message=err_msg) return import gnupg from conf import getConfig try: gpg = gnupg.GPG(homedir=getConfig('gpg_homedir')) except Exception as e: print "ERROR INITING GPG" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return media.j3m_verified = False verified = gpg.verify_file(j3m, sig_file=sig) if DEBUG: print "verified fingerprint: %s" % verified.fingerprint if verified.fingerprint is not None: from json import loads supplied_fingerprint = str(loads( media.loadAsset("j3m.json"))['genealogy']['createdOnDevice']) if verified.fingerprint.upper() == supplied_fingerprint.upper(): if DEBUG: print "SIGNATURE VALID for %s" % verified.fingerprint.upper() media.j3m_verified = True media.save() media.addCompletedTask(task.task_path) task.routeNext() task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def locate_j3m(uv_task): task_tag = "PULLING J3M" print "\n\n************** %s [START] ******************\n" % task_tag print "pulling j3m at %s" % uv_task.doc_id uv_task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG, ANNEX_DIR from vars import ASSET_TAGS media = UnveillanceDocument(_id=uv_task.doc_id) if media is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return from lib.Worker.Utils.funcs import getFileType from vars import MIME_TYPES, MIME_TYPE_MAP ic_j3m_txt = media.loadAsset("j3m_raw.txt") ic_j3m_txt_mime_type = getFileType(ic_j3m_txt, as_buffer=True, force_json=True) inflate = {} print "J3M MIME TYPE SNIFFED: %s" % ic_j3m_txt_mime_type if ic_j3m_txt_mime_type != MIME_TYPES['json']: import os from lib.Core.Utils.funcs import b64decode un_b64 = b64decode(ic_j3m_txt) if un_b64 is not None: un_b64_mime_type = getFileType(un_b64, as_buffer=True) if un_b64_mime_type in [MIME_TYPES['pgp'], MIME_TYPES['gzip']]: if DEBUG: print "MIME TYPE: %s" % un_b64_mime_type asset_path = "j3m_raw.%s" % MIME_TYPE_MAP[un_b64_mime_type] media.addAsset(un_b64, asset_path) if DEBUG: print "\n\nPGP KEY FILE PATH: %s\n\n" % asset_path gz = media.addAsset(None, "j3m_raw.gz", tags=[ASSET_TAGS['OB_M']], description="j3m data extracted from obscura marker") if un_b64_mime_type == MIME_TYPES['pgp']: uv_task.put_next([ "PGP.decrypt.decrypt", "J3M.j3mify.parse_zipped_j3m" ]) inflate.update({ 'pgp_file' : os.path.join(media.base_path, asset_path), 'save_as' : gz }) was_encrypted = True elif un_b64_mime_type in MIME_TYPES['gzip']: uv_task.put_next("J3M.j3mify.parse_zipped_j3m") else: import os from fabric.api import settings, local with settings(warn_only=True): src_j3m = os.path.join(ANNEX_DIR, media.base_path, "j3m_raw.txt") dest_j3m = os.path.join(ANNEX_DIR, media.base_path, "j3m_raw.json") local("mv %s %s" % (src_j3m, dest_j3m)) print "PUTTING J3M FROM HERE!!!! WAS JSON! (%s -> %s)" % (src_j3m, dest_j3m) media.addAsset(None, "j3m_raw.json") uv_task.put_next([ "J3M.j3mify.j3mify", "PGP.verify_signature.verifySignature", "J3M.massage_j3m.massageJ3M", "J3M.verify_visual_content.verifyVisualContent", "J3M.notarize.notarize_media" ]) inflate.update({'j3m_name' : "j3m_raw.json"}) media.addCompletedTask(uv_task.task_path) uv_task.routeNext(inflate=inflate) uv_task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def createGensimObjects(task): task_tag = "GENSIM TOPIC EXTRACTION" print "\n\n************** %s [START] ******************\n" % task_tag print "USING TEXT DOCUMENT at %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS doc = UnveillanceDocument(_id=task.doc_id) if doc is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return from json import loads try: texts = loads(doc.loadAsset("doc_texts.json")) except Exception as e: print "ERROR GETTING DOC-TEXTS: %s" % e print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return if len(texts) == 0: print "THERE ARE NO TEXTS HERE ANYWAY!" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return import logging, os, bz2 from json import loads from gensim import corpora from lib.Core.Utils.funcs import cleanLine from conf import getConfig, ANNEX_DIR logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) try: wiki_dictionary = corpora.Dictionary.load_from_text(os.path.join( getConfig('compass.gensim.training_data'), 'wiki_en_wordids.txt')) wiki_corpus = corpora.MmCorpus(bz2.BZ2File(os.path.join( getConfig('compass.gensim.training_data'), 'wiki_en_tfidf.mm.bz2'))) except Exception as e: print "\n\n************** %s [ERROR] ******************\n" % task_tag error_msg = "having trouble loading gensim dictionary and corpus from wiki dump: (error type %s)" % type(e) print error_msg print e task.fail(message=error_msg) return from gensim import models wiki_log_entropy_file = os.path.join(getConfig('compass.gensim.training_data'), 'wiki_en_log_entropy.model') if not os.path.exists(wiki_log_entropy_file): print "\n\n************** %s [WARN] ******************\n" % task_tag print "no pre-prepared log entropy model. going to generate this here, now. might take a minute..." logent_transformation = models.LogEntropyModel(wiki_corpus, id2word=wiki_dictionary) logent_transformation.save(wiki_log_entropy_file) else: logent_transformation = models.LogEntropyModel.load(wiki_log_entropy_file) tokenize_function = corpora.wikicorpus.tokenize doc_corpus = [wiki_dictionary.doc2bow(tokenize_function(cleanLine(page).lower())) for page in texts] doc_corpus = logent_transformation[doc_corpus] wiki_tfidf_file = os.path.join(getConfig('compass.gensim.training_data'), 'wiki_en_tfidf.tfidf_model') if not os.path.exists(wiki_tfidf_file): print "\n\n************** %s [WARN] ******************\n" % task_tag print "no pre-prepared tfidf model. going to generate this here, now. might take a minute..." wiki_tfidf = models.TfidfModel(wiki_corpus) wiki_tfidf.save(wiki_tfidf_file) else: wiki_tfidf = models.TfidfModel.load(wiki_tfidf_file) doc_tfidf = wiki_tfidf[doc_corpus] num_topics = 35 lsi = models.LsiModel(corpus=doc_tfidf, id2word=wiki_dictionary, num_topics=num_topics) topics = [] t_lambda = lambda x : [float(x[0]), x[1]] for t_group in [t.split("+") for t in [str(topic) for topic in lsi.print_topics(num_topics)]]: topics.append([t_lambda(t.strip().replace('\"','').split("*")) for t in t_group]) lsi_topics = { "topics" : topics, "doc_comprehension" : [] } doc_lsi = lsi[doc_tfidf] for d in doc_lsi: lsi_topics['doc_comprehension'].append(d) topic_path = doc.addAsset(lsi_topics, "%s_topics.json" % doc.file_name, as_literal=False, description="Gensim Topics dump (from LSI Model)", tags=[ASSET_TAGS["GM_TOPICS"]]) doc.addCompletedTask(task.task_path) task.routeNext() print "\n\n************** %s [END] ******************\n" % task_tag task.finish()
def compileMetadata(task): task_tag = "COMPILING METADATA" print "\n\n************** %s [START] ******************\n" % task_tag print "compiling metadata for %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG document = UnveillanceDocument(_id=task.doc_id) if document is None: err = "DOC IS NONE" print err print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail(message=err) return metadata = document.loadAsset(task.md_file) if metadata is None: print "NO METADATA FILE" print "\n\n************** %s [ERROR] ******************\n" % task_tag return import csv, re from Levenshtein import ratio from string import letters from vars import METADATA_ASPECTS, ASSET_TAGS numbers = str("".join([str(i) for i in range(0,10)])) missing_value = "NaN" labels = ["_id"] values = [document._id] try: for mda in METADATA_ASPECTS[task.md_namespace]: labels.append(mda['label']) if hasattr(task, "md_rx"): pattern = re.compile(task.md_rx % (mda['tag_position'], mda['label'])) else: pattern = re.compile(mda['tag_position']) if DEBUG: print pattern.pattern value = missing_value ideal = mda['ideal'] if mda['ideal'] is None: if mda['type'] == "str": ideal = letters + numbers elif mda['type'] == "int": ideal = int(numbers) print "IDEAL FOR TAG: %s" % ideal for line in metadata.splitlines(): match = re.findall(pattern, line.strip()) if len(match) == 1: if DEBUG: print "VALUE FOUND: %s (%s)" % (match[0], type(match[0])) if mda['type'] == "str": try: value = "%.9f" % ratio(ideal, str(value.replace("\"", ''))) except TypeError as e: if DEBUG: print e value = 0 elif mda['type'] == "int": try: value = ideal/float(match[0].replace("\"", '')) except ZeroDivisionError as e: if DEBUG: print e value = 0 break if value == missing_value: if mda['ideal'] is None: value = 1 else: value = 0 values.append(value) if hasattr(task, 'md_extras'): for key, value in task.md_extras.iteritems(): labels.append(key) values.append(value) """ if DEBUG: print "labels %s" % labels print "values %s" % values """ from cStringIO import StringIO md_csv_file = StringIO() md_csv = csv.writer(md_csv_file, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) md_csv.writerow(labels) md_csv.writerow(values) md_asset = document.addAsset(md_csv_file.getvalue(), "file_metadata.csv", tags=[ASSET_TAGS["F_MD"]], description="CSV representation of %s" % task.md_file) md_csv_file.close() if md_asset is None or not document.addFile(md_asset, None): print "Could not save the Metadata" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return document.addCompletedTask(task.task_path) from lib.Worker.Utils.funcs import routeNextTask routeNextTask(task, document) task.finish() print "\n\n************** %s [END] ******************\n" % task_tag except KeyError as e: if DEBUG: print e print "No metadata aspects for %s" % task.md_namespace print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return
def verifyVisualContent(task): task_tag = "VERIFYING VISUAL CONTENT" print "\n\n************** %s [START] ******************\n" % task_tag print "image preprocessing at %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS media = UnveillanceDocument(_id=task.doc_id) if media is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return j3m = media.loadAsset("j3m.json") if j3m is None: print "NO J3M AT ALL" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return import os from json import loads from subprocess import Popen, PIPE from conf import ANNEX_DIR, getConfig from vars import MIME_TYPES try: supplied_hashes = loads(j3m)['genealogy']['hashes'] except KeyError as e: print "NO HASHES" print "\n\n************** %s [WARNING] ******************\n" % task_tag task.finish() return media.media_verified = False if media.mime_type == MIME_TYPES['image']: cmd = ["java", "-jar", os.path.join(getConfig('jpeg_tools_dir'), "JavaMediaHasher.jar"), os.path.join(ANNEX_DIR, media.file_name)] elif media.mime_type == MIME_TYPES['video']: cmd = ["ffmpeg", "-y", "-i", os.path.join(ANNEX_DIR, media.file_name), "-vcodec", "copy", "-an", "-f", "md5", "-"] p = Popen(cmd, stdout=PIPE, close_fds=True) verified_hash = p.stdout.readline().strip().replace("MD5=", "") p.stdout.close() if type(supplied_hashes) is list: for hash in supplied_hashes: if type(hash) is unicode: hash = str(hash) if hash == verified_hash: media.media_verified = True media.save() media.addCompletedTask(task.task_path) task.routeNext() task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def verifyVisualContent(task): task_tag = "VERIFYING VISUAL CONTENT" print "\n\n************** %s [START] ******************\n" % task_tag print "image preprocessing at %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS media = UnveillanceDocument(_id=task.doc_id) if media is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return j3m = media.loadAsset("j3m.json") if j3m is None: print "NO J3M AT ALL" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return from json import loads from vars import MIME_TYPES try: supplied_hashes = loads(j3m)['genealogy']['hashes'] except KeyError as e: print "NO HASHES" print "\n\n************** %s [WARNING] ******************\n" % task_tag task.finish() return media.media_verified = False if not hasattr(media, "verified_hash"): if media.mime_type == MIME_TYPES['image']: from lib.Worker.Models.ic_image import InformaCamImage media = InformaCamImage(_id=media._id) media.get_image_hash() elif media.mime_type == MIME_TYPES['video']: from lib.Worker.Models.ic_video import InformaCamVideo media = InformaCamVideo(_id=media._id) media.get_video_hash() if type(supplied_hashes) is list: for hash in supplied_hashes: if type(hash) is unicode: hash = str(hash) if hash == media.verified_hash: media.media_verified = True media.saveFields("media_verified") media.addCompletedTask(task.task_path) task.routeNext() task.finish() print "\n\n************** %s [END] ******************\n" % task_tag