def getRankedAddresses(*args, **kwargs): tangelo.content_type("application/json") tangelo.log("getRankedAddresses(args: %s kwargs: %s)" % (str(args), str(kwargs))) data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**kwargs) # TODO - reminder no 'qs' here set to '' # qs = parseParamTextQuery(**kwargs) qs='' # TODO this needs to come from UI size = size if size >500 else 2500 ranked_addresses = get_ranked_email_address_from_email_addrs_index(data_set_id, start_datetime, end_datetime, size) top_address_list = [] for i, email_address in enumerate(ranked_addresses["emails"]): graph = es_get_all_email_by_address(data_set_id, email_address[0], qs, start_datetime, end_datetime, size ) top_address_list.append({ "address_search_url_path" : email_address[0], "parameters" : kwargs, "search_results" : { "mail_sent_count" : email_address[6], "mail_received_count" : email_address[5], "mail_attachment_count" : email_address[7], "query_matched_count" : graph["query_hits"], "associated_count" : len(graph["graph"]["nodes"]) }, "TEMPORARY_GRAPH" : graph }) return {"top_address_list" : top_address_list}
def search_email_by_community(*args, **param_args): tangelo.content_type("application/json") tangelo.log("search_email_by_community(args: %s kwargs: %s)" % (str(args), str(param_args))) data_set_id, start_datetime, end_datetime, size = parseParamDatetime( **param_args) community = nth(args, 0, '') # TODO: set from UI size = param_args.get('size', 2500) if not data_set_id: return tangelo.HTTPStatusCode( 400, "invalid service call - missing data_set_id") if not community: return tangelo.HTTPStatusCode(400, "invalid service call - missing sender") email_addrs = parseParam_email_addr(**param_args) qs = parseParamTextQuery(**param_args) return es_get_all_email_by_community(data_set_id, community, email_addrs, qs, start_datetime, end_datetime, size)
def topic_list(*args, **kwargs): category = nth(args, 0, 'all') #tangelo.log("category %s" %(category)) data_set_id, start_datetime, end_datetime, size = parseParamDatetime( **kwargs) tangelo.content_type("application/json") return get_categories(data_set_id)
def stream_next(key): if key not in streams: tangelo.http_status(404, "No Such Key") return { "error": "Stream key does not correspond to an active stream", "stream": key } else: # Grab the stream in preparation for running it. stream = streams[key] # Attempt to run the stream via its next() method - if this # yields a result, then continue; if the next() method raises # StopIteration, then there are no more results to retrieve; if # any other exception is raised, this is treated as an error. try: return stream.next() except StopIteration: del streams[key] tangelo.http_status(204, "Stream Finished") return "OK" except: del streams[key] tangelo.http_status(500, "Streaming Service Exception") tangelo.content_type("application/json") return tangelo.util.traceback_report( error="Caught exception while executing stream service", stream=key)
def search_email_by_topic(*args, **param_args): tangelo.content_type("application/json") tangelo.log("search_email_by_topic(args: %s kwargs: %s)" % (str(args), str(param_args))) data_set_id, start_datetime, end_datetime, size = parseParamDatetime( **param_args) # TODO: set from UI size = param_args.get('size', 2500) if not data_set_id: return tangelo.HTTPStatusCode( 400, "invalid service call - missing data_set_id") if not param_args.get("topic_index"): return tangelo.HTTPStatusCode( 400, "invalid service call - missing topic_index") topic = parseParamTopic(**param_args) email_addrs = parseParam_email_addr(**param_args) qs = parseParamTextQuery(**param_args) return es_get_all_email_by_topic(data_set_id, topic=topic, email_addrs=email_addrs, qs=qs, start_datetime=start_datetime, end_datetime=end_datetime, size=size)
def getAttachFileType(*args, **kwargs): tangelo.content_type("application/json") tangelo.log("getAttachFileType(args: %s kwargs: %s)" % (str(args), str(kwargs))) data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**kwargs) top_count = int(size) attach_type = urllib.unquote(nth(args, 0, '')) if not attach_type: attach_type = 'all' #hack for now email_address_list = parseParamEmailAddress(**kwargs); if not email_address_list : file_types = get_top_attachment_types(data_set_id, date_bounds=(start_datetime, end_datetime), num_top_attachments=top_count)[:top_count] else : #TODO: implement populating the attachment file-types under individual email-accounts; simulate result for now file_types = get_top_attachment_types(data_set_id, date_bounds=(start_datetime, end_datetime), num_top_attachments=top_count)[:top_count] result = { "account_id" : data_set_id, "data_set_id" : data_set_id, "account_start_datetime" : start_datetime, "account_end_datetime" : end_datetime, "types" : file_types } return result
def get_graph_for_entity(*args, **kwargs): tangelo.content_type("application/json") tangelo.log("entity.get_graph_for_entity(args: %s kwargs: %s)" % (str(args), str(kwargs))) data_set_id, start_datetime, end_datetime, size = parseParamDatetime( **kwargs) email_address_list = parseParamEmailAddress(**kwargs) entity_dict = parseParamEntity(**kwargs) # TODO set from UI size = size if size > 500 else 2500 qs = parseParamTextQuery(**kwargs) query = _build_email_query(email_addrs=email_address_list, qs=qs, entity=entity_dict, date_bounds=(start_datetime, end_datetime)) tangelo.log("entity.get_graph_for_entity(query: %s)" % (query)) results = _query_emails(data_set_id, size, query) graph = _build_graph_for_emails(data_set_id, results["hits"], results["total"]) # Get attachments for community query = _build_email_query(email_addrs=email_address_list, qs=qs, entity=entity_dict, date_bounds=(start_datetime, end_datetime), attachments_only=True) tangelo.log("entity.get_graph_by_entity(attachment-query: %s)" % (query)) attachments = _query_email_attachments(data_set_id, size, query) graph["attachments"] = attachments return graph
def getEmail(*args): email = urllib.unquote(nth(args, 0, '')) if not email: return tangelo.HTTPStatusCode(400, "invalid service call - missing id") tangelo.content_type("application/json") return {"email": queryEmail(email), "entities": queryEntity(email)}
def last_save(*args): tangelo.content_type("application/json") saves = list(glob.iglob('{}/*.json'.format(auto_save_dir))) if len(saves) > 0: f = max(saves, key=os.path.getctime) return slurp(f) return {'trainings': []}
def queryEmail(email): with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt_email_by_id, email) as qry: tangelo.log("node-vals: %s" % qry.stmt) rtn = qry.cursor().fetchone() tangelo.content_type("application/json") return rtn if rtn else []
def server_save(*args, **kwargs): print("inside Server save") f = kwargs.get('name') data = kwargs.get('data') spit("{}/{}".format(user_save_dir, f), json.dumps(data)) tangelo.content_type("application/json") return {'saved': f}
def getExportable(*args): stmt = (" SELECT id, subject FROM email WHERE exportable='true' ") tangelo.content_type("application/json") with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt) as qry: rtn = [[str(val) for val in row] for row in qry.cursor()] return {"emails": rtn}
def get_topics_by_query(*args, **kwargs): tangelo.content_type("application/json") algorithm = kwargs.get('algorithm', 'lingo') data_set_id, start_datetime, end_datetime, size = parseParamDatetime( **kwargs) email_address_list = parseParamEmailAddress(**kwargs) # TODO ------------------------------------------------------------------------- # TODO REMEMBER TO EVALUATE QUERY TERMS -- VERY IMPORTANT for good clustering! # TODO ------------------------------------------------------------------------- query_terms = '' # TODO set from UI analysis_field = kwargs.get("analysis_field", "_source.body") # TODO set from UI num_returned = 20 clusters = get_dynamic_clusters(data_set_id, "emails", email_addrs=email_address_list, query_terms=query_terms, topic_score=None, entity={}, date_bounds=(start_datetime, end_datetime), cluster_fields=[analysis_field], cluster_title_fields=["_source.subject"], algorithm=algorithm, max_doc_pool_size=500) return {"topics": clusters[:num_returned]}
def search_email_by_conversation(*path_args, **param_args): tangelo.content_type("application/json") tangelo.log("search.search_email_by_conversation(path_args[%s] %s)" % (len(path_args), str(path_args))) data_set_id, start_datetime, end_datetime, size = parseParamDatetime( **param_args) # TODO: set from UI size = param_args.get('size', 2500) # parse the sender address and the recipient address sender_list = parseParamEmailSender(**param_args) cherrypy.log("\tsender_list: %s)" % str(sender_list)) recipient_list = parseParamEmailRecipient(**param_args) cherrypy.log("\trecipient_list: %s)" % str(recipient_list)) document_uid = parseParamDocumentUID(**param_args) cherrypy.log("\tdocument_uid: %s)" % str(document_uid)) document_datetime = parseParamDocumentDatetime(**param_args) cherrypy.log("\tdocument_datetime: %s)" % str(document_datetime)) if not document_datetime: return tangelo.HTTPStatusCode( 400, "invalid service call - missing mandatory param 'document_datetime'" ) sender_address, recipient_address = parseParamAllSenderAllRecipient( **param_args) return es_get_conversation(data_set_id, sender_address, recipient_address, start_datetime, end_datetime, size / 2, document_uid, document_datetime)
def auto_save(*args, **kwargs): cherrypy.log("saved") f= "session_{}.json".format(datetime.datetime.now().strftime('%Y%m%d%H%M%S')) spit("{}/{}".format(auto_save_dir, f), json.dumps(kwargs)) remove_old_files() tangelo.content_type("application/json") return { 'saved': f }
def get_graph_for_entity(*args, **kwargs): tangelo.content_type("application/json") tangelo.log("entity.get_graph_for_entity(args: %s kwargs: %s)" % (str(args), str(kwargs))) data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**kwargs) email_address_list = parseParamEmailAddress(**kwargs); entity_dict = parseParamEntity(**kwargs) # TODO set from UI size = size if size >500 else 2500 qs = parseParamTextQuery(**kwargs) query = _build_email_query(email_addrs=email_address_list, qs=qs, entity=entity_dict, date_bounds=(start_datetime, end_datetime)) tangelo.log("entity.get_graph_for_entity(query: %s)" % (query)) results = _query_emails(data_set_id, size, query) graph = _build_graph_for_emails(data_set_id, results["hits"], results["total"]) # Get attachments for community query = _build_email_query(email_addrs=email_address_list, qs=qs, entity=entity_dict, date_bounds=(start_datetime, end_datetime), attachments_only=True) tangelo.log("entity.get_graph_by_entity(attachment-query: %s)" % (query)) attachments = _query_email_attachments(data_set_id, size, query) graph["attachments"] = attachments return graph
def getEmail(*args): email=urllib.unquote(nth(args, 0, '')) if not email: return tangelo.HTTPStatusCode(400, "invalid service call - missing id") tangelo.content_type("application/json") return { "email" : queryEmail(email), "entities": queryEntity(email) }
def getRankedAddresses(*args, **kwargs): tangelo.content_type("application/json") tangelo.log("getRankedAddresses(args: %s kwargs: %s)" % (str(args), str(kwargs))) data_set_id, start_datetime, end_datetime, size = parseParamDatetime( **kwargs) # TODO - reminder no 'qs' here set to '' # qs = parseParamTextQuery(**kwargs) qs = '' # TODO this needs to come from UI size = size if size > 500 else 2500 ranked_addresses = get_ranked_email_address_from_email_addrs_index( data_set_id, start_datetime, end_datetime, size) top_address_list = [] for i, email_address in enumerate(ranked_addresses["emails"]): graph = es_get_all_email_by_address(data_set_id, email_address[0], qs, start_datetime, end_datetime, size) top_address_list.append({ "address_search_url_path": email_address[0], "parameters": kwargs, "search_results": { "mail_sent_count": email_address[6], "mail_received_count": email_address[5], "mail_attachment_count": email_address[7], "query_matched_count": graph["query_hits"], "associated_count": len(graph["graph"]["nodes"]) }, "TEMPORARY_GRAPH": graph }) return {"top_address_list": top_address_list}
def searchStarred(*args, **kwargs): tangelo.log("email.searchStarred(args: %s kwargs: %s)" % (str(args), str(kwargs))) tangelo.content_type("application/json") data_set_id, start_datetime, end_datetime, size = parseParamDatetime( **kwargs) size = size if size > 500 else 2500 # TODO set from UI query_terms = '' email_address_list = [] query = _build_email_query(email_addrs=email_address_list, qs=query_terms, date_bounds=(start_datetime, end_datetime), starred=True) tangelo.log("email.searchStarred(query: %s)" % (query)) results = _query_emails(data_set_id, size, query) graph = _build_graph_for_emails(data_set_id, results["hits"], results["total"]) # Get attachments for community query = _build_email_query(email_addrs=email_address_list, qs=query_terms, date_bounds=(start_datetime, end_datetime), attachments_only=True, starred=True) tangelo.log("email.searchStarred(attachment-query: %s)" % (query)) attachments = _query_email_attachments(data_set_id, size, query) graph["attachments"] = attachments return graph
def last_save(*args): tangelo.content_type("application/json") saves=list(glob.iglob('{}/*.json'.format(auto_save_dir))) if len(saves) > 0: f= max(saves, key=os.path.getctime) return slurp(f) return { 'trainings' : [] }
def searchStarred(*args, **kwargs): tangelo.log("email.searchStarred(args: %s kwargs: %s)" % (str(args), str(kwargs))) tangelo.content_type("application/json") data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**kwargs) size = size if size >500 else 2500 # TODO set from UI query_terms='' email_address_list = [] query = _build_email_query(email_addrs=email_address_list, qs=query_terms, date_bounds=(start_datetime, end_datetime), starred=True) tangelo.log("email.searchStarred(query: %s)" % (query)) results = _query_emails(data_set_id, size, query) graph = _build_graph_for_emails(data_set_id, results["hits"], results["total"]) # Get attachments for community query = _build_email_query(email_addrs=email_address_list, qs=query_terms, date_bounds=(start_datetime, end_datetime), attachments_only=True, starred=True) tangelo.log("email.searchStarred(attachment-query: %s)" % (query)) attachments = _query_email_attachments(data_set_id, size, query) graph["attachments"] = attachments return graph
def stream_next(key): if key not in streams: tangelo.http_status(404, "No Such Key") return {"error": "Stream key does not correspond to an active stream", "stream": key} else: # Grab the stream in preparation for running it. stream = streams[key] # Attempt to run the stream via its next() method - if this # yields a result, then continue; if the next() method raises # StopIteration, then there are no more results to retrieve; if # any other exception is raised, this is treated as an error. try: return stream.next() except StopIteration: del streams[key] tangelo.http_status(204, "Stream Finished") return "OK" except: del streams[key] tangelo.http_status(500, "Streaming Service Exception") tangelo.content_type("application/json") error_code = tangelo.util.generate_error_code() tangelo.util.log_traceback("STREAM", error_code, "Offending stream key: %s" % (key), "Uncaught executing executing service %s" % (tangelo.request_path)) return tangelo.util.error_report(error_code)
def spacy_save(*args, **kwargs): print("inside spacy save") f = kwargs.get('name') data = kwargs.get('data') data_spacy_format = modify_output(data) spit("{}/{}".format(user_save_dir, f), json.dumps(data_spacy_format)) tangelo.content_type("application/json") return {'saved': f, 'data': data_spacy_format}
def train_spacy(*args, **kwargs): print("inside training spacy") data = kwargs.get('data') data_spacy_format = modify_output(data) train_entity_extractor(data_spacy_format) # spit("{}/{}".format(user_save_dir, f), json.dumps(data_spacy_format)) tangelo.content_type("application/json") return {'saved': f}
def export_emails_archive(data_set_id, email_ids=["f9c9c59a-7fe8-11e5-bb05-08002705cb99"]): cherrypy.log("email.get_attachments_sender(index=%s, attachment_id=%s)" % (data_set_id, email_ids)) if not data_set_id: return tangelo.HTTPStatusCode(400, "invalid service call - missing index") # if not email: # return tangelo.HTTPStatusCode(400, "invalid service call - missing attachment_id") # elasticsearch.exceptions.ConnectionTimeout: ConnectionTimeout caused by - ReadTimeoutError(HTTPConnectionPool(host='10.1.70.143', port=9200): Read timed out. (read timeout=10)) es = Elasticsearch([{"host" : "10.1.70.143", "port" : 9200}], request_timeout=60) # TODO can implement with multiple doc_types and combine attachments in emails = es.mget(index=data_set_id, doc_type="emails", body={"docs":[{"_id":id} for id in email_ids]}) # TODO filename filename= "export.tar.gz" tangelo.content_type("application/x-gzip") header("Content-Disposition", 'attachment; filename="{}"'.format(filename)) string_buffer = cStringIO.StringIO() tar = tarfile.open(mode='w:gz', fileobj=string_buffer) # Add each email to the tar for email_source in emails["docs"]: email = email_source["_source"] tarinfo_parent= tarfile.TarInfo(name = email["id"]) tarinfo_parent.type = tarfile.DIRTYPE tarinfo_parent.mode = 0755 tarinfo_parent.mtime = time.time() tar.addfile(tarinfo_parent) tarinfo = tarfile.TarInfo(email["id"]+"/"+email["id"]+".json") # TODO -- email transformation data_string = json.dumps(email) fobj = cStringIO.StringIO(data_string) tarinfo.size = len(data_string) tarinfo.mode = 0644 tarinfo.mtime = time.time() tar.addfile(tarinfo, fobj) # Get the attachments if email["attachments"]: attachments = es.mget(index=data_set_id, doc_type="attachments", body={"docs":[{"_id":attch["guid"]} for attch in email["attachments"]]}) for attachment_source in attachments["docs"]: attachment = attachment_source["_source"] filename = attachment["filename"] attch_data = str(base64.b64decode(attachment["contents64"])) tarinfo_attch = tarfile.TarInfo(email["id"]+"/"+filename) tarinfo_attch.size = len(attch_data) tarinfo_attch.mode = 0644 tarinfo_attch.mtime = time.time() tar.addfile(tarinfo_attch, cStringIO.StringIO(attch_data)) tar.close() return string_buffer.getvalue()
def getDomains(*args): stmt = ( "SELECT SUBSTRING_INDEX(email_addr, '@', -1) as eml, count(1) from email_addr group by eml" ) tangelo.content_type("application/json") with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt) as qry: rtn = [[str(val) for val in row] for row in qry.cursor()] return {"domains": rtn}
def getCommunities(*args, **kwargs): tangelo.log("getCommunities(args: %s kwargs: %s)" % (str(args), str(kwargs))) tangelo.content_type("application/json") data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**kwargs) #top_count = int(urllib.unquote(nth(args, 0, "40"))) top_count = int(size); return {"communities" : get_top_communities(data_set_id, date_bounds=(start_datetime, end_datetime), num_communities=top_count)[:top_count]}
def setSelectedDataSet(*args): tangelo.content_type("application/json") data_set_id=urllib.unquote(nth(args, 0, '')) if not data_set_id: return tangelo.HTTPStatusCode(400, "invalid service call - missing data_set_id") resp = initialize_email_addr_cache(data_set_id) _current_data_set_selected = data_set_id return _index_record(data_set_id)
def getRankedEmails(*args, **kwargs): tangelo.content_type("application/json") tangelo.log("getRankedEmails(args: %s kwargs: %s)" % (str(args), str(kwargs))) data_set_id, start_datetime, end_datetime, size = parseParamDatetime( **kwargs) return get_ranked_email_address_from_email_addrs_index( data_set_id, start_datetime, end_datetime, size)
def getDomains(*args): stmt = ( "SELECT SUBSTRING_INDEX(email_addr, '@', -1) as eml, count(1) from email_addr group by eml" ) tangelo.content_type("application/json") with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt) as qry: rtn = [[str(val) for val in row] for row in qry.cursor()] return { "domains" : rtn }
def getExportable(*args): stmt = ( " SELECT id, subject FROM email WHERE exportable='true' " ) tangelo.content_type("application/json") with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt) as qry: rtn = [[str(val) for val in row] for row in qry.cursor()] return { "emails" : rtn }
def download(data): user = data.get("user") if not user: return tangelo.HTTPStatusCode(400, "invalid service call missing user") passwd = data.get("pass") limit = data.get("limit", "2000") logfile = "{}/{}.log".format(work_dir, user) spit(logfile, "[Start] {}\n".format(user), True) cherrypy.log("logfile: {}".format(logfile)) def download_thread(): try: cherrypy.log("Thread Start User: {}".format(user)) try: session = newman_email.login(user, passwd, logfile) fldr = "{}/emails/{}".format(webroot, user) cherrypy.log("Login User: {}".format(user)) if os.path.exists(fldr): rmrf(fldr) mkdir(fldr) spit("{}/output.csv".format(fldr), newman_email.headerrow() + "\n") mkdir(fldr + "/emails") newman_email.download(session, user, fldr, int(limit), logfile) spit(logfile, "[Completed Download] {}\n".format(user)) except Exception as ex: spit(logfile, "[Error] {}\n".format(ex)) cherrypy.log("[Error] {}\n".format(ex)) except: spit(logfile, "[Error]") cherrypy.log("[Error]") error_info = sys.exc_info()[0] cherrypy.log(error_info) spit(logfile, "[Error] {}\n".format(error_info.replace('\n', ' '))) finally: newman_email.close_session(session) except: error_info = sys.exc_info()[0] cherrypy.log(error_info) spit(logfile, "[Error] {}\n".format(error_info.replace('\n', ' '))) thr = threading.Thread(target=download_thread, args=()) thr.start() tangelo.content_type("application/json") return {"id": user}
def stream_start(url, kwargs): content = tangelo.server.analyze_url(url).content if content is None or content.type != Content.Service: tangelo.http_status(500, "Error Opening Streaming Service") return {"error": "could not open streaming service"} else: # Extract the path to the service and the list of positional # arguments. module_path = content.path pargs = content.pargs # Get the service module. try: service = modules.get(module_path) except: tangelo.http_status(501, "Error Importing Streaming Service") tangelo.content_type("application/json") return tangelo.util.traceback_report( error="Could not import module %s" % (module_path)) else: # Check for a "stream" function inside the module. if "stream" not in dir(service): tangelo.http_status(400, "Non-Streaming Service") return { "error": "The requested streaming service does not implement a 'stream()' function" } else: # Call the stream function and capture its result. try: stream = service.stream(*pargs, **kwargs) except Exception: result = tangelo.util.traceback_report( error= "Caught exception during streaming service execution", module=tangelo.request_path()) tangelo.log_warning( "STREAM", "Could not execute service %s:\n%s" % (tangelo.request_path(), "\n".join( result["traceback"]))) tangelo.http_status(500, "Streaming Service Raised Exception") tangelo.content_type("application/json") return result else: # Generate a key corresponding to this object. key = tangelo.util.generate_key(streams) # Log the object in the streaming table. streams[key] = stream # Create an object describing the logging of the generator object. return {"key": key}
def getRollup(*args): entity = urllib.unquote(nth(args, 0, "")) if not entity: return tangelo.HTTPStatusCode(400, "invalid service call - missing id") with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt_entity_rollup_id, entity) as qry: rtn = qry.cursor().fetchone() tangelo.content_type("application/json") return {"rollupId": rtn}
def tables(*args): with impalaopen(":".join(settings.IMPALA)) as curr: curr.execute("show tables") tangelo.content_type("application/json") return json.dumps({ 'tables': [ table[:-20] for table in curr if table[0].endswith("tracks_comms_joined") ] })
def plugin(self, *path, **args): # Refresh the plugin registry. if self.plugins: error = self.plugins.refresh() if error is not None: tangelo.content_type("text/plain") tangelo.http_status(400, "Bad Plugin Configuration") return error return self.execute_analysis(args)
def setExportMany(data): emails = data.get('emails', []) exportable = 'true' if data.get('exportable', True) else 'false' stmt = (" UPDATE email SET exportable=%s WHERE id = %s ") with newman_connector() as cnx: for email in emails: with execute_nonquery(cnx.conn(), stmt, exportable, email) as qry: pass tangelo.content_type("application/json") return {'exported': emails}
def setSelectedDataSet(*args): tangelo.content_type("application/json") data_set_id = urllib.unquote(nth(args, 0, '')) if not data_set_id: return tangelo.HTTPStatusCode( 400, "invalid service call - missing data_set_id") resp = initialize_email_addr_cache(data_set_id) _current_data_set_selected = data_set_id return _index_record(data_set_id)
def topic_list(*args): category = nth(args, 0, 'all') with newman_connector() as read_cnx: stmt = (" select idx, value, docs from topic_category " " where category_id = %s " " order by idx ") with execute_query(read_cnx.conn(), stmt, category) as qry: rtn = [r for r in qry.cursor()] tangelo.content_type("application/json") return {"categories": rtn}
def getAttachCount(*args, **kwargs): tangelo.content_type("application/json") tangelo.log("getAttachCount(args: %s kwargs: %s)" % (str(args), str(kwargs))) data_set_id, start_datetime, end_datetime, size = parseParamDatetime( **kwargs) attach_type = urllib.unquote(nth(args, 0, '')) if not attach_type: return tangelo.HTTPStatusCode( 400, "invalid service call - missing attach_type") attach_type = 'all' #hack for now email_address_list = parseParamEmailAddress(**kwargs) if not email_address_list: activity = get_total_attachment_activity( data_set_id, data_set_id, query_function=attachment_histogram, sender_email_addr="", start=start_datetime, end=end_datetime, interval="week") result = { "account_activity_list": [{ "account_id": data_set_id, "data_set_id": data_set_id, "account_start_datetime": start_datetime, "account_end_datetime": end_datetime, "activities": activity }] } else: result = { "account_activity_list": [{ "account_id": account_id, "data_set_id": data_set_id, "account_start_datetime": start_datetime, "account_end_datetime": end_datetime, "activities": get_emailer_attachment_activity(data_set_id, account_id, (start_datetime, end_datetime), interval="week") } for account_id in email_address_list] } return result
def download(data): user = data.get("user") if not user: return tangelo.HTTPStatusCode(400, "invalid service call missing user") passwd = data.get("pass") limit = data.get("limit", "2000") logfile = "{}/{}.log".format(work_dir, user) spit(logfile, "[Start] {}\n".format(user), True) cherrypy.log("logfile: {}".format(logfile)) def download_thread(): try: cherrypy.log("Thread Start User: {}".format(user)) try: session = newman_email.login(user, passwd, logfile) fldr = "{}/emails/{}".format(webroot, user) cherrypy.log("Login User: {}".format(user)) if os.path.exists(fldr): rmrf(fldr) mkdir(fldr) spit("{}/output.csv".format(fldr), newman_email.headerrow() + "\n") mkdir(fldr + "/emails") newman_email.download(session, user, fldr, int(limit), logfile) spit(logfile, "[Completed Download] {}\n".format(user)) except Exception as ex: spit(logfile, "[Error] {}\n".format(ex)) cherrypy.log("[Error] {}\n".format(ex)) except: spit(logfile, "[Error]") cherrypy.log("[Error]") error_info = sys.exc_info()[0] cherrypy.log(error_info) spit(logfile, "[Error] {}\n".format(error_info.replace('\n', ' '))) finally: newman_email.close_session(session) except: error_info = sys.exc_info()[0] cherrypy.log(error_info) spit(logfile, "[Error] {}\n".format(error_info.replace('\n', ' '))) thr = threading.Thread(target=download_thread, args=()) thr.start() tangelo.content_type("application/json") return { "id" : user }
def getRollup(*args): entity = urllib.unquote(nth(args, 0, '')) if not entity: return tangelo.HTTPStatusCode(400, "invalid service call - missing id") with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt_entity_rollup_id, entity) as qry: rtn = qry.cursor().fetchone() tangelo.content_type("application/json") return {"rollupId": rtn}
def exif_emails(*args, **kwargs): tangelo.log("geo.exif_emails(args: %s kwargs: %s)" % (str(args), str(kwargs))) tangelo.content_type("application/json") data_set_id, start_datetime, end_datetime, size = parseParamDatetime( **kwargs) qs = parseParamTextQuery(**kwargs) return es_get_exif_emails(data_set_id, size)
def getRankedEmails(*args): tangelo.content_type("application/json") stmt = ( " select email_addr, community, community_id, group_id, rank, total_received, total_sent " " from email_addr " " where rank > 0 " " order by cast(rank as decimal(4,4)) desc") with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt) as qry: rtn = [[str(val) for val in row] for row in qry.cursor()] return {"emails": rtn}
def ingest(data): cfg = "{}/conf/{}".format(base_dir, data.get('conf', 'target.cfg')) logname = "ingest_{}".format(fmtNow()) teefile = "{}/{}.tee.log".format(work_dir, logname) errfile = "{}/{}.err.log".format(work_dir, logname) logfile = "{}/{}.status.log".format(work_dir, logname) cherrypy.log("Ingest config: {}".format(cfg)) cherrypy.log("Ingest logfile: {}".format(logfile)) def ingest_thread(): cherrypy.log("Ingest Started:") try: cherrypy.log("started: {}".format(fmtNow())) spit(logfile, "[Started] {} \n".format(fmtNow())) args = ["./bin/rebuild_all.sh"] cherrypy.log("running: {}".format(" ".join(args))) spit(logfile, "[Running] {} \n".format(" ".join(args))) with open(teefile, 'w') as t, open(errfile, 'w') as e: kwargs = {'stdout': t, 'stderr': e, 'cwd': base_dir } rebuildp = subprocess.Popen(args, **kwargs) out, err = rebuildp.communicate() cherrypy.log("rebuild complete: {}".format(fmtNow())) rtn = rebuildp.returncode if rtn != 0: spit(logfile, "[Error] rebuild return with non-zero code: {} \n".format(rtn)) return args = ["./bin/ingest.sh", cfg] cherrypy.log("running ingest: {}".format(" ".join(args))) spit(logfile, "[Running] {} \n".format(" ".join(args))) with open(teefile, 'w') as t, open(errfile, 'w') as e: kwargs = {'stdout': t, 'stderr': e, 'cwd': base_dir, 'bufsize' : 1 } subp = subprocess.Popen(args, **kwargs) out, err = subp.communicate() cherrypy.log("complete: {}".format(fmtNow())) rtn = subp.returncode if rtn != 0: spit(logfile, "[Error] return with non-zero code: {} \n".format(rtn)) else: spit(logfile, "[Complete]") except: error_info = sys.exc_info()[0] cherrypy.log(error_info) spit(logfile, "[Error] {}\n".format(error_info.replace('\n', ' '))) thr = threading.Thread(target=ingest_thread, args=()) thr.start() tangelo.content_type("application/json") return {'log' : logname }
def setExportMany(data): emails = data.get('emails', []) exportable= 'true' if data.get('exportable', True) else 'false' stmt = ( " UPDATE email SET exportable=%s WHERE id = %s " ) with newman_connector() as cnx: for email in emails: with execute_nonquery(cnx.conn(), stmt, exportable, email) as qry: pass tangelo.content_type("application/json") return { 'exported' : emails }
def getRankedEmails(*args): tangelo.content_type("application/json") stmt = ( " select email_addr, community, community_id, group_id, rank, total_received, total_sent " " from email_addr " " where rank > 0 " " order by cast(rank as decimal(4,4)) desc" ) with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt) as qry: rtn = [[str(val) for val in row] for row in qry.cursor()] return { "emails" : rtn }
def topic_list(*args): category=nth(args, 0, 'all') with newman_connector() as read_cnx: stmt = ( " select idx, value, docs from topic_category " " where category_id = %s " " order by idx " ) with execute_query(read_cnx.conn(), stmt, category) as qry: rtn = [r for r in qry.cursor()] tangelo.content_type("application/json") return { "categories" : rtn }
def getTopRollup(*args): amt = urllib.unquote(nth(args, 0, "")) if not amt: return tangelo.HTTPStatusCode(400, "invalid service call - missing id") stmt = stmt_top_rollup_entities + ("limit {0}".format(amt)) with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt) as qry: rtn = [r for r in qry.cursor()] rtn = rtn if rtn else [] tangelo.content_type("application/json") return {"entities": rtn}
def getAllAttachmentBySender(*args, **kwargs): tangelo.log("getAttachmentsSender(args: %s kwargs: %s)" % (str(args), str(kwargs))) data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**kwargs) sender=nth(args, 0, '') if not data_set_id: return tangelo.HTTPStatusCode(400, "invalid service call - missing data_set_id") if not sender: return tangelo.HTTPStatusCode(400, "invalid service call - missing sender") tangelo.content_type("application/json") return get_attachments_by_sender(data_set_id, sender, start_datetime, end_datetime, size )
def extract_pst(*args, **kwargs): cherrypy.log("search.extract_pst(kwargs[%s] %s)" % (len(kwargs), str(kwargs))) ingest_id=kwargs.get("ingest-id") ingest_file=kwargs.get("file") type=kwargs.get("type", "pst") # path = "{}/{}".format(ingest_parent_dir, type) if not ingest_id or not type or not ingest_file: raise TypeError("Encountered a 'None' value for 'email', 'type', or 'ingest_file!'") # Add the prefix for the newman indexes ingest_id = index_prefix+ingest_id logname = "pst_{}".format(fmtNow()) ingester_log = "{}/{}.ingester.log".format(work_dir, logname) # errfile = "{}/{}.err.log".format(work_dir, logname) service_status_log = "{}/{}.status.log".format(work_dir, logname) spit(service_status_log, "[Start] email address={}\n".format(ingest_id), True) def extract_thread(): try: args = ["./bin/ingest.sh", ingest_id, ingest_parent_dir, ingest_file, type] cherrypy.log("running pst: {}".format(" ".join(args))) spit(service_status_log, "[Running] {} \n".format(" ".join(args))) with open(ingester_log, 'w') as t: kwargs = {'stdout': t, 'stderr': t, 'cwd': base_dir, 'bufsize' : 1 } subp = subprocess.Popen(args, **kwargs) out, err = subp.communicate() # TODO should never see this line - remove this cherrypy.log("complete: {}".format(fmtNow())) rtn = subp.returncode if rtn != 0: spit(service_status_log, "[Error] return with non-zero code: {} \n".format(rtn)) else: spit(service_status_log, "[Done Ingesting data. Reloading the email_addr cache.]") initialize_email_addr_cache(ingest_id, update=True) spit(service_status_log, "[Complete.]") except: error_info = sys.exc_info()[0] spit(service_status_log, "[Error] {}\n".format(error_info.replace('\n', ' '))) # cherrypy.log(error_info) thr = threading.Thread(target=extract_thread, args=()) thr.start() tangelo.content_type("application/json") return {'log' : logname }
def getEmail(*args, **kwargs): tangelo.log("getEmail(args: %s kwargs: %s)" % (str(args), str(kwargs))) tangelo.content_type("application/json") data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**kwargs) qs = parseParamTextQuery(**kwargs) email_id = args[-1] if not email_id: return tangelo.HTTPStatusCode(400, "invalid service call - missing email_id") return get_email(data_set_id, email_id, qs)
def setStarred(*args, **kwargs): tangelo.log("setStarred(args: %s kwargs: %s)" % (str(args), str(kwargs))) tangelo.content_type("application/json") data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**kwargs) email_id = args[-1] if not email_id: return tangelo.HTTPStatusCode(400, "invalid service call - missing email_id") starred = parseParamStarred(**kwargs) return set_starred(data_set_id, [email_id], starred)
def execute_analysis(self, query_args): # Hide the identity/version number of the server technology in the # response headers. cherrypy.response.headers["Server"] = "" # Analyze the URL. analysis = analyze_url(cherrypy.request.path_info) directive = analysis.directive content = analysis.content # If any "directives" were found (i.e., redirections) perform them here. if directive is not None: if directive.type == Directive.HTTPRedirect: raise cherrypy.HTTPRedirect(analysis.directive.argument) elif directive.type == Directive.InternalRedirect: raise cherrypy.InternalRedirect(analysis.directive.argument) elif directive.type == Directive.ListPlugins: tangelo.content_type("application/json") plugin_list = self.plugins.plugin_list() if self.plugins else [] return json.dumps(plugin_list) else: raise RuntimeError("fatal internal error: illegal directive type code %d" % (analysis.directive.type)) # If content was actually found at the URL, perform any htaccess updates # now. do_auth = self.auth_update and content is None or content.type != Content.NotFound if do_auth: self.auth_update.update(analysis.reqpathcomp, analysis.pathcomp) # Serve content here, either by serving a static file, generating a # directory listing, executing a service, or barring the client entry. if content is not None: if content.type == Content.File: if content.path is not None: return cherrypy.lib.static.serve_file(content.path) else: raise cherrypy.HTTPError("403 Forbidden", "The requested path is forbidden") elif content.type == Content.Directory: if content.path is not None: return Tangelo.dirlisting(content.path, cherrypy.request.path_info) else: raise cherrypy.HTTPError("403 Forbidden", "Listing of this directory has been disabled") elif content.type == Content.Service: cherrypy.thread_data.pluginpath = analysis.plugin_path return self.invoke_service(content.path, *content.pargs, **query_args) elif content.type == Content.NotFound: raise cherrypy.HTTPError("404 Not Found", "The path '%s' was not found" % (content.path)) else: raise RuntimeError("fatal error: illegal content type code %d" % (content.type)) else: raise RuntimeError("fatal internal error: analyze_url() returned analysis without directive or content")
def getTarget(*args): # returns the users who's email is being analyzed #todo: read from file or config target = getOpt('target') stmt = ( " select e.email_addr, e.community, e.community_id, e.group_id, e.total_received, e.total_sent, e.rank " " from email_addr e " " where e.email_addr = %s " ) tangelo.content_type("application/json") with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt, target) as qry: rtn = [[str(val) for val in row] for row in qry.cursor()] return { "email" : rtn }
def stream_start(url, kwargs): content = tangelo.server.analyze_url(url).content if content is None or content.type != Content.Service: tangelo.http_status(500, "Error Opening Streaming Service") return {"error": "could not open streaming service"} else: # Extract the path to the service and the list of positional # arguments. module_path = content.path pargs = content.pargs # Get the service module. try: service = modules.get(module_path) except: tangelo.http_status(500, "Error Importing Streaming Service") tangelo.content_type("application/json") error_code = tangelo.util.generate_error_code() tangelo.util.log_traceback("STREAM", error_code, "Could not import module %s" % (tangelo.request_path())) return tangelo.util.error_report(error_code) else: # Check for a "stream" function inside the module. if "stream" not in dir(service): tangelo.http_status(400, "Non-Streaming Service") return {"error": "The requested streaming service does not implement a 'stream()' function"} else: # Call the stream function and capture its result. try: stream = service.stream(*pargs, **kwargs) except Exception: tangelo.http_status(500, "Streaming Service Raised Exception") tangelo.content_type("application/json") error_code = tangelo.util.generate_error_code() tangelo.util.log_traceback("STREAM", error_code, "Could not execute service %s" % (tangelo.request_path())) return tangelo.util.error_report(error_code) else: # Generate a key corresponding to this object. key = tangelo.util.generate_key(streams) # Log the object in the streaming table. streams[key] = stream # Create an object describing the logging of the generator object. return {"key": key}
def getAttachmentsSender(*args): sender=urllib.unquote(nth(args, 0, '')) if not sender: return tangelo.HTTPStatusCode(400, "invalid service call - missing id") tangelo.content_type("application/json") stmt = ( " select id, dir, datetime, from_addr, tos, ccs, bccs, subject, attach, bodysize " " from email " " where from_addr = %s and attach != '' " ) with newman_connector() as read_cnx: with execute_query(read_cnx.conn(), stmt, sender) as qry: rtn = [[ val.encode('utf-8') if isinstance(val, basestring) else str(val) for val in row] for row in qry.cursor()] return { "sender": sender, "email_attachments" : rtn }