def agg_cluster_counts(index): count = es().count(index=index, doc_type='lda-clustering', body={"query" : {"bool":{"must":[{"match_all":{}}]}}})["count"] # print count query = _cluster_lda(count, email_addrs=[], query_terms='', entity_dict=[], date_bounds=None) # print query resp = es().search(index=index, doc_type='emails', body=query) return {k: v["doc_count"]for k,v in resp["aggregations"].iteritems()}
def export_attachments(data_set_id, sender='', attachment_extension='jpg', date_bounds=None): print("email.get_attachments_sender(index=%s, sender=%s, attachment_type=%s, date_bounds=%s)" % (data_set_id, sender, attachment_extension, date_bounds)) if not data_set_id: print "invalid service call - missing index" return 1 # TODO get accurate count -- this is not strictly needed as attachments will be accessed as inner docs on the email_address max_inner_attachments_returned = 100000 # Get all attachments by extension rows=[] body = _attch_nested__ext_query(sender, attachment_extension, date_bounds, max_inner_attachments_returned=max_inner_attachments_returned ) print body addresses_count = es().count(index=data_set_id, doc_type="email_address", body=body)["count"] print "total addresses: " + str(addresses_count) addresses = es().search(index=data_set_id, doc_type="email_address", body=body, size=addresses_count) for address in addresses["hits"]["hits"]: rows += [[address["_source"]["addr"], attachment["_source"]["guid"], attachment["_source"]["filename"], attachment["_source"]["datetime"]] for attachment in address["inner_hits"]["sender_attachments"]["hits"]["hits"]] print "total attachments: " + str(len(rows)) # Start tar tar = tarfile.open(mode='w:gz', name="/tmp/big-export.tar.gz") csv_string_buffer = cStringIO.StringIO() csv_file=csv.writer( csv_string_buffer ) # Add all rows to attachment csv csv_file.writerows (rows) tarinfo = tarfile.TarInfo("attachments.csv") tarinfo.size = csv_string_buffer.tell() tarinfo.mode = 0644 tarinfo.mtime = time.time() csv_string_buffer.seek(0) tar.addfile(tarinfo, csv_string_buffer) # This is the buffer size of how many attachments to pull from ES at each iteration num_returned=3 index=0 # Paging while index < len(rows): # Get num_returned attachments from ES attachments = es().mget(index=data_set_id, doc_type="attachments", body={"docs":[{"_id":row[1]} for row in rows[index: index+num_returned]]}) index+=num_returned # Add all attachments to the archive for attachment_source in attachments["docs"]: attachment = attachment_source["_source"] filename = attachment["filename"] attch_data = str(base64.b64decode(attachment["contents64"])) tarinfo_attch = tarfile.TarInfo(attachment["guid"]+"/"+filename) tarinfo_attch.size = len(attch_data) tarinfo_attch.mode = 0644 tarinfo_attch.mtime = time.time() tar.addfile(tarinfo_attch, cStringIO.StringIO(attch_data)) tar.close()
def export_edges(index): body = { "query": { "filtered": { "query": {"bool":{"must":[{"match_all":{}}]}}, "filter": { "bool": { "must": [ { "exists": { "field": "senders"}}], "should" :[ { "exists": { "field": "tos"}}, { "exists": { "field": "ccs"}}, { "exists": { "field": "bccs"}} ] } } } } } def rcvrs(fields={}): return fields.get("tos",[]) +fields.get("ccs",[])+fields.get("bccs",[]) count = es().count(index=index, doc_type="emails", body=body)["count"] # TODO add batch processing addrs = es().search(index=index, doc_type="emails", size=count, from_=0, fields=["senders", "tos", "ccs", "bccs"], body=body) edges = reduce(operator.add, [[{"from":hit["fields"]["senders"][0], "to":rcvr}for rcvr in rcvrs(hit["fields"]) ]for hit in addrs["hits"]["hits"]]) text_file = open("/home/elliot/big_graph.json", "w") text_file.write(json.dumps({"edges" : edges})) text_file.close()
def export_edges(index): body = { "query": { "filtered": { "query": { "bool": { "must": [{ "match_all": {} }] } }, "filter": { "bool": { "must": [{ "exists": { "field": "senders" } }], "should": [{ "exists": { "field": "tos" } }, { "exists": { "field": "ccs" } }, { "exists": { "field": "bccs" } }] } } } } } def rcvrs(fields={}): return fields.get("tos", []) + fields.get("ccs", []) + fields.get( "bccs", []) count = es().count(index=index, doc_type="emails", body=body)["count"] # TODO add batch processing addrs = es().search(index=index, doc_type="emails", size=count, from_=0, fields=["senders", "tos", "ccs", "bccs"], body=body) edges = reduce(operator.add, [[{ "from": hit["fields"]["senders"][0], "to": rcvr } for rcvr in rcvrs(hit["fields"])] for hit in addrs["hits"]["hits"]]) text_file = open("/home/elliot/big_graph.json", "w") text_file.write(json.dumps({"edges": edges})) text_file.close()
def _index_record(index): tangelo.log("datasource._index_record(index: %s)" % (str(index))) email_docs_count = es().count( index=index, doc_type="emails", body={"query": { "bool": { "must": [{ "match_all": {} }] } }})["count"] emails_addrs_count = es().count( index=index, doc_type="email_address", body={"query": { "bool": { "must": [{ "match_all": {} }] } }})["count"] emails_attch_count = es().count( index=index, doc_type="attachments", body={"query": { "bool": { "must": [{ "match_all": {} }] } }})["count"] #TODO: still need to re-work the absolute date-time bounds and the suggested date-time bounds bounds = get_datetime_bounds(index) return { 'data_set_id': index, 'data_set_label': index, 'data_set_document_count': email_docs_count, 'data_set_node_count': emails_addrs_count, 'data_set_attachment_count': emails_attch_count, 'data_set_datetime_min': bounds[0], 'data_set_datetime_max': bounds[1], 'start_datetime_selected': bounds[0], 'end_datetime_selected': bounds[1] }
def _search_ranked_email_addrs(index, start, end, size): graph_body= {"fields": _graph_fields, "sort" : _sort_email_addrs_by_total, "query" : _query_all} # tangelo.log("getRankedEmails(query: %s)" % (graph_body)) resp = es().search(index=index, doc_type="email_address", size=size, body=graph_body) # tangelo.log("getRankedEmails(resp: %s)" % (resp)) return resp
def es_get_sender_locations(data_set_id, size): tangelo.log("es_geo.es_get_sender_locations()" ) emails_resp = es().search(index=data_set_id, doc_type="emails", size=size, body=_geo_xoip_query()) tangelo.log("es_geo.es_get_sender_locations(total document hits = %s)" % emails_resp["hits"]["total"]) docs = [_map_geo_response(hit["_source"])for hit in emails_resp["hits"]["hits"]] return {"total":emails_resp["hits"]["total"], "XOIP_locations" : docs}
def _cluster_carrot2(index, type, email_addrs=[], query_terms='', topic_score=None, entity={}, date_bounds=None, cluster_fields=["_source.body"], cluster_title_fields=["_source.subject"], algorithm="lingo", max_doc_pool_size=500): query = _build_email_query(email_addrs=email_addrs, qs=query_terms, entity=entity, date_bounds=date_bounds) carrot_query = { "search_request": { "query": query["query"], "size": max_doc_pool_size }, "algorithm": algorithm, "max_hits": 0, "query_hint": query_terms, "field_mapping": { "title": cluster_title_fields, "content": cluster_fields } } resp = es().transport.perform_request( "POST", "/{}/{}/_search_with_clusters".format(index, type), {}, body=carrot_query) total_docs = min(resp[1]["hits"]["total"], max_doc_pool_size) return resp
def es_get_exif_emails(data_set_id, size): tangelo.log("es_geo.es_get_exif_emails()" ) emails_resp = es().search(index=data_set_id, doc_type="emails", size=size, body=_geo_exif_query()) tangelo.log("es_geo.es_get_exif_emails(total document hits = %s)" % emails_resp["hits"]["total"]) docs = [hit["_source"] for hit in emails_resp["hits"]["hits"]] return {"total":emails_resp["hits"]["total"], "exif_docs" : docs}
def get_datetime_bounds(index, type="emails"): resp = es().search(index=index, doc_type=type, body={"aggregations": _date_aggs()}) now = strftime("%Y-%m-%d", gmtime()) min = resp["aggregations"]["min_date"].get("value_as_string", default_min_timeline_bound()) max = resp["aggregations"]["max_date"].get("value_as_string", default_max_timeline_bound()) # Average avg = resp["aggregations"]["avg_date"].get("value_as_string", None) # Estimated median pct = resp["aggregations"]["pct_date"]["values"].get( "50.0_as_string", None) if not pct: return (min if min >= "1970" else "1970-01-01", max if max <= now else now) avg_datetime = parse(pct) delta = timedelta(**{ default_timeline_interval(index): int(default_timeline_span(index)) / 2 }) return ((avg_datetime - delta).strftime("%Y-%m-%d"), (avg_datetime + delta).strftime("%Y-%m-%d"))
def get_attachments_by_sender(data_set_id, sender, start_datetime, end_datetime, size): # fields= ["id", "dir", "datetime", "from", "tos", "ccs", "bccs", "subject", "attach", "bodysize"] # fields= ["id", "datetime", "senders", "tos", "ccs", "bccs", "subject", "attachments.filename"] # body={"filter":{"exists":{"field":"attachments"}}, "query":{"match":{"senders":sender}}} body = _build_email_query(sender_addrs=[sender], date_bounds=(start_datetime, end_datetime), attachments_only=True) tangelo.log("get_attachments_by_sender.Query %s"%body) attachments_resp = es().search(index=data_set_id, doc_type="emails", size=size, body=body) email_attachments = [] for attachment_item in attachments_resp["hits"]["hits"]: _source = attachment_item["_source"] attachment_entry = [_source["id"], "PLACEHOLDER", _source["datetime"], _source.get("senders","")[0], ';'.join(_source.get("tos","")), ';'.join(_source.get("ccs","")), ';'.join(_source.get("bccs","")), _source.get("subject","")] for attachment in _source["attachments"]: l = list(attachment_entry) l[1] = attachment["guid"] l.append(attachment["filename"]) l.append(0) email_attachments.append(l) return {"sender":sender, "email_attachments":email_attachments}
def initialize_email_addr_cache(index, update=False): if index in _EMAIL_ADDR_CACHE and not update: tangelo.log("APPLICATION CACHE -- index=%s"% index) return _EMAIL_ADDR_CACHE_LOCK.acquire() try: tangelo.log("INITIALIZING CACHE -- index=%s"% index) global _EMAIL_ADDR_CACHE _email_addr_cache_fields= ["community", "community_id", "addr", "received_count", "sent_count", "attachments_count"] body={"query" : {"match_all" : {}}} num = count(index,"email_address") print num addrs = es().search(index=index, doc_type="email_address", size=num, fields=_email_addr_cache_fields, body=body) addr_index = {f["addr"][0] : f for f in [hit["fields"] for hit in addrs["hits"]["hits"]]} _EMAIL_ADDR_CACHE[index] = addr_index tangelo.log("done: %s"% num) finally: _EMAIL_ADDR_CACHE_LOCK.release() tangelo.log("INITIALIZING CACHE COMPLETE! -- index=%s"% index) return {"acknowledge" : "ok"}
def count(index, type="emails", start="2000-01-01", end="now"): # TODO apply filter to query not to body filter = {"range" : {"datetime" : { "gte": start, "lte": end }}} all_query = {"bool":{"must":[{"match_all":{}}]}} count = es().count(index=index, doc_type=type, body={"query" : all_query}) return count["count"]
def initialize_email_addr_cache(index, update=False): if index in _EMAIL_ADDR_CACHE and not update: tangelo.log("APPLICATION CACHE -- index=%s" % index) return _EMAIL_ADDR_CACHE_LOCK.acquire() try: tangelo.log("INITIALIZING CACHE -- index=%s" % index) global _EMAIL_ADDR_CACHE _email_addr_cache_fields = [ "community", "community_id", "addr", "received_count", "sent_count", "attachments_count" ] body = {"query": {"match_all": {}}} num = count(index, "email_address") print num addrs = es().search(index=index, doc_type="email_address", size=num, fields=_email_addr_cache_fields, body=body) addr_index = { f["addr"][0]: f for f in [hit["fields"] for hit in addrs["hits"]["hits"]] } _EMAIL_ADDR_CACHE[index] = addr_index tangelo.log("done: %s" % num) finally: _EMAIL_ADDR_CACHE_LOCK.release() tangelo.log("INITIALIZING CACHE COMPLETE! -- index=%s" % index) return {"acknowledge": "ok"}
def _get_attachment_info_from_email_address(index, email_address, date_time=None): query_email_addr = { "query": { "filtered": { "query": _query_all, "filter": { "bool": { "must": [{ "term": { "addr": email_address } }] } } } } } resp = es().search(index=index, doc_type="email_address", body=query_email_addr) # tangelo.log("getRankedEmails(resp: %s)" % (resp)) return resp
def count(index, type="emails", start="2000-01-01", end="now"): # TODO apply filter to query not to body filter = {"range": {"datetime": {"gte": start, "lte": end}}} all_query = {"bool": {"must": [{"match_all": {}}]}} count = es().count(index=index, doc_type=type, body={"query": all_query}) return count["count"]
def _query_email_attachments(index, size, emails_query): tangelo.log("_query_email_attachments.Query %s" % emails_query) attachments_resp = es().search(index=index, doc_type="emails", size=size, body=emails_query) email_attachments = [] for attachment_item in attachments_resp["hits"]["hits"]: _source = attachment_item["_source"] attachment_entry = [ _source["id"], "PLACEHOLDER", _source["datetime"], _source.get("senders", ""), ';'.join(_source.get("tos", "")), ';'.join(_source.get("ccs", "")), ';'.join(_source.get("bccs", "")), _source.get("subject", "") ] for attachment in _source["attachments"]: l = list(attachment_entry) l[1] = attachment["guid"] l.append(attachment["filename"]) l.append(0) email_attachments.append(l) return email_attachments
def get_entity_histogram(index, type, email_addrs=[], qs='', topic_score=None, date_bounds=None, entity_agg_size=10): tangelo.log("===================================================") body = entity_histogram_query(email_addrs=email_addrs, qs=qs, topic_score=topic_score, date_bounds=date_bounds, entity_agg_size=entity_agg_size) tangelo.log("get_entity_histogram: query = %s" % body) resp = es().search(index=index, doc_type=type, body=body) return sorted([ dict(d, **{"type": "location"}) for d in resp["aggregations"] ["filtered_entity_agg"]["location"]["buckets"] ] + [ dict(d, **{"type": "organization"}) for d in resp["aggregations"] ["filtered_entity_agg"]["organization"]["buckets"] ] + [ dict(d, **{"type": "person"}) for d in resp["aggregations"] ["filtered_entity_agg"]["person"]["buckets"] ] + [ dict(d, **{"type": "misc"}) for d in resp["aggregations"]["filtered_entity_agg"]["misc"]["buckets"] ], key=lambda d: d["doc_count"], reverse=True)
def get_top_phone_numbers(index, email_address='', qs='', date_bounds=('1970-01-01', 'now'), size=50): body = phone_numbers_agg(email_address, qs, date_bounds, size=size) resp = es().search(index=index, doc_type="emails", body=body) return resp["aggregations"]["phone_numbers_agg"]["buckets"]
def get_total_attachment_activity(index, account_id, query_function, **kwargs): body = query_function(**kwargs) resp = es().search(index=index, doc_type="attachments", body=body) return [ _map_attachments(index, account_id, attachments) for attachments in zip(resp["aggregations"]["attachments_filter_agg"] ["attachments_over_time"]["buckets"]) ]
def get_lda_clusters(index): resp = es().search(index=index, doc_type='lda-clustering', body=_lda_clusters) # return [{"index":hit["_source"]["idx"],"score":hit["sort"][0],"cluster": [term["term"] for term in hit["_source"]["topic"]]} for hit in resp["hits"]["hits"]] return [{ "idx": hit["_source"]["idx"], "cluster": [term["term"] for term in hit["_source"]["topic"]] } for hit in resp["hits"]["hits"]]
def agg_cluster_counts(index): count = es().count(index=index, doc_type='lda-clustering', body={"query": { "bool": { "must": [{ "match_all": {} }] } }})["count"] # print count query = _cluster_lda(count, email_addrs=[], query_terms='', entity_dict=[], date_bounds=None) # print query resp = es().search(index=index, doc_type='emails', body=query) return {k: v["doc_count"] for k, v in resp["aggregations"].iteritems()}
def get_daily_activity(index, account_id, type, query_function, **kwargs): resp = es().search(index=index, doc_type=type, request_cache="false", body=query_function(**kwargs)) return [ _map_activity(index, account_id, sent_rcvd) for sent_rcvd in zip( resp["aggregations"]["sent_agg"]["sent_emails_over_time"] ["buckets"], resp["aggregations"]["rcvr_agg"] ["rcvd_emails_over_time"]["buckets"]) ]
def get_top_attachment_types(index, email_addrs=[], query_terms='', topic_score=None, entity={}, date_bounds=None, num_top_attachments=20): aggs = { "attachment_type_agg" : { "terms" : { "field" : "extension", "size" : num_top_attachments }}} query = filtered_agg_query(email_addrs=email_addrs, query_terms=query_terms, topic_score=topic_score, date_bounds=date_bounds, entity=entity, aggs=aggs, name="attachment") tangelo.log("Query %s"%query) attch_agg_resp = es().search(index=index, doc_type='attachments', size=0, body=query) types = [[attch_type["key"], int(attch_type["doc_count"])] for attch_type in attch_agg_resp["aggregations"]["attachment_filtered_agg"]["attachment_type_agg"]["buckets"]] total = sum(type[1] for type in types) types = [[attch_type[0], attch_type[1], "{0:.2f}".format(round(100.0*attch_type[1]/total,2))] for attch_type in types] return types
def _index_record(index): tangelo.log("datasource._index_record(index: %s)" % (str(index))) email_docs_count = es().count(index=index, doc_type="emails", body={"query" : {"bool":{"must":[{"match_all":{}}]}}})["count"] emails_addrs_count = es().count(index=index, doc_type="email_address", body={"query" : {"bool":{"must":[{"match_all":{}}]}}})["count"] emails_attch_count = es().count(index=index, doc_type="attachments", body={"query" : {"bool":{"must":[{"match_all":{}}]}}})["count"] #TODO: still need to re-work the absolute date-time bounds and the suggested date-time bounds bounds = get_datetime_bounds(index) return {'data_set_id':index, 'data_set_label':index, 'data_set_document_count' : email_docs_count, 'data_set_node_count' : emails_addrs_count, 'data_set_attachment_count' : emails_attch_count, 'data_set_datetime_min' : bounds[0], 'data_set_datetime_max' : bounds[1], 'start_datetime_selected' : bounds[0], 'end_datetime_selected' : bounds[1] }
def _get_attachment_info_from_email_address(index, email_address, date_time=None): query_email_addr = {"query":{"filtered" : { "query" : _query_all, "filter" : {"bool":{ "must":[ {"term" : { "addr" : email_address}} ] }}}}} resp = es().search(index=index, doc_type="email_address", body=query_email_addr) # tangelo.log("getRankedEmails(resp: %s)" % (resp)) return resp
def get_emailer_attachment_activity(index, email_address, date_bounds, interval="week"): body = attachment_histogram_from_emails(email_address, date_bounds, interval) resp = es().search(index=index, doc_type="email_address", body=body) return [ _map_attachments(index, email_address, attachments) for attachments in zip(resp["aggregations"]["emailer_attach_agg"] ["sent_attachments_over_time"]["buckets"]) ]
def _search_ranked_email_addrs(index, start, end, size): graph_body = { "fields": _graph_fields, "sort": _sort_email_addrs_by_total, "query": _query_all } # tangelo.log("getRankedEmails(query: %s)" % (graph_body)) resp = es().search(index=index, doc_type="email_address", size=size, body=graph_body) # tangelo.log("getRankedEmails(resp: %s)" % (resp)) return resp
def get_top_communities(index, query_terms='', topic_score=None, entity={}, date_bounds=None, num_communities=20): # TODO fix -hack until we can do date filtering on the email_address date_bounds = None # TODO fix aggs = { "community_agg" : { "terms" : { "field" : "community", "size" : num_communities }}} query = filtered_agg_query(topic_score=topic_score, date_bounds=date_bounds, entity=entity, aggs=aggs, name="community") tangelo.log("Query %s"%query) communities_agg = es().search(index=index, doc_type='email_address', size=0, body=query) # total_other = communities_agg["aggregations"]["community_agg"]["doc_count_error_upper_bound"] communities = [[community["key"], int(community["doc_count"])] for community in communities_agg["aggregations"]["community_filtered_agg"]["community_agg"]["buckets"]] total = sum(domain[1] for domain in communities) communities = [[community[0],community[1], "{0:.2f}".format(round(100.0*community[1]/total,2))] for community in communities] return communities
def get_top_domains(index, email_addrs=[], query_terms='', topic_score=None, entity={}, date_bounds=None, num_domains=20): # TODO fix -hack until we can do date filtering on the email_address date_bounds = None # TODO fix aggs = { "domain_agg" : { "terms" : { "field" : "domain", "size" : num_domains }}} query = filtered_agg_query(email_addrs=email_addrs, query_terms=query_terms, topic_score=topic_score, date_bounds=date_bounds, entity=entity, aggs=aggs, name="domain") tangelo.log("Query %s"%query) domains_agg = es().search(index=index, doc_type='email_address', size=0, body=query) # total_other = domains_agg["aggregations"]["domain_agg"]["doc_count_error_upper_bound"] domains = [[domain["key"], int(domain["doc_count"])] for domain in domains_agg["aggregations"]["domain_filtered_agg"]["domain_agg"]["buckets"]] total = sum(domain[1] for domain in domains) domains = [[domain[0],domain[1], "{0:.2f}".format(round(100.0*domain[1]/total,2))] for domain in domains] return domains
def _query_emails(index, size, emails_query, additional_fields=[]): emails_resp = es().search(index=index, doc_type="emails", size=size, fields=get_graph_row_fields() + additional_fields, body=emails_query) tangelo.log("es_query_utils._query_emails(total document hits = %s)" % emails_resp["hits"]["total"]) return { "total": emails_resp["hits"]["total"], "hits": [_map_emails(hit["fields"]) for hit in emails_resp["hits"]["hits"]] }
def get_email_activity(index, data_set_id, account_id=None, date_bounds=None, interval="week"): body = actor_histogram([] if not account_id else [account_id], date_bounds, interval) tangelo.log("get_email_activity(query body: %s )" % body) resp = es().search(index=index, doc_type="emails", request_cache="false", body=body) id = data_set_id if not account_id else account_id return [ _map_activity(index, id, sent_rcvd) for sent_rcvd in zip( resp["aggregations"]["sent_agg"]["emails_over_time"]["buckets"], resp["aggregations"]["rcvr_agg"]["emails_over_time"]["buckets"]) ]
def _cluster_carrot2(index, type, email_addrs=[], query_terms='', topic_score=None, entity={}, date_bounds=None, cluster_fields=["_source.body"], cluster_title_fields=["_source.subject"], algorithm="lingo", max_doc_pool_size=500): query = _build_email_query(email_addrs=email_addrs, qs=query_terms, entity=entity, date_bounds=date_bounds) carrot_query = { "search_request": { "query": query["query"], "size": max_doc_pool_size }, "algorithm":algorithm, "max_hits": 0, "query_hint": query_terms, "field_mapping": { "title": cluster_title_fields, "content": cluster_fields } } resp = es().transport.perform_request("POST", "/{}/{}/_search_with_clusters".format(index,type), {}, body=carrot_query) total_docs = min(resp[1]["hits"]["total"], max_doc_pool_size) return resp
def get_ranked_email_address(data_set_id, query_terms='', topic_score=None, entity={}, date_bounds=None, num_top_hits=30): body = { "aggs" : { "filtered_addrs_agg" : { "filter" : _build_filter(qs=query_terms, topic=topic_score, entity_dict=entity, date_bounds=date_bounds), "aggs": { "top_addrs_agg" : { "terms" : {"field" : "addrs", "size": num_top_hits} } } } }, "size":0} resp = es().search(index=data_set_id, doc_type="emails", body=body) total_docs =resp["aggregations"]["filtered_addrs_agg"]["doc_count"] email_addrs = [map_email_filtered(get_cached_email_addr(data_set_id, email_addr["key"]), email_addr["doc_count"],total_docs) for email_addr in resp["aggregations"]["filtered_addrs_agg"]["top_addrs_agg"]["buckets"]] return {"emails": email_addrs }
def get_attachment_by_id(*args, **kwargs): data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**kwargs) attachment_id=nth(args, 0, '') if not attachment_id: attachment_id = parseParamAttachmentGUID(**kwargs) cherrypy.log("email.get_attachments_sender(index=%s, attachment_id=%s)" % (data_set_id, attachment_id)) if not data_set_id: return tangelo.HTTPStatusCode(400, "invalid service call - missing index") if not attachment_id: return tangelo.HTTPStatusCode(400, "invalid service call - missing attachment_id") attachment = es().get(index=data_set_id, doc_type="attachments", id=attachment_id) if not attachment: return tangelo.HTTPStatusCode(400, "no attachments found for (index=%s, attachment_id=%s)" % (data_set_id, attachment_id)) attachment = attachment["_source"] ext = attachment["extension"] filename = attachment["filename"] mime_type = mimetypes.guess_type(filename)[0] if not mime_type: tangelo.content_type("application/x-download") header("Content-Disposition", 'attachment; filename="{}"'.format(filename)) else: tangelo.content_type(mime_type) header("Content-Disposition", 'inline; filename="{}"'.format(filename)) content = attachment["contents64"] bytes = base64.b64decode(content) # dump(bytes, filename) as_str = str(bytes) tangelo.log(str(len(as_str)), "Uploading Attachment - length = ") return as_str
def _query_email_attachments(index, size, emails_query): tangelo.log("_query_email_attachments.Query %s"%emails_query) attachments_resp = es().search(index=index, doc_type="emails", size=size, body=emails_query) email_attachments = [] for attachment_item in attachments_resp["hits"]["hits"]: _source = attachment_item["_source"] attachment_entry = [_source["id"], "PLACEHOLDER", _source["datetime"], _source.get("senders",""), ';'.join(_source.get("tos","")), ';'.join(_source.get("ccs","")), ';'.join(_source.get("bccs","")), _source.get("subject","")] for attachment in _source["attachments"]: l = list(attachment_entry) l[1] = attachment["guid"] l.append(attachment["filename"]) l.append(0) email_attachments.append(l) return email_attachments
def export_emails_archive(data_set_id, email_ids=[]): cherrypy.log("email.get_attachments_sender(index=%s, attachment_id=%s)" % (data_set_id, email_ids)) if not data_set_id: return tangelo.HTTPStatusCode(400, "invalid service call - missing index") # TODO can implement with multiple doc_types and combine attachments in emails = es().mget(index=data_set_id, doc_type="emails", body={"docs": [{ "_id": id } for id in email_ids]}) topics = get_categories(data_set_id) # TODO filename filename = "export.tar.gz" tangelo.content_type("application/x-gzip") header("Content-Disposition", 'attachment; filename="{}"'.format(filename)) string_buffer = cStringIO.StringIO() tar = tarfile.open(mode='w:gz', fileobj=string_buffer) # Add each email to the tar for email_source in emails["docs"]: email = email_source["_source"] tarinfo_parent = tarfile.TarInfo(name=email["id"]) tarinfo_parent.type = tarfile.DIRTYPE tarinfo_parent.mode = 0755 tarinfo_parent.mtime = time.time() tar.addfile(tarinfo_parent) # Add raw document tarinfo = tarfile.TarInfo(email["id"] + "/" + email["id"] + ".json") data_string = json.dumps(email) fobj = cStringIO.StringIO(data_string) tarinfo.size = len(data_string) tarinfo.mode = 0644 tarinfo.mtime = time.time() tar.addfile(tarinfo, fobj) # Add txt document tarinfo = tarfile.TarInfo(email["id"] + "/" + email["id"] + ".txt") data_string = prettyprint_email_as_text(email) fobj = cStringIO.StringIO(data_string) tarinfo.size = len(data_string) tarinfo.mode = 0644 tarinfo.mtime = time.time() tar.addfile(tarinfo, fobj) # Add html document tarinfo = tarfile.TarInfo(email["id"] + "/" + email["id"] + ".html") data_string = prettyprint_email_as_html_template(email, topics) fobj = cStringIO.StringIO(data_string) tarinfo.size = len(data_string) tarinfo.mode = 0644 tarinfo.mtime = time.time() tar.addfile(tarinfo, fobj) # Get the attachments if email["attachments"]: attachments = es().mget(index=data_set_id, doc_type="attachments", body={ "docs": [{ "_id": attch["guid"] } for attch in email["attachments"]] }) for attachment_source in attachments["docs"]: attachment = attachment_source["_source"] filename = attachment["filename"] attch_data = str(base64.b64decode(attachment["contents64"])) tarinfo_attch = tarfile.TarInfo(email["id"] + "/" + filename) tarinfo_attch.size = len(attch_data) tarinfo_attch.mode = 0644 tarinfo_attch.mtime = time.time() tar.addfile(tarinfo_attch, cStringIO.StringIO(attch_data)) tar.close() return string_buffer.getvalue()
tarinfo_attch.mode = 0644 tarinfo_attch.mtime = time.time() tar.addfile(tarinfo_attch, cStringIO.StringIO(attch_data)) tar.close() return string_buffer.getvalue() if __name__ == "__main__": # TODO move into method topics = get_categories("sample") email_ids = ["e65c3704-7fe8-11e5-bb05-08002705cb99"] # email_ids = ["f326dd04-7fe8-11e5-bb05-08002705cb99"] # TODO can implement with multiple doc_types and combine attachments in emails = es().mget(index="sample", doc_type="emails", body={"docs": [{ "_id": id } for id in email_ids]}) data_string = prettyprint_email_as_html_template( emails["docs"][0]["_source"], topics) with open("/tmp/output.html", "w") as text_file: text_file.write(data_string) email_id = "f9c9c59a-7fe8-11e5-bb05-08002705cb99" # export_emails_archive("sample", [email_id]) # export_attachments("sample", '*****@*****.**', 'jpg', ("2001-08-01", "2001-08-30")) print "export done"
attachment = attachment_source["_source"] filename = attachment["filename"] attch_data = str(base64.b64decode(attachment["contents64"])) tarinfo_attch = tarfile.TarInfo(email["id"]+"/"+filename) tarinfo_attch.size = len(attch_data) tarinfo_attch.mode = 0644 tarinfo_attch.mtime = time.time() tar.addfile(tarinfo_attch, cStringIO.StringIO(attch_data)) tar.close() return string_buffer.getvalue() if __name__ == "__main__": # TODO move into method topics = get_categories("sample") email_ids = ["e65c3704-7fe8-11e5-bb05-08002705cb99"] # email_ids = ["f326dd04-7fe8-11e5-bb05-08002705cb99"] # TODO can implement with multiple doc_types and combine attachments in emails = es().mget(index="sample", doc_type="emails", body={"docs":[{"_id":id} for id in email_ids]}) data_string = prettyprint_email_as_html_template(emails["docs"][0]["_source"], topics) with open("/tmp/output.html", "w") as text_file: text_file.write(data_string) email_id = "f9c9c59a-7fe8-11e5-bb05-08002705cb99" # export_emails_archive("sample", [email_id]) # export_attachments("sample", '*****@*****.**', 'jpg', ("2001-08-01", "2001-08-30")) print "export done"
def export_attachments(data_set_id, sender='', attachment_extension='jpg', date_bounds=None): print( "email.get_attachments_sender(index=%s, sender=%s, attachment_type=%s, date_bounds=%s)" % (data_set_id, sender, attachment_extension, date_bounds)) if not data_set_id: print "invalid service call - missing index" return 1 # TODO get accurate count -- this is not strictly needed as attachments will be accessed as inner docs on the email_address max_inner_attachments_returned = 100000 # Get all attachments by extension rows = [] body = _attch_nested__ext_query( sender, attachment_extension, date_bounds, max_inner_attachments_returned=max_inner_attachments_returned) print body addresses_count = es().count(index=data_set_id, doc_type="email_address", body=body)["count"] print "total addresses: " + str(addresses_count) addresses = es().search(index=data_set_id, doc_type="email_address", body=body, size=addresses_count) for address in addresses["hits"]["hits"]: rows += [[ address["_source"]["addr"], attachment["_source"]["guid"], attachment["_source"]["filename"], attachment["_source"]["datetime"] ] for attachment in address["inner_hits"]["sender_attachments"]["hits"] ["hits"]] print "total attachments: " + str(len(rows)) # Start tar tar = tarfile.open(mode='w:gz', name="/tmp/big-export.tar.gz") csv_string_buffer = cStringIO.StringIO() csv_file = csv.writer(csv_string_buffer) # Add all rows to attachment csv csv_file.writerows(rows) tarinfo = tarfile.TarInfo("attachments.csv") tarinfo.size = csv_string_buffer.tell() tarinfo.mode = 0644 tarinfo.mtime = time.time() csv_string_buffer.seek(0) tar.addfile(tarinfo, csv_string_buffer) # This is the buffer size of how many attachments to pull from ES at each iteration num_returned = 3 index = 0 # Paging while index < len(rows): # Get num_returned attachments from ES attachments = es().mget(index=data_set_id, doc_type="attachments", body={ "docs": [{ "_id": row[1] } for row in rows[index:index + num_returned]] }) index += num_returned # Add all attachments to the archive for attachment_source in attachments["docs"]: attachment = attachment_source["_source"] filename = attachment["filename"] attch_data = str(base64.b64decode(attachment["contents64"])) tarinfo_attch = tarfile.TarInfo(attachment["guid"] + "/" + filename) tarinfo_attch.size = len(attch_data) tarinfo_attch.mode = 0644 tarinfo_attch.mtime = time.time() tar.addfile(tarinfo_attch, cStringIO.StringIO(attch_data)) tar.close()
def get_lda_clusters(index): resp = es().search(index=index, doc_type='lda-clustering', body=_lda_clusters) # return [{"index":hit["_source"]["idx"],"score":hit["sort"][0],"cluster": [term["term"] for term in hit["_source"]["topic"]]} for hit in resp["hits"]["hits"]] return [{"idx":hit["_source"]["idx"],"cluster": [term["term"] for term in hit["_source"]["topic"]]} for hit in resp["hits"]["hits"]]
def export_emails_archive(data_set_id, email_ids=[]): cherrypy.log("email.get_attachments_sender(index=%s, attachment_id=%s)" % (data_set_id, email_ids)) if not data_set_id: return tangelo.HTTPStatusCode(400, "invalid service call - missing index") # TODO can implement with multiple doc_types and combine attachments in emails = es().mget(index=data_set_id, doc_type="emails", body={"docs":[{"_id":id} for id in email_ids]}) topics = get_categories(data_set_id) # TODO filename filename= "export.tar.gz" tangelo.content_type("application/x-gzip") header("Content-Disposition", 'attachment; filename="{}"'.format(filename)) string_buffer = cStringIO.StringIO() tar = tarfile.open(mode='w:gz', fileobj=string_buffer) # Add each email to the tar for email_source in emails["docs"]: email = email_source["_source"] tarinfo_parent= tarfile.TarInfo(name = email["id"]) tarinfo_parent.type = tarfile.DIRTYPE tarinfo_parent.mode = 0755 tarinfo_parent.mtime = time.time() tar.addfile(tarinfo_parent) # Add raw document tarinfo = tarfile.TarInfo(email["id"]+"/"+email["id"]+".json") data_string = json.dumps(email) fobj = cStringIO.StringIO(data_string) tarinfo.size = len(data_string) tarinfo.mode = 0644 tarinfo.mtime = time.time() tar.addfile(tarinfo, fobj) # Add txt document tarinfo = tarfile.TarInfo(email["id"]+"/"+email["id"]+".txt") data_string = prettyprint_email_as_text(email) fobj = cStringIO.StringIO(data_string) tarinfo.size = len(data_string) tarinfo.mode = 0644 tarinfo.mtime = time.time() tar.addfile(tarinfo, fobj) # Add html document tarinfo = tarfile.TarInfo(email["id"]+"/"+email["id"]+".html") data_string = prettyprint_email_as_html_template(email, topics) fobj = cStringIO.StringIO(data_string) tarinfo.size = len(data_string) tarinfo.mode = 0644 tarinfo.mtime = time.time() tar.addfile(tarinfo, fobj) # Get the attachments if email["attachments"]: attachments = es().mget(index=data_set_id, doc_type="attachments", body={"docs":[{"_id":attch["guid"]} for attch in email["attachments"]]}) for attachment_source in attachments["docs"]: attachment = attachment_source["_source"] filename = attachment["filename"] attch_data = str(base64.b64decode(attachment["contents64"])) tarinfo_attch = tarfile.TarInfo(email["id"]+"/"+filename) tarinfo_attch.size = len(attch_data) tarinfo_attch.mode = 0644 tarinfo_attch.mtime = time.time() tar.addfile(tarinfo_attch, cStringIO.StringIO(attch_data)) tar.close() return string_buffer.getvalue()
def get_total_daily_activity(index, type, query_function, **kwargs): resp = es().search(index=index, doc_type=type, body=query_function(**kwargs)) return resp["aggregations"]["filter_agg"]["emails_over_time"]["buckets"]
def get_top_phone_numbers(index, email_address='', qs='', date_bounds=('1970-01-01', 'now'), size=50): body=phone_numbers_agg(email_address, qs, date_bounds, size=size) resp = es().search(index=index, doc_type="emails", body=body) return resp["aggregations"]["phone_numbers_agg"]["buckets"]
def set_starred(index, ids=[], starred=True): body = { "doc" : { "starred" : starred }} for id in ids: response = es().update(index, doc_type="emails", id=id, body=body)
def get_email(index, email_id, qs=None): # fields=["id","datetime","senders","senders_line","tos_line","ccs_line","bccs_line","subject","body","attachments.filename","entities.entity_organization","entities.entity_location","entities.entity_person","entities.entity_misc"] # email = es().get(index, doc_type="emails", id=email_id, fields=fields) source = '' body='_DEFAULT_' subject='_DEFAULT_' highlighted_attachments = {} if not qs: email = es().get(index, doc_type="emails", id=email_id) source = email["_source"] body = source["body"] subject = source["subject"] body_translated = source.get("body_translated",'') subject_translated = source.get("subject_translated",'') body_lang = source.get("body_lang",'en') else: query = email_highlighting_query(email_id, highlight_query_string=qs) tangelo.log("es_email.get_email(highlighting-query: %s )" % (query)) email = es().search(index=index, doc_type='emails', body=query) source = email["hits"]["hits"][0]["_source"] body_lang = source.get("body_lang",'en') highlight = email["hits"]["hits"][0].get("highlight", {}) body = highlight.get('body', [source.get('body','')])[0] body_translated = highlight.get('body_translated', [source.get('body_translated','')])[0] subject_translated = highlight.get('subject_translated', [source.get('subject_translated','')])[0] subject = highlight.get('subject', [source['subject']])[0] # TODO highlighting attachments need to return content and further test this method highlighted_attachments = _find_attachment_highlighting(highlight, source.get("attachments", [""])) body = _format_html(body) body_translated = _format_html(body_translated) subject = _format_html(subject) topic_scores=[] if source["topic_scores"]: topic_scores = [ [topic[0], topic[1], str(source["topic_scores"]["idx_"+str(topic[0])])] for topic in get_categories(index)["categories"]] email = [source["id"], # TODO REMOVE unused fields "DEPRECATED", source.get("datetime",""), "false", "".join(source["senders"]), ["".join(source["tos_line"]), ";".join(source["tos"])], ["".join(source["ccs_line"]), ";".join(source["ccs"])], ["".join(source["bccs_line"]), ";".join(source["bccs"])], subject, # Wrap in <pre> "<pre>"+body+"</pre>", [[f["guid"],f["filename"]] for f in source.get("attachments", [""])], source.get("starred", False), highlighted_attachments ] entities = [] for type in ["person","location","organization","misc"]: if "body_entities" in source["entities"] and ("entity_"+type) in source["entities"]["body_entities"]: entities += [ [source["id"][0]+"_entity_"+str(i), type, i, val] for i,val in enumerate(source["entities"]["body_entities"].get("entity_"+type, []), len(entities))] resp = {"email_contents" : { "email" : email, "entities": entities, "lda_topic_scores":topic_scores}} # only add translated text if the language is not english if body_lang and not body_lang == 'en': email_translated = [source["id"], # TODO REMOVE unused fields "DEPRECATED", source.get("datetime",""), "false", "".join(source["senders"]), ["".join(source["tos_line"]), ";".join(source["tos"])], ["".join(source["ccs_line"]), ";".join(source["ccs"])], ["".join(source["bccs_line"]), ";".join(source["bccs"])], subject_translated, # Wrap in <pre> "<pre>"+body_translated+"</pre>", [[f["guid"],f["filename"]] for f in source.get("attachments", [""])], source.get("starred", False), highlighted_attachments ] entities_translated = [] for type in ["person","location","organization","misc"]: if "body_entities_translated" in source["entities"] and ("entity_"+type) in source["entities"]["body_entities_translated"]: entities_translated += [ [source["id"][0]+"_entity_"+str(i), type, i, val] for i,val in enumerate(source["entities"]["body_entities_translated"].get("entity_"+type, []), len(entities_translated))] resp["email_contents_translated"] = { "email" : email_translated, "entities": entities_translated, "lda_topic_scores":topic_scores, "original_lang": body_lang} return resp
def _query_emails(index, size, emails_query, additional_fields=[]): emails_resp = es().search(index=index, doc_type="emails", size=size, fields=get_graph_row_fields() + additional_fields, body=emails_query) tangelo.log("es_query_utils._query_emails(total document hits = %s)" % emails_resp["hits"]["total"]) return {"total":emails_resp["hits"]["total"], "hits":[_map_emails(hit["fields"])for hit in emails_resp["hits"]["hits"]]}