Пример #1
0
def agg_cluster_counts(index):
    count = es().count(index=index, doc_type='lda-clustering', body={"query" : {"bool":{"must":[{"match_all":{}}]}}})["count"]
    # print count
    query = _cluster_lda(count, email_addrs=[], query_terms='', entity_dict=[], date_bounds=None)
    # print query
    resp = es().search(index=index, doc_type='emails', body=query)
    return {k: v["doc_count"]for k,v in resp["aggregations"].iteritems()}
Пример #2
0
def export_attachments(data_set_id, sender='', attachment_extension='jpg', date_bounds=None):
    print("email.get_attachments_sender(index=%s, sender=%s, attachment_type=%s, date_bounds=%s)" % (data_set_id, sender, attachment_extension, date_bounds))
    if not data_set_id:
        print "invalid service call - missing index"
        return 1

    # TODO get accurate count -- this is not strictly needed as attachments will be accessed as inner docs on the email_address
    max_inner_attachments_returned = 100000

    # Get all attachments by extension
    rows=[]
    body = _attch_nested__ext_query(sender, attachment_extension, date_bounds, max_inner_attachments_returned=max_inner_attachments_returned )
    print body
    addresses_count = es().count(index=data_set_id, doc_type="email_address", body=body)["count"]
    print "total addresses: " + str(addresses_count)
    addresses = es().search(index=data_set_id, doc_type="email_address", body=body, size=addresses_count)
    for address in addresses["hits"]["hits"]:
        rows += [[address["_source"]["addr"], attachment["_source"]["guid"], attachment["_source"]["filename"], attachment["_source"]["datetime"]] for attachment in address["inner_hits"]["sender_attachments"]["hits"]["hits"]]

    print "total attachments: " + str(len(rows))

    # Start tar
    tar = tarfile.open(mode='w:gz', name="/tmp/big-export.tar.gz")
    csv_string_buffer = cStringIO.StringIO()
    csv_file=csv.writer( csv_string_buffer )

    # Add all rows to attachment csv
    csv_file.writerows (rows)
    tarinfo = tarfile.TarInfo("attachments.csv")

    tarinfo.size = csv_string_buffer.tell()
    tarinfo.mode = 0644
    tarinfo.mtime = time.time()
    csv_string_buffer.seek(0)

    tar.addfile(tarinfo, csv_string_buffer)


    # This is the buffer size of how many attachments to pull from ES at each iteration
    num_returned=3
    index=0
    # Paging
    while index < len(rows):
        # Get num_returned attachments from ES
        attachments = es().mget(index=data_set_id, doc_type="attachments", body={"docs":[{"_id":row[1]} for row in rows[index: index+num_returned]]})
        index+=num_returned

        # Add all attachments to the archive
        for attachment_source in attachments["docs"]:
            attachment = attachment_source["_source"]
            filename = attachment["filename"]
            attch_data = str(base64.b64decode(attachment["contents64"]))

            tarinfo_attch = tarfile.TarInfo(attachment["guid"]+"/"+filename)
            tarinfo_attch.size = len(attch_data)
            tarinfo_attch.mode = 0644
            tarinfo_attch.mtime = time.time()
            tar.addfile(tarinfo_attch, cStringIO.StringIO(attch_data))

    tar.close()
Пример #3
0
def export_edges(index):
    body = {
        "query": {
            "filtered": {
                "query": {"bool":{"must":[{"match_all":{}}]}},
                "filter": {
                    "bool": {
                        "must": [ { "exists": { "field": "senders"}}],
                        "should" :[
                            { "exists": { "field": "tos"}},
                            { "exists": { "field": "ccs"}},
                            { "exists": { "field": "bccs"}}
                        ]
                    }
                }
            }
        }
    }
    def rcvrs(fields={}):
        return fields.get("tos",[]) +fields.get("ccs",[])+fields.get("bccs",[])

    count = es().count(index=index, doc_type="emails", body=body)["count"]
    # TODO add batch processing
    addrs = es().search(index=index, doc_type="emails", size=count, from_=0, fields=["senders", "tos", "ccs", "bccs"], body=body)

    edges = reduce(operator.add, [[{"from":hit["fields"]["senders"][0], "to":rcvr}for rcvr in rcvrs(hit["fields"]) ]for hit in addrs["hits"]["hits"]])

    text_file = open("/home/elliot/big_graph.json", "w")
    text_file.write(json.dumps({"edges" : edges}))
    text_file.close()
Пример #4
0
def export_edges(index):
    body = {
        "query": {
            "filtered": {
                "query": {
                    "bool": {
                        "must": [{
                            "match_all": {}
                        }]
                    }
                },
                "filter": {
                    "bool": {
                        "must": [{
                            "exists": {
                                "field": "senders"
                            }
                        }],
                        "should": [{
                            "exists": {
                                "field": "tos"
                            }
                        }, {
                            "exists": {
                                "field": "ccs"
                            }
                        }, {
                            "exists": {
                                "field": "bccs"
                            }
                        }]
                    }
                }
            }
        }
    }

    def rcvrs(fields={}):
        return fields.get("tos", []) + fields.get("ccs", []) + fields.get(
            "bccs", [])

    count = es().count(index=index, doc_type="emails", body=body)["count"]
    # TODO add batch processing
    addrs = es().search(index=index,
                        doc_type="emails",
                        size=count,
                        from_=0,
                        fields=["senders", "tos", "ccs", "bccs"],
                        body=body)

    edges = reduce(operator.add, [[{
        "from": hit["fields"]["senders"][0],
        "to": rcvr
    } for rcvr in rcvrs(hit["fields"])] for hit in addrs["hits"]["hits"]])

    text_file = open("/home/elliot/big_graph.json", "w")
    text_file.write(json.dumps({"edges": edges}))
    text_file.close()
Пример #5
0
def _index_record(index):
    tangelo.log("datasource._index_record(index: %s)" % (str(index)))

    email_docs_count = es().count(
        index=index,
        doc_type="emails",
        body={"query": {
            "bool": {
                "must": [{
                    "match_all": {}
                }]
            }
        }})["count"]
    emails_addrs_count = es().count(
        index=index,
        doc_type="email_address",
        body={"query": {
            "bool": {
                "must": [{
                    "match_all": {}
                }]
            }
        }})["count"]
    emails_attch_count = es().count(
        index=index,
        doc_type="attachments",
        body={"query": {
            "bool": {
                "must": [{
                    "match_all": {}
                }]
            }
        }})["count"]

    #TODO: still need to re-work the absolute date-time bounds and the suggested date-time bounds
    bounds = get_datetime_bounds(index)

    return {
        'data_set_id': index,
        'data_set_label': index,
        'data_set_document_count': email_docs_count,
        'data_set_node_count': emails_addrs_count,
        'data_set_attachment_count': emails_attch_count,
        'data_set_datetime_min': bounds[0],
        'data_set_datetime_max': bounds[1],
        'start_datetime_selected': bounds[0],
        'end_datetime_selected': bounds[1]
    }
Пример #6
0
def _search_ranked_email_addrs(index, start, end, size):
    graph_body= {"fields": _graph_fields, "sort" : _sort_email_addrs_by_total, "query" : _query_all}
    # tangelo.log("getRankedEmails(query: %s)" % (graph_body))

    resp = es().search(index=index, doc_type="email_address", size=size, body=graph_body)
    # tangelo.log("getRankedEmails(resp: %s)" % (resp))
    return resp
Пример #7
0
def es_get_sender_locations(data_set_id, size):
    tangelo.log("es_geo.es_get_sender_locations()" )

    emails_resp = es().search(index=data_set_id, doc_type="emails", size=size, body=_geo_xoip_query())
    tangelo.log("es_geo.es_get_sender_locations(total document hits = %s)" % emails_resp["hits"]["total"])
    docs = [_map_geo_response(hit["_source"])for hit in emails_resp["hits"]["hits"]]
    return {"total":emails_resp["hits"]["total"], "XOIP_locations" : docs}
Пример #8
0
def _cluster_carrot2(index,
                     type,
                     email_addrs=[],
                     query_terms='',
                     topic_score=None,
                     entity={},
                     date_bounds=None,
                     cluster_fields=["_source.body"],
                     cluster_title_fields=["_source.subject"],
                     algorithm="lingo",
                     max_doc_pool_size=500):
    query = _build_email_query(email_addrs=email_addrs,
                               qs=query_terms,
                               entity=entity,
                               date_bounds=date_bounds)
    carrot_query = {
        "search_request": {
            "query": query["query"],
            "size": max_doc_pool_size
        },
        "algorithm": algorithm,
        "max_hits": 0,
        "query_hint": query_terms,
        "field_mapping": {
            "title": cluster_title_fields,
            "content": cluster_fields
        }
    }

    resp = es().transport.perform_request(
        "POST",
        "/{}/{}/_search_with_clusters".format(index, type), {},
        body=carrot_query)
    total_docs = min(resp[1]["hits"]["total"], max_doc_pool_size)
    return resp
Пример #9
0
def es_get_exif_emails(data_set_id, size):
    tangelo.log("es_geo.es_get_exif_emails()" )

    emails_resp = es().search(index=data_set_id, doc_type="emails", size=size, body=_geo_exif_query())
    tangelo.log("es_geo.es_get_exif_emails(total document hits = %s)" % emails_resp["hits"]["total"])
    docs = [hit["_source"] for hit in emails_resp["hits"]["hits"]]
    return {"total":emails_resp["hits"]["total"], "exif_docs" : docs}
Пример #10
0
def get_datetime_bounds(index, type="emails"):
    resp = es().search(index=index,
                       doc_type=type,
                       body={"aggregations": _date_aggs()})

    now = strftime("%Y-%m-%d", gmtime())
    min = resp["aggregations"]["min_date"].get("value_as_string",
                                               default_min_timeline_bound())
    max = resp["aggregations"]["max_date"].get("value_as_string",
                                               default_max_timeline_bound())

    # Average
    avg = resp["aggregations"]["avg_date"].get("value_as_string", None)
    # Estimated median
    pct = resp["aggregations"]["pct_date"]["values"].get(
        "50.0_as_string", None)

    if not pct:
        return (min if min >= "1970" else "1970-01-01",
                max if max <= now else now)

    avg_datetime = parse(pct)

    delta = timedelta(**{
        default_timeline_interval(index):
        int(default_timeline_span(index)) / 2
    })

    return ((avg_datetime - delta).strftime("%Y-%m-%d"),
            (avg_datetime + delta).strftime("%Y-%m-%d"))
Пример #11
0
def get_attachments_by_sender(data_set_id, sender, start_datetime, end_datetime, size):

    # fields= ["id", "dir", "datetime", "from", "tos", "ccs", "bccs", "subject", "attach", "bodysize"]
    # fields= ["id", "datetime", "senders", "tos", "ccs", "bccs", "subject", "attachments.filename"]
    # body={"filter":{"exists":{"field":"attachments"}}, "query":{"match":{"senders":sender}}}

    body = _build_email_query(sender_addrs=[sender], date_bounds=(start_datetime, end_datetime), attachments_only=True)
    tangelo.log("get_attachments_by_sender.Query %s"%body)

    attachments_resp = es().search(index=data_set_id, doc_type="emails", size=size, body=body)

    email_attachments = []
    for attachment_item in attachments_resp["hits"]["hits"]:
        _source = attachment_item["_source"]
        attachment_entry = [_source["id"],
                            "PLACEHOLDER",
                            _source["datetime"],
                            _source.get("senders","")[0],
                            ';'.join(_source.get("tos","")),
                            ';'.join(_source.get("ccs","")),
                            ';'.join(_source.get("bccs","")),
                            _source.get("subject","")]
        for attachment in _source["attachments"]:
            l = list(attachment_entry)
            l[1] = attachment["guid"]
            l.append(attachment["filename"])
            l.append(0)
            email_attachments.append(l)
    return {"sender":sender, "email_attachments":email_attachments}
Пример #12
0
def initialize_email_addr_cache(index, update=False):

    if index in _EMAIL_ADDR_CACHE and not update:
        tangelo.log("APPLICATION CACHE -- index=%s"% index)
        return

    _EMAIL_ADDR_CACHE_LOCK.acquire()
    try:
        tangelo.log("INITIALIZING CACHE -- index=%s"% index)
        global _EMAIL_ADDR_CACHE
        _email_addr_cache_fields= ["community", "community_id", "addr", "received_count", "sent_count", "attachments_count"]

        body={"query" : {"match_all" : {}}}

        num = count(index,"email_address")
        print num
        addrs = es().search(index=index, doc_type="email_address", size=num, fields=_email_addr_cache_fields, body=body)
        addr_index = {f["addr"][0] : f for f in [hit["fields"] for hit in addrs["hits"]["hits"]]}
        _EMAIL_ADDR_CACHE[index] = addr_index
        tangelo.log("done: %s"% num)
    finally:
        _EMAIL_ADDR_CACHE_LOCK.release()
        tangelo.log("INITIALIZING CACHE COMPLETE! -- index=%s"% index)

    return {"acknowledge" : "ok"}
Пример #13
0
def count(index, type="emails", start="2000-01-01", end="now"):
    # TODO apply filter to query not to body
    filter = {"range" : {"datetime" : { "gte": start, "lte": end }}}
    all_query = {"bool":{"must":[{"match_all":{}}]}}
    count = es().count(index=index, doc_type=type, body={"query" : all_query})

    return count["count"]
Пример #14
0
def initialize_email_addr_cache(index, update=False):

    if index in _EMAIL_ADDR_CACHE and not update:
        tangelo.log("APPLICATION CACHE -- index=%s" % index)
        return

    _EMAIL_ADDR_CACHE_LOCK.acquire()
    try:
        tangelo.log("INITIALIZING CACHE -- index=%s" % index)
        global _EMAIL_ADDR_CACHE
        _email_addr_cache_fields = [
            "community", "community_id", "addr", "received_count",
            "sent_count", "attachments_count"
        ]

        body = {"query": {"match_all": {}}}

        num = count(index, "email_address")
        print num
        addrs = es().search(index=index,
                            doc_type="email_address",
                            size=num,
                            fields=_email_addr_cache_fields,
                            body=body)
        addr_index = {
            f["addr"][0]: f
            for f in [hit["fields"] for hit in addrs["hits"]["hits"]]
        }
        _EMAIL_ADDR_CACHE[index] = addr_index
        tangelo.log("done: %s" % num)
    finally:
        _EMAIL_ADDR_CACHE_LOCK.release()
        tangelo.log("INITIALIZING CACHE COMPLETE! -- index=%s" % index)

    return {"acknowledge": "ok"}
Пример #15
0
def _get_attachment_info_from_email_address(index,
                                            email_address,
                                            date_time=None):
    query_email_addr = {
        "query": {
            "filtered": {
                "query": _query_all,
                "filter": {
                    "bool": {
                        "must": [{
                            "term": {
                                "addr": email_address
                            }
                        }]
                    }
                }
            }
        }
    }

    resp = es().search(index=index,
                       doc_type="email_address",
                       body=query_email_addr)
    # tangelo.log("getRankedEmails(resp: %s)" % (resp))
    return resp
Пример #16
0
def count(index, type="emails", start="2000-01-01", end="now"):
    # TODO apply filter to query not to body
    filter = {"range": {"datetime": {"gte": start, "lte": end}}}
    all_query = {"bool": {"must": [{"match_all": {}}]}}
    count = es().count(index=index, doc_type=type, body={"query": all_query})

    return count["count"]
Пример #17
0
def _query_email_attachments(index, size, emails_query):
    tangelo.log("_query_email_attachments.Query %s" % emails_query)

    attachments_resp = es().search(index=index,
                                   doc_type="emails",
                                   size=size,
                                   body=emails_query)

    email_attachments = []
    for attachment_item in attachments_resp["hits"]["hits"]:
        _source = attachment_item["_source"]
        attachment_entry = [
            _source["id"], "PLACEHOLDER", _source["datetime"],
            _source.get("senders", ""), ';'.join(_source.get("tos", "")),
            ';'.join(_source.get("ccs", "")), ';'.join(_source.get("bccs",
                                                                   "")),
            _source.get("subject", "")
        ]
        for attachment in _source["attachments"]:
            l = list(attachment_entry)
            l[1] = attachment["guid"]
            l.append(attachment["filename"])
            l.append(0)
            email_attachments.append(l)
    return email_attachments
Пример #18
0
def get_entity_histogram(index,
                         type,
                         email_addrs=[],
                         qs='',
                         topic_score=None,
                         date_bounds=None,
                         entity_agg_size=10):
    tangelo.log("===================================================")
    body = entity_histogram_query(email_addrs=email_addrs,
                                  qs=qs,
                                  topic_score=topic_score,
                                  date_bounds=date_bounds,
                                  entity_agg_size=entity_agg_size)

    tangelo.log("get_entity_histogram: query = %s" % body)

    resp = es().search(index=index, doc_type=type, body=body)
    return sorted([
        dict(d, **{"type": "location"}) for d in resp["aggregations"]
        ["filtered_entity_agg"]["location"]["buckets"]
    ] + [
        dict(d, **{"type": "organization"}) for d in resp["aggregations"]
        ["filtered_entity_agg"]["organization"]["buckets"]
    ] + [
        dict(d, **{"type": "person"}) for d in resp["aggregations"]
        ["filtered_entity_agg"]["person"]["buckets"]
    ] + [
        dict(d, **{"type": "misc"})
        for d in resp["aggregations"]["filtered_entity_agg"]["misc"]["buckets"]
    ],
                  key=lambda d: d["doc_count"],
                  reverse=True)
Пример #19
0
def get_top_phone_numbers(index,
                          email_address='',
                          qs='',
                          date_bounds=('1970-01-01', 'now'),
                          size=50):
    body = phone_numbers_agg(email_address, qs, date_bounds, size=size)
    resp = es().search(index=index, doc_type="emails", body=body)
    return resp["aggregations"]["phone_numbers_agg"]["buckets"]
Пример #20
0
def get_total_attachment_activity(index, account_id, query_function, **kwargs):
    body = query_function(**kwargs)
    resp = es().search(index=index, doc_type="attachments", body=body)
    return [
        _map_attachments(index, account_id, attachments)
        for attachments in zip(resp["aggregations"]["attachments_filter_agg"]
                               ["attachments_over_time"]["buckets"])
    ]
Пример #21
0
def get_lda_clusters(index):
    resp = es().search(index=index,
                       doc_type='lda-clustering',
                       body=_lda_clusters)
    # return [{"index":hit["_source"]["idx"],"score":hit["sort"][0],"cluster": [term["term"] for term in hit["_source"]["topic"]]} for hit in resp["hits"]["hits"]]
    return [{
        "idx": hit["_source"]["idx"],
        "cluster": [term["term"] for term in hit["_source"]["topic"]]
    } for hit in resp["hits"]["hits"]]
Пример #22
0
def agg_cluster_counts(index):
    count = es().count(index=index,
                       doc_type='lda-clustering',
                       body={"query": {
                           "bool": {
                               "must": [{
                                   "match_all": {}
                               }]
                           }
                       }})["count"]
    # print count
    query = _cluster_lda(count,
                         email_addrs=[],
                         query_terms='',
                         entity_dict=[],
                         date_bounds=None)
    # print query
    resp = es().search(index=index, doc_type='emails', body=query)
    return {k: v["doc_count"] for k, v in resp["aggregations"].iteritems()}
Пример #23
0
def get_daily_activity(index, account_id, type, query_function, **kwargs):
    resp = es().search(index=index,
                       doc_type=type,
                       request_cache="false",
                       body=query_function(**kwargs))
    return [
        _map_activity(index, account_id, sent_rcvd) for sent_rcvd in zip(
            resp["aggregations"]["sent_agg"]["sent_emails_over_time"]
            ["buckets"], resp["aggregations"]["rcvr_agg"]
            ["rcvd_emails_over_time"]["buckets"])
    ]
Пример #24
0
def get_top_attachment_types(index, email_addrs=[], query_terms='', topic_score=None, entity={}, date_bounds=None, num_top_attachments=20):
    aggs = { "attachment_type_agg" : { "terms" : { "field" : "extension", "size" : num_top_attachments }}}
    query = filtered_agg_query(email_addrs=email_addrs, query_terms=query_terms, topic_score=topic_score, date_bounds=date_bounds, entity=entity, aggs=aggs, name="attachment")
    tangelo.log("Query %s"%query)

    attch_agg_resp = es().search(index=index, doc_type='attachments', size=0, body=query)

    types = [[attch_type["key"], int(attch_type["doc_count"])] for attch_type in attch_agg_resp["aggregations"]["attachment_filtered_agg"]["attachment_type_agg"]["buckets"]]
    total = sum(type[1] for type in types)
    types = [[attch_type[0], attch_type[1], "{0:.2f}".format(round(100.0*attch_type[1]/total,2))] for attch_type in types]
    return types
Пример #25
0
def _index_record(index):
    tangelo.log("datasource._index_record(index: %s)" % (str(index)))

    email_docs_count = es().count(index=index, doc_type="emails", body={"query" : {"bool":{"must":[{"match_all":{}}]}}})["count"]
    emails_addrs_count = es().count(index=index, doc_type="email_address", body={"query" : {"bool":{"must":[{"match_all":{}}]}}})["count"]
    emails_attch_count = es().count(index=index, doc_type="attachments", body={"query" : {"bool":{"must":[{"match_all":{}}]}}})["count"]


    #TODO: still need to re-work the absolute date-time bounds and the suggested date-time bounds
    bounds = get_datetime_bounds(index)

    return {'data_set_id':index,
           'data_set_label':index,
           'data_set_document_count' : email_docs_count,
           'data_set_node_count' : emails_addrs_count,
           'data_set_attachment_count' : emails_attch_count,
           'data_set_datetime_min' : bounds[0],
           'data_set_datetime_max' : bounds[1],
           'start_datetime_selected' : bounds[0],
           'end_datetime_selected' : bounds[1]
           }
Пример #26
0
def _get_attachment_info_from_email_address(index, email_address, date_time=None):
    query_email_addr =  {"query":{"filtered" : {
        "query" : _query_all,
        "filter" : {"bool":{
            "must":[
                {"term" : { "addr" : email_address}}
            ]
        }}}}}

    resp = es().search(index=index, doc_type="email_address", body=query_email_addr)
    # tangelo.log("getRankedEmails(resp: %s)" % (resp))
    return resp
Пример #27
0
def get_emailer_attachment_activity(index,
                                    email_address,
                                    date_bounds,
                                    interval="week"):
    body = attachment_histogram_from_emails(email_address, date_bounds,
                                            interval)
    resp = es().search(index=index, doc_type="email_address", body=body)
    return [
        _map_attachments(index, email_address, attachments)
        for attachments in zip(resp["aggregations"]["emailer_attach_agg"]
                               ["sent_attachments_over_time"]["buckets"])
    ]
Пример #28
0
def _search_ranked_email_addrs(index, start, end, size):
    graph_body = {
        "fields": _graph_fields,
        "sort": _sort_email_addrs_by_total,
        "query": _query_all
    }
    # tangelo.log("getRankedEmails(query: %s)" % (graph_body))

    resp = es().search(index=index,
                       doc_type="email_address",
                       size=size,
                       body=graph_body)
    # tangelo.log("getRankedEmails(resp: %s)" % (resp))
    return resp
Пример #29
0
def get_top_communities(index, query_terms='', topic_score=None, entity={}, date_bounds=None, num_communities=20):
    # TODO fix -hack until we can do date filtering on the email_address
    date_bounds = None
    # TODO fix

    aggs = { "community_agg" : { "terms" : { "field" : "community", "size" : num_communities }}}
    query = filtered_agg_query(topic_score=topic_score, date_bounds=date_bounds, entity=entity, aggs=aggs, name="community")
    tangelo.log("Query %s"%query)

    communities_agg = es().search(index=index, doc_type='email_address', size=0, body=query)
    # total_other = communities_agg["aggregations"]["community_agg"]["doc_count_error_upper_bound"]
    communities = [[community["key"], int(community["doc_count"])] for community in communities_agg["aggregations"]["community_filtered_agg"]["community_agg"]["buckets"]]
    total = sum(domain[1] for domain in communities)
    communities = [[community[0],community[1], "{0:.2f}".format(round(100.0*community[1]/total,2))] for community in communities]
    return communities
Пример #30
0
def get_top_domains(index, email_addrs=[], query_terms='', topic_score=None, entity={}, date_bounds=None, num_domains=20):
    # TODO fix -hack until we can do date filtering on the email_address
    date_bounds = None
    # TODO fix

    aggs = { "domain_agg" : { "terms" : { "field" : "domain", "size" : num_domains }}}
    query = filtered_agg_query(email_addrs=email_addrs, query_terms=query_terms, topic_score=topic_score, date_bounds=date_bounds, entity=entity, aggs=aggs, name="domain")
    tangelo.log("Query %s"%query)

    domains_agg = es().search(index=index, doc_type='email_address', size=0, body=query)
    # total_other = domains_agg["aggregations"]["domain_agg"]["doc_count_error_upper_bound"]
    domains = [[domain["key"], int(domain["doc_count"])] for domain in domains_agg["aggregations"]["domain_filtered_agg"]["domain_agg"]["buckets"]]
    total = sum(domain[1] for domain in domains)
    domains = [[domain[0],domain[1], "{0:.2f}".format(round(100.0*domain[1]/total,2))] for domain in domains]
    return domains
Пример #31
0
def _query_emails(index, size, emails_query, additional_fields=[]):
    emails_resp = es().search(index=index,
                              doc_type="emails",
                              size=size,
                              fields=get_graph_row_fields() +
                              additional_fields,
                              body=emails_query)
    tangelo.log("es_query_utils._query_emails(total document hits = %s)" %
                emails_resp["hits"]["total"])

    return {
        "total": emails_resp["hits"]["total"],
        "hits":
        [_map_emails(hit["fields"]) for hit in emails_resp["hits"]["hits"]]
    }
Пример #32
0
def get_email_activity(index,
                       data_set_id,
                       account_id=None,
                       date_bounds=None,
                       interval="week"):
    body = actor_histogram([] if not account_id else [account_id], date_bounds,
                           interval)
    tangelo.log("get_email_activity(query body: %s )" % body)

    resp = es().search(index=index,
                       doc_type="emails",
                       request_cache="false",
                       body=body)
    id = data_set_id if not account_id else account_id
    return [
        _map_activity(index, id, sent_rcvd) for sent_rcvd in zip(
            resp["aggregations"]["sent_agg"]["emails_over_time"]["buckets"],
            resp["aggregations"]["rcvr_agg"]["emails_over_time"]["buckets"])
    ]
Пример #33
0
def _cluster_carrot2(index, type, email_addrs=[], query_terms='', topic_score=None, entity={}, date_bounds=None, cluster_fields=["_source.body"], cluster_title_fields=["_source.subject"], algorithm="lingo", max_doc_pool_size=500):
    query = _build_email_query(email_addrs=email_addrs, qs=query_terms,  entity=entity, date_bounds=date_bounds)
    carrot_query = {
        "search_request": {
            "query": query["query"],
            "size": max_doc_pool_size
        },
        "algorithm":algorithm,
        "max_hits": 0,
        "query_hint": query_terms,
        "field_mapping": {
            "title": cluster_title_fields,
            "content": cluster_fields
        }
    }

    resp = es().transport.perform_request("POST", "/{}/{}/_search_with_clusters".format(index,type), {}, body=carrot_query)
    total_docs = min(resp[1]["hits"]["total"], max_doc_pool_size)
    return resp
Пример #34
0
def get_ranked_email_address(data_set_id, query_terms='', topic_score=None, entity={}, date_bounds=None, num_top_hits=30):
    body = {
        "aggs" : {
            "filtered_addrs_agg" : {
                "filter" : _build_filter(qs=query_terms, topic=topic_score, entity_dict=entity, date_bounds=date_bounds),
                "aggs": {
                    "top_addrs_agg" : {
                        "terms" : {"field" : "addrs", "size": num_top_hits}
                    }
                }
            }
        },
        "size":0}

    resp = es().search(index=data_set_id, doc_type="emails", body=body)

    total_docs =resp["aggregations"]["filtered_addrs_agg"]["doc_count"]
    email_addrs = [map_email_filtered(get_cached_email_addr(data_set_id, email_addr["key"]), email_addr["doc_count"],total_docs) for email_addr in resp["aggregations"]["filtered_addrs_agg"]["top_addrs_agg"]["buckets"]]
    return {"emails": email_addrs }
Пример #35
0
def get_attachment_by_id(*args, **kwargs):

    data_set_id, start_datetime, end_datetime, size = parseParamDatetime(**kwargs)

    attachment_id=nth(args, 0, '')

    if not attachment_id:
        attachment_id = parseParamAttachmentGUID(**kwargs)

    cherrypy.log("email.get_attachments_sender(index=%s, attachment_id=%s)" % (data_set_id, attachment_id))
    if not data_set_id:
        return tangelo.HTTPStatusCode(400, "invalid service call - missing index")
    if not attachment_id:
        return tangelo.HTTPStatusCode(400, "invalid service call - missing attachment_id")

    attachment = es().get(index=data_set_id, doc_type="attachments", id=attachment_id)

    if not attachment:
        return tangelo.HTTPStatusCode(400, "no attachments found for (index=%s, attachment_id=%s)" % (data_set_id, attachment_id))

    attachment = attachment["_source"]
    ext = attachment["extension"]
    filename = attachment["filename"]

    mime_type = mimetypes.guess_type(filename)[0]

    if not mime_type:
        tangelo.content_type("application/x-download")
        header("Content-Disposition", 'attachment; filename="{}"'.format(filename))
    else:
        tangelo.content_type(mime_type)
        header("Content-Disposition", 'inline; filename="{}"'.format(filename))

    content = attachment["contents64"]
    bytes = base64.b64decode(content)
    # dump(bytes, filename)

    as_str = str(bytes)
    tangelo.log(str(len(as_str)), "Uploading Attachment - length = ")

    return as_str
Пример #36
0
def _query_email_attachments(index, size, emails_query):
    tangelo.log("_query_email_attachments.Query %s"%emails_query)

    attachments_resp = es().search(index=index, doc_type="emails", size=size, body=emails_query)

    email_attachments = []
    for attachment_item in attachments_resp["hits"]["hits"]:
        _source = attachment_item["_source"]
        attachment_entry = [_source["id"],
                             "PLACEHOLDER",
                             _source["datetime"],
                             _source.get("senders",""),
                             ';'.join(_source.get("tos","")),
                             ';'.join(_source.get("ccs","")),
                             ';'.join(_source.get("bccs","")),
                             _source.get("subject","")]
        for attachment in _source["attachments"]:
            l = list(attachment_entry)
            l[1] = attachment["guid"]
            l.append(attachment["filename"])
            l.append(0)
            email_attachments.append(l)
    return email_attachments
Пример #37
0
def export_emails_archive(data_set_id, email_ids=[]):
    cherrypy.log("email.get_attachments_sender(index=%s, attachment_id=%s)" %
                 (data_set_id, email_ids))
    if not data_set_id:
        return tangelo.HTTPStatusCode(400,
                                      "invalid service call - missing index")

    # TODO can implement with multiple doc_types and combine attachments in
    emails = es().mget(index=data_set_id,
                       doc_type="emails",
                       body={"docs": [{
                           "_id": id
                       } for id in email_ids]})
    topics = get_categories(data_set_id)

    # TODO filename
    filename = "export.tar.gz"
    tangelo.content_type("application/x-gzip")
    header("Content-Disposition", 'attachment; filename="{}"'.format(filename))

    string_buffer = cStringIO.StringIO()
    tar = tarfile.open(mode='w:gz', fileobj=string_buffer)

    # Add each email to the tar
    for email_source in emails["docs"]:

        email = email_source["_source"]

        tarinfo_parent = tarfile.TarInfo(name=email["id"])
        tarinfo_parent.type = tarfile.DIRTYPE
        tarinfo_parent.mode = 0755
        tarinfo_parent.mtime = time.time()
        tar.addfile(tarinfo_parent)

        # Add raw document
        tarinfo = tarfile.TarInfo(email["id"] + "/" + email["id"] + ".json")
        data_string = json.dumps(email)
        fobj = cStringIO.StringIO(data_string)

        tarinfo.size = len(data_string)
        tarinfo.mode = 0644
        tarinfo.mtime = time.time()
        tar.addfile(tarinfo, fobj)

        # Add txt document
        tarinfo = tarfile.TarInfo(email["id"] + "/" + email["id"] + ".txt")

        data_string = prettyprint_email_as_text(email)
        fobj = cStringIO.StringIO(data_string)

        tarinfo.size = len(data_string)
        tarinfo.mode = 0644
        tarinfo.mtime = time.time()
        tar.addfile(tarinfo, fobj)

        # Add html document
        tarinfo = tarfile.TarInfo(email["id"] + "/" + email["id"] + ".html")

        data_string = prettyprint_email_as_html_template(email, topics)
        fobj = cStringIO.StringIO(data_string)

        tarinfo.size = len(data_string)
        tarinfo.mode = 0644
        tarinfo.mtime = time.time()
        tar.addfile(tarinfo, fobj)

        # Get the attachments
        if email["attachments"]:
            attachments = es().mget(index=data_set_id,
                                    doc_type="attachments",
                                    body={
                                        "docs": [{
                                            "_id": attch["guid"]
                                        } for attch in email["attachments"]]
                                    })
            for attachment_source in attachments["docs"]:
                attachment = attachment_source["_source"]
                filename = attachment["filename"]
                attch_data = str(base64.b64decode(attachment["contents64"]))

                tarinfo_attch = tarfile.TarInfo(email["id"] + "/" + filename)
                tarinfo_attch.size = len(attch_data)
                tarinfo_attch.mode = 0644
                tarinfo_attch.mtime = time.time()
                tar.addfile(tarinfo_attch, cStringIO.StringIO(attch_data))
    tar.close()

    return string_buffer.getvalue()
Пример #38
0
                tarinfo_attch.mode = 0644
                tarinfo_attch.mtime = time.time()
                tar.addfile(tarinfo_attch, cStringIO.StringIO(attch_data))
    tar.close()

    return string_buffer.getvalue()


if __name__ == "__main__":
    # TODO move into method
    topics = get_categories("sample")
    email_ids = ["e65c3704-7fe8-11e5-bb05-08002705cb99"]
    # email_ids = ["f326dd04-7fe8-11e5-bb05-08002705cb99"]

    # TODO can implement with multiple doc_types and combine attachments in
    emails = es().mget(index="sample",
                       doc_type="emails",
                       body={"docs": [{
                           "_id": id
                       } for id in email_ids]})

    data_string = prettyprint_email_as_html_template(
        emails["docs"][0]["_source"], topics)
    with open("/tmp/output.html", "w") as text_file:
        text_file.write(data_string)

    email_id = "f9c9c59a-7fe8-11e5-bb05-08002705cb99"
    # export_emails_archive("sample", [email_id])
    # export_attachments("sample", '*****@*****.**', 'jpg', ("2001-08-01", "2001-08-30"))
    print "export done"
Пример #39
0
                attachment = attachment_source["_source"]
                filename = attachment["filename"]
                attch_data = str(base64.b64decode(attachment["contents64"]))

                tarinfo_attch = tarfile.TarInfo(email["id"]+"/"+filename)
                tarinfo_attch.size = len(attch_data)
                tarinfo_attch.mode = 0644
                tarinfo_attch.mtime = time.time()
                tar.addfile(tarinfo_attch, cStringIO.StringIO(attch_data))
    tar.close()

    return string_buffer.getvalue()

if __name__ == "__main__":
    # TODO move into method
    topics = get_categories("sample")
    email_ids = ["e65c3704-7fe8-11e5-bb05-08002705cb99"]
    # email_ids = ["f326dd04-7fe8-11e5-bb05-08002705cb99"]

    # TODO can implement with multiple doc_types and combine attachments in
    emails = es().mget(index="sample", doc_type="emails", body={"docs":[{"_id":id} for id in email_ids]})

    data_string = prettyprint_email_as_html_template(emails["docs"][0]["_source"], topics)
    with open("/tmp/output.html", "w") as text_file:
        text_file.write(data_string)

    email_id = "f9c9c59a-7fe8-11e5-bb05-08002705cb99"
    # export_emails_archive("sample", [email_id])
    # export_attachments("sample", '*****@*****.**', 'jpg', ("2001-08-01", "2001-08-30"))
    print "export done"
Пример #40
0
def export_attachments(data_set_id,
                       sender='',
                       attachment_extension='jpg',
                       date_bounds=None):
    print(
        "email.get_attachments_sender(index=%s, sender=%s, attachment_type=%s, date_bounds=%s)"
        % (data_set_id, sender, attachment_extension, date_bounds))
    if not data_set_id:
        print "invalid service call - missing index"
        return 1

    # TODO get accurate count -- this is not strictly needed as attachments will be accessed as inner docs on the email_address
    max_inner_attachments_returned = 100000

    # Get all attachments by extension
    rows = []
    body = _attch_nested__ext_query(
        sender,
        attachment_extension,
        date_bounds,
        max_inner_attachments_returned=max_inner_attachments_returned)
    print body
    addresses_count = es().count(index=data_set_id,
                                 doc_type="email_address",
                                 body=body)["count"]
    print "total addresses: " + str(addresses_count)
    addresses = es().search(index=data_set_id,
                            doc_type="email_address",
                            body=body,
                            size=addresses_count)
    for address in addresses["hits"]["hits"]:
        rows += [[
            address["_source"]["addr"], attachment["_source"]["guid"],
            attachment["_source"]["filename"],
            attachment["_source"]["datetime"]
        ] for attachment in address["inner_hits"]["sender_attachments"]["hits"]
                 ["hits"]]

    print "total attachments: " + str(len(rows))

    # Start tar
    tar = tarfile.open(mode='w:gz', name="/tmp/big-export.tar.gz")
    csv_string_buffer = cStringIO.StringIO()
    csv_file = csv.writer(csv_string_buffer)

    # Add all rows to attachment csv
    csv_file.writerows(rows)
    tarinfo = tarfile.TarInfo("attachments.csv")

    tarinfo.size = csv_string_buffer.tell()
    tarinfo.mode = 0644
    tarinfo.mtime = time.time()
    csv_string_buffer.seek(0)

    tar.addfile(tarinfo, csv_string_buffer)

    # This is the buffer size of how many attachments to pull from ES at each iteration
    num_returned = 3
    index = 0
    # Paging
    while index < len(rows):
        # Get num_returned attachments from ES
        attachments = es().mget(index=data_set_id,
                                doc_type="attachments",
                                body={
                                    "docs": [{
                                        "_id": row[1]
                                    } for row in rows[index:index +
                                                      num_returned]]
                                })
        index += num_returned

        # Add all attachments to the archive
        for attachment_source in attachments["docs"]:
            attachment = attachment_source["_source"]
            filename = attachment["filename"]
            attch_data = str(base64.b64decode(attachment["contents64"]))

            tarinfo_attch = tarfile.TarInfo(attachment["guid"] + "/" +
                                            filename)
            tarinfo_attch.size = len(attch_data)
            tarinfo_attch.mode = 0644
            tarinfo_attch.mtime = time.time()
            tar.addfile(tarinfo_attch, cStringIO.StringIO(attch_data))

    tar.close()
Пример #41
0
def get_lda_clusters(index):
    resp = es().search(index=index, doc_type='lda-clustering', body=_lda_clusters)
    # return [{"index":hit["_source"]["idx"],"score":hit["sort"][0],"cluster": [term["term"] for term in hit["_source"]["topic"]]} for hit in resp["hits"]["hits"]]
    return [{"idx":hit["_source"]["idx"],"cluster": [term["term"] for term in hit["_source"]["topic"]]} for hit in resp["hits"]["hits"]]
Пример #42
0
def export_emails_archive(data_set_id, email_ids=[]):
    cherrypy.log("email.get_attachments_sender(index=%s, attachment_id=%s)" % (data_set_id, email_ids))
    if not data_set_id:
        return tangelo.HTTPStatusCode(400, "invalid service call - missing index")

    # TODO can implement with multiple doc_types and combine attachments in
    emails = es().mget(index=data_set_id, doc_type="emails", body={"docs":[{"_id":id} for id in email_ids]})
    topics = get_categories(data_set_id)


    # TODO filename
    filename= "export.tar.gz"
    tangelo.content_type("application/x-gzip")
    header("Content-Disposition", 'attachment; filename="{}"'.format(filename))

    string_buffer = cStringIO.StringIO()
    tar = tarfile.open(mode='w:gz', fileobj=string_buffer)

    # Add each email to the tar
    for email_source in emails["docs"]:

        email = email_source["_source"]

        tarinfo_parent= tarfile.TarInfo(name = email["id"])
        tarinfo_parent.type = tarfile.DIRTYPE
        tarinfo_parent.mode = 0755
        tarinfo_parent.mtime = time.time()
        tar.addfile(tarinfo_parent)

        # Add raw document
        tarinfo = tarfile.TarInfo(email["id"]+"/"+email["id"]+".json")
        data_string = json.dumps(email)
        fobj = cStringIO.StringIO(data_string)

        tarinfo.size = len(data_string)
        tarinfo.mode = 0644
        tarinfo.mtime = time.time()
        tar.addfile(tarinfo, fobj)

        # Add txt document
        tarinfo = tarfile.TarInfo(email["id"]+"/"+email["id"]+".txt")

        data_string = prettyprint_email_as_text(email)
        fobj = cStringIO.StringIO(data_string)

        tarinfo.size = len(data_string)
        tarinfo.mode = 0644
        tarinfo.mtime = time.time()
        tar.addfile(tarinfo, fobj)


        # Add html document
        tarinfo = tarfile.TarInfo(email["id"]+"/"+email["id"]+".html")

        data_string = prettyprint_email_as_html_template(email, topics)
        fobj = cStringIO.StringIO(data_string)

        tarinfo.size = len(data_string)
        tarinfo.mode = 0644
        tarinfo.mtime = time.time()
        tar.addfile(tarinfo, fobj)

        # Get the attachments
        if email["attachments"]:
            attachments = es().mget(index=data_set_id, doc_type="attachments", body={"docs":[{"_id":attch["guid"]} for attch in email["attachments"]]})
            for attachment_source in attachments["docs"]:
                attachment = attachment_source["_source"]
                filename = attachment["filename"]
                attch_data = str(base64.b64decode(attachment["contents64"]))

                tarinfo_attch = tarfile.TarInfo(email["id"]+"/"+filename)
                tarinfo_attch.size = len(attch_data)
                tarinfo_attch.mode = 0644
                tarinfo_attch.mtime = time.time()
                tar.addfile(tarinfo_attch, cStringIO.StringIO(attch_data))
    tar.close()

    return string_buffer.getvalue()
Пример #43
0
def get_total_daily_activity(index, type, query_function, **kwargs):
    resp = es().search(index=index,
                       doc_type=type,
                       body=query_function(**kwargs))
    return resp["aggregations"]["filter_agg"]["emails_over_time"]["buckets"]
Пример #44
0
def get_top_phone_numbers(index, email_address='', qs='', date_bounds=('1970-01-01', 'now'), size=50):
    body=phone_numbers_agg(email_address, qs, date_bounds, size=size)
    resp = es().search(index=index, doc_type="emails", body=body)
    return resp["aggregations"]["phone_numbers_agg"]["buckets"]
Пример #45
0
def set_starred(index, ids=[], starred=True):
    body = { "doc" : { "starred" : starred }}
    for id in ids:
        response = es().update(index, doc_type="emails", id=id, body=body)
Пример #46
0
def get_email(index, email_id, qs=None):

    # fields=["id","datetime","senders","senders_line","tos_line","ccs_line","bccs_line","subject","body","attachments.filename","entities.entity_organization","entities.entity_location","entities.entity_person","entities.entity_misc"]
    # email = es().get(index, doc_type="emails", id=email_id, fields=fields)

    source = ''
    body='_DEFAULT_'
    subject='_DEFAULT_'
    highlighted_attachments = {}

    if not qs:
        email = es().get(index, doc_type="emails", id=email_id)
        source = email["_source"]
        body = source["body"]
        subject = source["subject"]
        body_translated = source.get("body_translated",'')
        subject_translated = source.get("subject_translated",'')
        body_lang = source.get("body_lang",'en')
    else:
        query = email_highlighting_query(email_id, highlight_query_string=qs)
        tangelo.log("es_email.get_email(highlighting-query: %s )" % (query))

        email = es().search(index=index, doc_type='emails', body=query)
        source = email["hits"]["hits"][0]["_source"]
        body_lang = source.get("body_lang",'en')
        highlight = email["hits"]["hits"][0].get("highlight", {})

        body = highlight.get('body', [source.get('body','')])[0]
        body_translated = highlight.get('body_translated', [source.get('body_translated','')])[0]

        subject_translated = highlight.get('subject_translated', [source.get('subject_translated','')])[0]
        subject = highlight.get('subject', [source['subject']])[0]
        # TODO highlighting attachments need to return content and further test this method
        highlighted_attachments = _find_attachment_highlighting(highlight, source.get("attachments", [""]))

    body = _format_html(body)
    body_translated = _format_html(body_translated)
    subject = _format_html(subject)

    topic_scores=[]
    if source["topic_scores"]:
        topic_scores = [ [topic[0], topic[1], str(source["topic_scores"]["idx_"+str(topic[0])])] for topic in get_categories(index)["categories"]]

    email = [source["id"],
             # TODO REMOVE unused fields
             "DEPRECATED",
             source.get("datetime",""),
             "false",
             "".join(source["senders"]),
             ["".join(source["tos_line"]), ";".join(source["tos"])],
             ["".join(source["ccs_line"]), ";".join(source["ccs"])],
             ["".join(source["bccs_line"]), ";".join(source["bccs"])],
             subject,
             # Wrap in <pre>
             "<pre>"+body+"</pre>",
             [[f["guid"],f["filename"]] for f in source.get("attachments", [""])],
             source.get("starred", False),
             highlighted_attachments
             ]
    entities = []
    for type in ["person","location","organization","misc"]:
        if "body_entities" in source["entities"] and ("entity_"+type) in source["entities"]["body_entities"]:
            entities += [ [source["id"][0]+"_entity_"+str(i), type,     i, val] for i,val in enumerate(source["entities"]["body_entities"].get("entity_"+type, []), len(entities))]

    resp = {"email_contents" : { "email" : email, "entities": entities, "lda_topic_scores":topic_scores}}

    # only add translated text if the language is not english
    if body_lang and not body_lang == 'en':
        email_translated = [source["id"],
                 # TODO REMOVE unused fields
                 "DEPRECATED",
                 source.get("datetime",""),
                 "false",
                 "".join(source["senders"]),
                 ["".join(source["tos_line"]), ";".join(source["tos"])],
                 ["".join(source["ccs_line"]), ";".join(source["ccs"])],
                 ["".join(source["bccs_line"]), ";".join(source["bccs"])],
                 subject_translated,
                 # Wrap in <pre>
                 "<pre>"+body_translated+"</pre>",
                 [[f["guid"],f["filename"]] for f in source.get("attachments", [""])],
                 source.get("starred", False),
                 highlighted_attachments
                 ]
        entities_translated = []
        for type in ["person","location","organization","misc"]:
            if "body_entities_translated" in source["entities"] and ("entity_"+type) in source["entities"]["body_entities_translated"]:
                entities_translated += [ [source["id"][0]+"_entity_"+str(i), type, i, val] for i,val in enumerate(source["entities"]["body_entities_translated"].get("entity_"+type, []), len(entities_translated))]

        resp["email_contents_translated"] = { "email" : email_translated, "entities": entities_translated, "lda_topic_scores":topic_scores, "original_lang": body_lang}

    return resp
Пример #47
0
def _query_emails(index, size, emails_query, additional_fields=[]):
    emails_resp = es().search(index=index, doc_type="emails", size=size, fields=get_graph_row_fields() + additional_fields, body=emails_query)
    tangelo.log("es_query_utils._query_emails(total document hits = %s)" % emails_resp["hits"]["total"])

    return {"total":emails_resp["hits"]["total"], "hits":[_map_emails(hit["fields"])for hit in emails_resp["hits"]["hits"]]}