def _get_all_fields (app, app_type=None):
	Retrieve all possible fields in an application

	:param app: [string] application name (e.g. xdata_v3)
	:param app_type: [string] application type (e.g. logs)
	:return: [list] list of strings representing the fields names 	
	d = list ()
	query = { "aggs" : {
				"fields" : {
					"terms" : {
						"field" : "_field_names",
						"size" : 100

		response = es.search (index=app, doc_type=app_type, body=query)
		for tag in response['aggregations']['fields']['buckets']:
			d.append (tag ['key'])
	except TransportError as e:
		d.append (str (e.info))			
	except Exception as e:
		d.append (str (e))
	return d
    def segment(app, app_type=None, params=''):
        Just support match all for now.
        q = params.get("q") if params.get("q") else {}
        fields = params.get("fields") if params.get("fields") else []
        size = params.get("size") if params.get("size") else 10
        scroll = params.get("scroll") if params.get("scroll") else False
        fl = params.get("fl") if params.get("fl") else []

        # filters = params.get ("filter") if params.get ("filter") else {}

        # 'q': args.get('q', '{}'),
        # 'fields': args.get('fl', '{}'),
        # 'size': args.get ('size', 100),
        # 'scroll': args.get ('scroll', False),
        # 'filters': request_args.getlist ('fq')
        query = {}
        query['size'] = size

        if q:
            res = q.split(":")
            key = res[0]
            val = res[1]
            query['query'] = {"match": {key: val}}
            query['query'] = {"match_all": {}}

        if len(fields) > 0:
            ex = {"include": fields.split(",")}
            query['_source'] = ex

        response = es.search(index=app, doc_type=app_type, body=query)

        return jsonify(response)
def _get_all_fields(app, app_type=None):
	Retrieve all possible fields in an application

	:param app: [string] application name (e.g. xdata_v3)
	:param app_type: [string] application type (e.g. logs)
	:return: [list] list of strings representing the fields names 	
    d = list()
    query = {
        "aggs": {
            "fields": {
                "terms": {
                    "field": "_field_names",
                    "size": 100

        response = es.search(index=app, doc_type=app_type, body=query)
        for tag in response['aggregations']['fields']['buckets']:
    except TransportError as e:
    except Exception as e:
    return d
    def search(app,
        Perform a search query.

        :param app: [string] application id (e.g. "xdata_v3")
        :param app_type: [string] name of the application type.
            If None all application types are searched.
        :param filters: [list of strings] list of filters for a query.
        :param size: [int] maximum number of hits that should be returned
        :param sort_field: [string] sorting field.
            Currently supported fields: "timestamp", "date"
        :return: [dict] dictionary with processed results.
            If STOUT is enabled, STOUT data will be merged with final result.

        # Need some query builder...
        query = {}
        log_result = es.search(index=app,

        # stout_result = Stout.getSessions()
        # data = merged_results(log_result, stout_result)
        return log_result
	def search (app,
				filters=list (),
		Perform a search query.

		:param app: [string] application id (e.g. "xdata_v3")
		:param app_type: [string] name of the application type. If None all application types are searched.
		:param filters: [list of strings] list of filters for a query. 
		:param size: [int] maximum number of hits that should be returned
		:param sort_field: [string] sorting field. Currently supported fields: "timestamp", "date"
		:return: [dict] dictionary with processed results. If STOUT is enabled, STOUT data will be merged with final result.

		# Need some query builder...
		log_result = es.search (index=app, doc_type=app_type, body=query, fields=filters, size=size)

		stout_result = Stout.getSessions ()

		data = merged_results (log_result, stout_result)
		return data
    def histogram(app, app_type=None, q=""):
		Only works on numerical data.
        field = q.get("field") if q.get("field") else ""

        interval = 50
        query = {
            "aggs": {
                "hist_agg": {
                    "histogram": {
                        "field": field,
                        "interval": interval

        d = {}
            response = es.search(index=app, doc_type=app_type, body=query)
            for tag in response['aggregations']['hist_agg']['buckets']:
                d[tag['key']] = tag['doc_count']
        except TransportError as e:
            d['error'] = e.info
        except Exception as e:
            d['error'] = str(e)
        return jsonify(d)
	def histogram (app, app_type=None, q=""):
		Only works on numerical data.
		field = q.get ("field") if q.get ("field") else ""

		interval = 50
		query = { "aggs" : {
					"hist_agg" : {
						"histogram" : {
							"field" : field,
							"interval" : interval

		d = {}
			response = es.search (index=app, doc_type=app_type, body=query)
			for tag in response['aggregations']['hist_agg']['buckets']:
				d [tag ['key']] = tag ['doc_count']
		except TransportError as e:
			d ['error'] = e.info			
		except Exception as e:
			d ['error'] = str (e)		
		return jsonify (d)
    def unique_terms(app, app_type=None, q=""):
        Aggregate the number of unique terms in a field.
        Missing values are counted and marked as "N/A".

        .. todo::

                Need to incorporate QueryBuilder library instead of
                manually generating queries.

        :param app: [string] application name
        :param app_type: [string] application type
        :param field: [string] field to search against for unique values
        :param size: [int] the top size terms returned in the result.
                           Default value is 10.
        :param min_hits: [int] return tags which have been found
                               in min_hits or more. Default value is 1.
        :return: [dict] dictionary of results

        field = q.get("field") if q.get("field") else ""
        size = q.get("size") if q.get("size") else 10000
        min_hits = q.get("min_hits") if q.get("min_hits") else 0

        print field
        query = {
            "aggs": {
                "terms_agg": {
                    "terms": {
                        "field": field,
                        "size": size,
                        "min_doc_count": min_hits,
                        "missing": "N/A"

        d = {}
            response = es.search(index=app, doc_type=app_type, body=query)
            for tag in response['aggregations']['terms_agg']['buckets']:
                d[tag['key']] = tag['doc_count']
        except TransportError as e:
            d['error'] = e.info
        except Exception as e:
            d['error'] = str(e)
        return jsonify(d)
	def unique_terms (app, app_type=None, q=""):
		Aggregate the number of unique terms in a field. Missing values are counted and marked as "N/A".

		.. todo::

			Need to incorporate QueryBuilder library instead of manually generating queries. 

		:param app: [string] application name
		:param app_type: [string] application type
		:param field: [string] field to search against for unique values
		:param size: [int] the top size terms returned in the result. Default value is 10.
		:param min_hits: [int] return tags which have been found in min_hits or more. Default value is 1.
		:return: [dict] dictionary of results
		field = q.get ("field") if q.get ("field") else ""
		size = q.get ("size") if q.get ("size") else 10000
		min_hits = q.get ("min_hits") if q.get ("min_hits") else 0

		print field
		query = { "aggs" : {
					"terms_agg" : {
						"terms" : {
							"field" : field,
							"size" : size,
							"min_doc_count" : min_hits,
							"missing" : "N/A"

		d = {}
			response = es.search (index=app, doc_type=app_type, body=query)
			for tag in response['aggregations']['terms_agg']['buckets']:
				d [tag ['key']] = tag ['doc_count']
		except TransportError as e:
			d ['error'] = e.info			
		except Exception as e:
			d ['error'] = str (e)		
		return jsonify (d)
    def terms(app, app_type=None, q=''):
		Group by field (find all elements )
        field = q.get("field") if q.get("field") else ""
        segment = q.get("seg") if q.get("seg") else "*"
        size = q.get("size") if q.get("size") else 10000
        numhits = q.get("numhits") if q.get("numhits") else 10

        query = {
            "aggs": {
                "count_by_type": {
                    "terms": {
                        "field": field,
                        "size": size  # maximum number of keys (unique fields)
                    "aggs": {
                        "top": {  # arbitrary name
                            "top_hits": {
                                "size": numhits,  # number of logs in subgroup
                                {  # segment on fields - return only subgroup based on field
                                    "include": [segment]

        d = {}
        # try:
        response = es.search(index=app, doc_type=app_type, body=query)
        # 	for tag in response['aggregations']['count_by_type']['buckets']:
        # 		d [tag ['key']] = tag ['doc_count']
        # except TransportError as e:
        # 	d ['error'] = e.info
        # except Exception as e:
        # 	d ['error'] = str (e)
        # return jsonify (d)
        return jsonify(response)
	def terms (app, app_type=None, q=''):
		Group by field (find all elements )
		field = q.get ("field") if q.get ("field") else ""
		segment = q.get ("seg") if q.get ("seg") else "*"
		size = q.get ("size") if q.get ("size") else 10000
		numhits = q.get ("numhits") if q.get ("numhits") else 10

		query = { "aggs" : {
					"count_by_type" : {
						"terms" : {
							"field" : field,
							"size" : size	# maximum number of keys (unique fields)
						"aggs" : {
							"top" : {		# arbitrary name
								"top_hits" : {
									"size" : numhits,	# number of logs in subgroup
									"_source" : {	# segment on fields - return only subgroup based on field
										"include" : [

		d = {}
		# try:
		response = es.search (index=app, doc_type=app_type, body=query)
		# 	for tag in response['aggregations']['count_by_type']['buckets']:
		# 		d [tag ['key']] = tag ['doc_count']
		# except TransportError as e:
		# 	d ['error'] = e.info			
		# except Exception as e:
		# 	d ['error'] = str (e)		
		# return jsonify (d)
		return jsonify (response)
    def get_applications():
        Fetch all the registered applications in Distill.

        .. note:: Private indexes starting with a period are not included
                  in the result set

        :return: [dict] dictionary of all registered applications and meta info
        doc = {}
        query = {
            "aggs": {
                "count_by_type": {
                    "terms": {
                        "field": "_type",
                        "size": 100

            cluster_status = es.cat.indices(h=["index"], pri=False)
            x = cluster_status.splitlines()

            for idx in x:
                idx = idx.rstrip()

                # Ignore private indexes (like .kibana or .stout)
                if idx[:1] != '.':
                    response = es.search(index=idx, body=query)
                    d = {}
                    for tag in response["aggregations"]["count_by_type"][
                        d[tag['key']] = tag['doc_count']
                    doc[idx] = d
        except TransportError as e:
            doc['error'] = e.info
        except Exception as e:
            doc['error'] = str(e)
        return doc
	def get_applications ():
		Fetch all the registered applications in Distill.
		.. note:: Private indexes starting with a period are not included in the result set

		:return: [dict] dictionary of all registered applications and meta information
		doc = {}
		query = { "aggs" : {
					"count_by_type" : {
						"terms" : {
							"field" : "_type",
							"size" : 100

			cluster_status = es.cat.indices (h=["index"], pri=False)
			x = cluster_status.splitlines()

			for idx in x:
			    idx = idx.rstrip ()
			    # Ignore private indexes (like .kibana or .stout)
			    if idx [:1] != '.':
			        response = es.search (index=idx, body=query)
			        d = {}
			        for tag in response["aggregations"]["count_by_type"]["buckets"]:
			            d [tag ['key']] = tag ['doc_count']
			        doc [idx] = d
		except TransportError as e:
			doc ['error'] = e.info
		except Exception as e:
			doc ['error'] = str (e)
		return doc
	def segment (app, app_type=None, params=''):
		Just support match all for now. 
		q = params.get ("q") if params.get ("q") else {}
		fields = params.get ("fields") if params.get ("fields") else []
		size = params.get ("size") if params.get ("size") else 10
		scroll = params.get ("scroll") if params.get ("scroll") else False
		fl = params.get ("fl") if params.get ("fl") else []

		# filters = params.get ("filter") if params.get ("filter") else {}
		# 'q': args.get('q', '{}'),
		# 'fields': args.get('fl', '{}'),
		# 'size': args.get ('size', 100),
		# 'scroll': args.get ('scroll', False),
		# 'filters': request_args.getlist ('fq')
		query = {}
		query ['size'] = size
		if q:
			res = q.split(":")
			key = res [0]
			val = res [1]
			query ['query'] = {"match" : { key : val } }
			query ['query'] = {"match_all" : {}}

		if len (fields) > 0:
			ex = {
					"include" : fields.split(",")
			query ['_source'] = ex

		response = es.search (index=app, doc_type=app_type, body=query)

		return jsonify (response)
    def generate_graph(app,
                       time_range=['now-1h', 'now'],
        Return all elements from an application, possible matching against
        a specific event type (e.g. click, mouseover, etc)
        # @TODO ref_url filter

        must_not_query = [{
            "term": {
                "type": "mousedown"
        }, {
            "term": {
                "type": "mouseup"

        filter_query = [
                "term": {
                    "logType": log_type

        # Filtering
        should_query = []
        must_query = []

        # Include these events in the request
        if events:
            include_events = {"terms": {"type": events}}

        target_in = targets[0]
        target_out = targets[1]

        if target_in:
            include_targets = {"terms": {"target": target_in}}


        # Remove these elementids from result set
        for target in target_out:
            res = {"term": {"target": target}}

        # Finish off should query
        # must_query.append({"bool": {"should": should_query}})

        # Sort By Time
        sort_query = [{"clientTime": {"order": "asc"}}]

        # Timestamp range - date math
        timestamp_query = {
            "range": {
                "@timestamp": {
                    "gte": time_range[0],
                    "lte": time_range[1]

        agg_query = dict()

        # Get all unique sessions
        session_query = {"terms": {"field": "sessionID", "min_doc_count": 1}}

        agg_query['sessions'] = session_query

        # Generating all top targets and breakdowns by type, including path_length
        target_query = {
            "terms": {
                "field": "target",
                "min_doc_count": 1,
                "size": size
            "aggs": {
                "events": {
                    "terms": {
                        "field": "type",
                        "min_doc_count": 1,
                        "size": size
                "top_target": {
                    "top_hits": {
                        "script_fields": {
                            "path_length": {
                                "script": {
                                    "lang": "painless",
                                    "inline": "doc['path'].length;"
                        "size": 1

        agg_query['targets'] = target_query

        # Main query
        query = {
            "sort": sort_query,
            "query": {
                "bool": {
                    # "must": must_query,
                    # "should": should_query,
                    "filter": filter_query,
                    "must_not": must_not_query,
                    # "minimum_should_match": len(should_query) - 1
            "_source": {
                "includes": ['*'],
            "script_fields": {
                "path_length": {
                    "script": {
                        "lang": "painless",
                        "inline": "doc['path'].length;"
            "aggregations": agg_query

        # return query
        # Process Aggregate Results
        response = es.search(app, doc_type=app_type, body=query, size=0)
        # Only want to look at aggregations
        sessions = response['aggregations']['sessions']['buckets']
        # allSessions = { x['key']: [] for x in sessions }
        # intervalSessions = { x['key']: [] for x in sessions }

        # Deal with bar chart
        allTargets = response['aggregations']['targets']['buckets']

        # Re-execute query to get all hits
        iter = helpers.scan(es,

        allSessions = dict()
        # Store all hits in the user's bucket.
        for elem in iter:
            data = elem['_source']
            data['pathLength'] = elem['fields']['path_length'][0]
            if 'sessionID' in data:
                sessionID = data['sessionID']
                if sessionID in allSessions:
                    allSessions[sessionID] = [data]

        # This fixed sequence/interval logging that what was produced in
        # UserALE.js v 0.2.0
        # Possible to remove self-loops here as well (html->html->html->window) := (html->window)
        intervalSessions = dict()
        for sessionID in allSessions:
            data = allSessions[sessionID]
            newData = []
            intervalLog = []
            pairs = zip(data, data[1:])

            for curr, next in pairs:
                target1 = curr['target']
                event1 = curr['type']
                target2 = next['target']
                event2 = next['type']
                if target1 != target2:  # ignore self-loops
                    targetChange = int(True)
                    eventChange = int(False)
                    if event1 != event2:
                        eventChange = int(True)

                    # Starting over no matter what
                    # Based off of curr, update the log
                    curr['targetChange'] = targetChange
                    curr['typeChange'] = eventChange
                    curr['intervalCount'] = len(
                        intervalLog)  # some number maybe 0
                    if len(intervalLog) >= 2:
                        # Calculate duration
                        curr['duration'] = intervalLog[-1:]['clientTime'] - \
                        curr['duration'] = 0
                    intervalLog = []
                # else:
                #     # They are the same
                #     targetChange = int(False)
                #     eventChange = int(False)
                #     if event1 != event2:
                #         eventChange = int(True)
                #         # starting over
                #         curr['targetChange'] = targetChange
                #         curr['typeChange'] = eventChange
                #         curr['intervalCount'] = len(intervalLog)
                #         # if len(intervalLog) >= 2:
                #         #     # Calculate duration
                #         #     curr['duration'] = intervalLog[-1:]['clientTime'] - \
                #         #                        intervalLog[0]['clientTime']
                #         # else:
                #         #     curr['duration'] = 0
                #         newData.append(curr)
                #         intervalLog = []
                #     else:
                #         # increase counter
                #         intervalLog.append(curr)
            intervalSessions[sessionID] = newData

        # return intervalSessions
        newSessions = []

        # Generate all edges tied to a user
        # [ edge list, edge list, ... ]
        for k, v in intervalSessions.items():
            pairs = pairwise(v)  # list of edges for a user

        # Node Map
        node_list = []  # Need to keep 0-based index for sankey diagram
        links = []  # Aggregate sequence list
        node_map = []  # Final node map {"name": "foo", "id": 0"}

        # Align the sequences
        alignment = itertools.izip_longest(*newSessions)
        src_ids = {}
        target_ids = {}

        for i, step in enumerate(alignment):
            # print(i)
            c = collections.Counter()
            visitedLinks = []
            # visitedLinksUnique = set([])
            nodenames = set([])

            for edge in step:  # for a single step look at all links
                if edge:
                    node1 = edge[0]
                    node2 = edge[1]
                    session = node1['sessionID']
                    nodename1 = node1['target']
                    nodename2 = node2['target']

                    seqID = '%s->%s' % (nodename1, nodename2)

                    if nodename1 != nodename2:  #double check again for self-loops
                        link = {
                            if node1['path'] is not None else 0,

            # Done with visits in a step. Now calculate counts
            counts = collections.Counter(k['sequenceID'] for k in visitedLinks
                                         if k.get('sequenceID'))
            # print(counts)
            visitedLinksUnique = {v['sequenceID']: v
                                  for v in visitedLinks}.values()
            # print(visitedLinksUnique)

            # Visit unique links and generate src/targetid
            if len(node_map) == 0:
                for link in visitedLinksUnique:
                    # Add all sources
                    if link['sourceName'] not in src_ids:
                        node_map.append({"name": link['sourceName']})
                        src_ids[link['sourceName']] = len(node_map) - 1

                    # Add all targets
                    if link['targetName'] not in target_ids:
                        node_map.append({"name": link['targetName']})
                        target_ids[link['targetName']] = len(node_map) - 1

                src_ids = target_ids  # sources were previous targets
                target_ids = {}
                for link in visitedLinksUnique:
                    # Add all sources
                    # if link['sourceName'] not in src_ids.values():
                    #     node_map.append(link['sourceName'])
                    #     src_ids[len(node_map)-1] = link['sourceName']

                    # Add all targets
                    if link['targetName'] not in target_ids:
                        node_map.append({"name": link['targetName']})
                        target_ids[link['targetName']] = len(node_map) - 1

            for link in visitedLinksUnique:
                # Perform lookup for ids
                # Perform lookup for counts
                link['source'] = src_ids[link['sourceName']]
                link['target'] = target_ids[link['targetName']]
                link['value'] = counts[link['sequenceID']]


        # for step in alignment:
        #     # step through every users sequence
        #     c = collections.Counter()
        #     visitedLinks = []
        #     nodenames = set([])
        #     # Process all the edges
        #     for edge in step:
        #         if edge:
        #             node1 = edge[0]
        #             node2 = edge[1]
        #             nodename1 = node1['target']
        #             nodename2 = node2['target']
        #             # Add src and targetids
        #             nodenames.add(nodename1)
        #             nodenames.add(nodename2)
        #             # Generate sequence ID
        #             seqID = '%s->%s' % (nodename1, nodename2)
        #             # @todo Ensure src and target are not the same (self-loop)
        #             if nodename1 != nodename2:
        #                 link = {
        #                     'sequenceID': seqID,
        #                     'sourceName': nodename1,
        #                     'targetName': nodename2,
        #                     'type': node1['type'],
        #                     # 'duration': node1['duration'],
        #                     'pathLength': len(node1['path']),
        #                     'targetChange': node1['targetChange'],
        #                     'typeChange': node1['typeChange']
        #                 }
        #                 visitedLinks.append(link)
        #     # How many users visited a sequence at this step
        #     counts = collections.Counter(k['sequenceID'] for k in visitedLinks if k.get('sequenceID'))
        #     # print(counts)
        #     # Append into growing node_list
        #     map(lambda x: node_list.append(x), nodenames)
        #     # map(lambda x: node_map.append({ "name": x}
        #     #                                 "id": len(node_list) - 1 - node_list[::-1].index(x)}), nodenames)
        #     map(lambda x: node_map.append({ "name": x}), nodenames)
        #                                     # "id": len(node_list) - 1 - node_list[::-1].index(x)}), nodenames)
        #     for v in visitedLinks:
        #         # Pass through and update count, also generate src and target id
        #         v['value'] = counts[v['sequenceID']]
        #         # Last occurence is the src and target id
        #         v['source'] = len(node_list) -1 - node_list[::-1].index(v['sourceName'])
        #         v['target'] = len(node_list) -1 - node_list[::-1].index(v['targetName'])
        #         links.append(v)

        # Save everything
        res = dict()
        res['histogram'] = generate_bargraph(allTargets)
        # res['sankey'] = {
        #     # 'sessions': sessions,
        #     'links': links,
        #     'nodes': node_map
        # }

        res['nodes'] = node_map
        res['links'] = links
        res['sessions'] = sessions
        # with open('sankey.json', 'w') as outfile:
        #     json.dump(res, outfile, sort_keys=False, indent=4)

        # with open('data.txt', 'w') as outfile:
        #     json.dump(intervalSessions, outfile, indent=4, sort_keys=False)
        # with open('query.json', 'w') as outfile:
        #     json.dump(query, outfile, indent=4, sort_keys=False)
        # Iterate first to get nodes
        # pairs = pairwise(iter)
        # nodes = []
        # links = []

        # for p in pairs:
        #     node1 = p[0]['_source']
        #     node2 = p[1]['_source']

        #     # Append nodes to list
        #     nodes.append(node1['target'])
        #     nodes.append(node2['target'])

        # Iterate again to get edges
        # pairs = pairwise(iter2)

        # srcID = targetID = None
        # for p in pairs:
        #     node1 = p[0]['_source']
        #     node2 = p[1]['_source']
        #     # Append nodes to list
        #     nodes.append(node1['target'])
        #     # nodes.append(node2['target'])
        #     srcID = len(nodes) - 1
        #     targetID = len(nodes)
        #     # if (node1['target'] != node2['target']):
        #     # Append links to list (remove self-loops)
        #     link = {
        #         'sourceID': srcID,
        #         'targetID': targetID,
        #         'sourceName': node1['target'],
        #         'targetName': node2['target'],
        #         'type': node1['type'],
        #         'duration': node1['duration'],
        #         'value': node1['count'],
        #         'pathLength': len(node1['path']),
        #         'targetChange': int(node1['targetChange']),
        #         'typeChange': int(node1['typeChange'])
        #     }
        #     links.append(link)
        # # Get all unique nodes
        # # node_names = np.unique(nodes).tolist()
        # node_list = []
        # for indx, name in enumerate(nodes):
        #     n = {'id': indx, 'name': name}
        #     node_list.append(n)
        # # Remove self-loops
        # newLinks = []
        # for indx, elem in enumerate(links):
        #     srcID = elem['sourceID']
        #     targetID = elem['targetID']
        #     if srcID != targetID:
        #         newLinks.append(elem)

        return res