예제 #1
0
    def __init__(self):
        self._config = DEFAULT_REPORT_CONFIGURATION

        self.es = MetricsElasticSearch()
        self.es.connect()

        self.logger = logging.getLogger('metrics_reporting_service.' + __name__)
        self.logger.setLevel(logging.DEBUG)

        # create file handler which logs even debug messages
        fh = logging.FileHandler('./reports/reports.log')
        fh.setLevel(logging.DEBUG)

        # create console handler with a higher log level
        ch = logging.StreamHandler()
        ch.setLevel(logging.ERROR)

        # create formatter and add it to the handlers
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        fh.setFormatter(formatter)
        ch.setFormatter(formatter)

        # add the handlers to the logger
        self.logger.addHandler(fh)
        self.logger.addHandler(ch)
    async def _update_tags(tag_label, read_event_entry, session):
        logger = getESSyncLogger(name="es_eventlog")
        metrics_elastic_search = MetricsElasticSearch()
        metrics_elastic_search.connect()
        eventSuccessCount = 0
        writeNotNeeded = 0
        entry_id = None

        if "_id" in read_event_entry:
            entry_id = read_event_entry["_id"]

        try:
            tags_array = []

            if entry_id is None:
                logger.error("Cannot update entry without ID")

            if "_index" in read_event_entry:
                read_event_entry_index = read_event_entry["_index"]
            else:
                logger.error("Cannot update entry: " + entry_id)

            # set up the URL
            index_update_url = "http://localhost:9200/%s/_doc/%s/_update" % (
                read_event_entry_index, entry_id)

            headers = {}
            headers["Content-Type"] = "application/json"

            if (tag_label is not None):

                index_update_body = {
                    "script": {
                        "source": "ctx._source.tags.add(params.tag)",
                        "lang": "painless",
                        "params": {
                            "tag": tag_label
                        }
                    }
                }

                async with session.post(index_update_url,
                                        data=json.dumps(index_update_body),
                                        headers=headers) as response:
                    response_text = await response.text()
                    if response.status == 200:
                        eventSuccessCount += 1

        except Exception as e:
            eventFailCount += 1
            logger.error(e)
            logger.info("Error occured for entry id: " + str(entry_id))
            logger.info(eventSuccessCount + " " + eventFailCount + " " +
                        writeNotNeeded)
        return eventSuccessCount, eventFailCount, writeNotNeeded
예제 #3
0
    def get_unique_pids(self, start_date, end_date, node, doi=False):
        """
        Queries ES for the given time period and returns the set of pids
        :param start_date:
        :param end_date:
        :param: node
        :return: SET object of pids for a given time range. (Always unique - because it is a set!)
        """
        metrics_elastic_search = MetricsElasticSearch()
        metrics_elastic_search.connect()
        pid_list = []
        unique_pids = []
        query = [
            {
                "terms": {
                    "formatType": [
                        "METADATA"
                    ]
                }
            },
            {

                "term": {"event.key": "read"}
            },
            {
                "term": {"nodeId": node}
            },
            {
                "exists": {
                    "field": "sessionId"
                }
            }
        ]

        # Just search for DOI string in to send it to the HUB
        if(doi):
            DOIWildcard = {}
            DOIWildcard["wildcard"] = {"pid.key": "*doi*"}
            query.append(DOIWildcard)

        fields = "pid"
        results, total = metrics_elastic_search.getSearches(limit=1000000, q = query, date_start=datetime.strptime(start_date,'%m/%d/%Y')\
                                                     , date_end=datetime.strptime(end_date,'%m/%d/%Y'), fields=fields)

        for i in range(total):
            pid_list.append(results[i]["pid"])

        for i in pid_list:
            if i not in unique_pids:
                unique_pids.append(i)

        return (unique_pids)
def updatePortalEpungePIDs(seriesId, portal_DIF):
    """
    Removes the seriesID from events for PIDs that are no longer part of the portal
    :param seriesId:
    :param portal_DIF:
    :return:
    """
    epunge_list = []
    current_portal_PIDs = []
    logger = getESSyncLogger(name="es_eventlog")
    t_start = time.time()

    logger.info("Beginning updatePortalEpungePIDs")

    metrics_elastic_search = MetricsElasticSearch()
    metrics_elastic_search.connect()

    # set up the query
    query = {
        "term": {
            "portalIdentifier.keyword": seriesId
        },
    }

    try:
        results = metrics_elastic_search.getSearches(index="eventlog-*",
                                                     q=query,
                                                     fields="pid",
                                                     limit=9999999)
        if results is not None:
            current_portal_PIDs = results[0]

    except Exception as e:
        logger.error("Exception occured while retrieving expunge PIDs")

    for id in current_portal_PIDs:
        if id not in portal_DIF:
            epunge_list.append(id)

    t_delta = time.time() - t_start
    logger.info("Completed check for " + seriesId)
    logger.info("Length of expunge list " + str(len(epunge_list)))
    logger.info('updatePortalEpungePIDs:t1=%.4f', t_delta)

    return epunge_list
def getPIDRecords(pid_sub_list, seriesId, operation):
    """
    Queries ES and retrieves records from the index for a given PID
    :return:
    """
    metrics_elastic_search = MetricsElasticSearch()
    metrics_elastic_search.connect()
    logger = getESSyncLogger(name="es_eventlog")

    # set up the query
    query = {"terms": {"pid.key": pid_sub_list}}

    must_not_query = None
    if operation == "add":
        must_not_query = {"terms": {"portalIdentifier.keyword": [seriesId]}}

    return metrics_elastic_search.getRawSearches(index="eventlog-*",
                                                 q=query,
                                                 must_not_q=must_not_query,
                                                 limit=9999999)
def getAdminSubjects():
    """
    Returns admin tag from the ES index
    :return:
    """
    metrics_elastic_search = MetricsElasticSearch()
    metrics_elastic_search.connect()
    logger = getESSyncLogger(name="es_eventlog")
    logger.info("Getting the admin subjects")

    query = [{
        "exists": {
            "field": "sessionId"
        }
    }, {
        "terms": {
            "subject.key": DATAONE_ADMIN_SUBJECTS
        }
    }]

    return metrics_elastic_search.getRawSearches(index="eventlog-*",
                                                 q=query,
                                                 limit=1000000)
    def getMetricsPerRepository(self, nodeId):
        """
        Retrieves the metrics stats per repository
        Uses NodeID as repository ID
        :param: NodeId: Repository identifier to look up the metrics in the ES
        :return:
            Formatted Metrics Resonse object in JSON format
        """

        # Basic init for required objects
        t_start = time.time()
        metrics_elastic_search = MetricsElasticSearch()
        metrics_elastic_search.connect()

        t_delta = time.time() - t_start
        self.logger.debug('getMetricsPerRepository:t1=%.4f', t_delta)

        # defining the ES search and aggregation body for Repository profile
        search_body = [{
            "term": {
                "event.key": "read"
            }
        }, {
            "term": {
                "nodeId": nodeId
            }
        }, {
            "exists": {
                "field": "sessionId"
            }
        }, {
            "terms": {
                "formatType": ["DATA", "METADATA"]
            }
        }]

        aggregation_body = {
            "pid_list": {
                "composite": {
                    "size":
                    100,
                    "sources": [{
                        "format": {
                            "terms": {
                                "field": "formatType"
                            }
                        }
                    }, {
                        "month": {
                            "date_histogram": {
                                "field": "dateLogged",
                                "interval": "month"
                            }
                        }
                    }]
                }
            }
        }

        # we have events in ES from July 2012
        # TODO : Set this to the initial repository onboarding date
        # i.e. the first dataset submission date for that repository
        start_date = "07/01/2012"
        end_date = datetime.today().strftime('%m/%d/%Y')

        # Query the ES with the designed Search and Aggregation body
        # uses the start_date and the end_date for the time range of data retrieval
        data = metrics_elastic_search.iterate_composite_aggregations(
            search_query=search_body,
            aggregation_query=aggregation_body,
            start_date=datetime.strptime(start_date, '%m/%d/%Y'),
            end_date=datetime.strptime(end_date, '%m/%d/%Y'))

        t_delta = time.time() - t_start
        self.logger.debug('getMetricsPerRepository:t3=%.4f', t_delta)
        return (self.formatMetricsPerRepository(data, nodeId, start_date,
                                                end_date))
    def getSummaryMetricsPerCatalog(self, requestPIDArray, a_type):
        """
        Queries the Elastic Search and retrieves the summary metrics for a given DataCatalog pid Array.
        This information is used to populate the DataCatalog and Search pages.
        :param requestPIDArray: Array of PIDs of datasets on DataCatalog page or Search page
        :return:
        """
        t_0 = time.time()
        self.logger.debug("enter getSummaryMetricsPerCatalog")
        catalogPIDs = {}
        combinedPIDs = []
        for i in requestPIDArray:
            if i not in catalogPIDs:
                catalogPIDs[i] = []
                catalogPIDs[i].append(i)

        self.catalogPIDs = catalogPIDs

        return_dict = {}

        self.logger.debug("getSummaryMetricsPerCatalog #004")
        if a_type == "catalog":
            return_dict = pid_resolution.getResolvePIDs(catalogPIDs)
        elif a_type == "package":
            return_dict = pid_resolution.getObsolescenceChain(catalogPIDs)
        #    PIDs = self.resolvePackagePIDs([PID, ], req_session=req_session)
        #return_dict[PID] = PIDs

        #for pid in catalogPIDs:
        #    self.logger.debug("getSummaryMetricsPerCatalog #004.5 pid=%s", pid)
        #    self.resolveCatalogPID(return_dict, a_type, pid, req_session=req_session)
        self.logger.debug("getSummaryMetricsPerCatalog #005: %s",
                          str(return_dict))

        for i in catalogPIDs:
            catalogPIDs[i] = return_dict[i]

        for i in catalogPIDs:
            combinedPIDs.extend(catalogPIDs[i])

        aggregatedPIDs = {}
        for i in catalogPIDs:
            aggregatedPIDs[i] = {
                "filters": {
                    "filters": {
                        "pid.key": {
                            "terms": {
                                "pid.key": catalogPIDs[i]
                            }
                        }
                    }
                }
            }

        # Setting the query for the data catalog page
        metrics_elastic_search = MetricsElasticSearch()
        metrics_elastic_search.connect()
        search_body = [{
            "term": {
                "event.key": "read"
            }
        }, {
            "terms": {
                "pid.key": combinedPIDs
            }
        }, {
            "exists": {
                "field": "sessionId"
            }
        }, {
            "terms": {
                "formatType": ["DATA", "METADATA"]
            }
        }]
        aggregation_body = {
            "pid_list": {
                "composite": {
                    "sources": [{
                        "format": {
                            "terms": {
                                "field": "formatType"
                            }
                        }
                    }]
                },
                "aggs": aggregatedPIDs
            }
        }

        start_date = "01/01/2012"
        end_date = datetime.today().strftime('%m/%d/%Y')

        data = metrics_elastic_search.iterate_composite_aggregations(
            search_query=search_body,
            aggregation_query=aggregation_body,
            start_date=datetime.strptime(start_date, '%m/%d/%Y'),
            end_date=datetime.strptime(end_date, '%m/%d/%Y'))

        # return {}, {}
        # return data, return_dict
        self.logger.debug("exit getSummaryMetricsPerCatalog, duration=%fsec",
                          time.time() - t_0)
        return (self.formatDataPerCatalog(data, catalogPIDs))
    def getSummaryMetricsPerDataset(self, PIDs):
        """
        Queries the Elastic Search and retrieves the summary metrics for a given dataset.
        This information is used to populate the dataset landing pages.
        :param PIDs:
        :return: A dictionary containing lists of all the facets specified in the metrics_request
        """
        t_start = time.time()
        metrics_elastic_search = MetricsElasticSearch()
        metrics_elastic_search.connect()
        PIDDict = pid_resolution.getResolvePIDs(PIDs)
        PIDs = PIDDict[PIDs[0]]
        t_delta = time.time() - t_start
        self.logger.debug('getSummaryMetricsPerDataset:t1=%.4f', t_delta)

        obsoletes_dict = pid_resolution.getObsolescenceChain(PIDs, max_depth=1)

        t_delta = time.time() - t_start
        self.logger.debug('getSummaryMetricsPerDataset:t2=%.4f', t_delta)

        aggregatedPIDs = {}
        for i in PIDs:
            aggregatedPIDs[i] = {
                "filters": {
                    "filters": {
                        "pid.key": {
                            "term": {
                                "pid.key": i
                            }
                        }
                    }
                }
            }

        search_body = [{
            "term": {
                "event.key": "read"
            }
        }, {
            "terms": {
                "pid.key": PIDs
            }
        }, {
            "exists": {
                "field": "sessionId"
            }
        }, {
            "terms": {
                "formatType": ["DATA", "METADATA"]
            }
        }]
        aggregation_body = {
            "pid_list": {
                "composite": {
                    "size":
                    100,
                    "sources": [{
                        "country": {
                            "terms": {
                                "field": "geoip.country_code2.keyword",
                                "missing_bucket": "true"
                            }
                        }
                    }, {
                        "format": {
                            "terms": {
                                "field": "formatType"
                            }
                        }
                    }]
                },
                "aggs": aggregatedPIDs
            },
            "package_pid_list": {
                "composite": {
                    "sources": [{
                        "format": {
                            "terms": {
                                "field": "formatType"
                            }
                        }
                    }]
                },
                "aggs": aggregatedPIDs
            }
        }
        # pid = self.response["metricsRequest"]["filterBy"][0]["values"]
        self.response["metricsRequest"]["filterBy"][0]["values"] = PIDs
        self.request["filterBy"][0]["values"] = self.response[
            "metricsRequest"]["filterBy"][0]["values"]

        start_date = "01/01/2000"
        end_date = datetime.today().strftime('%m/%d/%Y')

        if (len(self.response["metricsRequest"]["filterBy"]) > 1):
            if (self.response["metricsRequest"]["filterBy"][1]["filterType"]
                    == "month" and self.response["metricsRequest"]["filterBy"]
                [1]["interpretAs"] == "range"):
                start_date = self.response["metricsRequest"]["filterBy"][1][
                    "values"][0]
                end_date = self.response["metricsRequest"]["filterBy"][1][
                    "values"][1]

            monthObject = {
                "month": {
                    "date_histogram": {
                        "field": "dateLogged",
                        "interval": "month"
                    }
                }
            }
            aggregation_body["pid_list"]["composite"]["sources"].append(
                monthObject)
        data = metrics_elastic_search.iterate_composite_aggregations(
            search_query=search_body,
            aggregation_query=aggregation_body,
            start_date=datetime.strptime(start_date, '%m/%d/%Y'),
            end_date=datetime.strptime(end_date, '%m/%d/%Y'))

        obsoletesDictionary = {k: str(v) for k, v in obsoletes_dict.items()}

        t_delta = time.time() - t_start
        self.logger.debug('getSummaryMetricsPerDataset:t3=%.4f', t_delta)
        return (self.formatDataPerDataset(data, PIDs, obsoletesDictionary))
예제 #10
0
def getAsyncPortalDatasetIdentifierFamilyByBatches(portal_pids):
    """
  Resolving Portal Dataset Identifier Family asynchronously
  """
    # Basic init for required objects
    _L, t_0 = _getLogger()
    metrics_elastic_search = MetricsElasticSearch()
    metrics_elastic_search.connect()

    t_delta = time.time() - t_0
    _L.debug('getAsyncPortalDatasetIdentifierFamily:t1=%.4f', t_delta)
    batch_size = BATCH_SIZE
    datasetIdentifierFamily = set()
    datasetIdentifierFamilyList = []
    datasetIdentifierFamily.update(portal_pids)
    totalDatasetIdentifierFamilySize = 0

    def _fetch(batch_portal_pids):

        results = {}
        search_query = {
            "bool": {
                "must": [{
                    "terms": {
                        "PID.keyword": batch_portal_pids
                    }
                }]
            }
        }
        # Try searching the identifiers index for the datasetIdentifierFamily
        results = metrics_elastic_search.getDatasetIdentifierFamily(
            search_query=search_query,
            index="identifiers-2",
            max_limit=1000000)

        return results

    async def _work(portal_pids):
        with concurrent.futures.ThreadPoolExecutor(
                max_workers=CONCURRENT_REQUESTS) as executor:
            loop = asyncio.get_event_loop()
            tasks = []

            for num in range(0, len(portal_pids), batch_size):
                _L.info("retrieving batch : " + str(num) + " and " +
                        str(num + batch_size))
                batch_portal_pids = portal_pids[num:num + batch_size]
                num += batch_size
                tasks.append(
                    loop.run_in_executor(executor, _fetch, batch_portal_pids))

            for response in await asyncio.gather(*tasks):
                _L.info(response)
                datasetIdentifierFamily_results.append(response)
                for i in response[0]:
                    datasetIdentifierFamily.update(
                        i["datasetIdentifierFamily"])

    _L.debug("Enter")
    datasetIdentifierFamily_results = []
    # In a multithreading environment such as under gunicorn, the new thread created by
    # gevent may not provide an event loop. Create a new one if necessary.
    try:
        loop = asyncio.get_event_loop()
    except RuntimeError as e:
        _L.info("Creating new event loop.")
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)

    future = asyncio.ensure_future(_work(portal_pids))
    loop.run_until_complete(future)
    _L.debug("elapsed:%fsec", time.time() - t_0)

    for response in datasetIdentifierFamily_results:
        totalDatasetIdentifierFamilySize += response[1]

    return (datasetIdentifierFamily, totalDatasetIdentifierFamilySize)
예제 #11
0
    def get_report_datasets(self, start_date, end_date, unique_pids, node ):
        """

        :param start_date:
        :param end_date:
        :param: unique_pids
        :param: node
        :return:
        """
        metrics_elastic_search = MetricsElasticSearch()
        metrics_elastic_search.connect()
        report_datasets = []
        pid_list = []


        count = 0
        nodeName = self.resolve_MN(node)
        self.logger.debug("Processing " + str(len(unique_pids)) + " datasets for node " + node)
        for pid in unique_pids:
            count = count + 1
            if((count % 100 == 0) or (count == 1) or (count == len(unique_pids))) :
                self.logger.debug(str(count) + " of "  + str(len(unique_pids)))


            dataset = {}
            solr_response = self.query_solr(pid)
            if(solr_response["response"]["numFound"] > 0):

                if ("title" in (i for i in solr_response["response"]["docs"][0])):
                    dataset["dataset-title"] = solr_response["response"]["docs"][0]["title"]
                else:
                    dataset["dataset-title"] = ""

                if ("authoritativeMN" in (i for i in solr_response["response"]["docs"][0])):
                    dataset["publisher"] = self.resolve_MN(solr_response["response"]["docs"][0]["authoritativeMN"])
                else:
                    dataset["publisher"].append(
                        {"type": "urn", "value": nodeName})

                if ("authoritativeMN" in (i for i in solr_response["response"]["docs"][0])):
                    dataset["publisher-id"] = []
                    dataset["publisher-id"].append({"type":"urn", "value" :solr_response["response"]["docs"][0]["authoritativeMN"]})
                else:
                    dataset["publisher-id"].append(
                        {"type": "urn", "value": node})

                dataset["platform"] = "DataONE"

                if ("origin" in (i for i in solr_response["response"]["docs"][0])):
                    contributors = []
                    for i in solr_response["response"]["docs"][0]["origin"]:
                        contributors.append({"type": "name", "value": i})
                    dataset["dataset-contributors"] = contributors

                if ("datePublished" in (i for i in solr_response["response"]["docs"][0])):
                    dataset["dataset-dates"] = []
                    dataset["dataset-dates"].append({"type": "pub-date", "value" :solr_response["response"]["docs"][0]["datePublished"][:10]})
                else:
                    dataset["dataset-dates"] = []
                    dataset["dataset-dates"].append({"type": "pub-date", "value" :solr_response["response"]["docs"][0]["dateUploaded"][:10]})

                if "doi" in pid:
                    dataset["dataset-id"] = [{"type": "doi", "value": pid}]
                else:
                    continue
                    # dataset["dataset-id"] = [{"type": "other-id", "value": pid}]

                dataset["yop"] = dataset["dataset-dates"][0]["value"][:4]

                if ("dataUrl" in (i for i in solr_response["response"]["docs"][0])):
                    dataset["uri"] = solr_response["response"]["docs"][0]["dataUrl"]

                dataset["data-type"] = "dataset"

                dataset["performance"] = []
                performance = {}

                performance["period"] = {}
                performance["period"]["begin-date"] = (datetime.strptime(start_date,'%m/%d/%Y')).strftime('%Y-%m-%d')
                performance["period"]["end-date"] = (datetime.strptime(end_date,'%m/%d/%Y')).strftime('%Y-%m-%d')

                instance = []
                pid_list = []
                pid_list.append(pid)
                pid_list = self.resolvePIDs(pid_list)

                report_instances = self.generate_instances(start_date, end_date, pid_list)

                if ("METADATA" in report_instances):
                    total_dataset_investigation = {"count": report_instances["METADATA"]["total_investigations"],
                                                   "access-method": "regular",
                                                   "metric-type": "total-dataset-investigations",
                                                   "country-counts": report_instances["METADATA"]["country_total_investigations"]}

                    unique_dataset_investigation = {"count": report_instances["METADATA"]["unique_investigations"],
                                                    "access-method": "regular",
                                                    "metric-type": "unique-dataset-investigations",
                                                    "country-counts": report_instances["METADATA"]["country_unique_investigations"]}
                    instance.append(total_dataset_investigation)
                    instance.append(unique_dataset_investigation)


                if("DATA" in report_instances):
                    total_dataset_requests = {"count": report_instances["DATA"]["total_requests"],
                                              "access-method": "regular",
                                              "metric-type": "total-dataset-requests",
                                              "country-counts": report_instances["DATA"]["country_total_requests"]}

                    unique_dataset_requests = {"count": report_instances["DATA"]["unique_requests"],
                                               "access-method": "regular",
                                               "metric-type": "unique-dataset-requests",
                                               "country-counts": report_instances["DATA"]["country_unique_requests"]}
                    instance.append(total_dataset_requests)
                    instance.append(unique_dataset_requests)

                performance["instance"] = instance

                dataset["performance"].append(performance)

            else:
                continue

            report_datasets.append(dataset)

        return (report_datasets)
예제 #12
0
    def generate_instances(self, start_date, end_date, pid_list):
        """

        :param start_date:
        :param end_date:
        :return:
        """
        metrics_elastic_search = MetricsElasticSearch()
        metrics_elastic_search.connect()
        report_instances = {}
        search_body = [
            {
                "terms": {
                    "pid.key": pid_list
                }
            },
            {
                "terms": {
                    "formatType": [
                        "METADATA",
                        "DATA"
                    ]
                }
            },
            {
                "term": {"event.key": "read"}
            },
            {
                "exists": {
                    "field": "sessionId"
                }
            }
        ]
        aggregation_body = {
            "pid_list": {
                "composite": {
                    "size": 100,
                    "sources": [
                        {
                            "session": {
                                "terms": {
                                    "field": "sessionId"
                                }
                            }
                        },
                        {
                            "country": {
                                "terms": {
                                    "field": "geoip.country_code2.keyword"
                                }
                            }
                        },
                        {
                            "format": {
                                "terms": {
                                    "field": "formatType"
                                }
                            }
                        }
                    ]
                }
            }
        }
        data = metrics_elastic_search.iterate_composite_aggregations(search_query=search_body, aggregation_query = aggregation_body,\
                                                                     start_date=datetime.strptime(start_date,'%m/%d/%Y'),\
                                                                     end_date=datetime.strptime(end_date,'%m/%d/%Y'))


        for i in data["aggregations"]["pid_list"]["buckets"]:
            if(i["key"]["format"] == "METADATA"):
                if "METADATA" in report_instances:
                    report_instances["METADATA"]["unique_investigations"] = report_instances["METADATA"]["unique_investigations"] + 1
                    report_instances["METADATA"]["total_investigations"] = report_instances["METADATA"][
                                                                                "total_investigations"] + i["doc_count"]
                    if (i["key"]["country"]).lower() in report_instances["METADATA"]["country_unique_investigations"]:
                        report_instances["METADATA"]["country_unique_investigations"][(i["key"]["country"]).lower()] = \
                            report_instances["METADATA"]["country_unique_investigations"][(i["key"]["country"]).lower()] + 1
                        report_instances["METADATA"]["country_total_investigations"][(i["key"]["country"]).lower()] = \
                        report_instances["METADATA"]["country_total_investigations"][(i["key"]["country"]).lower()] + i["doc_count"]
                    else:
                        report_instances["METADATA"]["country_unique_investigations"][(i["key"]["country"]).lower()] = 1
                        report_instances["METADATA"]["country_total_investigations"][(i["key"]["country"]).lower()] = i["doc_count"]
                else:
                    report_instances["METADATA"] = {
                        "unique_investigations" : 1,
                        "total_investigations": i["doc_count"],
                        "country_unique_investigations": {
                            (i["key"]["country"]).lower(): 1
                        },
                        "country_total_investigations": {
                            (i["key"]["country"]).lower(): i["doc_count"]
                        }
                    }
            if (i["key"]["format"] == "DATA"):
                if "DATA" in report_instances:
                    report_instances["METADATA"]["unique_investigations"] = report_instances["METADATA"][
                                                                                "unique_investigations"] + 1
                    report_instances["METADATA"]["total_investigations"] = report_instances["METADATA"][
                                                                               "total_investigations"] + i["doc_count"]
                    report_instances["DATA"]["unique_requests"] = report_instances["DATA"][
                                                                                "unique_requests"] + 1
                    report_instances["DATA"]["total_requests"] = report_instances["DATA"][
                                                                               "total_requests"] + i["doc_count"]
                    if (i["key"]["country"]).lower() in report_instances["DATA"]["country_unique_requests"]:
                        report_instances["METADATA"]["country_unique_investigations"][(i["key"]["country"]).lower()] = \
                            report_instances["METADATA"]["country_unique_investigations"][(i["key"]["country"]).lower()] + 1
                        report_instances["METADATA"]["country_total_investigations"][(i["key"]["country"]).lower()] = \
                            report_instances["METADATA"]["country_total_investigations"][(i["key"]["country"]).lower()] + i[
                                "doc_count"]
                        report_instances["DATA"]["country_unique_requests"][(i["key"]["country"]).lower()] = \
                            report_instances["DATA"]["country_unique_requests"][(i["key"]["country"]).lower()] + 1
                        report_instances["DATA"]["country_total_requests"][(i["key"]["country"]).lower()] = \
                            report_instances["DATA"]["country_total_requests"][(i["key"]["country"]).lower()] + i[
                                "doc_count"]
                    else:
                        if (i["key"]["country"]).lower() in report_instances["METADATA"]["country_unique_investigations"]:
                            report_instances["METADATA"]["country_unique_investigations"][(i["key"]["country"]).lower()] = \
                                report_instances["METADATA"]["country_unique_investigations"][(i["key"]["country"]).lower()] + 1
                            report_instances["METADATA"]["country_total_investigations"][(i["key"]["country"]).lower()] = \
                                report_instances["METADATA"]["country_total_investigations"][(i["key"]["country"]).lower()] + i[
                                    "doc_count"]
                        else:
                            report_instances["METADATA"]["country_unique_investigations"][(i["key"]["country"]).lower()] = 1
                            report_instances["METADATA"]["country_total_investigations"][(i["key"]["country"]).lower()] = i[
                                "doc_count"]
                        report_instances["DATA"]["country_unique_requests"][(i["key"]["country"]).lower()] = 1
                        report_instances["DATA"]["country_total_requests"][(i["key"]["country"]).lower()] = i[
                            "doc_count"]
                else:
                    if "METADATA" in report_instances:
                        report_instances["METADATA"]["unique_investigations"] = report_instances["METADATA"][
                                                                                    "unique_investigations"] + 1
                        report_instances["METADATA"]["total_investigations"] = report_instances["METADATA"][
                                                                                   "total_investigations"] + i[
                                                                                   "doc_count"]
                        if (i["key"]["country"]).lower() in report_instances["METADATA"]["country_unique_investigations"]:
                            report_instances["METADATA"]["country_unique_investigations"][(i["key"]["country"]).lower()] = \
                                report_instances["METADATA"]["country_unique_investigations"][(i["key"]["country"]).lower()] + 1
                            report_instances["METADATA"]["country_total_investigations"][(i["key"]["country"]).lower()] = \
                                report_instances["METADATA"]["country_total_investigations"][(i["key"]["country"]).lower()] + i[
                                    "doc_count"]
                        else:
                            report_instances["METADATA"]["country_unique_investigations"][(i["key"]["country"]).lower()] = 1
                            report_instances["METADATA"]["country_total_investigations"][(i["key"]["country"]).lower()] = i[
                                "doc_count"]
                    else:
                        report_instances["METADATA"] = {
                            "unique_investigations": 1,
                            "total_investigations": i["doc_count"],
                            "country_unique_investigations": {
                                (i["key"]["country"]).lower(): 1
                            },
                            "country_total_investigations": {
                                (i["key"]["country"]).lower(): i["doc_count"]
                            }
                        }
                    report_instances["DATA"] = {
                        "unique_requests": 1,
                        "total_requests": i["doc_count"],
                        "country_unique_requests": {
                            (i["key"]["country"]).lower(): 1
                        },
                        "country_total_requests": {
                            (i["key"]["country"]).lower(): i["doc_count"]
                        }
                    }
        return report_instances
예제 #13
0
async def updateRecords(seriesId="",
                        pid_sub_list=[],
                        session=ClientSession(),
                        operation="add"):
    """
    Updates the record and writes it down to the ES index
    :return:
    """
    logger = getESSyncLogger(name="es_eventlog")
    metrics_elastic_search = MetricsElasticSearch()
    metrics_elastic_search.connect()
    eventSuccessCount = 0
    eventFailCount = 0

    data = getPIDRecords(pid_sub_list, seriesId, operation)
    total_hits = 0

    index_update_seriesId = {
        "script": {
            "source": "ctx._source.portalIdentifier.add(params.tag)",
            "lang": "painless",
            "params": {
                "tag": seriesId
            }
        }
    }

    index_add_seriesId = {
        "script": "ctx._source.portalIdentifier = ['%s']" % (seriesId)
    }

    index_remove_seriesId = {
        "script": {
            "source":
            "if (ctx._source.portalIdentifier.contains(params.tag)) { ctx._source.portalIdentifier.remove(ctx._source.portalIdentifier.indexOf(params.tag)) }",
            "lang": "painless",
            "params": {
                "tag": seriesId
            }
        }
    }

    headers = {}
    headers["Content-Type"] = "application/x-ndjson"

    if data is not None:
        total_hits = data[1]

    if total_hits > 0:

        for data_index in range(0, len(data[0]), 500):
            try:
                bulk_update_body = ""
                for read_event_entry in itertools.islice(
                        data[0], data_index, data_index + 500):

                    if "_id" in read_event_entry:
                        entry_id = read_event_entry["_id"]

                    if "_index" in read_event_entry:
                        read_event_entry_index = read_event_entry["_index"]
                    else:
                        logger.error("Cannot update entry: " + entry_id)
                        continue

                    update_body_syntax = {
                        "update": {
                            "_id": entry_id,
                            "_type": "_doc",
                            "_index": read_event_entry_index,
                            "retry_on_conflict": 3
                        }
                    }
                    bulk_update_body += json.dumps(update_body_syntax) + "\n"

                    index_update_url = "http://localhost:9200/_bulk"
                    if operation == "add":
                        if "portalIdentifier" in read_event_entry["_source"]:
                            bulk_update_body += json.dumps(
                                index_update_seriesId) + "\n"
                        else:
                            bulk_update_body += json.dumps(
                                index_add_seriesId) + "\n"
                    elif operation == "remove":
                        bulk_update_body += json.dumps(
                            index_remove_seriesId) + "\n"

                if ((len(seriesId) > 0) and seriesId is not None
                        and bulk_update_body is not None):
                    bulk_update_body += "\n"
                    async with session.post(index_update_url,
                                            data=bulk_update_body,
                                            headers=headers,
                                            timeout=120) as response:
                        response_text = await response.text()
                        if response.status == 200:
                            response_data = json.loads(response_text)
                            if "items" in response_data:
                                for item in response_data["items"]:
                                    if item["update"]["status"] == 200:
                                        eventSuccessCount += 1
                                    else:
                                        logger.error(entry_id + " - " +
                                                     response_text)
                        else:
                            logger.error(entry_id + " - " + response_text)

            except TimeoutError as e:
                logger.error("Timeout Error for entry_ID : " + entry_id)
            except Exception as e:
                logger.error("Exception occured for entry_ID : " + entry_id)

    return total_hits, eventSuccessCount
예제 #14
0
class MetricsReporter(object):

    def __init__(self):
        self._config = DEFAULT_REPORT_CONFIGURATION

        self.es = MetricsElasticSearch()
        self.es.connect()

        self.logger = logging.getLogger('metrics_reporting_service.' + __name__)
        self.logger.setLevel(logging.DEBUG)

        # create file handler which logs even debug messages
        fh = logging.FileHandler('./reports/reports.log')
        fh.setLevel(logging.DEBUG)

        # create console handler with a higher log level
        ch = logging.StreamHandler()
        ch.setLevel(logging.ERROR)

        # create formatter and add it to the handlers
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        fh.setFormatter(formatter)
        ch.setFormatter(formatter)

        # add the handlers to the logger
        self.logger.addHandler(fh)
        self.logger.addHandler(ch)


    def report_handler(self, start_date, end_date, node, doi_dict, perform_put=None):
        """
        Creates a Report JSON object, dumps it to a file and sends the report to the Hub.
        This is a handler function that manages the entire work flow
        :param start_date:
        :param end_date:
        :param: node
        :param: doi_dict
        :return: None
        """
        json_object = {}


        large_report = False
        if len(doi_dict) > 2000:
            large_report = True

        json_object["report-header"] = self.get_report_header(start_date, end_date, node, large_report)
        report_datasets = self.get_async_report_datasets(start_date, end_date, node, doi_dict)

        if report_datasets:
            json_object["report-datasets"] = self.get_async_report_datasets(start_date, end_date, node, doi_dict)

            with open('./reports/' + ("DSR-D1-" + (datetime.strptime(end_date,'%m/%d/%Y')).strftime('%Y-%m-%d'))+ "-" + node+'.json', 'w') as outfile:
                json.dump(json_object, outfile, indent=2, ensure_ascii=False)

            if len(doi_dict) > 2000:
                response = response = self.send_reports(start_date, end_date, node, perform_put, compressed=large_report)
            else:
                response = response = self.send_reports(start_date, end_date, node, perform_put, compressed=large_report)

            return response

        return None


    def get_report_header(self, start_date, end_date, node, large_report):
        """
        Generates a unique report header
        :param start_date:
        :param end_date:
        :return: Dictionary report header object.
        """
        report_header = {}
        report_header["report-name"] = self._config["report_name"]
        report_header["report-id"] = "dsr"
        report_header["release"] = self._config["release"]
        report_header["reporting-period"] = {}
        report_header["reporting-period"]["begin-date"] = (datetime.strptime(start_date,'%m/%d/%Y')).strftime('%Y-%m-%d')
        report_header["reporting-period"]["end-date"] = (datetime.strptime(end_date, '%m/%d/%Y')).strftime(
            '%Y-%m-%d')
        report_header["created"] = datetime.now().strftime('%Y-%m-%d')
        report_header["created-by"] = node
        report_header["report-filters"] = []
        report_header["report-attributes"] = []

        if large_report:
            report_header["exceptions"] = [
                {
                    "code": 69,
                    "severity": "warning",
                    "message": "Report is compressed using gzip",
                    "help-url": "https://github.com/datacite/sashimi",
                    "data": "usage data needs to be uncompressed"
                }
            ]
        else:
            report_header["exceptions"] = []

        return (report_header)


    def generate_instances(self, start_date, end_date, pid_list):
        """

        :param start_date:
        :param end_date:
        :return:
        """
        report_instances = {}
        search_body = [
            {
                "terms": {
                    "pid.key": pid_list
                }
            },
            {
                "terms": {
                    "formatType": [
                        "METADATA",
                        "DATA"
                    ]
                }
            },
            {
                "term": {"event.key": "read"}
            },
            {
                "exists": {
                    "field": "sessionId"
                }
            }
        ]
        aggregation_body = {
            "pid_list": {
                "composite": {
                    "size": 100,
                    "sources": [
                        {
                            "session": {
                                "terms": {
                                    "field": "sessionId"
                                }
                            }
                        },
                        {
                            "country": {
                                "terms": {
                                    "field": "geoip.country_code2.keyword",
                                    "missing_bucket":"true"
                                }
                            }
                        },
                        {
                            "format": {
                                "terms": {
                                    "field": "formatType"
                                }
                            }
                        }
                    ]
                }
            }
        }
        data = self.es.iterate_composite_aggregations(search_query=search_body, aggregation_query = aggregation_body,\
                                                                     start_date=datetime.strptime(start_date,'%m/%d/%Y'),\
                                                                     end_date=datetime.strptime(end_date,'%m/%d/%Y'))


        for i in data["aggregations"]["pid_list"]["buckets"]:
            if i["key"]["country"] is None:
                i["key"]["country"] = "n/a"
            if(i["key"]["format"] == "METADATA"):
                if "METADATA" in report_instances:
                    report_instances["METADATA"]["unique_investigations"] = report_instances["METADATA"]["unique_investigations"] + 1
                    report_instances["METADATA"]["total_investigations"] = report_instances["METADATA"][
                                                                                "total_investigations"] + i["doc_count"]
                    if (i["key"]["country"]).lower() in report_instances["METADATA"]["country_unique_investigations"]:
                        report_instances["METADATA"]["country_unique_investigations"][(i["key"]["country"]).lower()] = \
                            report_instances["METADATA"]["country_unique_investigations"][(i["key"]["country"]).lower()] + 1
                        report_instances["METADATA"]["country_total_investigations"][(i["key"]["country"]).lower()] = \
                        report_instances["METADATA"]["country_total_investigations"][(i["key"]["country"]).lower()] + i["doc_count"]
                    else:
                        report_instances["METADATA"]["country_unique_investigations"][(i["key"]["country"]).lower()] = 1
                        report_instances["METADATA"]["country_total_investigations"][(i["key"]["country"]).lower()] = i["doc_count"]
                else:
                    report_instances["METADATA"] = {
                        "unique_investigations" : 1,
                        "total_investigations": i["doc_count"],
                        "country_unique_investigations": {
                            (i["key"]["country"]).lower(): 1
                        },
                        "country_total_investigations": {
                            (i["key"]["country"]).lower(): i["doc_count"]
                        }
                    }
            if (i["key"]["format"] == "DATA"):
                if "DATA" in report_instances:
                    report_instances["METADATA"]["unique_investigations"] = report_instances["METADATA"][
                                                                                "unique_investigations"] + 1
                    report_instances["METADATA"]["total_investigations"] = report_instances["METADATA"][
                                                                               "total_investigations"] + i["doc_count"]
                    report_instances["DATA"]["unique_requests"] = report_instances["DATA"][
                                                                                "unique_requests"] + 1
                    report_instances["DATA"]["total_requests"] = report_instances["DATA"][
                                                                               "total_requests"] + i["doc_count"]
                    if (i["key"]["country"]).lower() in report_instances["DATA"]["country_unique_requests"]:
                        report_instances["METADATA"]["country_unique_investigations"][(i["key"]["country"]).lower()] = \
                            report_instances["METADATA"]["country_unique_investigations"][(i["key"]["country"]).lower()] + 1
                        report_instances["METADATA"]["country_total_investigations"][(i["key"]["country"]).lower()] = \
                            report_instances["METADATA"]["country_total_investigations"][(i["key"]["country"]).lower()] + i[
                                "doc_count"]
                        report_instances["DATA"]["country_unique_requests"][(i["key"]["country"]).lower()] = \
                            report_instances["DATA"]["country_unique_requests"][(i["key"]["country"]).lower()] + 1
                        report_instances["DATA"]["country_total_requests"][(i["key"]["country"]).lower()] = \
                            report_instances["DATA"]["country_total_requests"][(i["key"]["country"]).lower()] + i[
                                "doc_count"]
                    else:
                        if (i["key"]["country"]).lower() in report_instances["METADATA"]["country_unique_investigations"]:
                            report_instances["METADATA"]["country_unique_investigations"][(i["key"]["country"]).lower()] = \
                                report_instances["METADATA"]["country_unique_investigations"][(i["key"]["country"]).lower()] + 1
                            report_instances["METADATA"]["country_total_investigations"][(i["key"]["country"]).lower()] = \
                                report_instances["METADATA"]["country_total_investigations"][(i["key"]["country"]).lower()] + i[
                                    "doc_count"]
                        else:
                            report_instances["METADATA"]["country_unique_investigations"][(i["key"]["country"]).lower()] = 1
                            report_instances["METADATA"]["country_total_investigations"][(i["key"]["country"]).lower()] = i[
                                "doc_count"]
                        report_instances["DATA"]["country_unique_requests"][(i["key"]["country"]).lower()] = 1
                        report_instances["DATA"]["country_total_requests"][(i["key"]["country"]).lower()] = i[
                            "doc_count"]
                else:
                    if "METADATA" in report_instances:
                        report_instances["METADATA"]["unique_investigations"] = report_instances["METADATA"][
                                                                                    "unique_investigations"] + 1
                        report_instances["METADATA"]["total_investigations"] = report_instances["METADATA"][
                                                                                   "total_investigations"] + i[
                                                                                   "doc_count"]
                        if (i["key"]["country"]).lower() in report_instances["METADATA"]["country_unique_investigations"]:
                            report_instances["METADATA"]["country_unique_investigations"][(i["key"]["country"]).lower()] = \
                                report_instances["METADATA"]["country_unique_investigations"][(i["key"]["country"]).lower()] + 1
                            report_instances["METADATA"]["country_total_investigations"][(i["key"]["country"]).lower()] = \
                                report_instances["METADATA"]["country_total_investigations"][(i["key"]["country"]).lower()] + i[
                                    "doc_count"]
                        else:
                            report_instances["METADATA"]["country_unique_investigations"][(i["key"]["country"]).lower()] = 1
                            report_instances["METADATA"]["country_total_investigations"][(i["key"]["country"]).lower()] = i[
                                "doc_count"]
                    else:
                        report_instances["METADATA"] = {
                            "unique_investigations": 1,
                            "total_investigations": i["doc_count"],
                            "country_unique_investigations": {
                                (i["key"]["country"]).lower(): 1
                            },
                            "country_total_investigations": {
                                (i["key"]["country"]).lower(): i["doc_count"]
                            }
                        }
                    report_instances["DATA"] = {
                        "unique_requests": 1,
                        "total_requests": i["doc_count"],
                        "country_unique_requests": {
                            (i["key"]["country"]).lower(): 1
                        },
                        "country_total_requests": {
                            (i["key"]["country"]).lower(): i["doc_count"]
                        }
                    }
        return report_instances


    def get_async_report_datasets(self, start_date, end_date, node, doi_dict):
        """
        Generates asynchronous dataset instances.

        :param start_date:
            Start date of the report
        :param end_date:
            End date of the report
        :param: node
            Node identifier for the logs from ES
        :param: doi_dict
            Dictionary object with dois and their datasetIdentifierFamily

        :return: list object
            list of dataset instances as defined in SUSHI format
        """

        time_beg = time.time()

        count = 0
        mn_dict = self.get_MN_Dict()
        nodeName = mn_dict[node]

        def _get_single_dataset_instance(self, doi, pid_list):
            """
            Generartes a single instance of dataset object

            :param self:
                The self object
            :param doi:
                The doi for the dataset
            :param pid_list:
                `datasetIdentifierFamily` for this doi

            :return: dictionary object
                a single dataset instance as defined in SUSHI format
            """

            dataset = {}
            pid = pid_list[0]
            solr_response = self.query_solr(pid)
            if (solr_response["response"]["numFound"] > 0):

                if ("title" in (i for i in solr_response["response"]["docs"][0])):
                    dataset["dataset-title"] = solr_response["response"]["docs"][0]["title"]
                else:
                    return None

                if ("authoritativeMN" in (i for i in solr_response["response"]["docs"][0])):
                    dataset["publisher"] = mn_dict[solr_response["response"]["docs"][0]["authoritativeMN"]]
                else:
                    dataset["publisher"].append(
                        {"type": "urn", "value": nodeName})

                if ("authoritativeMN" in (i for i in solr_response["response"]["docs"][0])):
                    dataset["publisher-id"] = []
                    dataset["publisher-id"].append(
                        {"type": "urn", "value": solr_response["response"]["docs"][0]["authoritativeMN"]})
                else:
                    dataset["publisher-id"].append(
                        {"type": "urn", "value": node})

                dataset["platform"] = "DataONE"

                if ("origin" in (i for i in solr_response["response"]["docs"][0])):
                    contributors = []
                    for i in solr_response["response"]["docs"][0]["origin"]:
                        contributors.append({"type": "name", "value": i})
                    dataset["dataset-contributors"] = contributors

                if ("datePublished" in (i for i in solr_response["response"]["docs"][0])):
                    dataset["dataset-dates"] = []
                    dataset["dataset-dates"].append(
                        {"type": "pub-date", "value": solr_response["response"]["docs"][0]["datePublished"][:10]})
                else:
                    dataset["dataset-dates"] = []
                    dataset["dataset-dates"].append(
                        {"type": "pub-date", "value": solr_response["response"]["docs"][0]["dateUploaded"][:10]})

                if doi:
                    dataset["dataset-id"] = [{"type": "doi", "value": doi}]
                else:
                    return None
                    # dataset["dataset-id"] = [{"type": "other-id", "value": pid}]

                dataset["yop"] = dataset["dataset-dates"][0]["value"][:4]

                if ("dataUrl" in (i for i in solr_response["response"]["docs"][0])):
                    dataset["uri"] = solr_response["response"]["docs"][0]["dataUrl"]

                dataset["data-type"] = "dataset"

                dataset["performance"] = []
                performance = {}

                performance["period"] = {}
                performance["period"]["begin-date"] = (datetime.strptime(start_date, '%m/%d/%Y')).strftime('%Y-%m-%d')
                performance["period"]["end-date"] = (datetime.strptime(end_date, '%m/%d/%Y')).strftime('%Y-%m-%d')

                instance = []

                report_instances = self.generate_instances(start_date, end_date, pid_list)

                if ("METADATA" in report_instances):
                    total_dataset_investigation = {"count": report_instances["METADATA"]["total_investigations"],
                                                   "access-method": "regular",
                                                   "metric-type": "total-dataset-investigations",
                                                   "country-counts": report_instances["METADATA"][
                                                       "country_total_investigations"]}

                    unique_dataset_investigation = {"count": report_instances["METADATA"]["unique_investigations"],
                                                    "access-method": "regular",
                                                    "metric-type": "unique-dataset-investigations",
                                                    "country-counts": report_instances["METADATA"][
                                                        "country_unique_investigations"]}
                    instance.append(total_dataset_investigation)
                    instance.append(unique_dataset_investigation)

                if ("DATA" in report_instances):
                    total_dataset_requests = {"count": report_instances["DATA"]["total_requests"],
                                              "access-method": "regular",
                                              "metric-type": "total-dataset-requests",
                                              "country-counts": report_instances["DATA"]["country_total_requests"]}

                    unique_dataset_requests = {"count": report_instances["DATA"]["unique_requests"],
                                               "access-method": "regular",
                                               "metric-type": "unique-dataset-requests",
                                               "country-counts": report_instances["DATA"]["country_unique_requests"]}
                    instance.append(total_dataset_requests)
                    instance.append(unique_dataset_requests)

                for i in instance:
                    if "n/a" in i["country-counts"]:
                        i["country-counts"].pop("n/a", None)

                performance["instance"] = instance

                dataset["performance"].append(performance)

            else:
                return None

            return dataset

        async def _work_get_all_datasets_instances(self, doi_dict):
            """
            For all the PIDs in the doi_dict, create async jobs and execute them concurrently

            :param self:
                Class object
            :param doi_dict:
                Dictionary of `doi` as key and `datasetIdentifierFamily` as value

            :return:
                None
            """

            with concurrent.futures.ThreadPoolExecutor(max_workers=CONCURRENT_REQUESTS) as executor:
                self.logger.info("self.async init : %f sec", time.time() - time_beg)
                loop = asyncio.get_event_loop()
                tasks = []

                for pid, pid_list in doi_dict.items():
                    tasks.append(loop.run_in_executor(executor, _get_single_dataset_instance, self, pid, pid_list))

                for response in await asyncio.gather(*tasks):
                    if len(tasks) % 100 == 0:
                        self.logger.info(len(tasks))
                    if response is not None:
                        report_datasets.append(response)

                self.logger.info("self.async end : %f sec", time.time() - time_beg)

        report_datasets = []

        self.logger.info("self.get_es_unique_dois : %f sec", time.time() - time_beg)
        self.logger.debug("Processing " + str(len(doi_dict)) + " datasets for node " + node)

        # In a multithreading environment such as under gunicorn, the new thread created by
        # gevent may not provide an event loop. Create a new one if necessary.
        try:
            loop = asyncio.get_event_loop()
        except RuntimeError as e:
            _L.info("Creating new event loop.")
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)

        future = asyncio.ensure_future(_work_get_all_datasets_instances(self, doi_dict))
        loop.run_until_complete(future)

        if len(report_datasets) < 1:
            return None

        self.logger.info("total : %f sec", time.time() - time_beg)

        return (report_datasets)


    def send_reports(self, start_date, end_date, node, perform_put=None, compressed=False):
        """
        Sends reports to the Hub at the specified Hub report url in the config parameters.

        The DataCite HUB has a limit on the number of dataset instance that it can inject.
        If ther reports are too large, it gives errors while injesting the reports.

        To handle cases with large reports the `compressed` parameters should be set to True.

        :param: start_date
            String object representing the beginning of the report

        :param: end_date
            String object representing the end interval of the report

        :param: node
            The corresponding node to which the report belong to

        :param: compressed
            A boolean parameter that represents whether to send zipped reports or not

        :return: response
            A HTTP reponse object reporesenting the status of the sent zipped report
        """
        s = requests.session()

        name = "./reports/DSR-D1-" + (datetime.strptime(end_date,'%m/%d/%Y')).strftime('%Y-%m-%d')+ "-" + node

        if compressed:

            s.headers.update(
                {'Authorization': "Bearer " +  self._config["auth_token"], 'Content-Type': 'application/gzip', 'Accept': 'gzip', 'Content-Encoding': 'gzip'})

            with open(name + ".json", 'r') as content_file:
                # JSON large object data
                jlob = content_file.read()

                # JSON large object bytes
                jlob = jlob.encode("utf-8")

                with open(name + ".gzip", mode="wb") as f:
                    f.write(jlob)

            if perform_put:
                self.logger.info("Performing PUT")
                response = s.put(self._config["report_url"] + "/" + perform_put, data=gzip.compress(jlob))

            response = s.post(self._config["report_url"], data=gzip.compress(jlob))

        else:
            s.headers.update(
                {'Authorization': "Bearer " + self._config["auth_token"], 'Content-Type': 'application/json',
                 'Accept': 'application/json'})

            with open(name + '.json', 'r') as content_file:
                content = content_file.read()

            if perform_put:
                self.logger.info("Performing PUT")
                response = s.put(self._config["report_url"] + "/" + perform_put, data=content.encode("utf-8"))

            response = s.post(self._config["report_url"], data=content.encode("utf-8"))

        self.logger.info(response)
        self.logger.info(str(response.status_code) + " " + response.reason)
        self.logger.info("Headers: " + str(response.headers))
        self.logger.info("Content: " + str((response.content).decode("utf-8")))

        return response


    def query_solr(self, PID):
        """
        Queries the Solr end-point for metadata given the PID.
        :param PID:
        :return: JSON Object containing the metadata fields queried from Solr
        """

        session = requests.Session()
        retry = Retry(connect=3, backoff_factor=0.5)
        adapter = HTTPAdapter(max_retries=retry)
        session.mount('http://', adapter)
        session.mount('https://', adapter)

        queryString = 'q=id:"' + PID + '"&fl=origin,title,datePublished,dateUploaded,authoritativeMN,dataUrl&wt=json'
        response = session.get(url = self._config["solr_query_url"], params = queryString)

        return response.json()


    def scheduler(self):
        """
        This function sends reports to the hub with events reported on daily basis from Jan 01, 2000
        Probably would be called only once in its lifetime
        :return: None
        """
        mn_dict = self.get_MN_Dict()
        for node, nodeName in mn_dict.items():
            self.logger.debug("Running job for Node: " + node)

            date = datetime(2012, 6, 30)
            stopDate = datetime(2014, 12, 31)

            jobs_done = self.get_jobs_done(node)

            count = 0
            while (date.strftime('%Y-%m-%d') != stopDate.strftime('%Y-%m-%d')):
                count = count + 1

                prevDate = date + timedelta(days=1)
                date = self.last_day_of_month(prevDate)

                start_date, end_date = prevDate.strftime('%m/%d/%Y'),\
                             date.strftime('%m/%d/%Y')

                perform_put = None
                job_done_date = prevDate.strftime('%Y-%m-%d')
                if job_done_date in jobs_done:
                    perform_put = jobs_done[job_done_date]


                orignial_doi_dict = self.get_es_unique_dois(start_date, end_date, nodeId = node)

                doi_dict = {k: orignial_doi_dict[k] for k in list(orignial_doi_dict)[:100]}

                if (len(doi_dict) > 0):
                    self.logger.info("Job " + " : " + start_date + " to " + end_date)

                    # Uncomment me to send reports to the HUB!
                    response = self.report_handler(start_date, end_date, node, doi_dict, perform_put)

                    if response is None:
                        self.logger.info(
                            "Skipping job for " + node + " " + start_date + " to " + end_date + " - no datasets to submit!")
                        continue

                    logentry = "Node " + node + " : " + start_date + " to " + end_date + " === " + str(response.status_code)

                    self.logger.info(logentry)

                    if response.status_code != 201:

                        logentry = "Node " + node + " : " + start_date + " to " + end_date + " === " \
                                   + str(response.status_code)
                        self.logger.error(logentry)
                        self.logger.error(str(response.status_code) + " " + response.reason)
                        self.logger.error("Headers: " + str(response.headers))
                        self.logger.error("Content: " + str((response.content).decode("utf-8")))
                else:
                    self.logger.info(
                        "Skipping job for " + node + " " + start_date + " to " + end_date + " - length of PIDS : " + str(
                            len(doi_dict)))


    def last_day_of_month(self, date):
        """
        Returns the last day of the month for report generation

        :param date:
            A date object to get the last date of that month

        :return: date object
            Last day of the month for the date instance supplied in the parameter
        """
        if date.month == 12:
            return date.replace(day=31)
        return date.replace(month=date.month + 1, day=1) - timedelta(days=1)


    def get_MN_Dict(self, mn = True):
        """
        Retreives a MN idenifier from the https://cn.dataone.org/cn/v2/node/ endpoint
        Used to send the reports for different MNs

        :return: Dictionary of Member Node identifiers
            Key - MN identifier
            Value - Full name of the MN

        """
        node_url = "https://cn.dataone.org/cn/v2/node/"
        resp = requests.get(node_url, stream=True)
        root = ElementTree.fromstring(resp.content)
        mn_dict = dict()

        for child in root:
            if child.get('type') == "mn" and mn:
                identifier = child.find('identifier').text
                name = child.find('name').text
                mn_dict[identifier] = name
            else:
                identifier = child.find('identifier').text
                name = child.find('name').text
                mn_dict[identifier] = name

        return (mn_dict)


    def get_es_unique_dois(self, start_date, end_date, nodeId = None):
        """

        Finds the dois from the eventlog and
        returns a dictionary with the doi as the key and it's corresponding PID as a value

        :param start_date: begin date for search
        :param end_date: end date for search
        :param nodeId: Node ID for the query term

        :return: dictionary object

        """

        seriesIdWildCard = {
            "seriesId": {
                "value": "*doi*"
            }
        }

        PIDWildCard = {
            "pid.key": {
                "value": "*doi*"
            }
        }

        doi_dict = {}

        search_body = [
            {
                "term": {"event.key": "read"}
            },
            {
                "exists": {
                    "field": "sessionId"
                }
            },
            {
                "terms": {
                    "formatType": [
                        "DATA",
                        "METADATA"
                    ]
                }
            },
            {
                "wildcard": seriesIdWildCard
            }
        ]

        if nodeId:
            nodeQuery = {
                "term": {
                    "nodeId" : nodeId
                }
            }
            search_body.append(nodeQuery)

        fields = ["pid", "seriesId"]

        results, total1 = self.es.getSearches (q=search_body, index='eventlog-1', limit=1000000, fields=fields,
                                                             date_start=datetime.strptime(start_date,'%m/%d/%Y'),
                                                             date_end=datetime.strptime(end_date,'%m/%d/%Y'))

        for result in results:
            if result["seriesId"] not in doi_dict:
                doi_dict[result["seriesId"]] = []
                doi_dict[result["seriesId"]].append(result["pid"])

        search_body[3]["wildcard"] = PIDWildCard

        results, total2 = self.es.getSearches(q=search_body, index='eventlog-1', limit=1000000, fields=fields,
                                                             date_start=datetime.strptime(start_date,'%m/%d/%Y'),
                                                             date_end=datetime.strptime(end_date,'%m/%d/%Y'))

        for result in results:
            if result["pid"] not in doi_dict:
                doi_dict[result["pid"]] = []
                doi_dict[result["pid"]].append(result["pid"])

        # query identifiers index only if there is anything to query!
        if len(doi_dict) > 0:
            return self.get_doi_dict_dataset_identifier_family(doi_dict)

        return {}


    def get_doi_dict_dataset_identifier_family(self, doi_dict):
        """
        Gets the dataset_identifier_family from the identifiers index for every key in the doi_dict

        :param: doi_dict
            A dictionary of the DOIs with the doi as the key and the resolved_pids a.k.a
            the dataset_identifier_family as the value

        :return: dictionary object
        """


        def _get_dataset_identifier_family(pid):
            """

            Retrieves the dataset_identifier_family

            :param pid: The PID of interest

            :return: a dictionary object

            """

            result = {}
            result[pid] = []
            result[pid].append(pid)

            query_body = {
                "bool": {
                    "should": [
                        {
                            "term": {
                                "PID.keyword": pid
                            }
                        }
                    ]
                }
            }

            data = self.es.getDatasetIdentifierFamily(search_query=query_body, max_limit=1)

            # Parse only if there are existing records found in the `identifiers index`
            try:
                if data[1] > 0:
                    result[pid].extend(data[0][0]["datasetIdentifierFamily"])
            except:
                pass

            return result


        async def work_get_identifier_family(doi_dict):
            """

            Creates async task to query ES, executes those tasks and returns results to
            the parent function

            :param doi_dict:

            :return:

            """

            with concurrent.futures.ThreadPoolExecutor(max_workers=CONCURRENT_REQUESTS) as executor:
                loop = asyncio.get_event_loop()
                tasks = []

                # for every pid in the dict
                # cerate a new task and add it to the task list
                for an_id in doi_dict:
                    tasks.append(loop.run_in_executor(executor, _get_dataset_identifier_family, an_id))

                # wait for the response to complete the tasks
                for response in await asyncio.gather(*tasks):
                    for response_key,response_val in response.items():
                        results[response_key] = response_val

            return

        results = {}

        # In a multithreading environment such as under gunicorn, the new thread created by
        # gevent may not provide an event loop. Create a new one if necessary.
        try:
            loop = asyncio.get_event_loop()
        except RuntimeError as e:
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)

        future = asyncio.ensure_future(work_get_identifier_family(doi_dict))

        # wait for the work to complete
        loop.run_until_complete(future)

        return results


    def get_jobs_done(self, node):
        jobs_done = {}

        session = requests.Session()
        retry = Retry(connect=3, backoff_factor=0.5)
        adapter = HTTPAdapter(max_retries=retry)
        session.mount('https://', adapter)

        resp = session.get(self._config["report_url"] + "?created-by=" + node)

        data = json.loads(resp.content.decode('utf-8'))

        try:
            if data["meta"]["total"] > 0:
                for i in data["reports"]:
                    jobs_done[i["report-header"]["reporting-period"]["begin-date"]] \
                        = i["id"]
        except:
            return None

        return jobs_done