Exemplo n.º 1
0
    def set_branch_info(self, config=None):
        """get branch info from config collection
        """
        if config is None:
            raise Exception('no branch config found')

        self.assets_dir = os.path.join(config.source_dir, 'geo-hpc/assets')

        self.config = config

        self.client = config.client

        self.c_queue = self.client.asdf.det

        self.email = GeoEmail(config)

        self.branch = config.name
        self.msr_version = config.versions['msr']
        self.extract_version = config.versions['extracts']
        return config
Exemplo n.º 2
0
    def set_branch_info(self, config=None):
        """get branch info from config collection
        """
        if config is None:
            raise Exception('no branch config found')

        self.assets_dir = os.path.join(config.source_dir, 'geo-hpc/assets')

        self.config = config

        self.client = config.client

        self.c_queue = self.client.asdf.det

        self.email = GeoEmail(config)

        self.branch = config.name
        self.msr_version = config.versions['msr']
        self.extract_version = config.versions['extracts']
        return config
Exemplo n.º 3
0
class QueueToolBox():
    """utilty functions for processing requests in queue

    old cachetools descrip:
    Accepts request object and checks if all extracts have been processed
    """
    def __init__(self):
        self.client = None

        self.c_queue = None
        self.email = None

        self.config = None
        self.branch = None
        self.msr_version = None
        self.extract_version = None

        self.msr_resolution = 0.05

        self.dev = " (dev) " if self.branch == "develop" else " "


    # def quit(self, rid, status, message):
    #     """exit function used for errors
    #     """
    #     self.update_status(rid, int(status))
    #     sys.exit(">> det processing error ("+str(status)+"): \n\t\t" +
    #               str(message))


    def set_branch_info(self, config=None):
        """get branch info from config collection
        """
        if config is None:
            raise Exception('no branch config found')

        self.assets_dir = os.path.join(config.source_dir, 'geo-hpc/assets')

        self.config = config

        self.client = config.client

        self.c_queue = self.client.asdf.det

        self.email = GeoEmail(config)

        self.branch = config.name
        self.msr_version = config.versions['msr']
        self.extract_version = config.versions['extracts']
        return config


    def check_id(self, rid):
        """verify request with given id exists

        Args
            rid (str): request id
        Returns
            (dict) request object or None
        """
        # check document with request id exists
        search = self.c_queue.find_one({"_id": ObjectId(rid)})
        return search


    def get_requests(self, status, limit=0):
        """get requests from queue

        Args
            status (int):
            limit (int):

        Returns
            tuple (function status, list of request objects)
        """
        search = self.c_queue.find({
            "status": status
        }).sort([("priority", -1), ("stage.0.time", 1)]).limit(limit)

        count = search.count(True)

        if count > 0:
            return list(search)
        else:
            return []


    def get_status(self, rid):
        """get status of request.

        Args
            rid (str): request id
        Returns
            tuple (function status, request status)
        """
        try:
            # check document with request id exists
            search = self.c_queue.find_one({"_id": ObjectId(rid)})
            status = search['status']
            return status
        except Exception as e:
            print "error retrieving status of request (id: {0})".format(rid)
            raise e


    def update_status(self, rid, status, is_prep=False):
        """ update status of request
        """
        valid_stages = {
            "-2": None,
            "-1": None,
            "0": None,
            "2": "stage.2.time",
            "1": "stage.3.time"
        }

        ctime = int(time.time())

        updates = {
            "status": long(status)
        }

        stage = valid_stages[str(status)]
        if stage is not None:
            updates[stage] = ctime


        if is_prep:
            updates['stage.1.time'] = ctime

            column_info_html = ''.join(
                open(self.assets_dir + '/templates/column_info.txt')
            ).replace('\n', '<br/>')

            updates['info'] = [column_info_html]

            # Example push/prepend for info field if needed for manual updates
            # https://docs.mongodb.com/manual/reference/operator/update/push/
            # https://stackoverflow.com/questions/10131957/can-you-have-mongo-push-prepend-instead-of-append

        try:
            # update request document
            self.c_queue.update({"_id": ObjectId(rid)},
                                {"$set": updates})

        except Exception as e:
            print ('error updating status of request '
                   '(id: {0}, status: {1}').format(rid, status)
            raise e


    def notify_received(self, request_id, email):
        """send email that request was received
        """
        mail_to = email

        mail_subject = ("AidData GeoQuery{0}- "
                        "Request {1}.. Received").format(self.dev, request_id[:7])

        mail_message = (
            """
            Thanks for using GeoQuery. This is an automated email to let you
            know that we received your request and will process your data as
            soon as we can. We will send another email when your data is ready.

            You can view the status of this data request here:
            http://{0}/query/#!/status/{1}

            You can also keep track of all of the data requests that you've
            submitted with this email address here:
            http://{0}/query/#!/requests/{2}

            Thank you,
            \tAidData's GeoQuery Team
            """).format(self.config.det['download_server'], request_id, mail_to)

        mail_message = textwrap.dedent(mail_message)

        mail_status = self.email.send_email(mail_to, mail_subject, mail_message)

        if not mail_status[0]:
            print mail_status[1]
            update_status = self.update_status(request_id, -2)
            raise mail_status[2]


    def notify_completed(self, request_id, email):
        """send email that request was completed
        """
        mail_to = email

        mail_subject = ("AidData GeoQuery{0}- "
                        "Request {1}.. Completed").format(self.dev, request_id[:7])

        mail_message = (
            """
            Thanks again for using GeoQuery. This is another automated email to let you know that
            your data is ready.

            You can review your request, and download the results and documentation here:
            \thttp://{0}/query/#!/status/{1}

            Or download the results directly (this link will always be available):
            \thttp://{0}/data/geoquery_results/{1}/{1}.zip

            You can also view all your current and previous requests using:
            \thttp://{0}/query/#!/requests/{2}

            Also, one quick reminder about citations. Don't forget to cite both AidData's GeoQuery
            tool as well as each dataset you selected within GeoQuery. All citations can be found
            in the Documentation PDF at the link above. Here's the correct citation for GeoQuery:

                Goodman, S., BenYishay, A., Lv, Z., & Runfola, D. (2019).
                GeoQuery: Integrating HPC systems and public web-based
                geospatial data tools. Computers & Geosciences, 122, 103-112.

            Thank you in advance for citing us when you publish your research.
            This helps us to demonstrate how GeoQuery is making a difference
            as a freely available public good.

            Thank you,
            \tAidData's GeoQuery Team
            """).format(self.config.det['download_server'], request_id, mail_to)

        mail_message = textwrap.dedent(mail_message)

        mail_status = self.email.send_email(mail_to, mail_subject, mail_message)

        if not mail_status[0]:
            print mail_status[1]
            update_status = self.update_status(request_id, -2)
            raise mail_status[2]


    def notify_comments(self, email):
        """send email asking for information about their work using geoquery
        """
        mail_to = email

        mail_subject = ("Was GeoQuery{0} helpful?").format(self.dev)

        mail_message = (
            """
            Hello there!

            We would like to hear about your experience using AidData's GeoQuery tool. Would you
            please respond to this email with a couple sentences about how GeoQuery has helped you?

            We are able to make GeoQuery freely available thanks to the generosity of donors and
            open source data providers. These people love to hear about new research enabled by
            GeoQuery, and what kind of difference this research is making in the world.

            Also, we love feedback of all kinds. If something did not go the way you expected, we
            want to hear about that too.

            Thanks!
            \tAidData's GeoQuery Team
            """)

        mail_message = textwrap.dedent(mail_message)

        mail_status = self.email.send_email(mail_to, mail_subject, mail_message)

        if not mail_status[0]:
            print mail_status[1]
            raise mail_status[2]


# =============================================================================
# =============================================================================
# =============================================================================


    def check_request(self, request, dry_run=False):
        """check entire request object for cache
        """
        merge_list = []
        extract_count = 0
        msr_count = 0

        # id used for field names in results
        msr_id = 1

        print "\nchecking aid data (dry=run = {})...".format(dry_run)
        for ix, raw_data in enumerate(request['release_data']):

            if not raw_data:
                continue

            # # mongo was defaulting to 32bit vals for numbers on insert
            # # making float should prevent that
            # if 'total_commitments' in raw_data['filters']:
            #     raw_data['filters']['total_commitments'] = [
            #         float(i) for i in raw_data['filters']['total_commitments']]
            # if 'total_disbursements' in raw_data['filters']:
            #     raw_data['filters']['total_disbursements'] = [
            #         float(i) for i in raw_data['filters']['total_disbursements']]


            # remove filters without actual fields
            tmp_filters = {}
            if len(raw_data['filters']) > 0:
                tmp_filters = {
                    fk: fv
                    for fk, fv in raw_data['filters'].iteritems()
                    if not any([fvx in ['All', 'None', None] for fvx in fv])
                }



            # msr request object format
            data = {
                'dataset': raw_data['dataset'],
                'type': 'release',
                'resolution': self.msr_resolution,
                'version': self.msr_version,
                'filters': tmp_filters
            }

            # get hash of msr request object
            data_hash = json_sha1_hash(data)

            # add hash to request
            if not 'hash' in raw_data or raw_data['hash'] == data_hash:
                self.c_queue.update(
                    { "_id": ObjectId(request['_id']) },
                    { "$set": { 'release_data.'+str(ix)+'.hash' : data_hash}}
                )


            print ''
            print '\t{0}'.format(data_hash)
            print '\t{0}'.format(data)
            print '\t----------'

            msr_item = MSRItem(self.config,
                               data_hash,
                               data)

            # check if msr exists in queue and is completed
            msr_exists, msr_completed = msr_item.exists()

            print '\tmsr exists: {0}'.format(msr_exists)
            print '\tmsr completed: {0}'.format(msr_completed)

            if msr_completed == True:

                tmp_extract_type = 'reliability'
                # if data["dataset"].startswith(('worldbank', 'globalenvironmentfacility')):
                #     tmp_extract_type = 'sum'

                msr_ex_item = ExtractItem(self.config,
                                          request["boundary"]["name"],
                                          data["dataset"],
                                          data["dataset"] + '_' + data_hash,
                                          tmp_extract_type,
                                          data_hash,
                                          self.extract_version)

                msr_ex_exists, msr_ex_completed = msr_ex_item.exists()

                print '\tmsr extract exists: {0}'.format(msr_ex_exists)
                print '\tmsr extract completed: {0}'.format(msr_ex_completed)

                if msr_ex_completed == "Error":
                    extract_count += 1

                elif not msr_ex_completed:
                    extract_count += 1
                    if not dry_run:
                        # add to extract queue
                        msr_ex_item.add_to_queue("msr")

                else:
                    # add to merge list
                    merge_list.append(
                        (msr_ex_item.extract_path, 'release_data', msr_id))

            else:
                msr_count += 1
                extract_count += 1
                if not dry_run:
                    # add to msr tracker
                    msr_item.add_to_queue()


            msr_id += 1


        print "\nchecking raster data (dry=run = {})...".format(dry_run)
        for data in request["raster_data"]:

            if not data:
                continue

            name = data['name']

            for i in data["files"]:

                for extract_type in data["options"]["extract_types"]:

                    print ''
                    print '\tdataset: {0}'.format(name)
                    print '\tfile: {0}'.format(i['name'])
                    print '\textract type: {0}'.format(extract_type)
                    print ''

                    temporal = i["name"][len(data["name"])+1:]

                    extract_item = ExtractItem(self.config,
                                               request["boundary"]["name"],
                                               data["name"],
                                               i["name"],
                                               extract_type,
                                               temporal,
                                               self.extract_version)

                    # check if extract exists in queue and is completed
                    extract_exists, extract_completed = extract_item.exists()

                    print '\textract exists: {0}'.format(extract_exists)
                    print '\textract completed: {0}'.format(extract_completed)
                    print '\t--------------------'

                    # incremenet count if extract is not completed
                    # (whether it exists in queue or not)
                    if extract_completed == "Error" or not extract_completed:
                        extract_count += 1

                        # add to extract queue if it does not already
                        # exist in queue
                        if not dry_run:
                            extract_item.add_to_queue("raster")

                    else:
                        # add to merge list
                        merge_list.append(
                            (extract_item.extract_path, 'raster_data', None))


        print ''
        print 'missing msr count: {0}'.format(msr_count)
        print 'missing extract count: {0}'.format(extract_count)
        print ''

        missing_items = extract_count + msr_count

        return missing_items, merge_list


# =============================================================================
# =============================================================================
# =============================================================================


    def build_output(self, request, merge_list, branch):
        """build output

        merge extracts, generate documentation, update status,
            cleanup working directory, send final email
        """
        request_id = str(request['_id'])
        request['_id'] = request_id

        results_dir = os.path.join(self.config.branch_dir, "outputs/det/results")

        request_dir = os.path.join(results_dir, request_id)

        shutil.rmtree(request_dir, ignore_errors=True)

        merge_output = os.path.join(request_dir,
                                    "{0}_results.csv".format(request_id))

        # merge cached results if all are available
        merge_status = merge_file_list(merge_list, merge_output)

        if not merge_status:
            raise Exception('\tWarning: no extracts merged for '
                            'request_id = {0}').format(request_id)
        else:
            print '\tMerge completed for {0}'.format(request_id)


        # generate documentation
        doc_output = os.path.join(request_dir,
                                  "{0}_documentation.pdf".format(request_id))
        doc = DocBuilder(self.config, request, doc_output, self.config.det['download_server'])
        bd_status = doc.build_doc()
        # print bd_status


        # output request doc as json
        print "creating request json"
        rdoc_path = os.path.join(request_dir, "request_details.json")
        rdoc_file = open(rdoc_path, "w")
        json.dump(request, rdoc_file, indent=4)
        rdoc_file.close()


        geo_pdf_src = self.assets_dir + "/other/GeoQuery_Goodman2019.pdf"
        geo_pdf_dst = os.path.join(request_dir, "GeoQuery_Goodman2019.pdf")
        shutil.copyfile(geo_pdf_src, geo_pdf_dst)

        if not 'boundaries/gadm' in request['boundary']['path']:
            bnd_src = request['boundary']['path']
            bnd_dst = os.path.join(request_dir, os.path.basename(request['boundary']['path']))
            shutil.copyfile(bnd_src, bnd_dst)


        # # make msr json folder in request_dir
        # msr_jsons_dir = os.path.join(request_dir, 'msr_jsons')
        # make_dir(msr_jsons_dir)

        # # copy all msr jsons into msr json folder
        # for i in request['release_data']:
        #     tmp_dataset = i['dataset']
        #     tmp_hash = i['hash']

        #     src = "{0}/outputs/msr/done/{1}/{2}/summary.json".format(
        #         self.config.branch_dir, tmp_dataset, tmp_hash)
        #     dst = os.path.join(msr_jsons_dir, "{0}_{1}.json".format(
        #         tmp_dataset, tmp_hash))

        #     shutil.copyfile(src, dst)


        # make msr aid folder in request_dir
        msr_aid_dir = os.path.join(request_dir, 'raw_aid_data')
        make_dir(msr_aid_dir)

        # copy all aid csv into msr aid folder
        for i in request['release_data']:
            tmp_dataset = i['dataset']
            tmp_hash = i['hash']

            src = "{0}/outputs/msr/done/{1}/{2}/project_locations.csv".format(
                self.config.branch_dir, tmp_dataset, tmp_hash)
            dst = os.path.join(msr_aid_dir, "{0}_{1}.csv".format(
                tmp_dataset, tmp_hash))
            try:
                shutil.copyfile(src, dst)
            except:
                pass


        # make zip of request dir
        # shutil.make_archive(request_dir, "zip", request_dir)
        make_zipfile(request_dir, request_dir)

        # move zip of request dir into request dir
        shutil.move(request_dir + ".zip", request_dir)


        # remove unzipped files which do not need direct access
        os.remove(geo_pdf_dst)
        # shutil.rmtree(msr_jsons_dir)

        # set permissions
        os.chmod(request_dir, 0775)
        for ro, di, fi in os.walk(request_dir):
            for d in di:
                os.chmod(os.path.join(ro, d), 0775)
            for f in fi:
                os.chmod(os.path.join(ro, f), 0664)
Exemplo n.º 4
0
from config_utility import BranchConfig

config = BranchConfig(branch=branch)
config.test_connection()

# -----------------------------------------------------------------------------

import time
from datetime import datetime

from bson.objectid import ObjectId

from email_utility import GeoEmail

email = GeoEmail(config)

dev_email = "*****@*****.**"
reply_to = "Geo Dev <{0}>".format(dev_email)

if config.connection_status != 0:

    print("Could not connect to mongodb (branch: {0})".format(branch))

    subject = "Geo ({0}) - Error : mongodb connection"

    message = ("""
        Could not connect to mongodb (branch: {0})

        """).format(branch)
Exemplo n.º 5
0
            We are able to make GeoQuery freely available thanks to the generosity of donors and
            open source data providers. These people love to hear about new research enabled by
            GeoQuery, and what kind of difference this research is making in the world.

            Also, we love feedback of all kinds. If something did not go the way you expected, we
            want to hear about that too.

            Thanks!
            \tAidData's GeoQuery Team
            """).format(email_list_str)


        mail_message = textwrap.dedent(mail_message)

        email = GeoEmail(config)

        mail_status = email.send_email(mail_to, mail_subject, mail_message)

        if not mail_status[0]:
            print mail_status[1]
            raise mail_status[2]


    # email any users who pass above filtering with request for comments
    # add "comments_requested" = 1 flag to all of their existing requests
    for ix, user_info in valid_df.iterrows():

        user_email = user_info["email"]

        if mode == "auto" and ix >= email_limit:
Exemplo n.º 6
0
class QueueToolBox():
    """utilty functions for processing requests in queue

    old cachetools descrip:
    Accepts request object and checks if all extracts have been processed
    """
    def __init__(self):
        self.client = None

        self.c_queue = None
        self.email = None

        self.config = None
        self.branch = None
        self.msr_version = None
        self.extract_version = None

        self.msr_resolution = 0.05

        self.dev = " (dev) " if self.branch == "develop" else " "

    # def quit(self, rid, status, message):
    #     """exit function used for errors
    #     """
    #     self.update_status(rid, int(status))
    #     sys.exit(">> det processing error ("+str(status)+"): \n\t\t" +
    #               str(message))

    def set_branch_info(self, config=None):
        """get branch info from config collection
        """
        if config is None:
            raise Exception('no branch config found')

        self.assets_dir = os.path.join(config.source_dir, 'geo-hpc/assets')

        self.config = config

        self.client = config.client

        self.c_queue = self.client.asdf.det

        self.email = GeoEmail(config)

        self.branch = config.name
        self.msr_version = config.versions['msr']
        self.extract_version = config.versions['extracts']
        return config

    def check_id(self, rid):
        """verify request with given id exists

        Args
            rid (str): request id
        Returns
            (dict) request object or None
        """
        # check document with request id exists
        search = self.c_queue.find_one({"_id": ObjectId(rid)})
        return search

    def get_requests(self, status, limit=0):
        """get requests from queue

        Args
            status (int):
            limit (int):

        Returns
            tuple (function status, list of request objects)
        """
        search = self.c_queue.find({
            "status": status
        }).sort([("priority", -1), ("stage.0.time", 1)]).limit(limit)

        count = search.count(True)

        if count > 0:
            return list(search)
        else:
            return []

    def get_status(self, rid):
        """get status of request.

        Args
            rid (str): request id
        Returns
            tuple (function status, request status)
        """
        try:
            # check document with request id exists
            search = self.c_queue.find_one({"_id": ObjectId(rid)})
            status = search['status']
            return status
        except Exception as e:
            print "error retrieving status of request (id: {0})".format(rid)
            raise e

    def update_status(self, rid, status, is_prep=False):
        """ update status of request
        """
        valid_stages = {
            "-2": None,
            "-1": None,
            "0": None,
            "2": "stage.2.time",
            "1": "stage.3.time"
        }

        ctime = int(time.time())

        updates = {"status": long(status)}

        stage = valid_stages[str(status)]
        if stage is not None:
            updates[stage] = ctime

        if is_prep:
            updates['stage.1.time'] = ctime

            column_info_html = ''.join(
                open(self.assets_dir + '/templates/column_info.txt')).replace(
                    '\n', '<br/>')
            updates['info'] = [column_info_html]

            # Example push/prepend for info field if needed for manual updates
            # https://docs.mongodb.com/manual/reference/operator/update/push/
            # https://stackoverflow.com/questions/10131957/can-you-have-mongo-push-prepend-instead-of-append

        try:
            # update request document
            self.c_queue.update({"_id": ObjectId(rid)}, {"$set": updates})

        except Exception as e:
            print('error updating status of request '
                  '(id: {0}, status: {1}').format(rid, status)
            raise e

    def notify_received(self, request_id, email):
        """send email that request was received
        """
        mail_to = email

        mail_subject = ("AidData GeoQuery{0}- "
                        "Request {1}.. Received").format(
                            self.dev, request_id[:7])

        mail_message = ("Your request has been received. \n"
                        "You will receive an additional email when the"
                        " request has been completed. \n\n"
                        "The status of your request can be viewed using"
                        " the following link: \n"
                        "http://{0}/query/#!/status/{1}\n\n"
                        "You can also view all your current and previous "
                        "requests using: \n"
                        "http://{0}/query/#!/requests/{2}\n\n").format(
                            self.config.det['download_server'], request_id,
                            mail_to)

        mail_status = self.email.send_email(mail_to, mail_subject,
                                            mail_message)

        if not mail_status[0]:
            print mail_status[1]
            update_status = self.update_status(request_id, -2)
            raise mail_status[2]

    def notify_completed(self, request_id, email):
        """send email that request was completed
        """
        mail_to = email

        mail_subject = ("AidData GeoQuery{0}- "
                        "Request {1}.. Completed").format(
                            self.dev, request_id[:7])

        mail_message = ("""
            Your request has been completed.

            You can review your request and download the results using the following page:
            \thttp://{0}/query/#!/status/{1}

            or download the results directly (this link will always be available):
            \thttp://{0}/data/geoquery_results/{1}/{1}.zip


            You can also view all your current and previous requests using:
            \thttp://{0}/query/#!/requests/{2}


            If you have not done so before or if you would like to provide additional feedback,
            please fill out this brief survey regarding your experience with geo(query).
            \thttps://goo.gl/4WZ46M


            Citations enable continued support and development of this tool.
            As a part of your use of this tool, you have agreed to cite geo(query)
            in any derived products (including academic publications, reports, or other works).
                \tGoodman, S., BenYishay, A., Runfola, D., 2016. Overview of the geo Framework.
                \tAidData. Available online at http://geo.aiddata.org. DOI: 10.13140/RG.2.2.28363.59686


            Thank you,
            \tThe AidData Team
            """).format(self.config.det['download_server'], request_id,
                        mail_to)

        mail_status = self.email.send_email(mail_to, mail_subject,
                                            mail_message)

        if not mail_status[0]:
            print mail_status[1]
            update_status = self.update_status(request_id, -2)
            raise mail_status[2]

# =============================================================================
# =============================================================================
# =============================================================================

    def check_request(self, request, dry_run=False):
        """check entire request object for cache
        """
        merge_list = []
        extract_count = 0
        msr_count = 0

        # id used for field names in results
        msr_id = 1

        print "\nchecking aid data..."
        for ix, raw_data in enumerate(request['release_data']):

            # # mongo was defaulting to 32bit vals for numbers on insert
            # # making float should prevent that
            # if 'total_commitments' in raw_data['filters']:
            #     raw_data['filters']['total_commitments'] = [
            #         float(i) for i in raw_data['filters']['total_commitments']]
            # if 'total_disbursements' in raw_data['filters']:
            #     raw_data['filters']['total_disbursements'] = [
            #         float(i) for i in raw_data['filters']['total_disbursements']]

            # remove filters without actual fields
            tmp_filters = {}
            if len(raw_data['filters']) > 0:
                tmp_filters = {
                    fk: fv
                    for fk, fv in raw_data['filters'].iteritems()
                    if not any([fvx in ['All', 'None', None] for fvx in fv])
                }

            # msr request object format
            data = {
                'dataset': raw_data['dataset'],
                'type': 'release',
                'resolution': self.msr_resolution,
                'version': self.msr_version,
                'filters': tmp_filters
            }

            # get hash of msr request object
            data_hash = json_sha1_hash(data)

            # add hash to request
            if not 'hash' in raw_data or raw_data['hash'] == data_hash:
                self.c_queue.update(
                    {"_id": ObjectId(request['_id'])},
                    {"$set": {
                        'release_data.' + str(ix) + '.hash': data_hash
                    }})

            print ''
            print '\t{0}'.format(data_hash)
            print '\t{0}'.format(data)
            print '\t----------'

            msr_item = MSRItem(self.config, data_hash, data)

            # check if msr exists in queue and is completed
            msr_exists, msr_completed = msr_item.exists()

            print '\tmsr exists: {0}'.format(msr_exists)
            print '\tmsr completed: {0}'.format(msr_completed)

            if msr_completed == True:

                tmp_extract_type = 'reliability'
                # if data["dataset"].startswith(('worldbank', 'globalenvironmentfacility')):
                #     tmp_extract_type = 'sum'

                msr_ex_item = ExtractItem(self.config,
                                          request["boundary"]["name"],
                                          data["dataset"],
                                          data["dataset"] + '_' + data_hash,
                                          tmp_extract_type, data_hash,
                                          self.extract_version)

                msr_ex_exists, msr_ex_completed = msr_ex_item.exists()

                print '\tmsr extract exists: {0}'.format(msr_ex_exists)
                print '\tmsr extract completed: {0}'.format(msr_ex_completed)

                if msr_ex_completed == "Error":
                    extract_count += 1

                elif not msr_ex_completed:
                    extract_count += 1
                    if not dry_run:
                        # add to extract queue
                        msr_ex_item.add_to_queue("msr")

                else:
                    # add to merge list
                    merge_list.append(
                        (msr_ex_item.extract_path, 'release_data', msr_id))

            else:
                msr_count += 1
                extract_count += 1
                if not dry_run:
                    # add to msr tracker
                    msr_item.add_to_queue()

            msr_id += 1

        print "\nchecking external data..."
        for data in request["raster_data"]:
            name = data['name']

            for i in data["files"]:

                for extract_type in data["options"]["extract_types"]:

                    print ''
                    print '\tdataset: {0}'.format(name)
                    print '\tfile: {0}'.format(i['name'])
                    print '\textract type: {0}'.format(extract_type)
                    print ''

                    temporal = i["name"][len(data["name"]) + 1:]

                    extract_item = ExtractItem(self.config,
                                               request["boundary"]["name"],
                                               data["name"], i["name"],
                                               extract_type, temporal,
                                               self.extract_version)

                    # check if extract exists in queue and is completed
                    extract_exists, extract_completed = extract_item.exists()

                    print '\textract exists: {0}'.format(extract_exists)
                    print '\textract completed: {0}'.format(extract_completed)
                    print '\t--------------------'

                    # incremenet count if extract is not completed
                    # (whether it exists in queue or not)
                    if extract_completed == "Error":
                        extract_count += 1

                    elif not extract_completed:
                        extract_count += 1

                        # add to extract queue if it does not already
                        # exist in queue
                        if not dry_run:
                            extract_item.add_to_queue("raster")

                    else:
                        # add to merge list
                        merge_list.append(
                            (extract_item.extract_path, 'raster_data', None))

        print ''
        print 'missing msr count: {0}'.format(msr_count)
        print 'missing extract count: {0}'.format(extract_count)
        print ''

        missing_items = extract_count + msr_count

        return missing_items, merge_list

# =============================================================================
# =============================================================================
# =============================================================================

    def build_output(self, request, merge_list, branch):
        """build output

        merge extracts, generate documentation, update status,
            cleanup working directory, send final email
        """
        request_id = str(request['_id'])
        request['_id'] = request_id

        results_dir = os.path.join(self.config.branch_dir,
                                   "outputs/det/results")

        request_dir = os.path.join(results_dir, request_id)

        shutil.rmtree(request_dir, ignore_errors=True)

        merge_output = os.path.join(request_dir,
                                    "{0}_results.csv".format(request_id))

        # merge cached results if all are available
        merge_status = merge_file_list(merge_list, merge_output)

        if not merge_status:
            raise Exception('\tWarning: no extracts merged for '
                            'request_id = {0}').format(request_id)
        else:
            print '\tMerge completed for {0}'.format(request_id)

        # generate documentation
        doc_output = os.path.join(request_dir,
                                  "{0}_documentation.pdf".format(request_id))
        doc = DocBuilder(self.config, request, doc_output,
                         self.config.det['download_server'])
        bd_status = doc.build_doc()
        # print bd_status

        # output request doc as json
        print "creating request json"
        rdoc_path = os.path.join(request_dir, "request_details.json")
        rdoc_file = open(rdoc_path, "w")
        json.dump(request, rdoc_file, indent=4)
        rdoc_file.close()

        geo_pdf_src = self.assets_dir + "/other/IntroducingtheAidDataGeoFramework.pdf"
        geo_pdf_dst = os.path.join(request_dir,
                                   "IntroducingtheAidDataGeoFramework.pdf")
        shutil.copyfile(geo_pdf_src, geo_pdf_dst)

        if not 'boundaries/gadm' in request['boundary']['path']:
            bnd_src = request['boundary']['path']
            bnd_dst = os.path.join(
                request_dir, os.path.basename(request['boundary']['path']))
            shutil.copyfile(bnd_src, bnd_dst)

        # # make msr json folder in request_dir
        # msr_jsons_dir = os.path.join(request_dir, 'msr_jsons')
        # make_dir(msr_jsons_dir)

        # # copy all msr jsons into msr json folder
        # for i in request['release_data']:
        #     tmp_dataset = i['dataset']
        #     tmp_hash = i['hash']

        #     src = "{0}/outputs/msr/done/{1}/{2}/summary.json".format(
        #         self.config.branch_dir, tmp_dataset, tmp_hash)
        #     dst = os.path.join(msr_jsons_dir, "{0}_{1}.json".format(
        #         tmp_dataset, tmp_hash))

        #     shutil.copyfile(src, dst)

        # make msr aid folder in request_dir
        msr_aid_dir = os.path.join(request_dir, 'raw_aid_data')
        make_dir(msr_aid_dir)

        # copy all aid csv into msr aid folder
        for i in request['release_data']:
            tmp_dataset = i['dataset']
            tmp_hash = i['hash']

            src = "{0}/outputs/msr/done/{1}/{2}/project_locations.csv".format(
                self.config.branch_dir, tmp_dataset, tmp_hash)
            dst = os.path.join(msr_aid_dir,
                               "{0}_{1}.csv".format(tmp_dataset, tmp_hash))
            try:
                shutil.copyfile(src, dst)
            except:
                pass

        # make zip of request dir
        shutil.make_archive(request_dir, "zip", request_dir)

        # move zip of request dir into request dir
        shutil.move(request_dir + ".zip", request_dir)

        # remove unzipped files which do not need direct access
        os.remove(geo_pdf_dst)
        # shutil.rmtree(msr_jsons_dir)

        # set permissions
        os.chmod(request_dir, 0775)
        for ro, di, fi in os.walk(request_dir):
            for d in di:
                os.chmod(os.path.join(ro, d), 0775)
            for f in fi:
                os.chmod(os.path.join(ro, f), 0664)
Exemplo n.º 7
0
class QueueToolBox():
    """utilty functions for processing requests in queue

    old cachetools descrip:
    Accepts request object and checks if all extracts have been processed
    """
    def __init__(self):
        self.client = None

        self.c_queue = None
        self.email = None

        self.config = None
        self.branch = None
        self.msr_version = None
        self.extract_version = None

        self.msr_resolution = 0.05

        self.dev = " (dev) " if self.branch == "develop" else " "


    # def quit(self, rid, status, message):
    #     """exit function used for errors
    #     """
    #     self.update_status(rid, int(status))
    #     sys.exit(">> det processing error ("+str(status)+"): \n\t\t" +
    #               str(message))


    def set_branch_info(self, config=None):
        """get branch info from config collection
        """
        if config is None:
            raise Exception('no branch config found')

        self.assets_dir = os.path.join(config.source_dir, 'geo-hpc/assets')

        self.config = config

        self.client = config.client

        self.c_queue = self.client.asdf.det

        self.email = GeoEmail(config)

        self.branch = config.name
        self.msr_version = config.versions['msr']
        self.extract_version = config.versions['extracts']
        return config


    def check_id(self, rid):
        """verify request with given id exists

        Args
            rid (str): request id
        Returns
            (dict) request object or None
        """
        # check document with request id exists
        search = self.c_queue.find_one({"_id": ObjectId(rid)})
        return search


    def get_requests(self, status, limit=0):
        """get requests from queue

        Args
            status (int):
            limit (int):

        Returns
            tuple (function status, list of request objects)
        """
        search = self.c_queue.find({
            "status": status
        }).sort([("priority", -1), ("stage.0.time", 1)]).limit(limit)

        count = search.count(True)

        if count > 0:
            return list(search)
        else:
            return []


    def get_status(self, rid):
        """get status of request.

        Args
            rid (str): request id
        Returns
            tuple (function status, request status)
        """
        try:
            # check document with request id exists
            search = self.c_queue.find_one({"_id": ObjectId(rid)})
            status = search['status']
            return status
        except Exception as e:
            print "error retrieving status of request (id: {0})".format(rid)
            raise e


    def update_status(self, rid, status, is_prep=False):
        """ update status of request
        """
        valid_stages = {
            "-2": None,
            "-1": None,
            "0": None,
            "2": "stage.2.time",
            "1": "stage.3.time"
        }

        ctime = int(time.time())

        updates = {
            "status": long(status)
        }

        stage = valid_stages[str(status)]
        if stage is not None:
            updates[stage] = ctime


        if is_prep:
            updates['stage.1.time'] = ctime

            column_info_html = ''.join(
                open(self.assets_dir + '/templates/column_info.txt')
            ).replace('\n', '<br/>')

            updates['info'] = [column_info_html]

            # Example push/prepend for info field if needed for manual updates
            # https://docs.mongodb.com/manual/reference/operator/update/push/
            # https://stackoverflow.com/questions/10131957/can-you-have-mongo-push-prepend-instead-of-append

        try:
            # update request document
            self.c_queue.update({"_id": ObjectId(rid)},
                                {"$set": updates})

        except Exception as e:
            print ('error updating status of request '
                   '(id: {0}, status: {1}').format(rid, status)
            raise e


    def notify_received(self, request_id, email):
        """send email that request was received
        """
        mail_to = email

        mail_subject = ("AidData GeoQuery{0}- "
                        "Request {1}.. Received").format(self.dev, request_id[:7])

        mail_message = (
            """
            Thanks for using GeoQuery. This is an automated email to let you
            know that we received your request and will process your data as
            soon as we can. We will send another email when your data is ready.

            You can view the status of this data request here:
            http://{0}/query/#!/status/{1}

            You can also keep track of all of the data requests that you've
            submitted with this email address here:
            http://{0}/query/#!/requests/{2}

            Thank you,
            \tAidData's GeoQuery Team
            """).format(self.config.det['download_server'], request_id, mail_to)

        mail_message = textwrap.dedent(mail_message)

        mail_status = self.email.send_email(mail_to, mail_subject, mail_message)

        if not mail_status[0]:
            print mail_status[1]
            update_status = self.update_status(request_id, -2)
            raise mail_status[2]


    def notify_completed(self, request_id, email):
        """send email that request was completed
        """
        mail_to = email

        mail_subject = ("AidData GeoQuery{0}- "
                        "Request {1}.. Completed").format(self.dev, request_id[:7])

        mail_message = (
            """
            Thanks again for using GeoQuery. This is another automated email to let you know that
            your data is ready.

            You can review your request, and download the results and documentation here:
            \thttp://{0}/query/#!/status/{1}

            Or download the results directly (this link will always be available):
            \thttp://{0}/data/geoquery_results/{1}/{1}.zip

            You can also view all your current and previous requests using:
            \thttp://{0}/query/#!/requests/{2}

            Also, one quick reminder about citations. Don't forget to cite both AidData's GeoQuery
            tool as well as each dataset you selected within GeoQuery. All citations can be found
            in the Documentation PDF at the link above. Here's the correct citation for GeoQuery:

                Goodman, S., BenYishay, A., Lv, Z., & Runfola, D. (2019).
                GeoQuery: Integrating HPC systems and public web-based
                geospatial data tools. Computers & Geosciences, 122, 103-112.

            Thank you in advance for citing us when you publish your research.
            This helps us to demonstrate how GeoQuery is making a difference
            as a freely available public good.

            Thank you,
            \tAidData's GeoQuery Team
            """).format(self.config.det['download_server'], request_id, mail_to)

        mail_message = textwrap.dedent(mail_message)

        mail_status = self.email.send_email(mail_to, mail_subject, mail_message)

        if not mail_status[0]:
            print mail_status[1]
            update_status = self.update_status(request_id, -2)
            raise mail_status[2]


    def notify_comments(self, email):
        """send email asking for information about their work using geoquery
        """
        mail_to = email

        mail_subject = ("Was GeoQuery{0} helpful?").format(self.dev)

        mail_message = (
            """
            Hello there!

            We would like to hear about your experience using AidData's GeoQuery tool. Would you
            please respond to this email with a couple sentences about how GeoQuery has helped you?

            We are able to make GeoQuery freely available thanks to the generosity of donors and
            open source data providers. These people love to hear about new research enabled by
            GeoQuery, and what kind of difference this research is making in the world.

            Also, we love feedback of all kinds. If something did not go the way you expected, we
            want to hear about that too.

            Thanks!
            \tAidData's GeoQuery Team
            """)

        mail_message = textwrap.dedent(mail_message)

        mail_status = self.email.send_email(mail_to, mail_subject, mail_message)

        if not mail_status[0]:
            print mail_status[1]
            raise mail_status[2]


# =============================================================================
# =============================================================================
# =============================================================================


    def check_request(self, request, dry_run=False):
        """check entire request object for cache
        """
        merge_list = []
        extract_count = 0
        msr_count = 0

        # id used for field names in results
        msr_id = 1

        print "\nchecking aid data (dry=run = {})...".format(dry_run)
        for ix, raw_data in enumerate(request['release_data']):

            # # mongo was defaulting to 32bit vals for numbers on insert
            # # making float should prevent that
            # if 'total_commitments' in raw_data['filters']:
            #     raw_data['filters']['total_commitments'] = [
            #         float(i) for i in raw_data['filters']['total_commitments']]
            # if 'total_disbursements' in raw_data['filters']:
            #     raw_data['filters']['total_disbursements'] = [
            #         float(i) for i in raw_data['filters']['total_disbursements']]


            # remove filters without actual fields
            tmp_filters = {}
            if len(raw_data['filters']) > 0:
                tmp_filters = {
                    fk: fv
                    for fk, fv in raw_data['filters'].iteritems()
                    if not any([fvx in ['All', 'None', None] for fvx in fv])
                }



            # msr request object format
            data = {
                'dataset': raw_data['dataset'],
                'type': 'release',
                'resolution': self.msr_resolution,
                'version': self.msr_version,
                'filters': tmp_filters
            }

            # get hash of msr request object
            data_hash = json_sha1_hash(data)

            # add hash to request
            if not 'hash' in raw_data or raw_data['hash'] == data_hash:
                self.c_queue.update(
                    { "_id": ObjectId(request['_id']) },
                    { "$set": { 'release_data.'+str(ix)+'.hash' : data_hash}}
                )


            print ''
            print '\t{0}'.format(data_hash)
            print '\t{0}'.format(data)
            print '\t----------'

            msr_item = MSRItem(self.config,
                               data_hash,
                               data)

            # check if msr exists in queue and is completed
            msr_exists, msr_completed = msr_item.exists()

            print '\tmsr exists: {0}'.format(msr_exists)
            print '\tmsr completed: {0}'.format(msr_completed)

            if msr_completed == True:

                tmp_extract_type = 'reliability'
                # if data["dataset"].startswith(('worldbank', 'globalenvironmentfacility')):
                #     tmp_extract_type = 'sum'

                msr_ex_item = ExtractItem(self.config,
                                          request["boundary"]["name"],
                                          data["dataset"],
                                          data["dataset"] + '_' + data_hash,
                                          tmp_extract_type,
                                          data_hash,
                                          self.extract_version)

                msr_ex_exists, msr_ex_completed = msr_ex_item.exists()

                print '\tmsr extract exists: {0}'.format(msr_ex_exists)
                print '\tmsr extract completed: {0}'.format(msr_ex_completed)

                if msr_ex_completed == "Error":
                    extract_count += 1

                elif not msr_ex_completed:
                    extract_count += 1
                    if not dry_run:
                        # add to extract queue
                        msr_ex_item.add_to_queue("msr")

                else:
                    # add to merge list
                    merge_list.append(
                        (msr_ex_item.extract_path, 'release_data', msr_id))

            else:
                msr_count += 1
                extract_count += 1
                if not dry_run:
                    # add to msr tracker
                    msr_item.add_to_queue()


            msr_id += 1


        print "\nchecking raster data (dry=run = {})...".format(dry_run)
        for data in request["raster_data"]:
            name = data['name']

            for i in data["files"]:

                for extract_type in data["options"]["extract_types"]:

                    print ''
                    print '\tdataset: {0}'.format(name)
                    print '\tfile: {0}'.format(i['name'])
                    print '\textract type: {0}'.format(extract_type)
                    print ''

                    temporal = i["name"][len(data["name"])+1:]

                    extract_item = ExtractItem(self.config,
                                               request["boundary"]["name"],
                                               data["name"],
                                               i["name"],
                                               extract_type,
                                               temporal,
                                               self.extract_version)

                    # check if extract exists in queue and is completed
                    extract_exists, extract_completed = extract_item.exists()

                    print '\textract exists: {0}'.format(extract_exists)
                    print '\textract completed: {0}'.format(extract_completed)
                    print '\t--------------------'

                    # incremenet count if extract is not completed
                    # (whether it exists in queue or not)
                    if extract_completed == "Error" or not extract_completed:
                        extract_count += 1

                        # add to extract queue if it does not already
                        # exist in queue
                        if not dry_run:
                            extract_item.add_to_queue("raster")

                    else:
                        # add to merge list
                        merge_list.append(
                            (extract_item.extract_path, 'raster_data', None))


        print ''
        print 'missing msr count: {0}'.format(msr_count)
        print 'missing extract count: {0}'.format(extract_count)
        print ''

        missing_items = extract_count + msr_count

        return missing_items, merge_list


# =============================================================================
# =============================================================================
# =============================================================================


    def build_output(self, request, merge_list, branch):
        """build output

        merge extracts, generate documentation, update status,
            cleanup working directory, send final email
        """
        request_id = str(request['_id'])
        request['_id'] = request_id

        results_dir = os.path.join(self.config.branch_dir, "outputs/det/results")

        request_dir = os.path.join(results_dir, request_id)

        shutil.rmtree(request_dir, ignore_errors=True)

        merge_output = os.path.join(request_dir,
                                    "{0}_results.csv".format(request_id))

        # merge cached results if all are available
        merge_status = merge_file_list(merge_list, merge_output)

        if not merge_status:
            raise Exception('\tWarning: no extracts merged for '
                            'request_id = {0}').format(request_id)
        else:
            print '\tMerge completed for {0}'.format(request_id)


        # generate documentation
        doc_output = os.path.join(request_dir,
                                  "{0}_documentation.pdf".format(request_id))
        doc = DocBuilder(self.config, request, doc_output, self.config.det['download_server'])
        bd_status = doc.build_doc()
        # print bd_status


        # output request doc as json
        print "creating request json"
        rdoc_path = os.path.join(request_dir, "request_details.json")
        rdoc_file = open(rdoc_path, "w")
        json.dump(request, rdoc_file, indent=4)
        rdoc_file.close()


        geo_pdf_src = self.assets_dir + "/other/GeoQuery_Goodman2019.pdf"
        geo_pdf_dst = os.path.join(request_dir, "GeoQuery_Goodman2019.pdf")
        shutil.copyfile(geo_pdf_src, geo_pdf_dst)

        if not 'boundaries/gadm' in request['boundary']['path']:
            bnd_src = request['boundary']['path']
            bnd_dst = os.path.join(request_dir, os.path.basename(request['boundary']['path']))
            shutil.copyfile(bnd_src, bnd_dst)


        # # make msr json folder in request_dir
        # msr_jsons_dir = os.path.join(request_dir, 'msr_jsons')
        # make_dir(msr_jsons_dir)

        # # copy all msr jsons into msr json folder
        # for i in request['release_data']:
        #     tmp_dataset = i['dataset']
        #     tmp_hash = i['hash']

        #     src = "{0}/outputs/msr/done/{1}/{2}/summary.json".format(
        #         self.config.branch_dir, tmp_dataset, tmp_hash)
        #     dst = os.path.join(msr_jsons_dir, "{0}_{1}.json".format(
        #         tmp_dataset, tmp_hash))

        #     shutil.copyfile(src, dst)


        # make msr aid folder in request_dir
        msr_aid_dir = os.path.join(request_dir, 'raw_aid_data')
        make_dir(msr_aid_dir)

        # copy all aid csv into msr aid folder
        for i in request['release_data']:
            tmp_dataset = i['dataset']
            tmp_hash = i['hash']

            src = "{0}/outputs/msr/done/{1}/{2}/project_locations.csv".format(
                self.config.branch_dir, tmp_dataset, tmp_hash)
            dst = os.path.join(msr_aid_dir, "{0}_{1}.csv".format(
                tmp_dataset, tmp_hash))
            try:
                shutil.copyfile(src, dst)
            except:
                pass


        # make zip of request dir
        shutil.make_archive(request_dir, "zip", request_dir)

        # move zip of request dir into request dir
        shutil.move(request_dir + ".zip", request_dir)


        # remove unzipped files which do not need direct access
        os.remove(geo_pdf_dst)
        # shutil.rmtree(msr_jsons_dir)

        # set permissions
        os.chmod(request_dir, 0775)
        for ro, di, fi in os.walk(request_dir):
            for d in di:
                os.chmod(os.path.join(ro, d), 0775)
            for f in fi:
                os.chmod(os.path.join(ro, f), 0664)
Exemplo n.º 8
0
config = BranchConfig(branch=branch)
config.test_connection()

# -----------------------------------------------------------------------------


import time
from datetime import datetime

from bson.objectid import ObjectId

from email_utility import GeoEmail


email = GeoEmail(config)

dev_email = "*****@*****.**"
reply_to = "Geo Dev <{0}>".format(dev_email)


if config.connection_status != 0:

    print "Could not connect to mongodb (branch: {0})".format(branch)

    subject = "Geo ({0}) - Error : mongodb connection"

    message = (
        """
        Could not connect to mongodb (branch: {0})
Exemplo n.º 9
0
            please respond to this email with a couple sentences about how GeoQuery has helped you?

            We are able to make GeoQuery freely available thanks to the generosity of donors and
            open source data providers. These people love to hear about new research enabled by
            GeoQuery, and what kind of difference this research is making in the world.

            Also, we love feedback of all kinds. If something did not go the way you expected, we
            want to hear about that too.

            Thanks!
            \tAidData's GeoQuery Team
            """).format(email_list_str)

        mail_message = textwrap.dedent(mail_message)

        email = GeoEmail(config)

        mail_status = email.send_email(mail_to, mail_subject, mail_message)

        if not mail_status[0]:
            print mail_status[1]
            raise mail_status[2]

    # email any users who pass above filtering with request for comments
    # add "comments_requested" = 1 flag to all of their existing requests
    for ix, user_info in valid_df.iterrows():

        user_email = user_info["email"]

        if mode == "auto" and ix >= email_limit:
            print "\n Warning: maximum emails reached. Exiting."