示例#1
0
    def call_web_service(self, request):

        sdlog.debug("SYDPROXY-100",
                    "Search-API call started (%s)." % request.get_url())

        try:
            response = sdnetutils.call_web_service(
                request.get_url(), timeout=sdconst.SEARCH_API_HTTP_TIMEOUT
            )  # returns Response object
        except:

            # if exception occurs in sdnetutils.call_web_service() method, all
            # previous calls to this method inside this paginated call are also
            # cancelled

            # we reset the offset so the paginated call can be restarted from the begining the next time
            # (maybe overkill as offset is reinitialized when entering 'call_web_service__PAGINATION()' func)
            request.offset = 0

            raise

        sdlog.info(
            "SYDPROXY-100",
            "Search-API call completed (returned-files-count=%i,match-count=%i,url=%s)."
            % (response.count(), response.num_found, request.get_url()))

        return response
示例#2
0
    def _reload(self, force=False):
        """Internal method to reload the dictionary of endpoints if the file has changed since it was last read"""

        if self.filepath:
            modtime = file_modification_datetime(self.filepath)
            if force or modtime > self.modtime:
                sdlog.debug("SDDMGLOB-014", "Loading endpoints from: %s, last modified: %s" % (self.filepath, modtime))
                self.modtime = modtime
                endpoints = {}

                # read XML file
                with open(self.filepath, "r") as myfile:
                    xml = myfile.read().replace("\n", "")

                # <endpoints xmlns="http://www.esgf.org/whitelist">
                root = fromstring(xml)
                # <endpoint name="esg#jpl" gridftp="esg-datanode.jpl.nasa.gov:2811" />
                for endpoint in root.findall("{%s}endpoint" % "http://www.esgf.org/whitelist"):
                    gridftp = endpoint.attrib["gridftp"]
                    name = endpoint.attrib["name"]
                    path_out = endpoint.attrib.get("path_out", None)
                    path_in = endpoint.attrib.get("path_in", None)
                    endpoints[gridftp] = Endpoint(name, path_out=path_out, path_in=path_in)
                    sdlog.debug("SDDMGLOB-018", "Using Globus endpoint %s : %s (%s --> %s)" % (gridftp, name, path_out, path_in))

                # switch the dictionary of endpoints after reading
                self.endpoints = endpoints
示例#3
0
def run(o,attached_parameters):
    """This func adds some parameters to the result of a query.
    
    Note
        The idea is the keep some parameters around by making them jump
        over the search call (e.g. Search-api call, SQL call..), from
        'query pipeline' to 'file pipeline'.
    """
    assert isinstance(attached_parameters, dict)

    sdlog.debug("SYDADDAP-620","Add attached_parameters..")

    if isinstance(o,sdtypes.Metadata):

        po=sdpipelineprocessing.ProcessingObject(add_attached_parameters,attached_parameters)
        o=sdpipelineprocessing.run_pipeline(o,po)

    elif isinstance(o,sdtypes.Response):

        # no need to process chunk by chunk here as Response size only contains a small amount of data (< sdconst.SEARCH_API_CHUNKSIZE)

        files=add_attached_parameters(o.get_files(),attached_parameters)
        o.set_files(files)

    else:
        assert False

    sdlog.debug("SYDADDAP-628","attached_parameters added")

    return o
示例#4
0
def main_loop():
    import sdapp, sdlog, sdtaskscheduler # must be here because of double-fork
    import sddb # this is to create database objects if not done already

    sdlog.info('SDDAEMON-001',"Daemon starting ...")

    try:
        sdtaskscheduler.event_loop()
    except SDException, e:
        level=sdconfig.config.get('log','verbosity_level')

        if level=='debug':
            # We log everything in debug mode no matter the exception type

            sdlog.debug('SDDAEMON-008',"Exception occured (%s)"%str(e))
        else:
            if isinstance(e,SDException):
                # In this case, we only print the exception code, as the errmsg
                # is likely to be there already (i.e. low-level func should have 
                # log information about this exception).
                # The primary reason for this is to have a clear log entry
                # when authentication failed (e.g. ESGF is down or openid is incorrect)

                sdlog.info('SDDAEMON-010',"Exception occured (%s)"%str(e.code))
            else:
                # This case should not occur, so we log everything to help debugging

                sdlog.info('SDDAEMON-012',"Exception occured (%s)"%str(e))
示例#5
0
def main_loop():
    import sdapp, sdlog, sdtaskscheduler  # must be here because of double-fork
    import sddb  # this is to create database objects if not done already

    sdlog.info('SDDAEMON-001', "Daemon starting ...")

    try:
        sdtaskscheduler.event_loop()
    except SDException, e:
        level = sdconfig.config.get('log', 'verbosity_level')

        if level == 'debug':
            # We log everything in debug mode no matter the exception type

            sdlog.debug('SDDAEMON-008', "Exception occured (%s)" % str(e))
        else:
            if isinstance(e, SDException):
                # In this case, we only print the exception code, as the errmsg
                # is likely to be there already (i.e. low-level func should have
                # log information about this exception).
                # The primary reason for this is to have a clear log entry
                # when authentication failed (e.g. ESGF is down or openid is incorrect)

                sdlog.info('SDDAEMON-010',
                           "Exception occured (%s)" % str(e.code))
            else:
                # This case should not occur, so we log everything to help debugging

                sdlog.info('SDDAEMON-012', "Exception occured (%s)" % str(e))
示例#6
0
def map_to_globus(url):
    parsed_url = urlparse(url)

    # 'globus' scheme
    if parsed_url.scheme == "globus":
        slash_index = parsed_url.path.find("/")
        src_endpoint = parsed_url.path[0:slash_index]
        src_path = parsed_url.path[slash_index:]
        return src_endpoint, src_path, src_path

    # 'gridftp' scheme
    hostname = parsed_url.netloc
    src_endpoint = None
    src_path = re.sub("/+", "/", parsed_url.path)
    path = src_path
    if hostname in globus_endpoints:
        src_endpoint = globus_endpoints[hostname].name
        path_out = globus_endpoints[hostname].path_out
        path_in = globus_endpoints[hostname].path_in
        if path_out:
            src_path.replace(path_out, "", 1)
        if path_in:
            src_path = path_out + src_path
    sdlog.debug("SDDMGLOB-024", "Mapped url %s to %s%s" % (url, src_endpoint, src_path))
    return src_endpoint, src_path, path
示例#7
0
def run(o, attached_parameters):
    """This func adds some parameters to the result of a query.
    
    Note
        The idea is the keep some parameters around by making them jump
        over the search call (e.g. Search-api call, SQL call..), from
        'query pipeline' to 'file pipeline'.
    """
    assert isinstance(attached_parameters, dict)

    sdlog.debug("SYDADDAP-620", "Add attached_parameters..")

    if isinstance(o, sdtypes.Metadata):

        po = sdpipelineprocessing.ProcessingObject(add_attached_parameters,
                                                   attached_parameters)
        o = sdpipelineprocessing.run_pipeline(o, po)

    elif isinstance(o, sdtypes.Response):

        # no need to process chunk by chunk here as Response size only contains a small amount of data (< sdconst.SEARCH_API_CHUNKSIZE)

        files = add_attached_parameters(o.get_files(), attached_parameters)
        o.set_files(files)

    else:
        assert False

    sdlog.debug("SYDADDAP-628", "attached_parameters added")

    return o
示例#8
0
    def _reload(self, force=False):
        '''Internal method to reload the dictionary of endpoints if the file has changed since it was last read'''

        if self.filepath: # only if endpoints file exists
            modtime = file_modification_datetime(self.filepath)
            if force or modtime > self.modtime:
                sdlog.debug("SDDMGLOB-014", "Loading endpoints from: %s, last modified: %s" % (self.filepath, modtime))
                self.modtime = modtime
                endpoints = {}

                # read XML file
                with open (self.filepath, "r") as myfile:
                    xml=myfile.read().replace('\n', '')

                # <endpoints xmlns="http://www.esgf.org/whitelist">
                root = fromstring(xml)
                # <endpoint name="esg#jpl" gridftp="esg-datanode.jpl.nasa.gov:2811" />
                for endpoint in root.findall("{%s}endpoint" % NS):
                    gridftp = endpoint.attrib['gridftp']
                    name = endpoint.attrib['name']                   # mandatory attribute
                    path_out = endpoint.attrib.get('path_out', None) # optional attribute
                    path_in = endpoint.attrib.get('path_in', None)   # optional attribute
                    endpoints[ gridftp ] = Endpoint(name, path_out=path_out, path_in=path_in)
                    sdlog.debug("SDDMGLOB-018", "Using Globus endpoint %s : %s (%s --> %s)"  % (gridftp, name, path_out, path_in))

                # switch the dictionary of endpoints after reading
                self.endpoints = endpoints
示例#9
0
def part_cleanup(paths):
    """Remove empty files and folders."""

    sdlog.info("SYNCLEAN-018", "Cleanup begin")

    paths = sorted(
        paths, reverse=True
    )  # maybe overkill (idea is that reverse order may allow the suppression of empty sibling, but as all paths to be removed will go through a os.removedirs call it should work anyway)

    for p in paths:
        sdlog.info("SYNCLEAN-060",
                   "Check for empty file and directory in %s" % p)

        # remove empty files
        sdlog.debug("SYNCLEAN-120", "Remove empty files (%s)" % (p, ))
        remove_empty_files(p)

        # remove empty directories starting from leaves
        sdlog.debug("SYNCLEAN-140", "Remove empty dirs (%s)" % (p, ))
        try:
            os.removedirs(p)
        except OSError as e:
            pass  # Neutralize exception (needed as removedirs raise exception at first non empty dir).

    # as the previous command may also remove 'data' folder (when all data have been removed), we re-create 'data' if missing
    if not os.path.isdir(sdconfig.data_folder):
        os.makedirs(sdconfig.data_folder)

    sdlog.info("SYNCLEAN-020", "Cleanup done.")
示例#10
0
def part_cleanup(paths):
    """Remove empty files and folders."""

    sdlog.info("SYNCLEAN-018","Cleanup begin")

    paths=sorted(paths, reverse=True) # maybe overkill (idea is that reverse order may allow the suppression of empty sibling, but as all paths to be removed will go through a os.removedirs call it should work anyway)

    for p in paths:
        sdlog.info("SYNCLEAN-060","Check for empty file and directory in %s"%p)

        # remove empty files
        sdlog.debug("SYNCLEAN-120","Remove empty files (%s)"%(p,))
        remove_empty_files(p)

        # remove empty directories starting from leaves
        sdlog.debug("SYNCLEAN-140","Remove empty dirs (%s)"%(p,))
        try:
            os.removedirs(p)
        except OSError as e:
            pass # Neutralize exception (needed as removedirs raise exception at first non empty dir).

    # as the previous command may also remove 'data' folder (when all data have been removed), we re-create 'data' if missing
    if not os.path.isdir(sdconfig.data_folder):
        os.makedirs(sdconfig.data_folder)

    sdlog.info("SYNCLEAN-020","Cleanup done.")
示例#11
0
def run(**kw):
    files=kw.get('files')
    check_type(files)
    check_fields(files)
    files=sdreducerow.run(files)
    files=sdremoveaggregation.run(files)
    files=sdprotocol.run(files)
    files=sdtimefilter.run(files)
    files=sdprepare_dataset_attr.run(files)
    #files=sdcheck_dataset_template.run(files)
    files=sdreducecol.run(files)
    files=sdprepare_file_attr.run(files)
    files=sdlocalpath.run(files)

    for f in files:
        sdlog.debug("SDFIPIPE-004","%s"%f['url'],stdout=True)

    files=sdshrink.run(files)

    for f in files:
        sdlog.debug("SDFIPIPE-005","%s"%f['url'],stdout=True)

    #files=sdonemgf_post.run(files) # BEWARE: this module do not respect 'KISS' principle (it updates global value by altering the syndac console session context). You can disable it to keep things simple (it's only there for tuning purpose).
    files=sdcomplete.run(files)

    files=sdstatusfilter.run(files)

    return files
示例#12
0
def run_helper(queries):
    """
    notes
      - "queries" is non-threadsafe (i.e. not a Queue), but doesn't matter as threads do not use it
    """
    total_query_to_process=len(queries)

    sdlog.debug("SDPROXMT-003","%d search-API queries to process (max_thread_per_host=%d,timeout=%d)"%(total_query_to_process,max_thread_per_host,sdconst.SEARCH_API_HTTP_TIMEOUT))

    while True:
        if sdconfig.proxymt_progress_stat:
            sdlog.info("SDPROXMT-033","threads per host: %s"%",".join(['%s=%s'%(host,len(searchAPIServices[host]['threadlist'])) for host in searchAPIServices.keys()]))

        if len(queries)>0:
            distribute_queries(queries)
        else:
            # leave the loop only if all threads completed
            if all_threads_completed():
                break

        # remove completed threads from list
        for host in searchAPIServices.keys():
            li=[]
            for t in searchAPIServices[host]['threadlist']:
                if t.is_alive():
                    li.append(t)
            searchAPIServices[host]['threadlist']=li

        # log
        total_query_already_processed = total_query_to_process - len(queries)
        if total_query_to_process > 0: # display progress only when there are a lot of queries
            if len(queries) > 0: # display progress only when still query to process
                sdlog.info("SDPROXMT-004","total_queries=%d, running_or_done_queries=%d, waiting_queries=%d"%(total_query_to_process,total_query_already_processed,len(queries)))

        # if all services are busy, we sleep to limit loop speed
        # (note that all the code around the "sleep" call is to detect system overload)
        sleep_time=10
        warning_threshold=5 # threshold not to emit warning for every small load exceedance
        befo=time.time()
        time.sleep(sleep_time)
        afte=time.time()
        diff=afte-befo
        if diff>sleep_time+warning_threshold:
            sdlog.warning("SDPROXMT-005","WARNING: system overload detected (sleep takes %d second to complete)."%diff)

    # retrieve result from output queue
    metadata=sdtypes.Metadata()
    while not __result_queue.empty():
        success=__result_queue.get(False) # retrieve result from ONE successful search-API call
        success.connect() # TAGKLK434L3K34K
        metadata.slurp(success) # warning: success is modified here

    # retrieve error from output queue and insert them into a list
    errors=[]
    while not __error_queue.empty():
        query=__error_queue.get(False)
        errors.append(query)

    return (metadata,errors)
示例#13
0
def run(stream=None,
        path=None,
        parameter=None,
        index_host=None,
        post_pipeline_mode='file',
        dry_run=False):

    if parameter is None:
        parameter = []

    queries = sdpipeline.build_queries(stream=stream,
                                       path=path,
                                       parameter=parameter,
                                       index_host=index_host,
                                       parallel=False,
                                       load_default=False)

    if len(queries) < 1:
        raise SDException("SDQSEARC-001", "No query to process")

    progress = sdsqueries.get_scalar(
        queries, 'progress', False, type_=bool
    )  # we cast here as progress can be str (set from parameter) or bool (set programmaticaly)
    searchapi_host = sdsqueries.get_scalar(queries, 'searchapi_host')

    if dry_run:
        for query in queries:
            request = sdtypes.Request(url=query['url'], pagination=False)

            print '%s' % request.get_url()

            # debug
            #print 'Url: %s'%request.get_url()
            #print 'Attached parameters: %s'%query.get('attached_parameters')

        return sdtypes.Response()
    else:
        try:
            if progress:
                sdtools.print_stderr(
                    sdi18n.m0003(searchapi_host)
                )  # waiting message => TODO: move into ProgressThread class
                ProgressThread.start(
                    sleep=0.1,
                    running_message='',
                    end_message='Search completed.')  # spinner start

            mqr = process_queries(queries)
            metadata = mqr.to_metadata()

            sdlog.debug("SDQSEARC-002", "files-count=%d" % metadata.count())
            metadata = sdpipeline.post_pipeline(metadata, post_pipeline_mode)
            sdlog.debug("SDQSEARC-004", "files-count=%d" % metadata.count())

            return metadata
        finally:
            if progress:
                ProgressThread.stop()  # spinner stop
示例#14
0
def start_new_thread(host,url):
    sdlog.debug("SDPROXMT-002","Starting new search-API thread (%s)"%host)

    service=searchAPIServices[host]["iSearchAPIProxy"]

    th=MetadataThread(host,service,url,__result_queue,__error_queue)
    th.setDaemon(True)
    th.start()

    return th
示例#15
0
def is_nearestpost_enabled(metadata):
    result=False

    sdlog.debug("SSHRINKT-001","Check if nearestpost is enabled..")

    if sdconfig.nearest_schedule=='post' and nearest_flag_set_on_all_files(metadata):
        result=True
    else:
        result=False

    sdlog.debug("SSHRINKT-002","nearestpost is %s"%result)

    return result
示例#16
0
def run(**kw):
    files=kw.get('files')
    check_type(files)
    check_fields(files)
    files=sdreducerow.run(files)
    files=sdremoveaggregation.run(files)
    files=sdprotocol.run(files)
    files=sdtimefilter.run(files)
    files=sdprepare_dataset_attr.run(files)
    #files=sdcheck_dataset_template.run(files)

    # we do not remove the number of column here anymore
    #
    # Notes
    #     - not reducing the number of column here may slightly diminish
    #       performance (memory, cpu). But as we do need those informations (e.g.
    #       description, variable_long_name, facets..), we have no choice.
    #     - we need to keep those informations even if they are not essential,
    #       as we will need them soon to provide more descriptive informations to
    #       the user (e.g. description, variable_long_name..)
    #     - we need to keep all facets so the user can build custom local path
    #       (see local_path_custom_transform() func for more info)
    #     - we will now remove those column downstream (but only for 'dump' action)
    #
    #files=sdreducecol.run(files)

    files=sdprepare_file_attr.run(files)
    files=sdlocalpath.run(files)

    for f in files:
        sdlog.debug("SDFIPIPE-004","%s"%f['url'],stdout=True)

    files=sdshrink.run(files)

    for f in files:
        sdlog.debug("SDFIPIPE-005","%s"%f['url'],stdout=True)

    
    # EXT_FILE_POST
    #
    # load extensions here
    #
    # TODO


    files=sdcomplete.run(files)

    files=sdstatusfilter.run(files)

    return files
示例#17
0
def is_nearestpost_enabled(metadata):
    result = False

    sdlog.debug("SSHRINKT-001", "Check if nearestpost is enabled..")

    if sdconfig.nearest_schedule == 'post' and nearest_flag_set_on_all_files(
            metadata):
        result = True
    else:
        result = False

    sdlog.debug("SSHRINKT-002", "nearestpost is %s" % result)

    return result
示例#18
0
def get_urls(file_functional_id):
    """returns a prioritized list of [url,protocol] where each url can supply the specified file"""

    try:
        result = sdquicksearch.run(parameter=[
            'limit=4',
            'fields=%s' % url_fields, 'type=File',
            'instance_id=%s' % file_functional_id
        ],
                                   post_pipeline_mode=None)
    except Exception as e:
        sdlog.debug("SDNEXTUR-015",
                    "exception %s.  instance_id=%s" % (e, file_functional_id))
        raise e

    li = result.get_files()
    sdlog.info(
        "SDNEXTUR-016",
        "sdquicksearch returned %s sets of file urls: %s" % (len(li), li))
    if li == []:
        # No urls found. Try again, but wildcard the file id. (That leads to a string search on all
        # fields for the wildcarded file id, rather than a match of the instance_id field only.)
        result = sdquicksearch.run(parameter=[
            'limit=4',
            'fields=%s' % url_fields, 'type=File',
            'instance_id=%s' % file_functional_id + '*'
        ],
                                   post_pipeline_mode=None)
        li = result.get_files()
        sdlog.info(
            "SDNEXTUR-017",
            "sdquicksearch 2nd call %s sets of file urls: %s" % (len(li), li))
    # result looks like
    # [ {protocol11:url11, protocol12:url12, attached_parameters:dict, score:number, type:'File',
    #    size:number} }, {[another dict of the same format}, {another dict},... ]
    # with no more than limit=4 items in the list, and no more than three protocols.
    # We'll return something like urlps = [ [url1,protocol1], [url2,protocol2],... ]
    # The return value could be an empty list.
    # Note: These nested lists are ugly; it's just a quick way to code something up.

    urlps = []
    for dic in li:
        urlps += [[dic[key], key] for key in dic.keys()
                  if key.find('url_') >= 0 and dic[key].find('//None') < 0]
        # ... protocol keys are one of 'url_opendap', 'url_http', 'url_gridftp'
        # The search for //None bypasses an issue with the SOLR lookup where there is no
        # url_gridftp possibility.

    return prioritize_urlps(urlps)
示例#19
0
def map_to_globus(url):
    parsed_url = urlparse.urlparse(url)
    hostname = parsed_url.netloc
    src_endpoint = None
    src_path = re.sub('/+', '/', parsed_url.path)
    path = src_path
    if hostname in globus_endpoints:
        src_endpoint = globus_endpoints[hostname].name
        path_out = globus_endpoints[hostname].path_out
        path_in = globus_endpoints[hostname].path_in
        if path_out:
            src_path.replace(path_out, '', 1)
        if path_in:
            src_path = path_out + src_path
    sdlog.debug("SDDMGLOB-024", "Mapped url %s to %s%s" % (url, src_endpoint, src_path))
    return src_endpoint, src_path, path
示例#20
0
def map_to_globus(url):
    parsed_url = urlparse.urlparse(url)
    hostname = parsed_url.netloc
    src_endpoint = None
    src_path = re.sub('/+', '/', parsed_url.path)
    path = src_path
    if hostname in globus_endpoints:
        src_endpoint = globus_endpoints[hostname].name
        path_out = globus_endpoints[hostname].path_out
        path_in = globus_endpoints[hostname].path_in
        if path_out:
            src_path.replace(path_out, '', 1)
        if path_in:
            src_path = path_out + src_path
    sdlog.debug("SDDMGLOB-024",
                "Mapped url %s to %s%s" % (url, src_endpoint, src_path))
    return src_endpoint, src_path, path
示例#21
0
    def run(self,url=None,attached_parameters=None):
        """Execute one search query (as pagination is used, it can result in many HTTP queries)."""

        if attached_parameters is None:
            attached_parameters={}

        request=sdtypes.Request(url=url,pagination=True)
        final_url=request.get_url()

        sdlog.debug("SYDPROXY-490","paginated call started (url=%s)"%final_url)

        try:
            paginated_response=self.call_web_service__PAGINATION(request)
        except Exception,e:
            sdlog.error("SYDPROXY-400","Error occurs during search-API paginated call (url=%s)"%(final_url,))
            sdlog.error("SYDPROXY-410","%s"%(str(e),))
            raise
示例#22
0
def run(files):
    for file in files:
        protocol = sdpostpipelineutils.get_attached_parameter(
            file, 'protocol', sdconst.TRANSFER_PROTOCOL_HTTP)

        if protocol not in sdconst.TRANSFER_PROTOCOLS:
            raise SDException("SYNPROTO-004",
                              "Incorrect protocol (%s)" % protocol)

        if protocol == sdconst.TRANSFER_PROTOCOL_GLOBUS:
            if 'url_globus' in file:
                file['url'] = file['url_globus']
            elif 'url_gridftp' in file:
                file['url'] = file['url_gridftp']
            elif 'url_http' in file:
                sdlog.warning('SYNPROTO-005',
                              'Fallback to http as globus url is missing')
                file['url'] = file['url_http']

        elif protocol == sdconst.TRANSFER_PROTOCOL_GRIDFTP:
            if 'url_gridftp' in file:
                file['url'] = file['url_gridftp']
            elif 'url_http' in file:
                sdlog.debug(
                    'SYNPROTO-002',
                    'Fallback to http as gridftp url is missing (%s)' %
                    file["title"])
                file['url'] = file['url_http']

        elif protocol == sdconst.TRANSFER_PROTOCOL_HTTP:
            if 'url_http' in file:
                file['url'] = file['url_http']
            elif 'url_gridftp' in file:
                sdlog.warning('SYNPROTO-001',
                              'Fallback to gridftp as http url is missing')
                file['url'] = file['url_gridftp']

        else:
            raise SDException("SYNPROTO-003",
                              "Incorrect protocol (%s)" % protocol)

        sdtools.remove_dict_items(
            file, ['url_globus', 'url_gridftp', 'url_http', 'url_opendap'])

    return files
示例#23
0
def run(stream=None,path=None,parameter=None,index_host=None,post_pipeline_mode='file',dry_run=False):

    if parameter is None:
        parameter=[]

    queries=sdpipeline.build_queries(stream=stream,path=path,parameter=parameter,index_host=index_host,parallel=False,load_default=False)

    if len(queries)<1:
        raise SDException("SDQSEARC-001","No query to process")

    progress=sdsqueries.get_scalar(queries,'progress',False,type_=bool) # we cast here as progress can be str (set from parameter) or bool (set programmaticaly)
    searchapi_host=sdsqueries.get_scalar(queries,'searchapi_host')


    if dry_run:
        for query in queries:
            request=sdtypes.Request(url=query['url'],pagination=False)

            print '%s'%request.get_url()

            # debug
            #print 'Url: %s'%request.get_url()
            #print 'Attached parameters: %s'%query.get('attached_parameters')

        return sdtypes.Response()
    else:
        try:
            if progress:
                sdtools.print_stderr(sdi18n.m0003(searchapi_host)) # waiting message => TODO: move into ProgressThread class
                ProgressThread.start(sleep=0.1,running_message='',end_message='Search completed.') # spinner start

            mqr=process_queries(queries)
            metadata=mqr.to_metadata()

            sdlog.debug("SDQSEARC-002","files-count=%d"%metadata.count())
            metadata=sdpipeline.post_pipeline(metadata,post_pipeline_mode)
            sdlog.debug("SDQSEARC-004","files-count=%d"%metadata.count())

            return metadata
        finally:
            if progress:
                ProgressThread.stop() # spinner stop
示例#24
0
def run(files):
    for file in files:
        protocol=sdpostpipelineutils.get_attached_parameter(file,'protocol',sdconst.TRANSFER_PROTOCOL_HTTP)

        if protocol not in sdconst.TRANSFER_PROTOCOLS:
            raise SDException("SYNPROTO-004","Incorrect protocol (%s)"%protocol)

        if 'url_gridftp' in file and 'url_http' in file:

            if protocol==sdconst.TRANSFER_PROTOCOL_GRIDFTP:
                file['url']=file['url_gridftp']
            elif protocol==sdconst.TRANSFER_PROTOCOL_HTTP:
                file['url']=file['url_http']
            else:
                raise SDException("SYNPROTO-003","Incorrect protocol (%s)"%protocol)

        elif 'url_gridftp' in file:
            # only gridftp

            if protocol==sdconst.TRANSFER_PROTOCOL_HTTP:
                sdlog.warning('SYNPROTO-001','Fallback to gridftp as http url is missing')

            file['url']=file['url_gridftp']

        elif 'url_http' in file:
            # only http
    
            if protocol==sdconst.TRANSFER_PROTOCOL_GRIDFTP:
                sdlog.debug('SYNPROTO-002','Fallback to http as gridftp url is missing (%s)'%file["title"])

            file['url']=file['url_http']

        else:
            # no url available to download the file
            # (should not be here as sdremoverow takes care of those cases)

            assert False


        sdtools.remove_dict_items(file,['url_gridftp', 'url_http', 'url_opendap'])

    return files
示例#25
0
    def run(self, url=None, attached_parameters=None):
        """Execute one search query (as pagination is used, it can result in many HTTP queries)."""

        if attached_parameters is None:
            attached_parameters = {}

        request = sdtypes.Request(url=url, pagination=True)
        final_url = request.get_url()

        sdlog.debug("SYDPROXY-490",
                    "paginated call started (url=%s)" % final_url)

        try:
            paginated_response = self.call_web_service__PAGINATION(request)
        except Exception, e:
            sdlog.error(
                "SYDPROXY-400",
                "Error occurs during search-API paginated call (url=%s)" %
                (final_url, ))
            sdlog.error("SYDPROXY-410", "%s" % (str(e), ))
            raise
示例#26
0
    def call_web_service(self,request):

        sdlog.debug("SYDPROXY-100","Search-API call started (%s)."%request.get_url())

        try:
            response=sdnetutils.call_web_service(request.get_url(),timeout=sdconst.SEARCH_API_HTTP_TIMEOUT) # returns Response object
        except:

            # if exception occurs in sdnetutils.call_web_service() method, all
            # previous calls to this method inside this paginated call are also
            # cancelled

            # we reset the offset so the paginated call can be restarted from the begining the next time
            # (maybe overkill as offset is reinitialized when entering 'call_web_service__PAGINATION()' func)
            request.offset=0

            raise

        sdlog.info("SYDPROXY-100","Search-API call completed (returned-files-count=%i,match-count=%i,url=%s)."%(response.count(),response.num_found,request.get_url()))

        return response
示例#27
0
def run_pipeline(metadata,po,io_mode=sdconst.PROCESSING_FETCH_MODE_GENERATOR):
    """
    Note
        Beware: metadata input argument is modified in this func !
        (you have to make a copy before calling this func if you want
        to keep original data)
    """

    # alias
    f=po.f
    args=po.args
    kwargs=po.kwargs

    sdlog.debug("SYNDPIPR-001","Start chunk loop (files-count=%d)"%metadata.count())

    if io_mode=='no_chunk':

        # way 0: load-all-in-memory (no chunk).
        files=f(metadata.get_files(),*args,**kwargs)
        metadata.set_files(files)

    elif io_mode=='generator':

        # way 1: chunk-by-chunk (using a second store)
        new_metadata=sdtypes.Metadata()
        for chunk in metadata.get_chunks(io_mode):

            sdlog.debug("SYNDPIPR-002","Process chunk")

            chunk=f(chunk,*args,**kwargs)
            new_metadata.add_files(chunk)

        metadata=new_metadata # note: metadata old value get's removed here (destructor is called). This is to enforce that this function IS destructive with its input argument (see func comment for more info).

    elif io_mode=='pagination':

        # way 2: chunk-by-chunk (updating store on-the-fly)
        for chunk in metadata.get_chunks(io_mode):
            chunk=f(chunk,*args,**kwargs)
            metadata.update(chunk) # TODO: check if 'size' is handled here

    elif io_mode=='experimental':

        # use 'ALTER TABLE foo RENAME TO bar' here

        pass

    else:
        assert False

    sdlog.debug("SYNDPIPR-003","Chunk loop completed (files-count=%d)"%metadata.count())

    return metadata
示例#28
0
class SearchAPIProxy():
    def __init__(self, **kw):
        pass

    def run(self, url=None, attached_parameters=None):
        """Execute one search query (as pagination is used, it can result in many HTTP queries)."""

        if attached_parameters is None:
            attached_parameters = {}

        request = sdtypes.Request(url=url, pagination=True)
        final_url = request.get_url()

        sdlog.debug("SYDPROXY-490",
                    "paginated call started (url=%s)" % final_url)

        try:
            paginated_response = self.call_web_service__PAGINATION(request)
        except Exception, e:
            sdlog.error(
                "SYDPROXY-400",
                "Error occurs during search-API paginated call (url=%s)" %
                (final_url, ))
            sdlog.error("SYDPROXY-410", "%s" % (str(e), ))
            raise

        sdlog.debug(
            "SYDPROXY-001",
            "paginated call completed (call-duration=%i, files-count=%i, url=%s)"
            % (paginated_response.call_duration, paginated_response.count(),
               final_url))

        if attached_parameters.get('verbose', False) == True:
            sdtools.print_stderr("Url: %s" % final_url)
            sdtools.print_stderr("Duration: %s" %
                                 paginated_response.call_duration)
            sdtools.print_stderr("")

        md = paginated_response.to_metadata(
        )  # we cast to remove pagination related code and have a lighter object

        md = sdaddap.run(md, attached_parameters)

        return md
示例#29
0
                        '|'
                    )[0]  # keep only first field (i.e. keep only the file url)
                    protocol = item.split('|')[-1]

                    if protocol.upper() == "HTTPSERVER":
                        l__dict['url_http'] = url
                    elif protocol.upper() == "GRIDFTP":
                        l__dict['url_gridftp'] = url
                    elif protocol.upper() == "OPENDAP":
                        l__dict['url_opendap'] = url
            else:
                l__dict[attr_name] = attr_value

        l__files.append(l__dict)

    sdlog.debug("SYNDJSON-014", "files-count=%d" % len(l__files))

    return {
        'files': l__files,
        'num_found': l__num_found,
        'num_result': len(l__files)
    }


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-f', '--file', required=True)
    args = parser.parse_args()

    # read search-api output sample
    with open(args.file, 'r') as fh:
示例#30
0
def transfers_end():

    _, _, access_token = api_client.goauth.get_access_token(username=globus_username, password=globus_password)
    api = api_client.TransferAPIClient(username=globus_username, goauth=access_token)

    for task_id in globus_tasks:

        code, reason, data = api.task(task_id, fields="status")
        status = data['status']

        sdlog.debug("SDDMGLOB-016", "Checking the status of Globus transfer tasks, id: %s, status: %s" % (task_id, status))
        for item in globus_tasks[task_id]['items']:
            tr = item['tr']
            if status == "SUCCEEDED":

                assert tr.size is not None

                if int(tr.size) != os.path.getsize(tr.get_full_local_path()):
                    sdlog.error("SDDMGLOB-002","size don't match (remote_size=%i,local_size=%i,local_path=%s)"%(int(tr.size),os.path.getsize(tr.get_full_local_path()),tr.get_full_local_path()))

                # retrieve local and remote checksum
                checksum_type=tr.checksum_type if tr.checksum_type is not None else 'md5'
                local_checksum=sdutils.compute_checksum(tr.get_full_local_path(),checksum_type)
                remote_checksum=tr.checksum # retrieve remote checksum

                if remote_checksum!=None:
                    # remote checksum exists

                    # compare local and remote checksum
                    if remote_checksum==local_checksum:
                        # checksum is ok

                        tr.status = sdconst.TRANSFER_STATUS_DONE
                    else:
                        # checksum is not ok

                        if incorrect_checksum_action=="remove":
                            tr.status=sdconst.TRANSFER_STATUS_ERROR
                            tr.error_msg="File corruption detected: local checksum doesn't match remote checksum"

                            # remove file from local repository
                            sdlog.error("SDDMGLOB-155","checksum don't match: remove local file (local_checksum=%s,remote_checksum=%s,local_path=%s)"%(local_checksum,remote_checksum,tr.get_full_local_path()))
                            try:
                                os.remove(tr.get_full_local_path())
                            except Exception,e:
                                sdlog.error("SDDMGLOB-158","error occurs while removing local file (%s)"%tr.get_full_local_path())

                        elif incorrect_checksum_action=="keep":
                            sdlog.info("SDDMGLOB-157","local checksum doesn't match remote checksum (%s)"%tr.get_full_local_path())
                            
                            tr.status=sdconst.TRANSFER_STATUS_DONE

                        else:
                            raise FatalException("SDDMGLOB-507","incorrect value (%s)"%incorrect_checksum_action)
                else:
                    # remote checksum is missing
                    # NOTE: we DON'T store the local checksum ('file' table contains only the REMOTE checksum)

                    tr.status = sdconst.TRANSFER_STATUS_DONE

                if tr.status == sdconst.TRANSFER_STATUS_DONE:
                    tr.end_date=sdtime.now() # WARNING: this is not the real end of transfer date but the date when we ask the globus scheduler if the transfer is done.
                    tr.error_msg=""
                    sdlog.info("SDDMGLOB-101", "Transfer done (%s)" % str(tr))

            elif status == "FAILED":
                tr.status = sdconst.TRANSFER_STATUS_ERROR
                tr.error_msg = "Error occurs during download."

                sdlog.info("SDDMGLOB-101", "Transfer failed (%s)" % str(tr))

                # Remove local file if exists
                if os.path.isfile(tr.get_full_local_path()):
                    try:
                        os.remove(tr.get_full_local_path())
                    except Exception,e:
                        sdlog.error("SDDMGLOB-528","Error occurs during file suppression (%s,%s)"%(tr.get_full_local_path(),str(e)))
示例#31
0
        #
        # TODO: maybe always enable this
        #
        sdtrace.log_exception()

        # debug
        #
        # (if the error is not due to a network error (e.g. internet connection
        # problem), raise the original exception below and set the debug mode
        # to see the stacktrace.
        #
        #raise

        raise SDException('SDNETUTI-008','Network error (see log for details)') # we raise a new exception 'network error' here, because most of the time, 'xml parsing error' is due to an 'network error'.

    sdlog.debug("SDNETUTI-044","files-count=%d"%len(di.get('files')))

    return sdtypes.Response(call_duration=elapsed_time,lowmem=lowmem,**di) # RAM storage is ok here as one response is limited by SEARCH_API_CHUNKSIZE

def call_param_web_service(url,timeout):
    buf=HTTP_GET(url,timeout)

    buf=fix_encoding(buf)

    try:
        params=search_api_parser.parse_parameters(buf)
    except Exception as e:

        # If we are here, it's likely that they is a problem with the internet connection
        # (e.g. we are behind an HTTP proxy and have no authorization to use it)
示例#32
0
文件: sdxml.py 项目: Prodiguer/synda
                                l__dict[l__name].append(l__value)

                    elif arr_n.tag=="float":
                        # type not used for now

                        """
                        sample:

                        <arr name="score"><float name="score">1.9600565</float></arr>
                        """

                        pass

        l__files.append(l__dict)

    sdlog.debug("SYNDAXML-014","files-count=%d"%len(l__files))

    return {'files':l__files,'num_found':l__num_found,'num_result':len(l__files)}

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-f','--file',required=True)
    args = parser.parse_args()

    # read search-api output sample
    with open(args.file, 'r') as fh:
        buffer=fh.read()

    #result=parse_parameters(buffer)
    result=parse_metadata(buffer)
示例#33
0
        #
        sdtrace.log_exception()

        # debug
        #
        # (if the error is not due to a network error (e.g. internet connection
        # problem), raise the original exception below and set the debug mode
        # to see the stacktrace.
        #
        #raise

        raise SDException(
            'SDNETUTI-008', 'Network error (see log for details)'
        )  # we raise a new exception 'network error' here, because most of the time, 'xml parsing error' is due to an 'network error'.

    sdlog.debug("SDNETUTI-044", "files-count=%d" % len(di.get('files')))

    return sdtypes.Response(
        call_duration=elapsed_time, lowmem=lowmem, **di
    )  # RAM storage is ok here as one response is limited by SEARCH_API_CHUNKSIZE


def call_param_web_service(url, timeout):
    buf = HTTP_GET(url, timeout)

    buf = fix_encoding(buf)

    try:
        params = search_api_parser.parse_parameters(buf)
    except Exception as e:
示例#34
0
def transfers_end():

    _, _, access_token = api_client.goauth.get_access_token(
        username=globus_username, password=globus_password)
    api = api_client.TransferAPIClient(username=globus_username,
                                       goauth=access_token)

    for task_id in globus_tasks:

        code, reason, data = api.task(task_id, fields="status")
        status = data['status']

        sdlog.debug(
            "SDDMGLOB-016",
            "Checking the status of Globus transfer tasks, id: %s, status: %s"
            % (task_id, status))
        for item in globus_tasks[task_id]['items']:
            tr = item['tr']
            if status == "SUCCEEDED":

                assert tr.size is not None

                if int(tr.size) != os.path.getsize(tr.get_full_local_path()):
                    sdlog.error(
                        "SDDMGLOB-002",
                        "size don't match (remote_size=%i,local_size=%i,local_path=%s)"
                        % (int(tr.size),
                           os.path.getsize(tr.get_full_local_path()),
                           tr.get_full_local_path()))

                # retrieve local and remote checksum
                checksum_type = tr.checksum_type if tr.checksum_type is not None else sdconst.CHECKSUM_TYPE_MD5
                local_checksum = sdutils.compute_checksum(
                    tr.get_full_local_path(), checksum_type)
                remote_checksum = tr.checksum  # retrieve remote checksum

                if remote_checksum != None:
                    # remote checksum exists

                    # compare local and remote checksum
                    if remote_checksum == local_checksum:
                        # checksum is ok

                        tr.status = sdconst.TRANSFER_STATUS_DONE
                    else:
                        # checksum is not ok

                        if incorrect_checksum_action == "remove":
                            tr.status = sdconst.TRANSFER_STATUS_ERROR
                            tr.priority -= 1
                            tr.error_msg = "File corruption detected: local checksum doesn't match remote checksum"

                            # remove file from local repository
                            sdlog.error(
                                "SDDMGLOB-155",
                                "checksum don't match: remove local file (local_checksum=%s,remote_checksum=%s,local_path=%s)"
                                % (local_checksum, remote_checksum,
                                   tr.get_full_local_path()))
                            try:
                                os.remove(tr.get_full_local_path())
                            except Exception, e:
                                sdlog.error(
                                    "SDDMGLOB-158",
                                    "error occurs while removing local file (%s)"
                                    % tr.get_full_local_path())

                        elif incorrect_checksum_action == "keep":
                            sdlog.info(
                                "SDDMGLOB-157",
                                "local checksum doesn't match remote checksum (%s)"
                                % tr.get_full_local_path())

                            tr.status = sdconst.TRANSFER_STATUS_DONE

                        else:
                            raise FatalException(
                                "SDDMGLOB-507", "incorrect value (%s)" %
                                incorrect_checksum_action)
                else:
                    # remote checksum is missing
                    # NOTE: we DON'T store the local checksum ('file' table contains only the REMOTE checksum)

                    tr.status = sdconst.TRANSFER_STATUS_DONE

                if tr.status == sdconst.TRANSFER_STATUS_DONE:
                    tr.end_date = sdtime.now(
                    )  # WARNING: this is not the real end of transfer date but the date when we ask the globus scheduler if the transfer is done.
                    tr.error_msg = ""
                    sdlog.info("SDDMGLOB-101", "Transfer done (%s)" % str(tr))

            elif status == "FAILED":
                tr.status = sdconst.TRANSFER_STATUS_ERROR
                tr.priority -= 1
                tr.error_msg = "Error occurs during download."

                sdlog.info("SDDMGLOB-101", "Transfer failed (%s)" % str(tr))

                # Remove local file if exists
                if os.path.isfile(tr.get_full_local_path()):
                    try:
                        os.remove(tr.get_full_local_path())
                    except Exception, e:
                        sdlog.error(
                            "SDDMGLOB-528",
                            "Error occurs during file suppression (%s,%s)" %
                            (tr.get_full_local_path(), str(e)))
示例#35
0
            break

        if quit == 1:
            if can_leave(
            ):  # wait until all threads finish and until everything has been processed on the database I/O queue
                sdlog.info("SDTSCHED-001",
                           "eot_queue orders processing completed",
                           stderr=False)
                sdlog.info("SDTSCHED-003",
                           "Running transfer processing completed",
                           stderr=False)
                break

        time.sleep(main_loop_sleep)

        sdlog.debug("SDTSCHED-400", "end of event loop")

    print
    sdlog.info("SDTSCHED-901", "Scheduler successfully stopped", stderr=True)


# module init.

quit = 0  # 0 => start, 1 => stop
scheduler_state = 0  # 0 => stopped, 1 => running, 2 => starting
main_loop_sleep = 9
sdlog.set_default_logger(sdconst.LOGGER_CONSUMER)

if sdconfig.prevent_daemon_and_ihm:
    if os.path.isfile(sdconfig.ihm_pid_file):
        sdlog.info("SDTSCHED-014",