예제 #1
0
def get_files(bfo, distinguish_main_and_additional_files=True, include_subformat_icons=False):
    """
    Returns the files available for the given record.
    Returned structure is a tuple (parsed_urls, old_versions, additionals):
     - parsed_urls: contains categorized URLS (see details below)
     - old_versions: set to True if we can have access to old versions
     - additionals: set to True if we have other documents than the 'main' document

     Parameter 'include_subformat_icons' decides if subformat
     considered as icons should be returned

    'parsed_urls' is a dictionary in the form::
        {'main_urls' : {'Main'      : [('http://CFG_SITE_URL/CFG_SITE_RECORD/1/files/aFile.pdf', 'aFile', 'PDF'),
                                       ('http://CFG_SITE_URL/CFG_SITE_RECORD/1/files/aFile.gif', 'aFile', 'GIF')],
                        'Additional': [('http://CFG_SITE_URL/CFG_SITE_RECORD/1/files/bFile.pdf', 'bFile', 'PDF')]},

         'other_urls': [('http://externalurl.com/aFile.pdf', 'Fulltext'),      # url(8564_u), description(8564_z/y)
                        ('http://externalurl.com/bFile.pdf', 'Fulltext')],

         'cern_urls' : [('http://cern.ch/aFile.pdf', 'Fulltext'),              # url(8564_u), description(8564_z/y)
                        ('http://cern.ch/bFile.pdf', 'Fulltext')],
        }

    Some notes about returned structure:
        - key 'cern_urls' is only available on CERN site
        - keys in main_url dictionaries are defined by the BibDoc.
        - older versions are not part of the parsed urls
        - returns only main files when possible, that is when doctypes
          make a distinction between 'Main' files and other
          files. Otherwise returns all the files as main. This is only
          enabled if distinguish_main_and_additional_files is set to True
    """

    _ = gettext_set_language(bfo.lang)

    urls = bfo.fields("8564_")
    bibarchive = BibRecDocs(bfo.recID)

    old_versions = False # We can provide link to older files. Will be
                         # set to True if older files are found.
    additionals = False  # We have additional files. Will be set to
                         # True if additional files are found.

    # Prepare object to return
    parsed_urls = {'main_urls':{},    # Urls hosted by Invenio (bibdocs)
                  'others_urls':[]    # External urls
                  }
    if CFG_CERN_SITE:
        parsed_urls['cern_urls'] = [] # cern.ch urls

    # Doctypes can of any type, but when there is one file marked as
    # 'Main', we consider that there is a distinction between "main"
    # and "additional" files. Otherwise they will all be considered
    # equally as main files
    distinct_main_and_additional_files = False
    if len(bibarchive.list_bibdocs(doctype='Main')) > 0 and \
           distinguish_main_and_additional_files:
        distinct_main_and_additional_files = True
    # Parse URLs
    for complete_url in urls:
        if 'u' in complete_url:
            url = complete_url['u']
            (dummy, host, path, dummy, params, dummy) = urlparse(url)
            subformat = complete_url.get('x', '')
            filename = urllib.unquote(basename(path))
            name = file_strip_ext(filename)
            url_format = filename[len(name):]
            if url_format.startswith('.'):
                url_format = url_format[1:]
            if compose_format(url_format, subformat) in _CFG_NORMALIZED_BIBFORMAT_HIDDEN_FILE_FORMATS:
                ## This format should be hidden.
                continue

            descr = filename
            if 'y' in complete_url:
                descr = complete_url['y']
                if descr == 'Fulltext':
                    descr = _("Fulltext")
            if not url.startswith(CFG_SITE_URL): # Not a bibdoc?
                if not descr: # For not bibdoc let's have a description
                    # Display the URL in full:
                    descr = url
                if CFG_CERN_SITE and 'cern.ch' in host and \
                       ('/setlink?' in url or \
                        'cms' in host or \
                        'documents.cern.ch' in url or \
                        'doc.cern.ch' in url or \
                        'preprints.cern.ch' in url):
                    url_params_dict = dict([part.split('=') for part in params.split('&') if len(part.split('=')) == 2])
                    if 'categ' in url_params_dict and \
                           (url_params_dict['categ'].split('.', 1)[0] in cern_arxiv_categories) and \
                           'id' in url_params_dict:
                        # Old arXiv links, used to be handled by
                        # setlink. Provide direct links to arXiv
                        for file_format, label in [('pdf', "PDF")]:#,
                            #('ps', "PS"),
                            #('e-print', "Source (generally TeX or LaTeX)"),
                            #('abs', "Abstract")]:
                            url = "http://arxiv.org/%(format)s/%(category)s/%(id)s" % \
                                  {'format': file_format,
                                   'category': url_params_dict['categ'],
                                   'id': url_params_dict['id']}
                            parsed_urls['others_urls'].append((url, "%s/%s %s" % \
                                                               (url_params_dict['categ'],
                                                                url_params_dict['id'],
                                                                label)))
                else:
                    parsed_urls['others_urls'].append((url, descr)) # external url
            else: # It's a bibdoc!
                assigned = False
                for doc in bibarchive.list_bibdocs():
                    if int(doc.get_latest_version()) > 1:
                        old_versions = True
                    if True in [f.get_full_name().startswith(filename) \
                                    for f in doc.list_all_files()]:
                        assigned = True
                        if not include_subformat_icons and \
                               CFG_BIBDOCFILE_ICON_SUBFORMAT_RE.match(subformat):
                            # This is an icon and we want to skip it
                            continue
                        if not doc.get_doctype(bfo.recID) == 'Main' and \
                               distinct_main_and_additional_files == True:
                            # In that case we record that there are
                            # additional files, but don't add them to
                            # returned structure.
                            additionals = True
                        else:
                            if not descr:
                                descr = _('Fulltext')
                            if descr not in parsed_urls['main_urls']:
                                parsed_urls['main_urls'][descr] = []
                            params_dict = parse_qs(params)
                            if 'subformat' in params_dict:
                                url_format += ' (%s)' % params_dict['subformat'][0]
                            parsed_urls['main_urls'][descr].append((url, name, url_format))
                if not assigned: # Url is not a bibdoc :-S
                    if not descr:
                        descr = filename
                    parsed_urls['others_urls'].append((url, descr)) # Let's put it in a general other url
    return (parsed_urls, old_versions, additionals)
예제 #2
0
def process_batch_job(batch_job_file):
    """ Processes a batch job description dictionary

    @param batch_job_file: a fullpath to a batch job file
    @type batch_job_file: string
    @return: 1 if the process was successful, 0 if not
    @rtype; int
    """
    from invenio.legacy.bibdocfile.cli import cli_fix_marc

    def upload_marcxml_file(marcxml):
        """ Creates a temporary marcxml file and sends it to bibupload
        """
        xml_filename = 'bibencode_'+ str(batch_job['recid']) + '_' + str(uuid.uuid4()) + '.xml'
        xml_filename = os.path.join(invenio.config.CFG_TMPSHAREDDIR, xml_filename)
        xml_file = file(xml_filename, 'w')
        xml_file.write(marcxml)
        xml_file.close()
        targs = ['-c', xml_filename]
        task_low_level_submission('bibupload', 'bibencode', *targs)

    #---------#
    # GENERAL #
    #---------#

    _task_write_message("----------- Handling Master -----------")

    ## Check the validity of the batch file here
    batch_job = json_decode_file(batch_job_file)

    ## Sanitise batch description and raise errrors
    batch_job = sanitise_batch_job(batch_job)

    ## Check if the record exists
    # if record_exists(batch_job['recid']) < 1:
    #     raise Exception("Record not found")

    recdoc = BibRecDocs(batch_job['recid'])

    #--------------------#
    # UPDATE FROM MASTER #
    #--------------------#

    ## We want to add new stuff to the video's record, using the master as input
    if getval(batch_job, 'update_from_master'):
        found_master = False
        bibdocs = recdoc.list_bibdocs()
        for bibdoc in bibdocs:
            bibdocfiles = bibdoc.list_all_files()
            for bibdocfile in bibdocfiles:
                comment = bibdocfile.get_comment()
                description = bibdocfile.get_description()
                subformat = bibdocfile.get_subformat()
                m_comment = getval(batch_job, 'bibdoc_master_comment', comment)
                m_description = getval(batch_job, 'bibdoc_master_description', description)
                m_subformat = getval(batch_job, 'bibdoc_master_subformat', subformat)
                if (comment == m_comment and
                    description == m_description and
                    subformat == m_subformat):
                    found_master = True
                    batch_job['input'] = bibdocfile.get_full_path()
                    ## Get the aspect of the from the record
                    try:
                        ## Assumes pbcore metadata mapping
                        batch_job['aspect'] = get_fieldvalues(124, CFG_BIBENCODE_ASPECT_RATIO_MARC_FIELD)[0]
                    except IndexError:
                        pass
                    break
            if found_master:
                break
        if not found_master:
            _task_write_message("Video master for record %d not found"
                          % batch_job['recid'])
            task_update_progress("Video master for record %d not found"
                                 % batch_job['recid'])
            ## Maybe send an email?
            return 1

    ## Clean the job to do no upscaling etc
    if getval(batch_job, 'assure_quality'):
        batch_job = clean_job_for_quality(batch_job)

    global _BATCH_STEPS
    _BATCH_STEPS = len(batch_job['jobs'])

    ## Generate the docname from the input filename's name or given name
    bibdoc_video_docname, bibdoc_video_extension = decompose_file(batch_job['input'])[1:]
    if not bibdoc_video_extension or getval(batch_job, 'bibdoc_master_extension'):
        bibdoc_video_extension = getval(batch_job, 'bibdoc_master_extension')
    if getval(batch_job, 'bibdoc_master_docname'):
        bibdoc_video_docname = getval(batch_job, 'bibdoc_master_docname')

    write_message("Creating BibDoc for %s" % bibdoc_video_docname)
    ## If the bibdoc exists, receive it
    if bibdoc_video_docname in recdoc.get_bibdoc_names():
        bibdoc_video = recdoc.get_bibdoc(bibdoc_video_docname)
    ## Create a new bibdoc if it does not exist
    else:
        bibdoc_video = recdoc.add_bibdoc(docname=bibdoc_video_docname)

    ## Get the directory auf the newly created bibdoc to copy stuff there
    bibdoc_video_directory = bibdoc_video.get_base_dir()

    #--------#
    # MASTER #
    #--------#
    if not getval(batch_job, 'update_from_master'):
        if getval(batch_job, 'add_master'):
            ## Generate the right name for the master
            ## The master should be hidden first an then renamed
            ## when it is really available
            ## !!! FIX !!!
            _task_write_message("Adding %s master to the BibDoc"
                          % bibdoc_video_docname)
            master_format = compose_format(
                                    bibdoc_video_extension,
                                    getval(batch_job, 'bibdoc_master_subformat', 'master')
                                    )
            ## If a file of the same format is there, something is wrong, remove it!
            ## it might be caused by a previous corrupted submission etc.
            if bibdoc_video.format_already_exists_p(master_format):
                bibdoc_video.delete_file(master_format, 1)
            bibdoc_video.add_file_new_format(
                    batch_job['input'],
                    version=1,
                    description=getval(batch_job, 'bibdoc_master_description'),
                    comment=getval(batch_job, 'bibdoc_master_comment'),
                    docformat=master_format
                    )

    #-----------#
    # JOBS LOOP #
    #-----------#

    return_code = 1
    global _BATCH_STEP

    for job in batch_job['jobs']:

        _task_write_message("----------- Job %s of %s -----------"
                           % (_BATCH_STEP, _BATCH_STEPS))

        ## Try to substitute docname with master docname
        if getval(job, 'bibdoc_docname'):
            job['bibdoc_docname'] = Template(job['bibdoc_docname']).safe_substitute({'bibdoc_master_docname': bibdoc_video_docname})

        #-------------#
        # TRANSCODING #
        #-------------#

        if job['mode'] == 'encode':

            ## Skip the job if assure_quality is not set and marked as fallback
            if not getval(batch_job, 'assure_quality') and getval(job, 'fallback'):
                continue

            if getval(job, 'profile'):
                profile = get_encoding_profile(job['profile'])
            else:
                profile = None
            ## We need an extension defined fot the video container
            bibdoc_video_extension = getval(job, 'extension',
                                            getval(profile, 'extension'))
            if not bibdoc_video_extension:
                raise Exception("No container/extension defined")
            ## Get the docname and subformat
            bibdoc_video_subformat = getval(job, 'bibdoc_subformat')
            bibdoc_slave_video_docname = getval(job, 'bibdoc_docname', bibdoc_video_docname)
            ## The subformat is incompatible with ffmpegs name convention
            ## We do the encoding without and rename it afterwards
            bibdoc_video_fullpath = compose_file(
                                                 bibdoc_video_directory,
                                                 bibdoc_video_extension
                                                 )
            _task_write_message("Transcoding %s to %s;%s" % (bibdoc_slave_video_docname,
                                bibdoc_video_extension,
                                bibdoc_video_subformat))
            ## We encode now directly into the bibdocs directory
            encoding_result = encode_video(
                 input_file=batch_job['input'],
                 output_file=bibdoc_video_fullpath,
                 acodec=getval(job, 'audiocodec'),
                 vcodec=getval(job, 'videocodec'),
                 abitrate=getval(job, 'videobitrate'),
                 vbitrate=getval(job, 'audiobitrate'),
                 resolution=getval(job, 'resolution'),
                 passes=getval(job, 'passes', 1),
                 special=getval(job, 'special'),
                 specialfirst=getval(job, 'specialfirst'),
                 specialsecond=getval(job, 'specialsecond'),
                 metadata=getval(job, 'metadata'),
                 width=getval(job, 'width'),
                 height=getval(job, 'height'),
                 aspect=getval(batch_job, 'aspect'), # Aspect for every job
                 profile=getval(job, 'profile'),
                 update_fnc=_task_update_overall_status,
                 message_fnc=_task_write_message
                 )
            return_code &= encoding_result
            ## only on success
            if  encoding_result:
                ## Rename it, adding the subformat
                os.rename(bibdoc_video_fullpath,
                          compose_file(bibdoc_video_directory,
                                       bibdoc_video_extension,
                                       bibdoc_video_subformat,
                                       1,
                                       bibdoc_slave_video_docname)
                          )
                #bibdoc_video._build_file_list()
                bibdoc_video.touch()
                bibdoc_video._sync_to_db()
                bibdoc_video_format = compose_format(bibdoc_video_extension,
                                                     bibdoc_video_subformat)
                if getval(job, 'bibdoc_comment'):
                    bibdoc_video.set_comment(getval(job, 'bibdoc_comment'),
                                              bibdoc_video_format)
                if getval(job, 'bibdoc_description'):
                    bibdoc_video.set_description(getval(job, 'bibdoc_description'),
                                                 bibdoc_video_format)

        #------------#
        # EXTRACTION #
        #------------#

        # if there are multiple extraction jobs, all the produced files
        # with the same name will be in the same bibdoc! Make sure that
        # you use different subformats or docname templates to avoid
        # conflicts.

        if job['mode'] == 'extract':
            if getval(job, 'profile'):
                profile = get_extract_profile(job['profile'])
            else:
                profile = {}
            bibdoc_frame_subformat = getval(job, 'bibdoc_subformat')
            _task_write_message("Extracting frames to temporary directory")
            tmpdir = invenio.config.CFG_TMPDIR + "/" + str(uuid.uuid4())
            os.mkdir(tmpdir)
            #Move this to the batch description
            bibdoc_frame_docname = getval(job, 'bibdoc_docname', bibdoc_video_docname)
            tmpfname = (tmpdir + "/" + bibdoc_frame_docname + '.'
                        + getval(profile, 'extension',
                        getval(job, 'extension', 'jpg')))
            extraction_result = extract_frames(input_file=batch_job['input'],
                           output_file=tmpfname,
                           size=getval(job, 'size'),
                           positions=getval(job, 'positions'),
                           numberof=getval(job, 'numberof'),
                           width=getval(job, 'width'),
                           height=getval(job, 'height'),
                           aspect=getval(batch_job, 'aspect'),
                           profile=getval(job, 'profile'),
                           update_fnc=_task_update_overall_status,
                           )
            return_code &= extraction_result

            ## only on success:
            if extraction_result:
                ## for every filename in the directorys, create a bibdoc that contains
                ## all sizes of the frame from the two directories
                files = os.listdir(tmpdir)
                for filename in files:
                    ## The docname was altered by BibEncode extract through substitution
                    ## Retrieve it from the filename again
                    bibdoc_frame_docname, bibdoc_frame_extension = os.path.splitext(filename)
                    _task_write_message("Creating new bibdoc for %s" % bibdoc_frame_docname)
                    ## If the bibdoc exists, receive it
                    if bibdoc_frame_docname in recdoc.get_bibdoc_names():
                        bibdoc_frame = recdoc.get_bibdoc(bibdoc_frame_docname)
                    ## Create a new bibdoc if it does not exist
                    else:
                        bibdoc_frame = recdoc.add_bibdoc(docname=bibdoc_frame_docname)

                    ## The filename including path from tmpdir
                    fname = os.path.join(tmpdir, filename)

                    bibdoc_frame_format = compose_format(bibdoc_frame_extension, bibdoc_frame_subformat)
                    ## Same as with the master, if the format allready exists,
                    ## override it, because something went wrong before
                    if bibdoc_frame.format_already_exists_p(bibdoc_frame_format):
                        bibdoc_frame.delete_file(bibdoc_frame_format, 1)
                    _task_write_message("Adding %s jpg;%s to BibDoc"
                                  % (bibdoc_frame_docname,
                                     getval(job, 'bibdoc_subformat')))
                    bibdoc_frame.add_file_new_format(
                                    fname,
                                    version=1,
                                    description=getval(job, 'bibdoc_description'),
                                    comment=getval(job, 'bibdoc_comment'),
                                    docformat=bibdoc_frame_format)
            ## Remove the temporary folders
            _task_write_message("Removing temporary directory")
            shutil.rmtree(tmpdir)

        _BATCH_STEP = _BATCH_STEP + 1

    #-----------------#
    # FIX BIBDOC/MARC #
    #-----------------#

    _task_write_message("----------- Handling MARCXML -----------")

    ## Fix the BibDoc for all the videos previously created
    _task_write_message("Updating BibDoc of %s" % bibdoc_video_docname)
    bibdoc_video._build_file_list()

    ## Fix the MARC
    _task_write_message("Fixing MARC")
    cli_fix_marc({}, [batch_job['recid']], False)

    if getval(batch_job, 'collection'):
        ## Make the record visible by moving in from the collection
        marcxml = ("<record><controlfield tag=\"001\">%d</controlfield>"
                   "<datafield tag=\"980\" ind1=\" \" ind2=\" \">"
                   "<subfield code=\"a\">%s</subfield></datafield></record>"
                   ) % (batch_job['recid'], batch_job['collection'])
        upload_marcxml_file(marcxml)

    #---------------------#
    # ADD MASTER METADATA #
    #---------------------#

    if getval(batch_job, 'add_master_metadata'):
        _task_write_message("Adding master metadata")
        pbcore = pbcore_metadata(input_file = getval(batch_job, 'input'),
                                 pbcoreIdentifier = batch_job['recid'],
                                 aspect_override = getval(batch_job, 'aspect'))
        from invenio.modules.formatter.engines.xslt import format
        marcxml = format(pbcore, CFG_BIBENCODE_PBCORE_MARC_XSLT)
        upload_marcxml_file(marcxml)

    #------------------#
    # ADD MARC SNIPPET #
    #------------------#

    if getval(batch_job, 'marc_snippet'):
        marc_snippet = open(getval(batch_job, 'marc_snippet'))
        marcxml = marc_snippet.read()
        marc_snippet.close()
        upload_marcxml_file(marcxml)

    #--------------#
    # DELETE INPUT #
    #--------------#

    if getval(batch_job, 'delete_input'):
        _task_write_message("Deleting input file")
        # only if successfull
        if not return_code:
            # only if input matches pattern
            if getval(batch_job, 'delete_input_pattern', '') in getval(batch_job, 'input'):
                try:
                    os.remove(getval(batch_job, 'input'))
                except OSError:
                    pass

    #--------------#
    # NOTIFICATION #
    #--------------#

    ## Send Notification emails on errors
    if not return_code:
        if getval(batch_job, 'notify_user'):
            _notify_error_user(getval(batch_job, 'notify_user'),
                               getval(batch_job, 'submission_filename', batch_job['input']),
                               getval(batch_job, 'recid'),
                               getval(batch_job, 'submission_title', ""))
            _task_write_message("Notify user because of an error")
        if getval(batch_job, 'notify_admin'):
            _task_write_message("Notify admin because of an error")
            if type(getval(batch_job, 'notify_admin') == type(str()) ):
                _notify_error_admin(batch_job,
                                    getval(batch_job, 'notify_admin'))

            else:
                _notify_error_admin(batch_job)
    else:
        if getval(batch_job, 'notify_user'):
            _task_write_message("Notify user because of success")
            _notify_success_user(getval(batch_job, 'notify_user'),
                               getval(batch_job, 'submission_filename', batch_job['input']),
                               getval(batch_job, 'recid'),
                               getval(batch_job, 'submission_title', ""))
    return 1
예제 #3
0
def process_batch_job(batch_job_file):
    """ Processes a batch job description dictionary

    @param batch_job_file: a fullpath to a batch job file
    @type batch_job_file: string
    @return: 1 if the process was successful, 0 if not
    @rtype; int
    """
    from invenio.legacy.bibdocfile.cli import cli_fix_marc

    def upload_marcxml_file(marcxml):
        """ Creates a temporary marcxml file and sends it to bibupload
        """
        xml_filename = 'bibencode_'+ str(batch_job['recid']) + '_' + str(uuid.uuid4()) + '.xml'
        xml_filename = os.path.join(invenio.config.CFG_TMPSHAREDDIR, xml_filename)
        xml_file = file(xml_filename, 'w')
        xml_file.write(marcxml)
        xml_file.close()
        targs = ['-c', xml_filename]
        task_low_level_submission('bibupload', 'bibencode', *targs)

    #---------#
    # GENERAL #
    #---------#

    _task_write_message("----------- Handling Master -----------")

    ## Check the validity of the batch file here
    batch_job = json_decode_file(batch_job_file)

    ## Sanitise batch description and raise errrors
    batch_job = sanitise_batch_job(batch_job)

    ## Check if the record exists
    # if record_exists(batch_job['recid']) < 1:
    #     raise Exception("Record not found")

    recdoc = BibRecDocs(batch_job['recid'])

    #--------------------#
    # UPDATE FROM MASTER #
    #--------------------#

    ## We want to add new stuff to the video's record, using the master as input
    if getval(batch_job, 'update_from_master'):
        found_master = False
        bibdocs = recdoc.list_bibdocs()
        for bibdoc in bibdocs:
            bibdocfiles = bibdoc.list_all_files()
            for bibdocfile in bibdocfiles:
                comment = bibdocfile.get_comment()
                description = bibdocfile.get_description()
                subformat = bibdocfile.get_subformat()
                m_comment = getval(batch_job, 'bibdoc_master_comment', comment)
                m_description = getval(batch_job, 'bibdoc_master_description', description)
                m_subformat = getval(batch_job, 'bibdoc_master_subformat', subformat)
                if (comment == m_comment and
                    description == m_description and
                    subformat == m_subformat):
                    found_master = True
                    batch_job['input'] = bibdocfile.get_full_path()
                    ## Get the aspect of the from the record
                    try:
                        ## Assumes pbcore metadata mapping
                        batch_job['aspect'] = get_fieldvalues(124, CFG_BIBENCODE_ASPECT_RATIO_MARC_FIELD)[0]
                    except IndexError:
                        pass
                    break
            if found_master:
                break
        if not found_master:
            _task_write_message("Video master for record %d not found"
                          % batch_job['recid'])
            task_update_progress("Video master for record %d not found"
                                 % batch_job['recid'])
            ## Maybe send an email?
            return 1

    ## Clean the job to do no upscaling etc
    if getval(batch_job, 'assure_quality'):
        batch_job = clean_job_for_quality(batch_job)

    global _BATCH_STEPS
    _BATCH_STEPS = len(batch_job['jobs'])

    ## Generate the docname from the input filename's name or given name
    bibdoc_video_docname, bibdoc_video_extension = decompose_file(batch_job['input'])[1:]
    if not bibdoc_video_extension or getval(batch_job, 'bibdoc_master_extension'):
        bibdoc_video_extension = getval(batch_job, 'bibdoc_master_extension')
    if getval(batch_job, 'bibdoc_master_docname'):
        bibdoc_video_docname = getval(batch_job, 'bibdoc_master_docname')

    write_message("Creating BibDoc for %s" % bibdoc_video_docname)
    ## If the bibdoc exists, receive it
    if bibdoc_video_docname in recdoc.get_bibdoc_names():
        bibdoc_video = recdoc.get_bibdoc(bibdoc_video_docname)
    ## Create a new bibdoc if it does not exist
    else:
        bibdoc_video = recdoc.add_bibdoc(docname=bibdoc_video_docname)

    ## Get the directory auf the newly created bibdoc to copy stuff there
    bibdoc_video_directory = bibdoc_video.get_base_dir()

    #--------#
    # MASTER #
    #--------#
    if not getval(batch_job, 'update_from_master'):
        if getval(batch_job, 'add_master'):
            ## Generate the right name for the master
            ## The master should be hidden first an then renamed
            ## when it is really available
            ## !!! FIX !!!
            _task_write_message("Adding %s master to the BibDoc"
                          % bibdoc_video_docname)
            master_format = compose_format(
                                    bibdoc_video_extension,
                                    getval(batch_job, 'bibdoc_master_subformat', 'master')
                                    )
            ## If a file of the same format is there, something is wrong, remove it!
            ## it might be caused by a previous corrupted submission etc.
            if bibdoc_video.format_already_exists_p(master_format):
                bibdoc_video.delete_file(master_format, 1)
            bibdoc_video.add_file_new_format(
                    batch_job['input'],
                    version=1,
                    description=getval(batch_job, 'bibdoc_master_description'),
                    comment=getval(batch_job, 'bibdoc_master_comment'),
                    docformat=master_format
                    )

    #-----------#
    # JOBS LOOP #
    #-----------#

    return_code = 1
    global _BATCH_STEP

    for job in batch_job['jobs']:

        _task_write_message("----------- Job %s of %s -----------"
                           % (_BATCH_STEP, _BATCH_STEPS))

        ## Try to substitute docname with master docname
        if getval(job, 'bibdoc_docname'):
            job['bibdoc_docname'] = Template(job['bibdoc_docname']).safe_substitute({'bibdoc_master_docname': bibdoc_video_docname})

        #-------------#
        # TRANSCODING #
        #-------------#

        if job['mode'] == 'encode':

            ## Skip the job if assure_quality is not set and marked as fallback
            if not getval(batch_job, 'assure_quality') and getval(job, 'fallback'):
                continue

            if getval(job, 'profile'):
                profile = get_encoding_profile(job['profile'])
            else:
                profile = None
            ## We need an extension defined fot the video container
            bibdoc_video_extension = getval(job, 'extension',
                                            getval(profile, 'extension'))
            if not bibdoc_video_extension:
                raise Exception("No container/extension defined")
            ## Get the docname and subformat
            bibdoc_video_subformat = getval(job, 'bibdoc_subformat')
            bibdoc_slave_video_docname = getval(job, 'bibdoc_docname', bibdoc_video_docname)
            ## The subformat is incompatible with ffmpegs name convention
            ## We do the encoding without and rename it afterwards
            bibdoc_video_fullpath = compose_file(
                                                 bibdoc_video_directory,
                                                 bibdoc_video_extension
                                                 )
            _task_write_message("Transcoding %s to %s;%s" % (bibdoc_slave_video_docname,
                                bibdoc_video_extension,
                                bibdoc_video_subformat))
            ## We encode now directly into the bibdocs directory
            encoding_result = encode_video(
                 input_file=batch_job['input'],
                 output_file=bibdoc_video_fullpath,
                 acodec=getval(job, 'audiocodec'),
                 vcodec=getval(job, 'videocodec'),
                 abitrate=getval(job, 'videobitrate'),
                 vbitrate=getval(job, 'audiobitrate'),
                 resolution=getval(job, 'resolution'),
                 passes=getval(job, 'passes', 1),
                 special=getval(job, 'special'),
                 specialfirst=getval(job, 'specialfirst'),
                 specialsecond=getval(job, 'specialsecond'),
                 metadata=getval(job, 'metadata'),
                 width=getval(job, 'width'),
                 height=getval(job, 'height'),
                 aspect=getval(batch_job, 'aspect'), # Aspect for every job
                 profile=getval(job, 'profile'),
                 update_fnc=_task_update_overall_status,
                 message_fnc=_task_write_message
                 )
            return_code &= encoding_result
            ## only on success
            if  encoding_result:
                ## Rename it, adding the subformat
                os.rename(bibdoc_video_fullpath,
                          compose_file(bibdoc_video_directory,
                                       bibdoc_video_extension,
                                       bibdoc_video_subformat,
                                       1,
                                       bibdoc_slave_video_docname)
                          )
                #bibdoc_video._build_file_list()
                bibdoc_video.touch()
                bibdoc_video._sync_to_db()
                bibdoc_video_format = compose_format(bibdoc_video_extension,
                                                     bibdoc_video_subformat)
                if getval(job, 'bibdoc_comment'):
                    bibdoc_video.set_comment(getval(job, 'bibdoc_comment'),
                                              bibdoc_video_format)
                if getval(job, 'bibdoc_description'):
                    bibdoc_video.set_description(getval(job, 'bibdoc_description'),
                                                 bibdoc_video_format)

        #------------#
        # EXTRACTION #
        #------------#

        # if there are multiple extraction jobs, all the produced files
        # with the same name will be in the same bibdoc! Make sure that
        # you use different subformats or docname templates to avoid
        # conflicts.

        if job['mode'] == 'extract':
            if getval(job, 'profile'):
                profile = get_extract_profile(job['profile'])
            else:
                profile = {}
            bibdoc_frame_subformat = getval(job, 'bibdoc_subformat')
            _task_write_message("Extracting frames to temporary directory")
            tmpdir = invenio.config.CFG_TMPDIR + "/" + str(uuid.uuid4())
            os.mkdir(tmpdir)
            #Move this to the batch description
            bibdoc_frame_docname = getval(job, 'bibdoc_docname', bibdoc_video_docname)
            tmpfname = (tmpdir + "/" + bibdoc_frame_docname + '.'
                        + getval(profile, 'extension',
                        getval(job, 'extension', 'jpg')))
            extraction_result = extract_frames(input_file=batch_job['input'],
                           output_file=tmpfname,
                           size=getval(job, 'size'),
                           positions=getval(job, 'positions'),
                           numberof=getval(job, 'numberof'),
                           width=getval(job, 'width'),
                           height=getval(job, 'height'),
                           aspect=getval(batch_job, 'aspect'),
                           profile=getval(job, 'profile'),
                           update_fnc=_task_update_overall_status,
                           )
            return_code &= extraction_result

            ## only on success:
            if extraction_result:
                ## for every filename in the directorys, create a bibdoc that contains
                ## all sizes of the frame from the two directories
                files = os.listdir(tmpdir)
                for filename in files:
                    ## The docname was altered by BibEncode extract through substitution
                    ## Retrieve it from the filename again
                    bibdoc_frame_docname, bibdoc_frame_extension = os.path.splitext(filename)
                    _task_write_message("Creating new bibdoc for %s" % bibdoc_frame_docname)
                    ## If the bibdoc exists, receive it
                    if bibdoc_frame_docname in recdoc.get_bibdoc_names():
                        bibdoc_frame = recdoc.get_bibdoc(bibdoc_frame_docname)
                    ## Create a new bibdoc if it does not exist
                    else:
                        bibdoc_frame = recdoc.add_bibdoc(docname=bibdoc_frame_docname)

                    ## The filename including path from tmpdir
                    fname = os.path.join(tmpdir, filename)

                    bibdoc_frame_format = compose_format(bibdoc_frame_extension, bibdoc_frame_subformat)
                    ## Same as with the master, if the format allready exists,
                    ## override it, because something went wrong before
                    if bibdoc_frame.format_already_exists_p(bibdoc_frame_format):
                        bibdoc_frame.delete_file(bibdoc_frame_format, 1)
                    _task_write_message("Adding %s jpg;%s to BibDoc"
                                  % (bibdoc_frame_docname,
                                     getval(job, 'bibdoc_subformat')))
                    bibdoc_frame.add_file_new_format(
                                    fname,
                                    version=1,
                                    description=getval(job, 'bibdoc_description'),
                                    comment=getval(job, 'bibdoc_comment'),
                                    docformat=bibdoc_frame_format)
            ## Remove the temporary folders
            _task_write_message("Removing temporary directory")
            shutil.rmtree(tmpdir)

        _BATCH_STEP = _BATCH_STEP + 1

    #-----------------#
    # FIX BIBDOC/MARC #
    #-----------------#

    _task_write_message("----------- Handling MARCXML -----------")

    ## Fix the BibDoc for all the videos previously created
    _task_write_message("Updating BibDoc of %s" % bibdoc_video_docname)
    bibdoc_video._build_file_list()

    ## Fix the MARC
    _task_write_message("Fixing MARC")
    cli_fix_marc({}, [batch_job['recid']], False)

    if getval(batch_job, 'collection'):
        ## Make the record visible by moving in from the collection
        marcxml = ("<record><controlfield tag=\"001\">%d</controlfield>"
                   "<datafield tag=\"980\" ind1=\" \" ind2=\" \">"
                   "<subfield code=\"a\">%s</subfield></datafield></record>"
                   ) % (batch_job['recid'], batch_job['collection'])
        upload_marcxml_file(marcxml)

    #---------------------#
    # ADD MASTER METADATA #
    #---------------------#

    if getval(batch_job, 'add_master_metadata'):
        _task_write_message("Adding master metadata")
        pbcore = pbcore_metadata(input_file = getval(batch_job, 'input'),
                                 pbcoreIdentifier = batch_job['recid'],
                                 aspect_override = getval(batch_job, 'aspect'))
        from invenio_formatter.engines.xslt import format
        marcxml = format(pbcore, CFG_BIBENCODE_PBCORE_MARC_XSLT)
        upload_marcxml_file(marcxml)

    #------------------#
    # ADD MARC SNIPPET #
    #------------------#

    if getval(batch_job, 'marc_snippet'):
        marc_snippet = open(getval(batch_job, 'marc_snippet'))
        marcxml = marc_snippet.read()
        marc_snippet.close()
        upload_marcxml_file(marcxml)

    #--------------#
    # DELETE INPUT #
    #--------------#

    if getval(batch_job, 'delete_input'):
        _task_write_message("Deleting input file")
        # only if successfull
        if not return_code:
            # only if input matches pattern
            if getval(batch_job, 'delete_input_pattern', '') in getval(batch_job, 'input'):
                try:
                    os.remove(getval(batch_job, 'input'))
                except OSError:
                    pass

    #--------------#
    # NOTIFICATION #
    #--------------#

    ## Send Notification emails on errors
    if not return_code:
        if getval(batch_job, 'notify_user'):
            _notify_error_user(getval(batch_job, 'notify_user'),
                               getval(batch_job, 'submission_filename', batch_job['input']),
                               getval(batch_job, 'recid'),
                               getval(batch_job, 'submission_title', ""))
            _task_write_message("Notify user because of an error")
        if getval(batch_job, 'notify_admin'):
            _task_write_message("Notify admin because of an error")
            if type(getval(batch_job, 'notify_admin') == type(str()) ):
                _notify_error_admin(batch_job,
                                    getval(batch_job, 'notify_admin'))

            else:
                _notify_error_admin(batch_job)
    else:
        if getval(batch_job, 'notify_user'):
            _task_write_message("Notify user because of success")
            _notify_success_user(getval(batch_job, 'notify_user'),
                               getval(batch_job, 'submission_filename', batch_job['input']),
                               getval(batch_job, 'recid'),
                               getval(batch_job, 'submission_title', ""))
    return 1
예제 #4
0
def get_files(bfo,
              distinguish_main_and_additional_files=True,
              include_subformat_icons=False):
    """
    Returns the files available for the given record.
    Returned structure is a tuple (parsed_urls, old_versions, additionals):
     - parsed_urls: contains categorized URLS (see details below)
     - old_versions: set to True if we can have access to old versions
     - additionals: set to True if we have other documents than the 'main' document

     Parameter 'include_subformat_icons' decides if subformat
     considered as icons should be returned

    'parsed_urls' is a dictionary in the form::
        {'main_urls' : {'Main'      : [('http://CFG_SITE_URL/CFG_SITE_RECORD/1/files/aFile.pdf', 'aFile', 'PDF'),
                                       ('http://CFG_SITE_URL/CFG_SITE_RECORD/1/files/aFile.gif', 'aFile', 'GIF')],
                        'Additional': [('http://CFG_SITE_URL/CFG_SITE_RECORD/1/files/bFile.pdf', 'bFile', 'PDF')]},

         'other_urls': [('http://externalurl.com/aFile.pdf', 'Fulltext'),      # url(8564_u), description(8564_z/y)
                        ('http://externalurl.com/bFile.pdf', 'Fulltext')],

         'cern_urls' : [('http://cern.ch/aFile.pdf', 'Fulltext'),              # url(8564_u), description(8564_z/y)
                        ('http://cern.ch/bFile.pdf', 'Fulltext')],
        }

    Some notes about returned structure:
        - key 'cern_urls' is only available on CERN site
        - keys in main_url dictionaries are defined by the BibDoc.
        - older versions are not part of the parsed urls
        - returns only main files when possible, that is when doctypes
          make a distinction between 'Main' files and other
          files. Otherwise returns all the files as main. This is only
          enabled if distinguish_main_and_additional_files is set to True
    """
    CFG_SITE_URL = current_app.config['CFG_SITE_URL']
    CFG_CERN_SITE = current_app.config['CFG_CERN_SITE']
    CFG_BIBFORMAT_HIDDEN_FILE_FORMATS = current_app.config[
        'CFG_BIBFORMAT_HIDDEN_FILE_FORMATS']
    _CFG_NORMALIZED_BIBFORMAT_HIDDEN_FILE_FORMATS = set(
        normalize_format(fmt) for fmt in CFG_BIBFORMAT_HIDDEN_FILE_FORMATS)

    _ = gettext_set_language(bfo.lang)

    urls = bfo.fields("8564_")
    bibarchive = BibRecDocs(bfo.recID)

    old_versions = False  # We can provide link to older files. Will be
    # set to True if older files are found.
    additionals = False  # We have additional files. Will be set to
    # True if additional files are found.

    # Prepare object to return
    parsed_urls = {
        'main_urls': {},  # Urls hosted by Invenio (bibdocs)
        'others_urls': []  # External urls
    }
    if CFG_CERN_SITE:
        parsed_urls['cern_urls'] = []  # cern.ch urls

    # Doctypes can of any type, but when there is one file marked as
    # 'Main', we consider that there is a distinction between "main"
    # and "additional" files. Otherwise they will all be considered
    # equally as main files
    distinct_main_and_additional_files = False
    if len(bibarchive.list_bibdocs(doctype='Main')) > 0 and \
           distinguish_main_and_additional_files:
        distinct_main_and_additional_files = True
    # Parse URLs
    for complete_url in urls:
        if complete_url.has_key('u'):
            url = complete_url['u']
            (dummy, host, path, dummy, params, dummy) = urlparse(url)
            subformat = complete_url.get('x', '')
            filename = urllib.unquote(basename(path))
            name = file_strip_ext(filename)
            url_format = filename[len(name):]
            if url_format.startswith('.'):
                url_format = url_format[1:]
            if compose_format(
                    url_format, subformat
            ) in _CFG_NORMALIZED_BIBFORMAT_HIDDEN_FILE_FORMATS:
                ## This format should be hidden.
                continue

            descr = _("Fulltext")
            if complete_url.has_key('y'):
                descr = complete_url['y']
                if descr == 'Fulltext':
                    descr = _("Fulltext")
            if not url.startswith(CFG_SITE_URL):  # Not a bibdoc?
                if not descr:  # For not bibdoc let's have a description
                    # Display the URL in full:
                    descr = url
                if CFG_CERN_SITE and 'cern.ch' in host and \
                       ('/setlink?' in url or \
                        'cms' in host or \
                        'documents.cern.ch' in url or \
                        'doc.cern.ch' in url or \
                        'preprints.cern.ch' in url):
                    url_params_dict = dict([
                        part.split('=') for part in params.split('&')
                        if len(part.split('=')) == 2
                    ])
                    if url_params_dict.has_key('categ') and \
                           (url_params_dict['categ'].split('.', 1)[0] in cern_arxiv_categories) and \
                           url_params_dict.has_key('id'):
                        # Old arXiv links, used to be handled by
                        # setlink. Provide direct links to arXiv
                        for file_format, label in [('pdf', "PDF")]:  #,
                            #('ps', "PS"),
                            #('e-print', "Source (generally TeX or LaTeX)"),
                            #('abs', "Abstract")]:
                            url = "http://arxiv.org/%(format)s/%(category)s/%(id)s" % \
                                  {'format': file_format,
                                   'category': url_params_dict['categ'],
                                   'id': url_params_dict['id']}
                            parsed_urls['others_urls'].append((url, "%s/%s %s" % \
                                                               (url_params_dict['categ'],
                                                                url_params_dict['id'],
                                                                label)))
                else:
                    parsed_urls['others_urls'].append(
                        (url, descr))  # external url
            else:  # It's a bibdoc!
                assigned = False
                for doc in bibarchive.list_bibdocs():
                    if int(doc.get_latest_version()) > 1:
                        old_versions = True
                    if True in [f.get_full_name().startswith(filename) \
                                    for f in doc.list_all_files()]:
                        assigned = True
                        if not include_subformat_icons and \
                               CFG_BIBDOCFILE_ICON_SUBFORMAT_RE.match(subformat):
                            # This is an icon and we want to skip it
                            continue
                        if not doc.get_doctype(bfo.recID) == 'Main' and \
                               distinct_main_and_additional_files == True:
                            # In that case we record that there are
                            # additional files, but don't add them to
                            # returned structure.
                            additionals = True
                        else:
                            if not descr:
                                descr = _('Fulltext')
                            if not parsed_urls['main_urls'].has_key(descr):
                                parsed_urls['main_urls'][descr] = []
                            params_dict = parse_qs(params)
                            if 'subformat' in params_dict:
                                url_format += ' (%s)' % params_dict[
                                    'subformat'][0]
                            parsed_urls['main_urls'][descr].append(
                                (url, name, url_format))
                if not assigned:  # Url is not a bibdoc :-S
                    if not descr:
                        descr = filename
                    parsed_urls['others_urls'].append(
                        (url, descr))  # Let's put it in a general other url
    return (parsed_urls, old_versions, additionals)