示例#1
0
def _insert_blogs(file_path):
    """
    @param file_path: file containing the list of blogs to insert
    in the archive. Each blog is represented by its url, [title],
    topic and license.
    E.g: "blogs_to_insert.csv"
    http://blogforever.eu,BlogForever,topic1,license1
    http://blogs.physicstoday.org/,Physicstoday,topic1,license3
    @type file_path: it is a csv file where the elements of
    each row (blog elements) are separated by commas.
    """

    blog_list = _get_blog_list(file_path)
    mode = "insert"
    if blog_list:
        res = _check_input_blogs(blog_list, mode)
        if res[0]:
            new_blogs_xml = _transform_bloglist_to_marcxml(blog_list, mode)
            xml_file = _write_xml_file(new_blogs_xml, mode)
            task_low_level_submission('bibupload', 'webblog', '-i', xml_file)
        else:
            write_message(str(res[1]))
            raise Exception(res[1])
    else:
        write_message("There are not blogs to "+ str(mode))
        raise Exception("There are not blogs to "+ str(mode))
示例#2
0
    def bibupload_it(self):
        if self.found_articles:
            nlm_parser = NLMParser(self.extract_nations)
            self.logger.debug("Preparing bibupload.")
            fd, name = mkstemp(suffix='.xml', prefix='bibupload_scoap3_',
                               dir=CFG_TMPSHAREDDIR)
            out = fdopen(fd, 'w')
            print("<collection>", file=out)
            for i, path in enumerate(self.found_articles):
                try:
                    print(nlm_parser.get_record(path,
                                                publisher='Oxford',
                                                collection='SCOAP3',
                                                logger=self.logger),
                          file=out)

                    xml_doc = nlm_parser.get_article(path)
                    doi = nlm_parser.get_doi(xml_doc)
                    package_name = [x for x in path.split('/')
                                    if 'ptep_iss' in x]
                    if package_name:
                        self.doi_package_name_mapping.append((package_name[0],
                                                              doi))
                except Exception as err:
                    print(err, file=sys.stderr)
                    raise
                print(path, i + 1, "out of", len(self.found_articles))
            print("</collection>", file=out)
            out.close()
            task_low_level_submission("bibupload", "admin",
                                      "-N" "OUP", "-i", "-r", name)
示例#3
0
 def bibupload_it(self):
     if self.found_articles:
         nlm_parser = NLMParser(self.extract_nations)
         self.logger.debug("Preparing bibupload.")
         fd, name = mkstemp(
             suffix='.xml',
             prefix='bibupload_scoap3_',
             dir=CFG_TMPSHAREDDIR)
         out = fdopen(fd, 'w')
         print("<collection>", file=out)
         for i, path in enumerate(self.found_articles):
             try:
                 print(
                     nlm_parser.get_record(
                         path,
                         publisher='Oxford',
                         collection='SCOAP3',
                         logger=self.logger),
                     file=out)
             except Exception as err:
                 print >> sys.stderr, err
                 raise Exception
             print(path, i + 1, "out of", len(self.found_articles))
         print("</collection>", file=out)
         out.close()
         task_low_level_submission("bibupload", "admin", "-N"
                                   "OUP", "-i", "-r", name)
def upload_keywords(filename, mode='correct', recids=None):
    """Stores the extracted keywords in the database
    @var filename: fullpath to the file with marc record
    @keyword mode: correct|replace|add|delete
        use correct to add fields if they are different
        replace all fields with fields from the file
        add - add (even duplicate) fields
        delete - delete fields which are inside the file
    @keyword recids: list of record ids, this arg comes from
        the bibclassify daemon and it is used when the recids
        contains one entry (recid) - ie. one individual document
        was processed. We use it to mark the job title so that
        it is possible to query database if the bibclassify
        was run over that document (in case of collections with
        many recids, we simply construct a general title)
    """
    if mode == 'correct':
        m = '-c'
    elif mode == 'replace':
        m = '-r'
    elif mode == 'add':
        m = '-a'
    elif mode == 'delete':
        m = '-d'
    else:
        raise Exception('Unknown mode')

    # let's use the user column to store the information, cause no better alternative in sight...
    user_title = 'bibclassify.upload'
    if recids and len(recids) == 1:
        user_title = 'extract:%d' % recids[0]
    bibtask.task_low_level_submission('bibupload',
                user_title, '-n', m, filename)
示例#5
0
def process_dir(dir_name, delete=False):
	ftp.cwd(dir_name)
	ls = ftp.nlst()
	local_dir = "/opt/invenio/var/batchupload/metadata/replace/"
	for line in ls:
		try:
			mkdir('/opt/invenio/var/batchupload/mets/' + dir_name)
		except:
			pass
		file_name = line.splitlines()[-1]
		file_name = line
		process_file(dir_name, file_name)
		if delete:
			try:
				ftp.delete(file_name)
			except:
				print "ERROR deleting file " + file_name
	ftp.cwd('..')

	task_low_level_submission('bibupload', 'batchupload', '--replace', local_dir+dir_name+'.xml', '--pre-plugin=bp_pre_ingestion', '--post-plugin=bp_post_ingestion')

	if delete:
		try:
			ftp.rmd(dir_name)
		except:
			print "ERROR deleting dir " + dir_name
示例#6
0
def bibupload(record=None, collection=None, file_prefix="", mode="-c"):
    """
    General purpose function that will write a MARCXML file and call bibupload
    on it.
    """
    if collection is None and record is None:
        return

    (file_out, filename) = open_temp_file(file_prefix)

    if collection is not None:
        file_out.write("<collection>")
        tot = 0
        for rec in collection:
            file_out.write(record_xml_output(rec))
            tot += 1
            if tot == MAX_RECORDS:
                file_out.write("</collection>")
                file_out.close()
                logger.debug("Submitting bibupload %s -n %s" % (mode, filename))
                task_low_level_submission('bibupload', 'openaire', mode, filename, '-n')

                (file_out, filename) = open_temp_file(file_prefix)
                file_out.write("<collection>")
                tot = 0
        file_out.write("</collection>")
    elif record is not None:
        tot = 1
        file_out.write(record_xml_output(record))

    file_out.close()
    if tot > 0:
        logger.debug("Submitting bibupload %s -n %s" % (mode, filename))
        task_low_level_submission('bibupload', 'openaire', mode, filename, '-n')
    def bibupload_it(self):
        print self.found_articles
        if self.found_articles:
            if [x for x in self.found_articles if "vtex" not in x]:
                self.logger.debug("Preparing bibupload.")
                fd, name = mkstemp(suffix='.xml', prefix='bibupload_scoap3_', dir=CFG_TMPSHAREDDIR)
                out = fdopen(fd, 'w')
                print >> out, "<collection>"
                for i, path in enumerate(self.found_articles):
                    if "vtex" not in path:
                        print >> out, self.get_record(path)
                        print path, i + 1, "out of", len(self.found_articles)
                print >> out, "</collection>"
                out.close()
                task_low_level_submission("bibupload", "admin", "-N", "Elsevier", "-i", "-r", name)

            if [x for x in self.found_articles if "vtex" in x]:
            # for VTEX files with PDF/A
                self.logger.debug("Preparing bibupload for PDF/As.")
                fd_vtex, name_vtex = mkstemp(suffix='.xml', prefix='bibupload_scoap3_', dir=CFG_TMPSHAREDDIR)
                out = fdopen(fd_vtex, 'w')
                print >> out, "<collection>"
                # enumerate remember progres of prevoius one
                for i, path in enumerate(self.found_articles):
                    if "vtex" in path:
                        print >> out, self.get_pdfa_record(path)
                        print path, i + 1, "out of", len(self.found_articles)
                print >> out, "</collection>"
                out.close()
                task_low_level_submission("bibupload", "admin", "-N", "Elsevier:VTEX", "-a", name_vtex)
示例#8
0
def update_references(recid, overwrite=True):
    """Update references for a record

    First, we extract references from a record.
    Then, we are not updating the record directly but adding a bibupload
    task in -c mode which takes care of updating the record.

    Parameters:
    * recid: the id of the record
    """

    if not overwrite:
        # Check for references in record
        record = get_record(recid)
        if record and record_has_field(record, "999"):
            raise RecordHasReferences("Record has references and overwrite " "mode is disabled: %s" % recid)

    if get_fieldvalues(recid, "999C59"):
        raise RecordHasReferences("Record has been curated: %s" % recid)

    # Parse references
    references_xml = extract_references_from_record_xml(recid)

    # Save new record to file
    (temp_fd, temp_path) = mkstemp(prefix=CFG_REFEXTRACT_FILENAME, dir=CFG_TMPSHAREDDIR)
    temp_file = os.fdopen(temp_fd, "w")
    temp_file.write(references_xml.encode("utf-8"))
    temp_file.close()

    # Update record
    task_low_level_submission("bibupload", "refextract", "-P", "5", "-c", temp_path)
示例#9
0
    def _run_tasks(obj, dummy_eng):
        d = Deposition(obj)
        sip = d.get_latest_sip(sealed=True)

        recid = sip.metadata['recid']
        communities = sip.metadata.get('provisional_communities', [])

        common_args = ['-P5', ]
        sequenceid = getattr(d.workflow_object, 'task_sequence_id', None)
        if sequenceid:
            common_args += ['-I', str(sequenceid)]

        if update:
            tasklet_name = 'bst_openaire_update_upload'
        else:
            tasklet_name = 'bst_openaire_new_upload'

        task_id = task_low_level_submission(
            'bibtasklet', 'webdeposit', '-T', tasklet_name,
            '--argument', 'recid=%s' % recid, *common_args
        )
        sip.task_ids.append(task_id)

        for c in communities:
            task_id = task_low_level_submission(
                'webcoll', 'webdeposit', '-c', 'provisional-user-%s' % c,
                *common_args
            )
            sip.task_ids.append(task_id)
        d.update()
示例#10
0
def _delete_blogs(file_path):
    """
    @param file_path: file containing the list of blogs to delete
    from the archive. Each blog is represented just by its url.
    E.g: "blogs_to_delete.csv"
    http://blogforever.eu
    http://blogs.physicstoday.org/
    @type file_path: it is a csv file where each row is the url of
    a blog to delete.
    """

    blog_list = _get_blog_list(file_path)
    mode = "delete"
    if blog_list:
        res = _check_input_blogs(blog_list, mode)
        if res[0]:
            records_to_delete = _get_records_to_delete(blog_list)
            deleted_blogs_xml = _transform_bloglist_to_marcxml(records_to_delete, mode)
            xml_file = _write_xml_file(deleted_blogs_xml, mode)
            task_low_level_submission('bibupload', 'webblog', '-c', xml_file)
        else:
            write_message(str(res[1]))
            raise Exception(res[1])
    else:
        write_message("There are not blogs to "+ str(mode))
        raise Exception("There are not blogs to " + str(mode))
示例#11
0
def save_xml_record(recid, uid, xml_record='', to_upload=True, to_merge=False,
                    task_name="bibedit", sequence_id=None):
    """Write XML record to file. Default behaviour is to read the record from
    a BibEdit cache file, filter out the unchanged volatile subfields,
    write it back to an XML file and then pass this file to BibUpload.

    @param xml_record: give XML as string in stead of reading cache file
    @param to_upload: pass the XML file to BibUpload
    @param to_merge: prepare an XML file for BibMerge to use

    """
    if not xml_record:
        # Read record from cache file.
        cache = get_cache_contents(recid, uid)
        if cache:
            record = cache[2]
            used_changes = cache[4]
            xml_record = record_xml_output(record)
            delete_cache(recid, uid)
            delete_disabled_changes(used_changes)
    else:
        record = create_record(xml_record)[0]

    # clean the record from unfilled volatile fields
    record_strip_empty_volatile_subfields(record)
    record_strip_empty_fields(record)

    # order subfields alphabetically before saving the record
    record_order_subfields(record)

    xml_to_write = wash_for_xml(record_xml_output(record))

    # Write XML file.
    if not to_merge:
        fd, file_path = tempfile.mkstemp(dir=CFG_BIBEDIT_CACHEDIR,
                                         prefix="%s_" % CFG_BIBEDIT_FILENAME,
                                         suffix="_%s_%s.xml" % (recid, uid))
        f = os.fdopen(fd, 'w')
        f.write(xml_to_write)
        f.close()
    else:
        file_path = '%s_%s.xml' % (_get_file_path(recid, uid),
                                   CFG_BIBEDIT_TO_MERGE_SUFFIX)
        xml_file = open(file_path, 'w')
        xml_file.write(xml_to_write)
        xml_file.close()

    user_name = get_user_info(uid)[1]
    if to_upload:
        args = ['bibupload', user_name, '-P', '5', '-r',
                file_path, '-u', user_name]
        if task_name == "bibedit":
            args.extend(['--name', 'bibedit'])
        if sequence_id:
            args.extend(["-I", sequence_id])
        args.extend(['--email-logs-on-error'])
        task_low_level_submission(*args)
    return True
示例#12
0
def submit_refextract_task(recids):
    """Submit a refextract task if needed"""
    # First filter out recids we cannot safely extract references from
    # (mostly because they have been curated)
    recids = [recid for recid in recids if check_record_for_refextract(recid)]

    if recids:
        recids_str = ','.join(str(recid) for recid in recids)
        task_low_level_submission('refextract', NAME, '-i', recids_str)
def create_ill_record(book_info):
    """
    Create a new ILL record

    @param book_info: book's information
    @type book_info: tuple

    @return MARC record
    """

    (title, author, place, publisher, year, edition, isbn) = book_info

    ill_record = """
    <record>
    <datafield tag="020" ind1=" " ind2=" ">
      <subfield code="a">%(isbn)s</subfield>
    </datafield>
    <datafield tag="100" ind1=" " ind2=" ">
      <subfield code="a">%(author)s</subfield>
    </datafield>
    <datafield tag="245" ind1=" " ind2=" ">
      <subfield code="a">%(title)s</subfield>
    </datafield>
    <datafield tag="250" ind1=" " ind2=" ">
      <subfield code="a">%(edition)s</subfield>
    </datafield>
    <datafield tag="260" ind1=" " ind2=" ">
      <subfield code="a">%(place)s</subfield>
      <subfield code="b">%(publisher)s</subfield>
      <subfield code="c">%(year)s</subfield>
    </datafield>
    <datafield tag="980" ind1=" " ind2=" ">
      <subfield code="a">ILLBOOK</subfield>
    </datafield>
  </record>

  """ % {'isbn': isbn,
         'author': author,
         'title': title,
         'edition': edition,
         'place': place,
         'publisher': publisher,
         'year': year}

    file_path = '%s/%s_%s.xml' % (CFG_TMPDIR, 'bibcirculation_ill_book',
                                  time.strftime("%Y%m%d_%H%M%S"))

    xml_file = open(file_path, 'w')
    xml_file.write(ill_record)
    xml_file.close()

    # Pass XML file to BibUpload.
    task_low_level_submission('bibupload', 'bibcirculation', '-P', '5', '-i',
                              file_path)

    return ill_record
示例#14
0
def create_ill_record(book_info):
    """
    Create a new ILL record

    @param book_info: book's information
    @type book_info: tuple

    @return MARC record
    """

    (title, author, place, publisher, year, edition, isbn) = book_info

    ill_record = """
    <record>
    <datafield tag="020" ind1=" " ind2=" ">
      <subfield code="a">%(isbn)s</subfield>
    </datafield>
    <datafield tag="100" ind1=" " ind2=" ">
      <subfield code="a">%(author)s</subfield>
    </datafield>
    <datafield tag="245" ind1=" " ind2=" ">
      <subfield code="a">%(title)s</subfield>
    </datafield>
    <datafield tag="250" ind1=" " ind2=" ">
      <subfield code="a">%(edition)s</subfield>
    </datafield>
    <datafield tag="260" ind1=" " ind2=" ">
      <subfield code="a">%(place)s</subfield>
      <subfield code="b">%(publisher)s</subfield>
      <subfield code="c">%(year)s</subfield>
    </datafield>
    <datafield tag="980" ind1=" " ind2=" ">
      <subfield code="a">ILLBOOK</subfield>
    </datafield>
  </record>

  """ % {
        "isbn": isbn,
        "author": author,
        "title": title,
        "edition": edition,
        "place": place,
        "publisher": publisher,
        "year": year,
    }

    file_path = "%s/%s_%s.xml" % (CFG_TMPDIR, "bibcirculation_ill_book", time.strftime("%Y%m%d_%H%M%S"))

    xml_file = open(file_path, "w")
    xml_file.write(ill_record)
    xml_file.close()

    # Pass XML file to BibUpload.
    task_low_level_submission("bibupload", "bibcirculation", "-P", "5", "-i", file_path)

    return ill_record
def process_bibcodes_to_delete(extraction_directory, upload_mode):
    """method that creates the MarcXML for the bibcodes to delete"""
    logger.info("In function %s" % (inspect.stack()[0][3],))

    #I create an unique file for all the bibcodes to delete:
    #I don't think it's necessary to split the content in groups, since the XML is really simple

    #I create the base object for the tree
    doc = libxml2.newDoc("1.0")
    root = doc.newChild(None, "collection", None)

    #then for each bibcode to delete I create the proper record
    for bibcode in BIBCODES_TO_DELETE_LIST:
        record = root.newChild(None, 'record', None)
        #I add to the record the 2 necessary datafields
        d970 = record.newChild(None, 'datafield', None)
        d970.setProp('tag', '970')
        d970.setProp('ind1', '')
        d970.setProp('ind2', '')
        #I create the subfield tag
        sub = d970.newChild(None, 'subfield', bibcode.replace('&', '&amp;'))
        sub.setProp("code", "a")
        d980 = record.newChild(None, 'datafield', None)
        d980.setProp('tag', '980')
        d980.setProp('ind1', '')
        d980.setProp('ind2', '')
        #I create the subfield tag
        sub = d980.newChild(None, 'subfield', "DELETED")
        sub.setProp("code", "c")

    #I extract the node
    marcxml_string = doc.serialize('UTF-8', 1)
    #I remove the data
    doc.freeDoc()
    del doc
    #I write the bibcodes in the done bibcodes file
    w2f = write_files.WriteFile(extraction_directory, logger)
    w2f.write_done_bibcodes_to_file(BIBCODES_TO_DELETE_LIST)
    del w2f
    
    if upload_mode == 'concurrent':
        #I transform the xml in bibrecords
        bibrecord_object = [elem[0] for elem in bibrecord.create_records(marcxml_string)]
        #I upload the result with option append
        logger.warning('Upload of records to delete started.')
        bibupload_merger(bibrecord_object, logger, 'append')
        logger.warning('Upload of records to delete ended.')
    elif upload_mode == 'bibupload':
        filepath = os.path.join(settings.BASE_OUTPUT_PATH, extraction_directory, settings.BASE_BIBRECORD_FILES_DIR, settings.BIBCODE_TO_DELETE_OUT_NAME)
        with open(filepath, 'w') as marcxml_to_del_file:
            marcxml_to_del_file.write(marcxml_string)
        task_low_level_submission('bibupload', 'admin', '-a', filepath)
        logger.warning('File "%s" submitted to bibupload.' % filepath)
    else:
        logger.error('Upload mode "%s" not supported! File not uploaded' % upload_mode)
    return True
示例#16
0
def openaire_create_icon(docid=None, recid=None, reformat=True):
    """
    Celery task to create an icon for all documents in a given record or for
    just a specific document.
    """
    if recid:
        docs = BibRecDocs(recid).list_bibdocs()
    else:
        docs = [BibDoc(docid)]

    # Celery task will fail if BibDoc does not exists (on purpose ;-)
    for d in docs:
        logger.debug("Checking document %s" % d)
        if not d.get_icon(subformat_re=re.compile(ICON_SUBFORMAT)):
            logger.debug("Document has no icon")
            for f in d.list_latest_files():
                logger.debug("Checking file %s" % f)
                if not f.is_icon():
                    logger.debug("File not an icon")
                    file_path = f.get_full_path()
                    icon_path = None
                    try:
                        filename = os.path.splitext(
                            os.path.basename(file_path)
                        )[0]
                        logger.info("Creating icon from file %s" % file_path)
                        (icon_dir, icon_name) = create_icon(
                            {'input-file': file_path,
                             'icon-name': "icon-%s" % filename,
                             'multipage-icon': False,
                             'multipage-icon-delay': 0,
                             'icon-scale': ICON_SIZE,
                             'icon-file-format': ICON_FILEFORMAT,
                             'verbosity': 0})
                        icon_path = os.path.join(icon_dir, icon_name)
                    except InvenioWebSubmitIconCreatorError, e:
                        logger.warning('Icon for file %s could not be created: %s' % (file_path, str(e)))
                        register_exception(
                            prefix='Icon for file %s could not be created: %s' % (file_path, str(e)),
                            alert_admin=False
                        )

                    try:
                        if icon_path and os.path.exists(icon_path):
                            logger.debug("Adding icon %s to document" % icon_path)
                            d.add_icon(icon_path, subformat=ICON_SUBFORMAT)
                            recid_list = ",".join([str(x['recid']) for x in d.bibrec_links])
                            if reformat:
                                task_low_level_submission('bibreformat', 'openaire', '-i', recid_list)

                    except InvenioBibDocFileError, e:
                        logger.warning('Icon %s for file %s could not be added to document: %s' % (icon_path, f, str(e)))
                        register_exception(
                            prefix='Icon %s for file %s could not be added to document: %s' % (icon_path, f, str(e)),
                            alert_admin=False
                        )
示例#17
0
    def bibupload_it(self):
        if self.found_articles:
            self.logger.debug("Preparing bibupload.")
            fd, name = mkstemp(suffix='.xml', prefix='bibupload_scoap3_',
                               dir=CFG_TMPSHAREDDIR)
            out = fdopen(fd, 'w')
            print >> out, "<collection>"
            for i, path in enumerate(self.found_articles):
                try:
                    for filename in listdir(path):

                        if filename.endswith(".xml.scoap"):
                            xml_end = True
                        elif filename.endswith("_nlm.xml"):
                            xml_end = False
                        else:
                            continue

                        l_info = '%s is JHCP' if xml_end else '%s is EPJC'
                        lc_info = 'Found %s. Calling SISSA' if xml_end \
                                  else 'Found %s. Calling Springer'
                        publi = 'SISSA' if xml_end else 'Springer'

                        if xml_end:
                            parser = APPParser(extract_nations=
                                               self.extract_nations)
                        else:
                            parser = JATSParser(extract_nations=
                                                self.extract_nations)

                        self.logger.info(l_info % path)
                        self.logger.info(lc_info % filename)
                        rec = parser.get_record(join(path, filename),
                                                publisher=publi,
                                                collection='SCOAP3',
                                                logger=self.logger)

                        xml_doc = parser.get_article(join(path, filename))
                        doi = parser.get_doi(xml_doc)
                        package_name = [x for x in path.split('/')
                                        if 'scoap3_package' in x]
                        if package_name:
                            doi_name_map = (package_name[0], doi)
                            self.doi_package_name_mapping.append(doi_name_map)

                        print >> out, rec
                        break
                    print path, i + 1, "out of", len(self.found_articles)
                except Exception as err:
                    register_exception(alert_admin=True)
                    self.logger.error("Error creating record from: %s \n%s"
                                      % (join(path, filename), err))
            print >> out, "</collection>"
            out.close()
            task_low_level_submission("bibupload", "admin", "-N",
                                      "Springer", "-i", "-r", name)
示例#18
0
 def upload_marcxml_file(marcxml):
     """ Creates a temporary marcxml file and sends it to bibupload
     """
     xml_filename = "bibencode_" + str(batch_job["recid"]) + "_" + str(uuid.uuid4()) + ".xml"
     xml_filename = os.path.join(invenio.config.CFG_TMPSHAREDDIR, xml_filename)
     xml_file = file(xml_filename, "w")
     xml_file.write(marcxml)
     xml_file.close()
     targs = ["-c", xml_filename]
     task_low_level_submission("bibupload", "bibencode", *targs)
 def upload_marcxml_file(marcxml):
     """ Creates a temporary marcxml file and sends it to bibupload
     """
     xml_filename = 'bibencode_'+ str(batch_job['recid']) + '_' + str(uuid.uuid4()) + '.xml'
     xml_filename = os.path.join(invenio.config.CFG_TMPSHAREDDIR, xml_filename)
     xml_file = file(xml_filename, 'w')
     xml_file.write(marcxml)
     xml_file.close()
     targs = ['-c', xml_filename]
     task_low_level_submission('bibupload', 'bibencode', *targs)
def submit_xml(xml):
    # Save new record to file
    (temp_fd, temp_path) = mkstemp(prefix='refextract-fixup',
                                   dir=CFG_TMPSHAREDDIR)
    temp_file = os.fdopen(temp_fd, 'w')
    temp_file.write(xml)
    temp_file.close()

    # Update record
    task_low_level_submission('bibupload', 'refextract-fixup', '-P', '5',
                              '-c', temp_path)
示例#21
0
def save_xml_record(recid, uid, xml_record='', to_upload=True, to_merge=False, spec_name=''):
    """Write XML record to file. Default behaviour is to read the record from
    a BibEdit cache file, filter out the unchanged volatile subfields,
    write it back to an XML file and then pass this file to BibUpload.

    @param xml_record: give XML as string in stead of reading cache file
    @param to_upload: pass the XML file to BibUpload
    @param to_merge: prepare an XML file for BibMerge to use

    """
    if not xml_record:
        # Read record from cache file.
        cache = get_cache_file_contents(recid, uid)
        if cache:
            record = cache[2]
            used_changes = cache[4]
#            record_strip_empty_fields(record) # now performed for every record after removing unfilled volatile fields
            xml_record = record_xml_output(record)
            delete_cache_file(recid, uid)
            delete_disabled_changes(used_changes)
    else:
        record = create_record(xml_record)[0]

    # clean the record from unfilled volatile fields
    record_strip_empty_volatile_subfields(record)
    record_strip_empty_fields(record)

    # order subfields alphabetically before saving the record
#TP: nechceme    record_order_subfields(record)

    xml_to_write = wash_for_xml(record_xml_output(record))

    # Write XML file.
    if not to_merge:
        file_path = '%s.xml' % _get_file_path(recid, uid)
    else:
        file_path = '%s_%s.xml' % (_get_file_path(recid, uid),
                                   CFG_BIBEDIT_TO_MERGE_SUFFIX)
    xml_file = open(file_path, 'w')
    xml_file.write(xml_to_write)
    xml_file.close()

    user_name = get_user_info(uid)[1]
    if to_upload:
        # TP: check whether to add spec name
        if spec_name == '':
            # Pass XML file to BibUpload.
            task_low_level_submission('bibupload', 'bibedit', '-P', '5', '-r',
                                      file_path, '-u', user_name)
        else:
            task_low_level_submission('bibupload', 'bibedit', '-P', '5', '-r',
                                      file_path, '-u', user_name, '-N', spec_name)
    return True
示例#22
0
def submit_bibrank_task(to_update, methods, user, priority=3, disable_citation_losses_check=False):
    recids = [str(r) for r in to_update]
    if disable_citation_losses_check:
        return task_low_level_submission('bibrank', user,
                                     '-w', methods,
                                     '-P', str(priority),
                                     '--disable-citation-losses-check',
                                     '-i', ','.join(recids))
    else:
        return task_low_level_submission('bibrank', user,
                                     '-w', methods,
                                     '-P', str(priority),
                                     '-i', ','.join(recids))
示例#23
0
def cb_submit_bibupload(bibcatalog_system=None, records=None):
    if records:
        references_xml = print_records(records)

        # Save new record to file
        temp_fd, temp_path = mkstemp(prefix=CFG_REFEXTRACT_FILENAME,
                                     dir=CFG_TMPSHAREDDIR)
        temp_file = os.fdopen(temp_fd, 'w')
        temp_file.write(references_xml)
        temp_file.close()

        # Update record
        task_low_level_submission('bibupload', 'refextract', '-c', temp_path)
def metadata_upload(req, metafile=None, filetype=None, mode=None, exec_date=None,
                    exec_time=None, metafilename=None, ln=CFG_SITE_LANG):
    """
    Metadata web upload service. Get upload parameters and exec bibupload for the given file.
    Finally, write upload history.
    @return: tuple (error code, message)
        error code: code that indicates if an error ocurred
        message: message describing the error
    """
    # start output:
    req.content_type = "text/html"
    req.send_http_header()

    # write temporary file:
    if filetype == 'marcxml':
        metafile = metafile.value
    else:
        metafile = _transform_input_to_marcxml(file_input=metafile.value)

    user_info = collect_user_info(req)
    tempfile.tempdir = CFG_TMPDIR
    filename = tempfile.mktemp(prefix="batchupload_" + \
        user_info['nickname'] + "_" + time.strftime("%Y%m%d%H%M%S",
        time.localtime()) + "_" + metafilename + "_")
    filedesc = open(filename, 'w')
    filedesc.write(metafile)
    filedesc.close()

    # check if this client can run this file:
    allow = _check_client_can_submit_file(req=req, metafile=metafile, webupload=1, ln=ln)
    if allow[0] != 0:
        return (allow[0], allow[1])

    # run upload command:
    if exec_date:
        date = "\'" + exec_date + ' ' + exec_time + "\'"
        jobid = task_low_level_submission('bibupload', user_info['nickname'], mode, "--name=" + metafilename,"-t", date, filename)
    else:
        jobid = task_low_level_submission('bibupload', user_info['nickname'], mode, "--name=" + metafilename, filename)

    # write batch upload history
    run_sql("""INSERT INTO hstBATCHUPLOAD (user, submitdate,
            filename, execdate, id_schTASK, batch_mode)
            VALUES (%s, NOW(), %s, %s, %s, "metadata")""",
            (user_info['nickname'], metafilename,
            exec_date != "" and (exec_date + ' ' + exec_time)
            or time.strftime("%Y-%m-%d %H:%M:%S"), str(jobid), ))
    return (0, "Task %s queued" % str(jobid))
示例#25
0
    def upload_marcxml(self, marcxml, mode):
        """
        Uploads a record to the server

        Parameters:
          marcxml - *str* the XML to upload.
             mode - *str* the mode to use for the upload.
                    "-i" insert new records
                    "-r" replace existing records
                    "-c" correct fields of records
                    "-a" append fields to records
                    "-ir" insert record or replace if it exists
        """
        if mode not in ["-i", "-r", "-c", "-a", "-ir"]:
            raise NameError, "Incorrect mode " + str(mode)

        # Are we running locally? If so, submit directly
        if LOCAL_SITE_URL == self.server_url:
            (code, marcxml_filepath) = tempfile.mkstemp(prefix="upload_%s" % \
                                                        time.strftime("%Y%m%d_%H%M%S_",
                                                                      time.localtime()))
            marcxml_file_d = os.fdopen(code, "w")
            marcxml_file_d.write(marcxml)
            marcxml_file_d.close()
            return task_low_level_submission("bibupload", "", mode, marcxml_filepath)
        else:
            params = urllib.urlencode({'file': marcxml,
                                        'mode': mode})
            return urllib2.urlopen(self.server_url + "/batchuploader/robotupload", params)
示例#26
0
def upload_amendments(records, holdingpen):
    """ Upload a modified record """

    if task_get_option("no_upload", False) or len(records) == 0:
        return

    xml = '<collection xmlns="http://www.loc.gov/MARC21/slim">'
    for record in records:
        xml += record_xml_output(record)
    xml += "</collection>"

    tmp_file_fd, tmp_file = mkstemp(
        suffix='.xml',
        prefix="bibcheckfile_%s" % time.strftime("%Y-%m-%d_%H:%M:%S"),
        dir=CFG_TMPSHAREDDIR
    )
    os.write(tmp_file_fd, xml)
    os.close(tmp_file_fd)
    os.chmod(tmp_file, 0644)
    if holdingpen:
        flag = "-o"
    else:
        flag = "-r"
    task = task_low_level_submission('bibupload', 'bibcheck', flag, tmp_file)
    write_message("Submitted bibupload task %s" % task)
示例#27
0
def submit_task(to_submit, mode, sequence_id):
    """ calls bibupload with all records to be modified

    @param to_submit: list of xml snippets to be submitted
    @type: list
    @param mode: mode to be used in bibupload
    @type: list
    @param sequence_id: sequence id to be included in the task_id
    @type: str

    @return: id of the submitted task
    @rtype: int
    """
    (temp_fd, temp_path) = mkstemp(prefix=PREFIX,
                                   dir=CFG_TMPSHAREDDIR)
    temp_file = os.fdopen(temp_fd, 'w')
    temp_file.write('<?xml version="1.0" encoding="UTF-8"?>')
    temp_file.write('<collection>')
    for el in to_submit:
        temp_file.write(el)
    temp_file.write('</collection>')
    temp_file.close()

    return task_low_level_submission('bibupload', PREFIX, '-P', '3', '-I',
                                     sequence_id, '-%s' % mode,
                                     temp_path)
    def upload_marcxml(self, marcxml, mode):
        """
        Uploads a record to the server

        Parameters:
          marcxml - *str* the XML to upload.
             mode - *str* the mode to use for the upload.
                    "-i" insert new records
                    "-r" replace existing records
                    "-c" correct fields of records
                    "-a" append fields to records
                    "-ir" insert record or replace if it exists
        """
        if mode not in ["-i", "-r", "-c", "-a", "-ir"]:
            raise NameError, "Incorrect mode " + str(mode)

        # Are we running locally? If so, submit directly
        if self.local:
            (code, marcxml_filepath) = tempfile.mkstemp(prefix="upload_%s" % \
                                                        time.strftime("%Y%m%d_%H%M%S_",
                                                                      time.localtime()))
            marcxml_file_d = os.fdopen(code, "w")
            marcxml_file_d.write(marcxml)
            marcxml_file_d.close()
            return task_low_level_submission("bibupload", "", mode, marcxml_filepath)
        else:
            params = urllib.urlencode({'file': marcxml,
                                        'mode': mode})
            ## We don't use self.browser as batchuploader is protected by IP
            opener = urllib2.build_opener()
            opener.addheaders = [('User-Agent', CFG_USER_AGENT)]
            return opener.open(self.server_url + "/batchuploader/robotupload", params,)
示例#29
0
def upload_to_site(marcxml, yes_i_know, upload_mode="append"):
    """
    makes the appropriate calls to bibupload to get the MARCXML record onto
    the site. Uploads in "correct" mode.

    @param: marcxml (string): the absolute location of the MARCXML that was
        generated by this programme
    @param: yes_i_know (boolean): if true, no confirmation.  if false, prompt.

    @output: a new record on the invenio site

    @return: None
    """
    if not yes_i_know:
        wait_for_user(wrap_text_in_a_box("You are going to upload new " + "plots to the server."))
    task_low_level_submission("bibupload", "admin", upload_mode and "--" + upload_mode or "", marcxml)
示例#30
0
def Insert_Modify_Record(parameters, curdir, form, user_info=None):
    """
    Modify existing record using 'curdir/recmysql' and BibUpload correct
    mode. The file must therefore already have been created prior to this
    execution of this function, for eg. using "Make_Modify_Record".

    This function gets the output of BibConvert and uploads it into
    the MySQL bibliographical database.
    """
    global rn
    sequence_id = bibtask_allocate_sequenceid(curdir)
    if os.path.exists(os.path.join(curdir, "recmysqlfmt")):
        recfile = "recmysqlfmt"
    elif os.path.exists(os.path.join(curdir, "recmysql")):
        recfile = "recmysql"
    else:
        raise InvenioWebSubmitFunctionError("Could not find record file")
    initial_file = os.path.join(curdir, recfile)
    tmp_fd, final_file = tempfile.mkstemp(dir=CFG_TMPDIR,
                                          prefix="%s_%s" % \
                                          (rn.replace('/', '_'),
                                           time.strftime("%Y-%m-%d_%H:%M:%S")))
    os.close(tmp_fd)
    shutil.copy(initial_file, final_file)
    bibupload_id = task_low_level_submission('bibupload', 'websubmit.Insert_Modify_Record', '-c', final_file, '-P', '3', '-I', str(sequence_id))
    open(os.path.join(curdir, 'bibupload_id'), 'w').write(str(bibupload_id))
    return ""
示例#31
0
def schedule_extraction(recid, taxonomy):
    bibtask.task_low_level_submission('bibclassify', 'extract:%s' % recid,
                                      '-k', taxonomy, '-i', '%s' % recid)
示例#32
0
def iterate_over_old(list, fmt):
    """
    Iterate over list of IDs

    @param list: the list of record IDs to format
    @param fmt: the output format to use
    @return: tuple (total number of records, time taken to format, time taken to insert)
    """

    n_rec = 0
    n_max = 10000
    xml_content = ''  # hold the contents
    tbibformat = 0  # time taken up by external call
    tbibupload = 0  # time taken up by external call
    total_rec = 0  # Number of formatted records

    for record in list:

        n_rec = n_rec + 1
        total_rec = total_rec + 1

        message = "Processing record: %d" % (record)
        write_message(message, verbose=9)

        query = "id=%d&of=xm" % (record)

        count = 0

        contents = print_record(record, 'xm')

        while (contents == "") and (count < 10):
            contents = print_record(record, 'xm')
            count = count + 1
            time.sleep(10)
        if count == 10:
            sys.stderr.write(
                "Failed to download %s from %s after 10 attempts... terminating"
                % (query, CFG_SITE_URL))
            sys.exit(0)

        xml_content = xml_content + contents

        if xml_content:

            if n_rec >= n_max:

                finalfilename = "%s/rec_fmt_%s.xml" % (
                    CFG_TMPDIR, time.strftime('%Y%m%d_%H%M%S'))
                filename = "%s/bibreformat.xml" % CFG_TMPDIR
                filehandle = open(filename, "w")
                filehandle.write(xml_content)
                filehandle.close()

                ### bibformat external call
                ###
                task_sleep_now_if_required(can_stop_too=True)
                t11 = os.times()[4]
                message = "START bibformat external call"
                write_message(message, verbose=9)
                command = "%s/bibformat otype='%s' < %s/bibreformat.xml > %s 2> %s/bibreformat.err" % (
                    CFG_BINDIR, fmt.upper(), CFG_TMPDIR, finalfilename,
                    CFG_TMPDIR)
                os.system(command)

                t22 = os.times()[4]
                message = "END bibformat external call (time elapsed:%2f)" % (
                    t22 - t11)
                write_message(message, verbose=9)
                task_sleep_now_if_required(can_stop_too=True)
                tbibformat = tbibformat + (t22 - t11)

                ### bibupload external call
                ###

                t11 = os.times()[4]
                message = "START bibupload external call"
                write_message(message, verbose=9)

                task_id = task_low_level_submission('bibupload', 'bibreformat',
                                                    '-f', finalfilename)
                write_message("Task #%s submitted" % task_id)

                t22 = os.times()[4]
                message = "END bibupload external call (time elapsed:%2f)" % (
                    t22 - t11)
                write_message(message, verbose=9)

                tbibupload = tbibupload + (t22 - t11)

                n_rec = 0
                xml_content = ''


### Process the last re-formated chunk
###

    if n_rec > 0:

        write_message("Processing last record set (%d)" % n_rec, verbose=9)

        finalfilename = "%s/rec_fmt_%s.xml" % (CFG_TMPDIR,
                                               time.strftime('%Y%m%d_%H%M%S'))
        filename = "%s/bibreformat.xml" % CFG_TMPDIR
        filehandle = open(filename, "w")
        filehandle.write(xml_content)
        filehandle.close()

        ### bibformat external call
        ###

        t11 = os.times()[4]
        message = "START bibformat external call"
        write_message(message, verbose=9)

        command = "%s/bibformat otype='%s' < %s/bibreformat.xml > %s 2> %s/bibreformat.err" % (
            CFG_BINDIR, fmt.upper(), CFG_TMPDIR, finalfilename, CFG_TMPDIR)
        os.system(command)

        t22 = os.times()[4]
        message = "END bibformat external call (time elapsed:%2f)" % (t22 -
                                                                      t11)
        write_message(message, verbose=9)

        tbibformat = tbibformat + (t22 - t11)

        ### bibupload external call
        ###

        t11 = os.times()[4]
        message = "START bibupload external call"
        write_message(message, verbose=9)

        task_id = task_low_level_submission('bibupload', 'bibreformat', '-f',
                                            finalfilename)
        write_message("Task #%s submitted" % task_id)

        t22 = os.times()[4]
        message = "END bibupload external call (time elapsed:%2f)" % (t22 -
                                                                      t11)
        write_message(message, verbose=9)

        tbibupload = tbibupload + (t22 - t11)

    return (total_rec, tbibformat, tbibupload)
示例#33
0
    if not reverse_relationship:
        raise InvenioWebSubmitFunctionError(
            "Can not retrieve reverse relationship")

    marcxml = _prepare_marcxml(recid_a, rn_a, recid_b, rn_b,
                               reverse_relationship, direct_relationship)
    fd, name = tempfile.mkstemp(dir=CFG_TMPDIR, prefix="%s_%s" % \
                              (rn_a.replace('/', '_'),
                               time.strftime("%Y-%m-%d_%H:%M:%S")), suffix=".xml")
    try:
        os.write(fd, marcxml)
    finally:
        os.close(fd)

    bibupload_id = task_low_level_submission('bibupload',
                                             'websubmit.Link_Records', '-a',
                                             name, '-P', '3')
    open(join(curdir, 'bibupload_link_record_id'),
         'w').write(str(bibupload_id))
    return ""


def get_recid_and_reportnumber(recid=None,
                               reportnumber=None,
                               keep_original_reportnumber=True):
    """
    Given at least a recid or a reportnumber, this function will look into
    the system for the matching record and will return a normalized
    recid and the primary reportnumber.
    @raises ValueError: in case of no record matched.
    """
示例#34
0
def oairepositoryupdater_task():
    """Main business logic code of oai_archive"""
    no_upload = task_get_option("no_upload")
    report = task_get_option("report")

    if report > 1:
        print_repository_status(verbose=report)
        return True

    if run_sql(
            "SELECT id FROM schTASK WHERE proc='bibupload:oairepository' AND status='WAITING'"
    ):
        write_message(
            "Previous requests of oairepository still being elaborated. Let's skip this execution."
        )
        return True

    initial_snapshot = {}
    for set_spec in all_set_specs():
        initial_snapshot[set_spec] = get_set_definitions(set_spec)
    write_message("Initial set snapshot: %s" % pformat(initial_snapshot),
                  verbose=2)

    task_update_progress("Fetching records to process")

    recids_with_oaiid = search_unit_in_bibxxx(p='*',
                                              f=CFG_OAI_ID_FIELD,
                                              type='e')
    write_message("%s recids have an OAI ID" % len(recids_with_oaiid),
                  verbose=2)

    all_current_recids = search_unit_in_bibxxx(p='*',
                                               f=CFG_OAI_SET_FIELD,
                                               type='e')
    no_more_exported_recids = intbitset(all_current_recids)
    write_message("%s recids are currently exported" %
                  (len(all_current_recids)),
                  verbose=2)

    all_affected_recids = intbitset()
    all_should_recids = intbitset()
    recids_for_set = {}
    for set_spec in all_set_specs():
        if not set_spec:
            set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC
        should_recids = get_recids_for_set_spec(set_spec)
        recids_for_set[set_spec] = should_recids
        no_more_exported_recids -= should_recids
        all_should_recids |= should_recids
        current_recids = search_unit_in_bibxxx(p=set_spec,
                                               f=CFG_OAI_SET_FIELD,
                                               type='e')
        write_message(
            "%s recids should be in %s. Currently %s are in %s" %
            (len(should_recids), set_spec, len(current_recids), set_spec),
            verbose=2)
        to_add = should_recids - current_recids
        write_message("%s recids should be added to %s" %
                      (len(to_add), set_spec),
                      verbose=2)
        to_remove = current_recids - should_recids
        write_message("%s recids should be removed from %s" %
                      (len(to_remove), set_spec),
                      verbose=2)
        affected_recids = to_add | to_remove
        write_message("%s recids should be hence updated for %s" %
                      (len(affected_recids), set_spec),
                      verbose=2)
        all_affected_recids |= affected_recids

    missing_oaiid = all_should_recids - recids_with_oaiid
    write_message("%s recids are missing an oaiid" % len(missing_oaiid))
    write_message("%s recids should no longer be exported" %
                  len(no_more_exported_recids))

    ## Let's add records with missing OAI ID
    all_affected_recids |= missing_oaiid | no_more_exported_recids
    write_message("%s recids should updated" % (len(all_affected_recids)),
                  verbose=2)

    if not all_affected_recids:
        write_message("Nothing to do!")
        return True

    # Prepare to save results in a tmp file
    (fd, filename) = mkstemp(dir=CFG_TMPSHAREDDIR,
                                  prefix='oairepository_' + \
                                  time.strftime("%Y%m%d_%H%M%S_",
                                                time.localtime()))
    oai_out = os.fdopen(fd, "w")
    oai_out.write("<collection>")

    tot = 0
    # Iterate over the recids
    for i, recid in enumerate(all_affected_recids):
        task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("Done %s out of %s records." % \
                             (i, len(all_affected_recids)))

        write_message("Elaborating recid %s" % recid, verbose=3)
        record = get_record(recid)
        if not record:
            write_message("Record %s seems empty. Let's skip it." % recid,
                          verbose=3)
            continue
        new_record = {}

        # Check if an OAI identifier is already in the record or
        # not.
        assign_oai_id_entry = False
        oai_id_entry = record_get_field_value(record,
                                              tag=CFG_OAI_ID_FIELD[:3],
                                              ind1=CFG_OAI_ID_FIELD[3],
                                              ind2=CFG_OAI_ID_FIELD[4],
                                              code=CFG_OAI_ID_FIELD[5])
        if not oai_id_entry:
            assign_oai_id_entry = True
            oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid)
            write_message("Setting new oai_id %s for record %s" %
                          (oai_id_entry, recid),
                          verbose=3)
        else:
            write_message("Already existing oai_id %s for record %s" %
                          (oai_id_entry, recid),
                          verbose=3)

        # Get the sets to which this record already belongs according
        # to the metadata
        current_oai_sets = set(
            record_get_field_values(record,
                                    tag=CFG_OAI_SET_FIELD[:3],
                                    ind1=CFG_OAI_SET_FIELD[3],
                                    ind2=CFG_OAI_SET_FIELD[4],
                                    code=CFG_OAI_SET_FIELD[5]))
        write_message("Record %s currently belongs to these oai_sets: %s" %
                      (recid, ", ".join(current_oai_sets)),
                      verbose=3)

        current_previous_oai_sets = set(
            record_get_field_values(record,
                                    tag=CFG_OAI_PREVIOUS_SET_FIELD[:3],
                                    ind1=CFG_OAI_PREVIOUS_SET_FIELD[3],
                                    ind2=CFG_OAI_PREVIOUS_SET_FIELD[4],
                                    code=CFG_OAI_PREVIOUS_SET_FIELD[5]))
        write_message(
            "Record %s currently doesn't belong anymore to these oai_sets: %s"
            % (recid, ", ".join(current_previous_oai_sets)),
            verbose=3)

        # Get the sets that should be in this record according to
        # settings
        updated_oai_sets = set(_set
                               for _set, _recids in recids_for_set.iteritems()
                               if recid in _recids)
        write_message("Record %s now belongs to these oai_sets: %s" %
                      (recid, ", ".join(updated_oai_sets)),
                      verbose=3)

        updated_previous_oai_sets = set(
            _set for _set in (current_previous_oai_sets - updated_oai_sets)
            | (current_oai_sets - updated_oai_sets))
        write_message(
            "Record %s now doesn't belong anymore to these oai_sets: %s" %
            (recid, ", ".join(updated_previous_oai_sets)),
            verbose=3)

        # Ok, we have the old sets and the new sets. If they are equal
        # and oai ID does not need to be added, then great, nothing to
        # change . Otherwise apply the new sets.
        if current_oai_sets == updated_oai_sets and not assign_oai_id_entry:
            write_message("Nothing has changed for record %s, let's move on!" %
                          recid,
                          verbose=3)
            continue  # Jump to next recid

        write_message("Something has changed for record %s, let's update it!" %
                      recid,
                      verbose=3)
        subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)]
        for oai_set in updated_oai_sets:
            subfields.append((CFG_OAI_SET_FIELD[5], oai_set))
        for oai_set in updated_previous_oai_sets:
            subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set))

        record_add_field(new_record, tag="001", controlfield_value=str(recid))
        record_add_field(new_record,
                         tag=CFG_OAI_ID_FIELD[:3],
                         ind1=CFG_OAI_ID_FIELD[3],
                         ind2=CFG_OAI_ID_FIELD[4],
                         subfields=subfields)
        oai_out.write(record_xml_output(new_record))
        tot += 1
        if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE:
            oai_out.write("</collection>")
            oai_out.close()
            write_message("Wrote to file %s" % filename)
            if not no_upload:
                if task_get_option("notimechange"):
                    task_low_level_submission('bibupload', 'oairepository',
                                              '-c', filename, '-n',
                                              '-Noairepository', '-P', '-1')
                else:
                    task_low_level_submission('bibupload', 'oairepository',
                                              '-c', filename,
                                              '-Noairepository', '-P', '-1')
            # Prepare to save results in a tmp file
            (fd, filename) = mkstemp(dir=CFG_TMPSHAREDDIR,
                                        prefix='oairepository_' + \
                                        time.strftime("%Y%m%d_%H%M%S_",
                                                        time.localtime()))
            oai_out = os.fdopen(fd, "w")
            oai_out.write("<collection>")
            tot = 0
            task_sleep_now_if_required(can_stop_too=True)

    oai_out.write("</collection>")
    oai_out.close()
    write_message("Wrote to file %s" % filename)

    if tot > 0:
        if not no_upload:
            task_sleep_now_if_required(can_stop_too=True)
            if task_get_option("notimechange"):
                task_low_level_submission('bibupload', 'oairepository', '-c',
                                          filename, '-n')
            else:
                task_low_level_submission('bibupload', 'oairepository', '-c',
                                          filename)
    else:
        os.remove(filename)

    return True
示例#35
0
                user = user_info['nickname']
            if not user:
                user = "******"
            # Execute bibupload with the appropiate mode

            task_arguments = ('bibupload', user, "--" + mode,
                              "--priority=" + priority, "-N", "batchupload")

            if exec_date:
                date = '--runtime=' + "\'" + exec_date + ' ' + exec_time + "\'"
                task_arguments += (date, )
            if email_logs_to:
                task_arguments += ("--email-logs-to", email_logs_to)
            task_arguments += (filename, )

            jobid = task_low_level_submission(*task_arguments)

            # write batch upload history
            run_sql(
                """INSERT INTO hstBATCHUPLOAD (user, submitdate,
                    filename, execdate, id_schTASK, batch_mode)
                    VALUES (%s, NOW(), %s, %s, %s, "document")""",
                (user_info['nickname'], docfile, exec_date != "" and
                 (exec_date + ' ' + exec_time)
                 or time.strftime("%Y-%m-%d %H:%M:%S"), str(jobid)))

            # Move file to DONE folder
            done_filename = docfile + "_" + time.strftime(
                "%Y%m%d%H%M%S", time.localtime()) + "_" + str(jobid)
            try:
                os.rename(os.path.join(folder, docfile),
示例#36
0
def import_recid_list(input_stream=sys.stdin,
                      batch_limit=500,
                      automatic_upload=False):
    """Import identifiers from file, match and generate output files."""
    all_recids = get_all_recids()
    output_files = []
    current_batch = []
    current_dupes = []
    i = 0
    for row in input_stream:
        if row.endswith('\n'):
            row = row[:-1]
        row = row.split('|')
        if row:
            try:
                other_id, doi, eprint, recid, system_number = row[0], row[
                    1], row[2], row[3], row[4]
            except IndexError:
                # Something is up
                write_message("WARNING: {0} is invalid".format(row),
                              stream=sys.stderr)
                continue
            if len(row) > 5:
                reportnumbers = row[5:]
            else:
                reportnumbers = None
            if not other_id:
                other_id = None
            if not recid:
                recid = None
            result = add_other_id(other_id, doi, eprint, recid, system_number,
                                  reportnumbers, all_recids)
            if result:
                if isinstance(result, list):
                    # Duplications found
                    current_dupes.append(result)
                    continue
                current_batch.append(result)
                i += 1
                if i % batch_limit == 0:
                    output_file = write_results(current_batch)
                    output_files.append(output_file)
                    if automatic_upload:
                        task_low_level_submission('bibupload',
                                                  'bst_inspire_cds_synchro',
                                                  '-c', output_file, '-n')
                        write_message("Scheduled bibupload --correct %s" %
                                      output_file)
                    task_sleep_now_if_required()
                    current_batch = []
    if len(current_batch) > 0:
        output_file = write_results(current_batch)
        output_files.append(output_file)
        if automatic_upload:
            task_low_level_submission('bibupload', 'bst_inspire_cds_synchro',
                                      '-c', output_file, '-n')
            write_message("Scheduled bibupload --correct %s" % output_file)
    write_message("Matched in total {0} records.".format(i))

    if len(current_dupes) > 0:
        # We have duplications
        dupes_output_file = get_temporary_file("cds_duplicates_", ".txt")
        with open(dupes_output_file, "w") as fd:
            fd.write("\n".join([
                "{0}: {1}".format(dupe[0], dupe[1:]) for dupe in current_dupes
            ]))
        write_message(
            "Found {0} possible duplicates which are available here: {1}".
            format(len(current_dupes), dupes_output_file))
    return output_files
示例#37
0
def cli_upload(req,
               file_content=None,
               mode=None,
               callback_url=None,
               nonce=None,
               special_treatment=None,
               priority=0):
    """ Robot interface for uploading MARC files
    """
    req.content_type = "text/plain"

    # check IP and useragent:
    if not _get_client_authorized_collections(_get_client_ip(req)):
        msg = "[ERROR] Sorry, client IP %s cannot use the service." % _get_client_ip(
            req)
        _log(msg)
        req.status = HTTP_FORBIDDEN
        return _write(req, msg)
    if not _check_client_useragent(req):
        msg = "[ERROR] Sorry, the %s useragent cannot use the service." % _get_useragent(
            req)
        _log(msg)
        req.status = HTTP_FORBIDDEN
        return _write(req, msg)

    arg_mode = mode
    if not arg_mode:
        msg = "[ERROR] Please specify upload mode to use."
        _log(msg)
        req.status = HTTP_BAD_REQUEST
        return _write(req, msg)
    if arg_mode == '--insertorreplace':
        arg_mode = '-ir'
    if not arg_mode in PERMITTED_MODES:
        msg = "[ERROR] Invalid upload mode."
        _log(msg)
        req.status = HTTP_BAD_REQUEST
        return _write(req, msg)

    arg_file = file_content
    if hasattr(arg_file, 'read'):
        ## We've been passed a readable file, e.g. req
        arg_file = arg_file.read()
        if not arg_file:
            msg = "[ERROR] Please provide a body to your request."
            _log(msg)
            req.status = HTTP_BAD_REQUEST
            return _write(req, msg)
    else:
        if not arg_file:
            msg = "[ERROR] Please specify file body to input."
            _log(msg)
            req.status = HTTP_BAD_REQUEST
            return _write(req, msg)
        if hasattr(arg_file, "filename"):
            arg_file = arg_file.value
        else:
            msg = "[ERROR] 'file' parameter must be a (single) file"
            _log(msg)
            req.status = HTTP_BAD_REQUEST
            return _write(req, msg)

    # write temporary file:
    (fd, filename) = tempfile.mkstemp(prefix="batchupload_" + \
               time.strftime("%Y%m%d%H%M%S", time.localtime()) + "_",
               dir=CFG_TMPSHAREDDIR)

    filedesc = os.fdopen(fd, 'w')
    filedesc.write(arg_file)
    filedesc.close()

    # check if this client can run this file:
    client_ip = _get_client_ip(req)
    permitted_dbcollids = _get_client_authorized_collections(client_ip)
    if '*' not in permitted_dbcollids:  # wildcard
        allow = _check_client_can_submit_file(client_ip, filename, req, 0)
        if not allow:
            msg = "[ERROR] Cannot submit such a file from this IP. (Wrong collection.)"
            _log(msg)
            req.status = HTTP_FORBIDDEN
            return _write(req, msg)

    # check validity of marcxml
    xmlmarclint_path = CFG_BINDIR + '/xmlmarclint'
    xmlmarclint_output, dummy1, dummy2 = run_shell_command(
        '%s %s' % (xmlmarclint_path, filename))
    if xmlmarclint_output != 0:
        msg = "[ERROR] MARCXML is not valid."
        _log(msg)
        req.status = HTTP_BAD_REQUEST
        return _write(req, msg)
    args = [
        'bibupload', "batchupload", arg_mode, filename, '-P',
        str(priority)
    ]
    # run upload command
    if callback_url:
        args += ["--callback-url", callback_url]
        if nonce:
            args += ["--nonce", nonce]
        if special_treatment:
            args += ["--special-treatment", special_treatment]
    task_low_level_submission(*args)
    msg = "[INFO] %s" % ' '.join(args)
    _log(msg)
    return _write(req, msg)
示例#38
0
                               direct_relationship,
                               marc_for_a=direct_relationship_MARC,
                               marc_for_b=reverse_relationship_MARC,
                               upload_mode=bibupload_mode,
                               consider_empty_p=consider_empty_p)
    fd, name = tempfile.mkstemp(dir=CFG_TMPDIR, prefix="%s_%s" % \
                              (rn_a.replace('/', '_'),
                               time.strftime("%Y-%m-%d_%H:%M:%S")), suffix=".xml")
    try:
        os.write(fd, marcxml)
    finally:
        os.close(fd)

    sequence_id = bibtask_allocate_sequenceid(curdir)
    bibupload_id = task_low_level_submission('bibupload',
                                             'websubmit.Link_Records',
                                             '--' + bibupload_mode, name, '-P',
                                             '3', '-I', str(sequence_id))
    open(join(curdir, 'bibupload_link_record_id'),
         'w').write(str(bibupload_id))
    return ""


def get_recid_and_reportnumber(recid=None,
                               reportnumber=None,
                               keep_original_reportnumber=True):
    """
    Given at least a recid or a reportnumber, this function will look into
    the system for the matching record and will return a normalized
    recid and the primary reportnumber.
    @raises ValueError: in case of no record matched.
    """
示例#39
0
def launch_task(args):
    """ Launches the job as a new bibtask through the low-level submission
    interface
    """
    return task_low_level_submission('bibencode', 'bibencode:daemon', *args)
示例#40
0
def save_xml_record(recid,
                    uid,
                    xml_record='',
                    to_upload=True,
                    to_merge=False,
                    task_name="bibedit",
                    sequence_id=None):
    """Write XML record to file. Default behaviour is to read the record from
    a BibEdit cache file, filter out the unchanged volatile subfields,
    write it back to an XML file and then pass this file to BibUpload.

    @param xml_record: give XML as string in stead of reading cache file
    @param to_upload: pass the XML file to BibUpload
    @param to_merge: prepare an XML file for BibMerge to use

    """
    if not xml_record:
        # Read record from cache file.
        cache = get_cache_contents(recid, uid)
        if cache:
            record = cache[2]
            used_changes = cache[4]
            xml_record = record_xml_output(record)
            delete_cache(recid, uid)
            delete_disabled_changes(used_changes)
    else:
        record = create_record(xml_record)[0]

    # clean the record from unfilled volatile fields
    record_strip_empty_volatile_subfields(record)
    record_strip_empty_fields(record)

    # order subfields alphabetically before saving the record
    record_order_subfields(record)

    xml_to_write = wash_for_xml(record_xml_output(record))

    # Write XML file.
    if not to_merge:
        fd, file_path = tempfile.mkstemp(dir=CFG_BIBEDIT_CACHEDIR,
                                         prefix="%s_" % CFG_BIBEDIT_FILENAME,
                                         suffix="_%s_%s.xml" % (recid, uid))
        f = os.fdopen(fd, 'w')
        f.write(xml_to_write)
        f.close()
    else:
        file_path = '%s_%s.xml' % (_get_file_path(
            recid, uid), CFG_BIBEDIT_TO_MERGE_SUFFIX)
        xml_file = open(file_path, 'w')
        xml_file.write(xml_to_write)
        xml_file.close()

    user_name = get_user_info(uid)[1]
    if to_upload:
        args = [
            'bibupload', user_name, '-P', '5', '-r', file_path, '-u', user_name
        ]
        if task_name == "bibedit":
            args.extend(['--name', 'bibedit'])
        if sequence_id:
            args.extend(["-I", sequence_id])
        args.extend(['--email-logs-on-error'])
        task_low_level_submission(*args)
    return True
示例#41
0
def submit_refextract_task(to_update, user, priority=3):
    recids = [str(r) for r in to_update]
    return task_low_level_submission('refextract', user,
                                     '-P', str(priority),
                                     '--overwrite',
                                     '--id', ','.join(recids))
示例#42
0
def Notify_URL(parameters, curdir, form, user_info=None):
    """
    Access a given URL, and possibly post some content.

    Could be used to notify that a record has been fully integrated.
    (the URL is only accessed once the BibTask created by this
    function runs in BibSched, not the when the function is run. The
    BibTask uses a task sequence ID to respect ordering of tasks)

    if URL is empty, skip the notification.

    @param parameters: (dictionary) - contains the following parameter
         strings used by this function:

         + url: (string) - the URL to be contacted by this function
                           (must start with http/https)
                           If value starts with "FILE:", will look for
                           the URL in a file on curdir with the given name.
                           for eg: "FILE:my_url"
                           (value retrieved when function is run)

         + data: (string) - (optional) the data to be posted at the
                            given URL.  if no value is given, the URL
                            will be accessed via GET.
                            If value starts with "FILE:", will look for
                            the data in a file on curdir with the given name.
                            for eg: "FILE:my_data"
                            (value retrieved when function is run)

         + content_type: (string) - (optional) the content-type to use
                                    to post data. Default is 'text/plain'.
                                    Ignored if not data is posted.

         + attempt_times: (int) - (optional) up to how many time shall
                                  we try to contact the URL in case we
                                  fail at contacting it?

         + attempt_sleeptime: (int) - (optional) how many seconds to
                                       sleep between each attempt?

         + admin_emails: (string) - (optional) list of emails (comma-separated
                                    values) to contact in case the URL
                                    cannot be accessed after all attempts.
                                    If value starts with "FILE:", will look for
                                    the emails in a file on curdir with the given name.
                                    for eg: "FILE:my_email"
                                    (value retrieved when function is run)

         + user: (string) - the user to be used to launch the task
                            (visible in BibSched).  If value starts
                            with"FILE:", will look for the emails in a file on
                            curdir with the given name.
                            for eg:"FILE:my_user"
                            (value retrieved when function is run)

    """

    other_bibtasklet_arguments = []
    sequence_id = bibtask_allocate_sequenceid(curdir)

    url = parameters["url"]
    data = parameters["data"]
    admin_emails = parameters["admin_emails"]
    content_type = parameters["content_type"]
    attempt_times = parameters["attempt_times"]
    attempt_sleeptime = parameters["attempt_sleeptime"]
    user = parameters["user"]

    # Maybe some params must be read from disk
    if url.startswith('FILE:'):
        url = ParamFromFile(os.path.join(curdir, url[5:]))
    if not url:
        return ""
    if data.startswith('FILE:'):
        data = ParamFromFile(os.path.join(curdir, data[5:]))
    if admin_emails.startswith('FILE:'):
        admin_emails = ParamFromFile(os.path.join(curdir, admin_emails[5:]))
    if user.startswith('FILE:'):
        user = ParamFromFile(os.path.join(curdir, user[5:]))

    if data:
        other_bibtasklet_arguments.extend(("-a", "data=%s" % data))
        other_bibtasklet_arguments.extend(
            ("-a", "content_type=%s" % content_type))

    return task_low_level_submission(
        "bibtasklet", user, "-T", "bst_notify_url", "-I", str(sequence_id),
        "-a", "url=%s" % url, "-a", "attempt_times=%s" % attempt_times, "-a",
        "attempt_sleeptime=%s" % attempt_sleeptime, "-a",
        "admin_emails=%s" % admin_emails, *other_bibtasklet_arguments)
示例#43
0
            if result:
                print >> output_file, result
                i += 1
                if i % 1000 == 0:
                    output_file.close()
                    task_low_level_submission('bibupload',
                                              'bst_inspire_cds_synchro', '-a',
                                              output_file.name, '-n')
                    write_message("Scheduled bibupload --append %s" %
                                  output_file.name)
                    task_sleep_now_if_required()
                    output_file = get_out_file()
                    i = 0
    if i > 0:
        output_file.close()
        task_low_level_submission('bibupload', 'bst_inspire_cds_synchro', '-a',
                                  output_file.name, '-n')
        write_message("Scheduled bibupload --append %s" % output_file.name)


def bst_inspire_cds_synchro():
    task_update_progress("Phase 1: extracting IDs for %s" % CFG_OTHER_SITE)
    export_file = open(CFG_EXPORT_FILE + '.part', "w")
    for i, row in enumerate(iter_export_rows()):
        print >> export_file, row
        if i % 100 == 0:
            task_sleep_now_if_required(can_stop_too=True)
    export_file.close()
    shutil.move(CFG_EXPORT_FILE + '.part', CFG_EXPORT_FILE)
    task_sleep_now_if_required(can_stop_too=True)
    if os.path.exists(CFG_IMPORT_FILE):
        task_update_progress("Phase 2: importing IDs from %s" % CFG_OTHER_SITE)
示例#44
0
            }
            filedesc.write(marc_content)
            filedesc.close()
            info[1].append(docfile)
            user = ""
            if req is not None:
                user_info = collect_user_info(req)
                user = user_info['nickname']
            if not user:
                user = "******"
            # Execute bibupload with the appropiate mode
            if exec_date:
                date = '--runtime=' + "\'" + exec_date + ' ' + exec_time + "\'"
                jobid = task_low_level_submission('bibupload', user,
                                                  "--" + mode,
                                                  "--name=" + docfile,
                                                  "--priority=" + priority,
                                                  date, filename)
            else:
                jobid = task_low_level_submission('bibupload', user,
                                                  "--" + mode,
                                                  "--name=" + docfile,
                                                  "--priority=" + priority,
                                                  filename)

            # write batch upload history
            run_sql(
                """INSERT INTO hstBATCHUPLOAD (user, submitdate,
                    filename, execdate, id_schTASK, batch_mode)
                    VALUES (%s, NOW(), %s, %s, %s, "document")""",
                (user_info['nickname'], docfile, exec_date != "" and
示例#45
0
def submit_bibrank_task(to_update, methods, user, priority=3):
    recids = [str(r) for r in to_update]
    return task_low_level_submission('bibrank', user, '-w', methods, '-P',
                                     str(priority), '-i', ','.join(recids))
示例#46
0
def metadata_upload(req,
                    metafile=None,
                    filetype=None,
                    mode=None,
                    exec_date=None,
                    exec_time=None,
                    metafilename=None,
                    ln=CFG_SITE_LANG,
                    priority="1"):
    """
    Metadata web upload service. Get upload parameters and exec bibupload for the given file.
    Finally, write upload history.
    @return: tuple (error code, message)
        error code: code that indicates if an error ocurred
        message: message describing the error
    """
    # start output:
    req.content_type = "text/html"
    req.send_http_header()

    error_codes = {'not_authorized': 1, 'invalid_marc': 2}
    # write temporary file:
    if filetype == 'marcxml':
        metafile = metafile.value
    else:
        metafile = _transform_input_to_marcxml(file_input=metafile.value)

    user_info = collect_user_info(req)
    tempfile.tempdir = CFG_TMPSHAREDDIR
    filename = tempfile.mktemp(prefix="batchupload_" + \
        user_info['nickname'] + "_" + time.strftime("%Y%m%d%H%M%S",
        time.localtime()) + "_")
    filedesc = open(filename, 'w')
    filedesc.write(metafile)
    filedesc.close()

    # check if this client can run this file:
    if req is not None:
        allow = _check_client_can_submit_file(req=req,
                                              metafile=metafile,
                                              webupload=1,
                                              ln=ln)
        if allow[0] != 0:
            return (error_codes['not_authorized'], allow[1])

    # check MARCXML validity
    if filetype == 'marcxml':
        # check validity of marcxml
        xmlmarclint_path = CFG_BINDIR + '/xmlmarclint'
        xmlmarclint_output, dummy1, dummy2 = run_shell_command(
            '%s %s' % (xmlmarclint_path, filename))
        if xmlmarclint_output != 0:
            msg = "[ERROR] MARCXML is not valid."
            return (error_codes['invalid_marc'], msg)
    # run upload command:
    if exec_date:
        date = exec_date
        if exec_time:
            date += ' ' + exec_time
        jobid = task_low_level_submission('bibupload', user_info['nickname'],
                                          mode, "--name=" + metafilename,
                                          "--priority=" + priority, "-t", date,
                                          filename)
    else:
        jobid = task_low_level_submission('bibupload', user_info['nickname'],
                                          mode, "--name=" + metafilename,
                                          "--priority=" + priority, filename)

    # write batch upload history
    run_sql(
        """INSERT INTO hstBATCHUPLOAD (user, submitdate,
            filename, execdate, id_schTASK, batch_mode)
            VALUES (%s, NOW(), %s, %s, %s, "metadata")""", (
            user_info['nickname'],
            metafilename,
            exec_date != "" and (exec_date + ' ' + exec_time)
            or time.strftime("%Y-%m-%d %H:%M:%S"),
            str(jobid),
        ))
    return (0, "Task %s queued" % str(jobid))
def bst_scoap3_importer():
    task_sleep_now_if_required(can_stop_too=True)
    f = urllib.urlopen('http://repo.scoap3.org/ffts_for_inspire.py/csv')

    fd_update, name_update = mkstemp(suffix='.xml',
                                     prefix='bibupload_scoap3_',
                                     dir=CFG_TMPSHAREDDIR)
    out_update = fdopen(fd_update, 'w')
    fd_new, name_new = mkstemp(suffix='.xml',
                               prefix='bibupload_scoap3_',
                               dir=CFG_TMPSHAREDDIR)
    out_new = fdopen(fd_new, 'w')
    print >> out_update, "<collection>"
    print >> out_new, "<collection>"

    line_count_new = 0  # to avoid empty bibupload
    line_count_update = 0  # to avoid empty bibupload
    f.readline()  ## Let's strip the header line

    for d in f:
        task_sleep_now_if_required(can_stop_too=True)
        recid, arxiv_id, cr_date, checksum, link, type, doi = [
            x.strip() for x in d.split(',')
        ]
        write_message(d.strip())
        if checksum == "None":
            write_message("... no PDF. Skipping")
            continue
        if arxiv_id == "None":
            inspire_record = perform_request_search(p="doi:%s" % (doi, ),
                                                    cc="HEP")
        else:
            inspire_record = perform_request_search(p="037:%s or doi:%s" %
                                                    (arxiv_id, doi),
                                                    cc="HEP")
        if len(inspire_record) > 1:
            write_message(
                "ERROR: more than one INSPIRE record matched %s and %s for SCOAP3 record %s: %s"
                % (arxiv_id, doi, recid, list(inspire_record)),
                stream=sys.stderr)
            continue
        elif not inspire_record:
            write_message(
                "WARNING: no INSPIRE record matched %s or %s for SCOAP3 record %s"
                % (arxiv_id, doi, recid),
                stream=sys.stderr)
            continue
        action = None  # do nothing
        rec = {}
        inspire_record = inspire_record[0]
        record = BibRecDocs(inspire_record)
        for doc in record.list_latest_files():
            if doc.format in ('.pdf', '.pdf;pdfa'):
                if doc.bibdoc.doctype == 'SCOAP3':
                    if doc.checksum == checksum:
                        write_message(
                            "... OK: file alredy attached to INSPIRE record %s (doc.checksum=%s, checksum=%s)"
                            % (inspire_record, doc.checksum, checksum))
                    else:
                        write_message(
                            "... OK: new revision available for INSPIRE record %s (doc.checksum=%s, checksum=%s)"
                            % (inspire_record, doc.checksum, checksum))
                        action = "UPDATE"
                    break
        else:
            write_message("... OK: need to add new file to INSPIRE record %s" %
                          inspire_record)
            action = "APPEND"
        if action:
            if type == '.pdf;pdfa':
                record_add_field(rec,
                                 'FFT',
                                 subfields=[('a', link),
                                            ('n', 'scoap3-fulltext'),
                                            ('f', '.pdf;pdfa'),
                                            ('t', 'SCOAP3'),
                                            ('d', 'Article from SCOAP3')])
            else:
                record_add_field(rec,
                                 'FFT',
                                 subfields=[('a', link),
                                            ('n', 'scoap3-fulltext'),
                                            ('t', 'SCOAP3'),
                                            ('d', 'Article from SCOAP3')])

            record_add_field(rec,
                             '001',
                             controlfield_value=str(inspire_record))
        if action == "UPDATE":
            line_count_update += 1
            print >> out_update, record_xml_output(rec)
        elif action == "APPEND":
            line_count_new += 1
            print >> out_new, record_xml_output(rec)
    print >> out_update, "</collection>"
    print >> out_new, "</collection>"
    out_new.close()
    out_update.close()

    if line_count_new:
        id = task_low_level_submission("bibupload", "admin", "-N",
                                       "SCOAP3-import", "-a", name_new)
        write_message("Scheduled bibupload --append %s with ID #%s" %
                      (name_new, id))
    else:
        remove(name_new)
    if line_count_update:
        id = task_low_level_submission("bibupload", "admin", "-N",
                                       "SCOAP3-import", "-c", name_update)
        write_message("Scheduled bibupload --correct %s with ID #%s" %
                      (name_new, id))
    else:
        remove(name_update)
示例#48
0
def scheduled_send_email(fromaddr,
                         toaddr,
                         subject="",
                         content="",
                         header=None,
                         footer=None,
                         copy_to_admin=0,
                         attempt_times=1,
                         attempt_sleeptime=10,
                         user=None,
                         other_bibtasklet_arguments=None,
                         replytoaddr="",
                         bccaddr="",
                        ):
    """
    Like send_email, but send an email via the bibsched
    infrastructure.
    @param fromaddr: sender
    @type fromaddr: string
    @param toaddr: list of receivers
    @type toaddr: string (comma separated) or list of strings
    @param subject: the subject
    @param content: the body of the message
    @param header: optional header, otherwise default is used
    @param footer: optional footer, otherwise default is used
    @param copy_to_admin: set to 1 in order to send email the admins
    @param attempt_times: try at least n times before giving up sending
    @param attempt_sleeptime: number of seconds to sleep between two attempts
    @param user: the user name to user when scheduling the bibtasklet. If
        None, the sender will be used
    @param other_bibtasklet_arguments: other arguments to append to the list
        of arguments to the call of task_low_level_submission
    @param replytoaddr: [string or list-of-strings] to be used for the
                        reply-to header of the email (if string, then
                        receivers are separated by ',')
    @param bccaddr: [string or list-of-strings] to be used for BCC header of the email
                    (if string, then receivers are separated by ',')
    @return: the scheduled bibtasklet
    """
    from invenio.bibtask import task_low_level_submission
    if not isinstance(toaddr, (unicode, str)):
        toaddr = ','.join(toaddr)
    if not isinstance(replytoaddr, (unicode, str)):
        replytoaddr = ','.join(replytoaddr)

    toaddr = remove_temporary_emails(toaddr)

    if user is None:
        user = fromaddr
    if other_bibtasklet_arguments is None:
        other_bibtasklet_arguments = []
    else:
        other_bibtasklet_arguments = list(other_bibtasklet_arguments)
    if not header is None:
        other_bibtasklet_arguments.extend(("-a", "header=%s" % header))
    if not footer is None:
        other_bibtasklet_arguments.extend(("-a", "footer=%s" % footer))
    return task_low_level_submission(
        "bibtasklet", user, "-T", "bst_send_email",
        "-a", "fromaddr=%s" % fromaddr,
        "-a", "toaddr=%s" % toaddr,
        "-a", "replytoaddr=%s" % replytoaddr,
        "-a", "subject=%s" % subject,
        "-a", "content=%s" % content,
        "-a", "copy_to_admin=%s" % copy_to_admin,
        "-a", "attempt_times=%s" % attempt_times,
        "-a", "attempt_sleeptime=%s" % attempt_sleeptime,
        "-a", "bccaddr=%s" % bccaddr,
        *other_bibtasklet_arguments)
示例#49
0
def cli_upload(req, file_content=None, mode=None, callback_url=None):
    """ Robot interface for uploading MARC files
    """
    req.content_type = "text/plain"
    req.send_http_header()

    # check IP and useragent:
    if not _check_client_ip(req):
        msg = "[ERROR] Sorry, client IP %s cannot use the service." % _get_client_ip(
            req)
        _log(msg)
        return _write(req, msg)
    if not _check_client_useragent(req):
        msg = "[ERROR] Sorry, this useragent cannot use the service."
        _log(msg)
        return _write(req, msg)

    arg_file = file_content
    arg_mode = mode
    if not arg_file:
        msg = "[ERROR] Please specify file body to input."
        _log(msg)
        return _write(req, msg)
    if not arg_mode:
        msg = "[ERROR] Please specify upload mode to use."
        _log(msg)
        return _write(req, msg)
    if not arg_mode in PERMITTED_MODES:
        msg = "[ERROR] Invalid upload mode."
        _log(msg)
        return _write(req, msg)
    if hasattr(arg_file, "filename"):
        arg_file = arg_file.value
    else:
        msg = "[ERROR] 'file' parameter must be a (single) file"
        _log(msg)
        return _write(req, msg)

    # write temporary file:
    tempfile.tempdir = CFG_TMPSHAREDDIR

    filename = tempfile.mktemp(prefix="batchupload_" + \
               time.strftime("%Y%m%d%H%M%S", time.localtime()) + "_")

    filedesc = open(filename, 'w')
    filedesc.write(arg_file)
    filedesc.close()

    # check if this client can run this file:
    client_ip = _get_client_ip(req)
    permitted_dbcollids = CFG_BATCHUPLOADER_WEB_ROBOT_RIGHTS[client_ip]
    if permitted_dbcollids != ['*']:  # wildcard
        allow = _check_client_can_submit_file(client_ip, filename, req, 0)
        if not allow:
            msg = "[ERROR] Cannot submit such a file from this IP. (Wrong collection.)"
            _log(msg)
            return _write(req, msg)

    # check validity of marcxml
    xmlmarclint_path = CFG_BINDIR + '/xmlmarclint'
    xmlmarclint_output, dummy1, dummy2 = run_shell_command(
        '%s %s' % (xmlmarclint_path, filename))
    if xmlmarclint_output != 0:
        msg = "[ERROR] MARCXML is not valid."
        _log(msg)
        return _write(req, msg)
    # run upload command
    if callback_url:
        task_low_level_submission('bibupload', "batchupload", arg_mode,
                                  filename, "--callback-url", callback_url)
        msg = "[INFO] %s %s %s %s %s" % ('bibupload', arg_mode, filename,
                                         "--callback-url", callback_url)
    else:
        task_low_level_submission('bibupload', "batchupload", arg_mode,
                                  filename)
        msg = "[INFO] %s %s %s" % ('bibupload', arg_mode, filename)
    _log(msg)
    return _write(req, msg)
示例#50
0
def move_drafts_articles_to_ready(journal_name, issue):
    """
    Move draft articles to their final "collection".

    To do so we rely on the convention that an admin-chosen keyword
    must be removed from the metadata
    """
    protected_datafields = ['100', '245', '246', '520', '590', '700']
    keyword_to_remove = get_journal_draft_keyword_to_remove(journal_name)
    collections_to_refresh = {}
    indexes_to_refresh = get_journal_index_to_refresh_on_release(journal_name)
    bibindex_indexes_params = []
    if indexes_to_refresh:
        bibindex_indexes_params = ['-w', ','.join(indexes_to_refresh)]

    categories = get_journal_categories(journal_name, issue)
    task_sequence_id = str(bibtask_allocate_sequenceid())
    for category in categories:
        articles = get_journal_articles(journal_name, issue, category)
        for order, recids in articles.iteritems():
            for recid in recids:
                record_xml = format_record(recid, of='xm')
                if not record_xml:
                    continue
                new_record_xml_path = os.path.join(CFG_TMPSHAREDDIR,
                                                   'webjournal_publish_' + \
                                                   str(recid) + '.xml')
                if os.path.exists(new_record_xml_path):
                    # Do not modify twice
                    continue
                record_struc = create_record(record_xml)
                record = record_struc[0]
                new_record = update_draft_record_metadata(
                    record, protected_datafields, keyword_to_remove)
                new_record_xml = print_rec(new_record)
                if new_record_xml.find(keyword_to_remove) >= 0:
                    new_record_xml = new_record_xml.replace(
                        keyword_to_remove, '')
                    # Write to file
                    new_record_xml_file = file(new_record_xml_path, 'w')
                    new_record_xml_file.write(new_record_xml)
                    new_record_xml_file.close()
                    # Submit
                    task_low_level_submission('bibupload', 'WebJournal', '-c',
                                              new_record_xml_path, '-I',
                                              task_sequence_id)
                    task_low_level_submission('bibindex', 'WebJournal', '-i',
                                              str(recid), '-I',
                                              task_sequence_id,
                                              *bibindex_indexes_params)
                    for collection in get_all_collections_of_a_record(recid):
                        collections_to_refresh[collection] = ''

    # Refresh collections
    collections_to_refresh.update([
        (c, '')
        for c in get_journal_collection_to_refresh_on_release(journal_name)
    ])
    for collection in collections_to_refresh.keys():
        task_low_level_submission('webcoll', 'WebJournal', '-f', '-P', '2',
                                  '-p', '1', '-c', collection, '-I',
                                  task_sequence_id)
示例#51
0
def bst_scoap3_importer():
    """Import from SCOAP3."""
    try:
        request = requests.get(
            'http://repo.scoap3.org/ffts_for_inspire.py/csv', timeout=60)
    except (HTTPError, ConnectionError, Timeout):
        register_exception()
        return
    task_sleep_now_if_required(can_stop_too=True)

    fd_update, name_update = mkstemp(suffix='.xml',
                                     prefix='bibupload_scoap3_',
                                     dir=CFG_TMPSHAREDDIR)

    out_update = fdopen(fd_update, 'w')
    fd_new, name_new = mkstemp(suffix='.xml',
                               prefix='bibupload_scoap3_',
                               dir=CFG_TMPSHAREDDIR)
    out_new = fdopen(fd_new, 'w')

    print >> out_update, "<collection>"
    print >> out_new, "<collection>"

    line_count_new = 0  # to avoid empty bibupload
    line_count_update = 0  # to avoid empty bibupload

    # We strip the first line.
    for line in request.text.split("\n")[1:]:
        if not line.strip():
            continue
        task_sleep_now_if_required(can_stop_too=True)
        recid, arxiv_id, cr_date, checksum, link, file_format, doi = [
            x.strip() for x in line.split(',')
        ]
        write_message(line.strip())
        if checksum == "None":
            write_message("... no PDF. Skipping")
            continue
        if arxiv_id == "None":
            inspire_record = perform_request_search(p="doi:%s" % (doi, ),
                                                    cc="HEP")
        else:
            inspire_record = perform_request_search(p="037:%s or doi:%s" %
                                                    (arxiv_id, doi),
                                                    cc="HEP")
        if len(inspire_record) > 1:
            write_message(
                "ERROR: more than one INSPIRE record matched %s and %s for SCOAP3 record %s: %s"
                % (arxiv_id, doi, recid, list(inspire_record)),
                stream=sys.stderr)
            continue
        elif not inspire_record:
            write_message(
                "WARNING: no INSPIRE record matched %s or %s for SCOAP3 record %s"
                % (arxiv_id, doi, recid),
                stream=sys.stderr)
            continue
        action = None  # do nothing
        rec = {}
        inspire_record = inspire_record[0]
        record = BibRecDocs(inspire_record)
        for doc in record.list_latest_files('SCOAP3'):
            if doc.format == file_format:
                if doc.checksum == checksum:
                    write_message(
                        "... OK: file alredy attached to INSPIRE record %s (doc.checksum=%s, checksum=%s)"
                        % (inspire_record, doc.checksum, checksum))
                else:
                    # also check all previous version checksums
                    allchecksums = set()
                    for anydoc in record.list_bibdocs(doctype="SCOAP3"):
                        for filev in anydoc.list_all_files():
                            allchecksums.add(filev.checksum)
                    if checksum not in allchecksums:
                        write_message(
                            "... OK: new revision available for INSPIRE record %s (doc.checksum=%s, checksum=%s)"
                            % (inspire_record, doc.checksum, checksum))
                        action = "UPDATE"
                break
        else:
            write_message("... OK: need to add new file to INSPIRE record %s" %
                          inspire_record)
            action = "APPEND"
        if action:
            if file_format == '.pdf;pdfa':
                record_add_field(rec,
                                 'FFT',
                                 subfields=[('a', link),
                                            ('n', 'scoap3-fulltext'),
                                            ('f', '.pdf;pdfa'),
                                            ('t', 'SCOAP3'),
                                            ('d', 'Article from SCOAP3')])
            else:
                record_add_field(rec,
                                 'FFT',
                                 subfields=[('a', link),
                                            ('n', 'scoap3-fulltext'),
                                            ('t', 'SCOAP3'),
                                            ('d', 'Article from SCOAP3')])

            record_add_field(rec,
                             '001',
                             controlfield_value=str(inspire_record))
        if action == "UPDATE":
            line_count_update += 1
            print >> out_update, record_xml_output(rec)
        elif action == "APPEND":
            line_count_new += 1
            print >> out_new, record_xml_output(rec)
    print >> out_update, "</collection>"
    print >> out_new, "</collection>"
    out_new.close()
    out_update.close()

    if line_count_new:
        # We use correct here instead of append to deal with potential sync issues.
        # Basically BibUpload should handle "new" corrections as "append" if it is not there.
        id = task_low_level_submission("bibupload", "admin", "-N",
                                       "SCOAP3-import", "-c", name_new)
        write_message("Scheduled bibupload --correct %s with ID #%s" %
                      (name_new, id))
    else:
        remove(name_new)
    if line_count_update:
        id = task_low_level_submission("bibupload", "admin", "-N",
                                       "SCOAP3-import", "-c", name_update)
        write_message("Scheduled bibupload --correct %s with ID #%s" %
                      (name_update, id))
    else:
        remove(name_update)
示例#52
0
def _dbdump_run_task_core():
    """
    Run DB dumper core stuff.

    Note: do not use task_can_sleep() stuff here because we don't want
    other tasks to interrupt us while we are dumping the DB content.
    """
    # read params:
    host = CFG_DATABASE_HOST
    port = CFG_DATABASE_PORT
    connection = None
    try:
        if task_get_option('slave') and not task_get_option('dump_on_slave_helper_mode'):
            connection = get_connection_for_dump_on_slave()
            write_message("Dump on slave requested")
            write_message("... checking if slave is well up...")
            check_slave_is_up(connection)
            write_message("... checking if slave is in consistent state...")
            check_slave_is_in_consistent_state(connection)
            write_message("... detaching slave database...")
            detach_slave(connection)
            write_message("... scheduling dump on slave helper...")
            helper_arguments = []
            if task_get_option("number"):
                helper_arguments += ["--number", str(task_get_option("number"))]
            if task_get_option("output"):
                helper_arguments += ["--output", str(task_get_option("output"))]
            if task_get_option("params"):
                helper_arguments += ["--params", str(task_get_option("params"))]
            if task_get_option("ignore_tables"):
                helper_arguments += ["--ignore-tables", str(task_get_option("ignore_tables"))]
            if task_get_option("compress"):
                helper_arguments += ["--compress"]
            if task_get_option("slave"):
                helper_arguments += ["--slave", str(task_get_option("slave"))]
            helper_arguments += ['-N', 'slavehelper', '--dump-on-slave-helper']
            task_id = task_low_level_submission('dbdump', task_get_task_param('user'), '-P4', *helper_arguments)
            write_message("Slave scheduled with ID %s" % task_id)
            task_update_progress("DONE")
            return True
        elif task_get_option('dump_on_slave_helper_mode'):
            write_message("Dumping on slave mode")
            connection = get_connection_for_dump_on_slave()
            write_message("... checking if slave is well down...")
            check_slave_is_down(connection)
            host = CFG_DATABASE_SLAVE

        task_update_progress("Reading parameters")
        write_message("Reading parameters started")
        output_dir = task_get_option('output', CFG_LOGDIR)
        output_num = task_get_option('number', 5)
        params = task_get_option('params', None)
        compress = task_get_option('compress', False)
        slave = task_get_option('slave', False)
        ignore_tables = task_get_option('ignore_tables', None)
        if ignore_tables:
            ignore_tables = get_table_names(ignore_tables)
        else:
            ignore_tables = None

        output_file_suffix = task_get_task_param('task_starting_time')
        output_file_suffix = output_file_suffix.replace(' ', '_') + '.sql'
        if compress:
            output_file_suffix = "%s.gz" % (output_file_suffix,)
        write_message("Reading parameters ended")

        # make dump:
        task_update_progress("Dumping database")
        write_message("Database dump started")

        if slave:
            output_file_prefix = 'slave-%s-dbdump-' % (CFG_DATABASE_NAME,)
        else:
            output_file_prefix = '%s-dbdump-' % (CFG_DATABASE_NAME,)
        output_file = output_file_prefix + output_file_suffix
        dump_path = output_dir + os.sep + output_file
        dump_database(dump_path, \
                        host=host,
                        port=port,
                        params=params, \
                        compress=compress, \
                        ignore_tables=ignore_tables)
        write_message("Database dump ended")
    finally:
        if connection and task_get_option('dump_on_slave_helper_mode'):
            write_message("Reattaching slave")
            attach_slave(connection)
    # prune old dump files:
    task_update_progress("Pruning old dump files")
    write_message("Pruning old dump files started")
    _delete_old_dumps(output_dir, output_file_prefix, output_num)
    write_message("Pruning old dump files ended")
    # we are done:
    task_update_progress("Done.")
    return True
示例#53
0
def submit_bibindex_task(to_update, indexes, user, priority=3):
    recids = [str(r) for r in to_update]
    return task_low_level_submission('bibindex', user,
                                     '-w', indexes,
                                     '-P', str(priority),
                                     '-i', ','.join(recids))
示例#54
0
def metadata_upload(req,
                    metafile=None,
                    filetype=None,
                    mode=None,
                    exec_date=None,
                    exec_time=None,
                    metafilename=None,
                    ln=CFG_SITE_LANG,
                    priority="1",
                    email_logs_to=None):
    """
    Metadata web upload service. Get upload parameters and exec bibupload for the given file.
    Finally, write upload history.
    @return: tuple (error code, message)
        error code: code that indicates if an error ocurred
        message: message describing the error
    """
    # start output:
    req.content_type = "text/html"
    req.send_http_header()

    error_codes = {'not_authorized': 1}

    user_info = collect_user_info(req)
    (fd, filename) = tempfile.mkstemp(prefix="batchupload_" + \
        user_info['nickname'] + "_" + time.strftime("%Y%m%d%H%M%S",
        time.localtime()) + "_", dir=CFG_TMPSHAREDDIR)
    filedesc = os.fdopen(fd, 'w')
    filedesc.write(metafile)
    filedesc.close()

    # check if this client can run this file:
    if req is not None:
        allow = _check_client_can_submit_file(req=req,
                                              metafile=metafile,
                                              webupload=1,
                                              ln=ln)
        if allow[0] != 0:
            return (error_codes['not_authorized'], allow[1])

    # run upload command:
    task_arguments = ('bibupload', user_info['nickname'], mode,
                      "--priority=" + priority, "-N", "batchupload")
    if exec_date:
        date = exec_date
        if exec_time:
            date += ' ' + exec_time
        task_arguments += ("-t", date)
    if email_logs_to:
        task_arguments += ('--email-logs-to', email_logs_to)
    task_arguments += (filename, )
    jobid = task_low_level_submission(*task_arguments)

    # write batch upload history
    run_sql(
        """INSERT INTO hstBATCHUPLOAD (user, submitdate,
            filename, execdate, id_schTASK, batch_mode)
            VALUES (%s, NOW(), %s, %s, %s, "metadata")""", (
            user_info['nickname'],
            metafilename,
            exec_date != "" and (exec_date + ' ' + exec_time)
            or time.strftime("%Y-%m-%d %H:%M:%S"),
            str(jobid),
        ))
    return (0, "Task %s queued" % str(jobid))