示例#1
0
def submission_fulltext_download(obj, eng):
    submission_pdf = obj.extra_data.get('submission_pdf')
    if submission_pdf and is_pdf_link(submission_pdf):
        filename = secure_filename('fulltext.pdf')
        pdf = download_file_to_workflow(
            workflow=obj,
            name=filename,
            url=submission_pdf,
        )

        if pdf:
            obj.data['documents'] = [
                document for document in obj.data.get('documents', ())
                if document.get('key') != filename
            ]
            lb = LiteratureBuilder(source=obj.data['acquisition_source']['source'], record=obj.data)
            lb.add_document(
                filename,
                fulltext=True,
                original_url=submission_pdf,
                url='/api/files/{bucket}/{key}'.format(bucket=obj.files[filename].bucket_id, key=filename)
            )
            obj.data = lb.record
            obj.log.info('PDF provided by user from %s', submission_pdf)
            return obj.files[filename].file.uri
        else:
            obj.log.info('Cannot fetch PDF provided by user from %s', submission_pdf)
示例#2
0
def populate_arxiv_document(obj, eng):
    arxiv_id = LiteratureReader(obj.data).arxiv_id

    for conf_name in ('ARXIV_PDF_URL', 'ARXIV_PDF_URL_ALTERNATIVE'):
        url = current_app.config[conf_name].format(arxiv_id=arxiv_id)
        is_valid_pdf_link = is_pdf_link(url)
        if is_valid_pdf_link:
            break
        try:
            if NO_PDF_ON_ARXIV in requests.get(url).content:
                obj.log.info('No PDF is available for %s', arxiv_id)
                return
        except requests.exceptions.RequestException:
            raise DownloadError("Error accessing url {url}".format(url=url))

    if not is_valid_pdf_link:
        raise DownloadError("{url} is not serving a PDF file.".format(url=url))

    filename = secure_filename('{0}.pdf'.format(arxiv_id))
    obj.data['documents'] = [
        document for document in obj.data.get('documents', ())
        if document.get('key') != filename
    ]

    lb = LiteratureBuilder(source='arxiv', record=obj.data)
    lb.add_document(
        filename,
        fulltext=True,
        hidden=True,
        material='preprint',
        original_url=url,
        url=url,
    )
    obj.data = lb.record
示例#3
0
def arxiv_plot_extract(obj, eng):
    """Extract plots from an arXiv archive.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    arxiv_id = get_arxiv_id(obj.data)
    filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
    tarball = obj.files[filename]

    if tarball:
        with TemporaryDirectory(prefix='plot_extract') as scratch_space:
            tarball_file = retrieve_uri(tarball.file.uri, outdir=scratch_space)
            try:
                plots = process_tarball(
                    tarball_file,
                    output_directory=scratch_space,
                )
            except (InvalidTarball, NoTexFilesFound):
                obj.log.info(
                    'Invalid tarball %s for arxiv_id %s',
                    tarball.file.uri,
                    arxiv_id,
                )
                return
            except DelegateError as err:
                obj.log.error(
                    'Error extracting plots for %s. Report and skip.',
                    arxiv_id,
                )
                current_app.logger.exception(err)
                return

            if 'figures' in obj.data:
                for figure in obj.data['figures']:
                    if figure['key'] in obj.files:
                        del obj.files[figure['key']]
                del obj.data['figures']

            lb = LiteratureBuilder(source='arxiv', record=obj.data)
            for index, plot in enumerate(plots):
                plot_name = os.path.basename(plot.get('url'))
                key = plot_name
                if plot_name in obj.files.keys:
                    key = '{number}_{name}'.format(number=index,
                                                   name=plot_name)
                with open(plot.get('url')) as plot_file:
                    obj.files[key] = plot_file

                lb.add_figure(key=key,
                              caption=''.join(plot.get('captions', [])),
                              label=plot.get('label'),
                              material='preprint',
                              url='/api/files/{bucket}/{key}'.format(
                                  bucket=obj.files[key].bucket_id,
                                  key=key,
                              ))

            obj.data = lb.record
            obj.log.info('Added {0} plots.'.format(len(plots)))
示例#4
0
def populate_arxiv_document(obj, eng):
    arxiv_id = get_arxiv_id(obj.data)
    url = current_app.config['ARXIV_PDF_URL'].format(arxiv_id=arxiv_id)

    if not is_pdf_link(url):
        if NO_PDF_ON_ARXIV in requests.get(url).content:
            obj.log.info('No PDF is available for %s', arxiv_id)
            return
        raise DownloadError("{url} is not serving a PDF file.".format(url=url))

    filename = secure_filename('{0}.pdf'.format(arxiv_id))
    obj.data['documents'] = [
        document for document in obj.data.get('documents', ())
        if document.get('key') != filename
    ]

    lb = LiteratureBuilder(source='arxiv', record=obj.data)
    lb.add_document(
        filename,
        fulltext=True,
        hidden=True,
        material='preprint',
        original_url=url,
        url=url,
    )
    obj.data = lb.record
示例#5
0
def populate_arxiv_document(obj, eng):
    arxiv_id = get_arxiv_id(obj.data)

    for conf_name in ('ARXIV_PDF_URL', 'ARXIV_PDF_URL_ALTERNATIVE'):
        url = current_app.config[conf_name].format(arxiv_id=arxiv_id)
        is_valid_pdf_link = is_pdf_link(url)
        if is_valid_pdf_link:
            break

        if NO_PDF_ON_ARXIV in requests.get(url).content:
            obj.log.info('No PDF is available for %s', arxiv_id)
            return

    if not is_valid_pdf_link:
        raise DownloadError("{url} is not serving a PDF file.".format(url=url))

    filename = secure_filename('{0}.pdf'.format(arxiv_id))
    obj.data['documents'] = [
        document for document in obj.data.get('documents', ())
        if document.get('key') != filename
    ]

    lb = LiteratureBuilder(source='arxiv', record=obj.data)
    lb.add_document(
        filename,
        fulltext=True,
        hidden=True,
        material='preprint',
        original_url=url,
        url=url,
    )
    obj.data = lb.record
示例#6
0
    def add_document_or_figure(
        self,
        metadata,
        stream=None,
        is_document=True,
        file_name=None,
        key=None,
    ):
        """Add a document or figure to the record.

        Args:

            metadata(dict): metadata of the document or figure, see the schemas
                for more details, will be validated.
            stream(file like object): if passed, will extract the file contents
                from it.
            is_document(bool): if the given information is for a document,
                set to ```False``` for a figure.
            file_name(str): Name of the file, used as a basis of the key for
                the files store.
            key(str): if passed, will use this as the key for the files store
                and ignore ``file_name``, use it to overwrite existing keys.


        Returns:

            dict: metadata of the added document or figure.


        Raises:

            TypeError: if not ``file_name`` nor ``key`` are passed (one of
                them is required).
        """
        if not key and not file_name:
            raise TypeError(
                'No file_name and no key passed, at least one of them is '
                'needed.'
            )

        if not key:
            key = self._get_unique_files_key(base_file_name=file_name)

        if stream is not None:
            self.files[key] = stream

        builder = LiteratureBuilder(record=self.to_dict())
        metadata['key'] = key
        metadata['url'] = '/api/files/{bucket}/{key}'.format(
            bucket=self.files[key].bucket_id,
            key=key,
        )
        if is_document:
            builder.add_document(**metadata)
        else:
            builder.add_figure(**metadata)

        super(InspireRecord, self).update(builder.record)
        return metadata
示例#7
0
def populate_submission_document(obj, eng):
    submission_pdf = obj.extra_data.get('submission_pdf')
    if submission_pdf and is_pdf_link(submission_pdf):
        filename = secure_filename('fulltext.pdf')
        obj.data['documents'] = [
            document for document in obj.data.get('documents', ())
            if document.get('key') != filename
        ]
        lb = LiteratureBuilder(source=obj.data['acquisition_source']['source'],
                               record=obj.data)
        lb.add_document(
            filename,
            fulltext=True,
            url=submission_pdf,
            original_url=submission_pdf,
        )
        obj.data = lb.record
示例#8
0
def populate_submission_document(obj, eng):
    submission_pdf = obj.extra_data.get('submission_pdf')
    if submission_pdf and is_pdf_link(submission_pdf):
        filename = secure_filename('fulltext.pdf')
        obj.data['documents'] = [
            document for document in obj.data.get('documents', ())
            if document.get('key') != filename
        ]
        lb = LiteratureBuilder(
            source=obj.data['acquisition_source']['source'], record=obj.data)
        lb.add_document(
            filename,
            fulltext=True,
            url=submission_pdf,
            original_url=submission_pdf,
        )
        obj.data = lb.record
示例#9
0
    def to_hep(self, source):
        """Get an output ready hep formatted record from the given
        :class:`hepcrawl.utils.ParsedItem`, whatever format it's record might be.

        Args:
            source(str): string identifying the source for this item (ex. 'arXiv').

        Returns:
            hepcrawl.utils.ParsedItem: the new item, with the internal record
                formated as hep record.

        Raises:
            UnknownItemFormat: if the source item format is unknown.
        """
        builder = LiteratureBuilder(source=source)

        builder.add_acquisition_source(
            source=source,
            method='hepcrawl',
            date=datetime.datetime.now().isoformat(),
            submission_number=os.environ.get('SCRAPY_JOB', ''),
        )

        self.record['acquisition_source'] = builder.record[
            'acquisition_source']

        if self.record_format == 'hep':
            record = hep_to_hep(
                hep_record=self.record,
                record_files=self.record_files,
            )
            for document in record.get('documents', []):
                if 'old_url' in document and 'original_url' not in document:
                    document['original_url'] = document['old_url']
                    del document['old_url']
            return record
        elif self.record_format == 'hepcrawl':
            record = _normalize_hepcrawl_record(
                item=self.record,
                source=source,
            )
            return hepcrawl_to_hep(dict(record))
        else:
            raise UnknownItemFormat('Unknown ParsedItem::{}'.format(
                self.record_format))
示例#10
0
def assign_conference(record, conference_ref, cnum):
    builder = LiteratureBuilder(record=record)
    if not cnum:
        raise MissingArgumentError("cnum is required.")
    if not conference_ref:
        raise MissingArgumentError("$ref is required.")

    if not {"proceedings", "conference paper"}.intersection(
            record.get_value("document_type")):
        builder.add_document_type("conference paper")
    if conference_ref not in builder.record.get_value(
            "publication_info.conference_record",
        []) and cnum not in builder.record.get_value("publication_info.cnum",
                                                     []):
        builder.add_publication_info(cnum=cnum,
                                     conference_record=conference_ref)
        LOGGER.info(
            "Assigning conference to record",
            recid=record.control_number,
            conference_ref=conference_ref,
            cnum=cnum,
        )
    else:
        for idx, publication_info_element in enumerate(
                builder.record.get_value("publication_info")):
            record_conference_ref = publication_info_element.get(
                "conference_record", {})
            record_cnum = publication_info_element.get("cnum", "")
            if conference_ref == record_conference_ref and cnum == record_cnum:
                LOGGER.warning(
                    "Conference already assigned to record",
                    recid=record.control_number,
                    conference_ref=conference_ref,
                    cnum=cnum,
                )
            elif conference_ref == record_conference_ref:
                builder.record["publication_info"][idx]["cnum"] = cnum
                LOGGER.warning(
                    "conference ref already assigned to paper without cnum.",
                    recid=record.control_number,
                    conference_ref=conference_ref,
                    cnum=cnum,
                )
            elif cnum == record_cnum:
                builder.record["publication_info"][idx][
                    "conference_record"] = conference_ref
                LOGGER.warning(
                    "conference cnum already assigned to paper without ref.",
                    recid=record.control_number,
                    conference_ref=conference_ref,
                    cnum=cnum,
                )

    return dict(builder.record)
示例#11
0
def process_cds_record(cds_record):
    control_numbers = get_value(cds_record, "metadata.other_ids", [])
    arxivs = get_value(cds_record, "metadata.eprints", [])
    dois = get_value(cds_record, "metadata.dois.value", [])
    report_numbers = get_value(cds_record, "metadata.report_numbers.value", [])

    cds_id = cds_record.get("id") or get_value(cds_record,
                                               "metadata.control_number", [])

    if not cds_id:
        LOGGER.info(
            "Cannot extract CDS id from CDS response",
            cds_data=cds_record,
        )
        return

    record = get_record_for_provided_ids(control_numbers, arxivs, dois,
                                         report_numbers)
    if not record:
        LOGGER.warning(
            "Cannot find record with any of the provided IDS",
            control_numbers=control_numbers,
            arxivs=arxivs,
            dois=dois,
            report_numbers=report_numbers,
        )
        return None
    control_number = record.control_number

    ids = record.get("external_system_identifiers", [])
    values = get_values_for_schema(ids, "CDS")
    if cds_id in values:
        LOGGER.info(
            "Correct CDS identifier is already present in the record",
            recid=control_number,
            cds_id=cds_id,
        )
        return

    builder = LiteratureBuilder(record=record)
    builder.add_external_system_identifier(cds_id, "CDS")

    data = dict(builder.record)
    record.update(data)
示例#12
0
    def add_figures(self, figures):
        builder = LiteratureBuilder()

        with ThreadPoolExecutor(max_workers=current_app.config.get(
                "FILES_MAX_UPLOAD_THREADS", 5)) as executor:
            tasks = self._generate_and_submit_files_tasks(figures, executor)
            processed_figures = self._process_tasks_results(tasks, figures)
        if processed_figures:
            for document in processed_figures:
                self._update_file_entry(document, builder.add_figure)
        return builder.record.get("figures")
示例#13
0
def arxiv_fulltext_download(obj, eng):
    """Perform the fulltext download step for arXiv records.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    arxiv_id = get_arxiv_id(obj.data)
    filename = secure_filename('{0}.pdf'.format(arxiv_id))
    url = current_app.config['ARXIV_PDF_URL'].format(arxiv_id=arxiv_id)

    if not is_pdf_link(url):
        if NO_PDF_ON_ARXIV in requests.get(url).content:
            obj.log.info('No PDF is available for %s', arxiv_id)
            return
        raise DownloadError("{url} is not serving a PDF file.".format(url=url))

    pdf = download_file_to_workflow(
        workflow=obj,
        name=filename,
        url=url,
    )

    if pdf:
        obj.data['documents'] = [
            document for document in obj.data.get('documents', ())
            if document.get('key') != filename
        ]
        lb = LiteratureBuilder(source='arxiv', record=obj.data)
        lb.add_document(filename,
                        fulltext=True,
                        hidden=True,
                        material='preprint',
                        original_url=url,
                        url='/api/files/{bucket}/{key}'.format(
                            bucket=obj.files[filename].bucket_id,
                            key=filename))
        obj.data = lb.record
        obj.log.info('PDF retrieved from arXiv for %s', arxiv_id)
    else:
        obj.log.error('Cannot retrieve PDF from arXiv for %s', arxiv_id)
示例#14
0
    def add_document_or_figure(
        self,
        metadata,
        stream=None,
        is_document=True,
        file_name=None,
        key=None,
    ):
        """Add a document or figure to the record.

        Args:

            metadata(dict): metadata of the document or figure, see the schemas
                for more details, will be validated.
            stream(file like object): if passed, will extract the file contents
                from it.
            is_document(bool): if the given information is for a document,
                set to ```False``` for a figure.
            file_name(str): Name of the file, used as a basis of the key for
                the files store.
            key(str): if passed, will use this as the key for the files store
                and ignore ``file_name``, use it to overwrite existing keys.


        Returns:

            dict: metadata of the added document or figure.


        Raises:

            TypeError: if not ``file_name`` nor ``key`` are passed (one of
                them is required).
        """
        if not key and not file_name:
            raise TypeError(
                'No file_name and no key passed, at least one of them is '
                'needed.'
            )

        if not key:
            key = self._get_unique_files_key(base_file_name=file_name)

        if stream is not None:
            self.files[key] = stream

        builder = LiteratureBuilder(record=self.dumps())
        metadata['key'] = key
        metadata['url'] = '/api/files/{bucket}/{key}'.format(
            bucket=self.files[key].bucket_id,
            key=key,
        )
        if is_document:
            builder.add_document(**metadata)
        else:
            builder.add_figure(**metadata)

        super(InspireRecord, self).update(builder.record)
        return metadata
示例#15
0
def test_append_to():
    formdata = ''
    builder = LiteratureBuilder("test")
    expected_result = None
    builder._append_to('test_field', formdata)
    assert builder.record.get('test_field') is expected_result
    formdata = 'value'
    expected_result = ['value']
    builder._append_to('test_field_2', formdata)
    assert builder.record.get('test_field_2') == expected_result
示例#16
0
    def add_files(self, documents=None, figures=None):
        """Public method for adding documents and figures

        Args:
            documents (list[dict]): List of documents which should be added to this
            record
            figures (list[dict]): List of figures which should be added to this record

            Documents and figures are lists of dicts.
            Most obscure dict which whould be provided for each file is:
            {
                'url': 'http:// or /api/file/bucket_id/file_key'
                'is_document': True or False(default)
            }


        Returns:
             list: list of added keys
        """
        if not documents and not figures:
            raise TypeError("No files passed, at least one is needed")

        if not current_app.config.get("FEATURE_FLAG_ENABLE_FILES", False):
            if figures:
                self.setdefault("figures", []).extend(figures)

            if documents:
                self.setdefault("documents", []).extend(documents)
            return []
        files = []
        builder = LiteratureBuilder(record=self)
        if documents:
            doc_keys = [
                doc_metadata["key"]
                for doc_metadata in self.get("documents", [])
            ]
            for doc in documents:
                metadata = self._add_file(document=True, **doc)
                if metadata["key"] not in doc_keys:
                    builder.add_document(**metadata)
                files.append(metadata)
        if figures:
            fig_keys = [
                fig_metadata["key"]
                for fig_metadata in self.get("figures", [])
            ]
            for fig in figures:
                metadata = self._add_file(**fig)
                if metadata["key"] not in fig_keys:
                    builder.add_figure(**metadata)
                files.append(metadata)
        # FIXME: this is wrong every time it goes to ``update``` function
        # which means update refs, pidstore etc..
        super().update(builder.record.dumps())
        return files
示例#17
0
    def add_documents(self, documents):
        builder = LiteratureBuilder()
        documents_hidden = list(
            filter(lambda x: x.get("hidden", False), documents))
        documents_to_process = list(
            filter(lambda x: not x.get("hidden", False), documents))

        for document in documents_hidden:
            self._update_file_entry(document, builder.add_document)

        with ThreadPoolExecutor(max_workers=current_app.config.get(
                "FILES_MAX_UPLOAD_THREADS", 5)) as executor:
            tasks = self._generate_and_submit_files_tasks(
                documents_to_process, executor)
            processed_documents = self._process_tasks_results(
                tasks, documents_to_process)
        if processed_documents:
            for document in processed_documents:
                self._update_file_entry(document, builder.add_document)
        return builder.record.get("documents")
示例#18
0
def arxiv_plot_extract(obj, eng):
    """Extract plots from an arXiv archive.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    # Crude way to set memory limits for wand globally.
    mem_limit = current_app.config.get("WAND_MEMORY_LIMIT")
    if mem_limit and limits['memory'] != mem_limit:
        limits['memory'] = mem_limit
        # This sets disk limit, if not set it will swap data on disk
        # instead of throwing exception
        limits['disk'] = current_app.config.get("WAND_DISK_LIMIT", 0)
        # It will throw an exception when memory and disk limit exceeds.
        # At least workflow status will be saved.

    arxiv_id = LiteratureReader(obj.data).arxiv_id
    filename = secure_filename('{0}.tar.gz'.format(arxiv_id))

    try:
        tarball = obj.files[filename]
    except KeyError:
        obj.log.info('No file named=%s for arxiv_id %s', filename, arxiv_id)
        return

    with TemporaryDirectory(prefix='plot_extract') as scratch_space, \
            retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file:
        try:
            plots = process_tarball(
                tarball_file,
                output_directory=scratch_space,
            )
        except (InvalidTarball, NoTexFilesFound):
            obj.log.info(
                'Invalid tarball %s for arxiv_id %s',
                tarball.file.uri,
                arxiv_id,
            )
            return
        except DelegateError as err:
            obj.log.error(
                'Error extracting plots for %s. Report and skip.',
                arxiv_id,
            )
            current_app.logger.exception(err)
            return

        if 'figures' in obj.data:
            for figure in obj.data['figures']:
                if figure['key'] in obj.files:
                    del obj.files[figure['key']]
            del obj.data['figures']

        lb = LiteratureBuilder(source='arxiv', record=obj.data)
        for index, plot in enumerate(plots):
            plot_name = os.path.basename(plot.get('url'))
            key = plot_name
            if plot_name in obj.files.keys:
                key = 'w{number}_{name}'.format(
                    number=index,
                    name=plot_name,
                )
            with open(plot.get('url')) as plot_file:
                obj.files[key] = plot_file

            lb.add_figure(
                key=key,
                caption=''.join(plot.get('captions', [])),
                label=plot.get('label'),
                material='preprint',
                url='/api/files/{bucket}/{key}'.format(
                    bucket=obj.files[key].bucket_id,
                    key=key,
                )
            )

        obj.data = lb.record
        obj.log.info('Added {0} plots.'.format(len(plots)))
示例#19
0
def arxiv_plot_extract(obj, eng):
    """Extract plots from an arXiv archive.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    arxiv_id = get_arxiv_id(obj.data)
    filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
    tarball = obj.files[filename]

    if tarball:
        with TemporaryDirectory(prefix='plot_extract') as scratch_space, \
                retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file:
            try:
                plots = process_tarball(
                    tarball_file,
                    output_directory=scratch_space,
                )
            except (InvalidTarball, NoTexFilesFound):
                obj.log.info(
                    'Invalid tarball %s for arxiv_id %s',
                    tarball.file.uri,
                    arxiv_id,
                )
                return
            except DelegateError as err:
                obj.log.error(
                    'Error extracting plots for %s. Report and skip.',
                    arxiv_id,
                )
                current_app.logger.exception(err)
                return

            if 'figures' in obj.data:
                for figure in obj.data['figures']:
                    if figure['key'] in obj.files:
                        del obj.files[figure['key']]
                del obj.data['figures']

            lb = LiteratureBuilder(source='arxiv', record=obj.data)
            for index, plot in enumerate(plots):
                plot_name = os.path.basename(plot.get('url'))
                key = plot_name
                if plot_name in obj.files.keys:
                    key = 'w{number}_{name}'.format(
                        number=index,
                        name=plot_name,
                    )
                with open(plot.get('url')) as plot_file:
                    obj.files[key] = plot_file

                lb.add_figure(
                    key=key,
                    caption=''.join(plot.get('captions', [])),
                    label=plot.get('label'),
                    material='preprint',
                    url='/api/files/{bucket}/{key}'.format(
                        bucket=obj.files[key].bucket_id,
                        key=key,
                    )
                )

            obj.data = lb.record
            obj.log.info('Added {0} plots.'.format(len(plots)))