Exemplo n.º 1
0
def submission_fulltext_download(obj, eng):
    submission_pdf = obj.extra_data.get('submission_pdf')
    if submission_pdf and is_pdf_link(submission_pdf):
        filename = secure_filename('fulltext.pdf')
        pdf = download_file_to_workflow(
            workflow=obj,
            name=filename,
            url=submission_pdf,
        )

        if pdf:
            obj.data['documents'] = [
                document for document in obj.data.get('documents', ())
                if document.get('key') != filename
            ]
            lb = LiteratureBuilder(source=obj.data['acquisition_source']['source'], record=obj.data)
            lb.add_document(
                filename,
                fulltext=True,
                original_url=submission_pdf,
                url='/api/files/{bucket}/{key}'.format(bucket=obj.files[filename].bucket_id, key=filename)
            )
            obj.data = lb.record
            obj.log.info('PDF provided by user from %s', submission_pdf)
            return obj.files[filename].file.uri
        else:
            obj.log.info('Cannot fetch PDF provided by user from %s', submission_pdf)
Exemplo n.º 2
0
def populate_arxiv_document(obj, eng):
    arxiv_id = get_arxiv_id(obj.data)
    url = current_app.config['ARXIV_PDF_URL'].format(arxiv_id=arxiv_id)

    if not is_pdf_link(url):
        if NO_PDF_ON_ARXIV in requests.get(url).content:
            obj.log.info('No PDF is available for %s', arxiv_id)
            return
        raise DownloadError("{url} is not serving a PDF file.".format(url=url))

    filename = secure_filename('{0}.pdf'.format(arxiv_id))
    obj.data['documents'] = [
        document for document in obj.data.get('documents', ())
        if document.get('key') != filename
    ]

    lb = LiteratureBuilder(source='arxiv', record=obj.data)
    lb.add_document(
        filename,
        fulltext=True,
        hidden=True,
        material='preprint',
        original_url=url,
        url=url,
    )
    obj.data = lb.record
Exemplo n.º 3
0
def populate_arxiv_document(obj, eng):
    arxiv_id = LiteratureReader(obj.data).arxiv_id

    for conf_name in ('ARXIV_PDF_URL', 'ARXIV_PDF_URL_ALTERNATIVE'):
        url = current_app.config[conf_name].format(arxiv_id=arxiv_id)
        is_valid_pdf_link = is_pdf_link(url)
        if is_valid_pdf_link:
            break
        try:
            if NO_PDF_ON_ARXIV in requests.get(url).content:
                obj.log.info('No PDF is available for %s', arxiv_id)
                return
        except requests.exceptions.RequestException:
            raise DownloadError("Error accessing url {url}".format(url=url))

    if not is_valid_pdf_link:
        raise DownloadError("{url} is not serving a PDF file.".format(url=url))

    filename = secure_filename('{0}.pdf'.format(arxiv_id))
    obj.data['documents'] = [
        document for document in obj.data.get('documents', ())
        if document.get('key') != filename
    ]

    lb = LiteratureBuilder(source='arxiv', record=obj.data)
    lb.add_document(
        filename,
        fulltext=True,
        hidden=True,
        material='preprint',
        original_url=url,
        url=url,
    )
    obj.data = lb.record
Exemplo n.º 4
0
def populate_arxiv_document(obj, eng):
    arxiv_id = get_arxiv_id(obj.data)

    for conf_name in ('ARXIV_PDF_URL', 'ARXIV_PDF_URL_ALTERNATIVE'):
        url = current_app.config[conf_name].format(arxiv_id=arxiv_id)
        is_valid_pdf_link = is_pdf_link(url)
        if is_valid_pdf_link:
            break

        if NO_PDF_ON_ARXIV in requests.get(url).content:
            obj.log.info('No PDF is available for %s', arxiv_id)
            return

    if not is_valid_pdf_link:
        raise DownloadError("{url} is not serving a PDF file.".format(url=url))

    filename = secure_filename('{0}.pdf'.format(arxiv_id))
    obj.data['documents'] = [
        document for document in obj.data.get('documents', ())
        if document.get('key') != filename
    ]

    lb = LiteratureBuilder(source='arxiv', record=obj.data)
    lb.add_document(
        filename,
        fulltext=True,
        hidden=True,
        material='preprint',
        original_url=url,
        url=url,
    )
    obj.data = lb.record
Exemplo n.º 5
0
    def add_document_or_figure(
        self,
        metadata,
        stream=None,
        is_document=True,
        file_name=None,
        key=None,
    ):
        """Add a document or figure to the record.

        Args:

            metadata(dict): metadata of the document or figure, see the schemas
                for more details, will be validated.
            stream(file like object): if passed, will extract the file contents
                from it.
            is_document(bool): if the given information is for a document,
                set to ```False``` for a figure.
            file_name(str): Name of the file, used as a basis of the key for
                the files store.
            key(str): if passed, will use this as the key for the files store
                and ignore ``file_name``, use it to overwrite existing keys.


        Returns:

            dict: metadata of the added document or figure.


        Raises:

            TypeError: if not ``file_name`` nor ``key`` are passed (one of
                them is required).
        """
        if not key and not file_name:
            raise TypeError(
                'No file_name and no key passed, at least one of them is '
                'needed.'
            )

        if not key:
            key = self._get_unique_files_key(base_file_name=file_name)

        if stream is not None:
            self.files[key] = stream

        builder = LiteratureBuilder(record=self.to_dict())
        metadata['key'] = key
        metadata['url'] = '/api/files/{bucket}/{key}'.format(
            bucket=self.files[key].bucket_id,
            key=key,
        )
        if is_document:
            builder.add_document(**metadata)
        else:
            builder.add_figure(**metadata)

        super(InspireRecord, self).update(builder.record)
        return metadata
Exemplo n.º 6
0
    def add_document_or_figure(
        self,
        metadata,
        stream=None,
        is_document=True,
        file_name=None,
        key=None,
    ):
        """Add a document or figure to the record.

        Args:

            metadata(dict): metadata of the document or figure, see the schemas
                for more details, will be validated.
            stream(file like object): if passed, will extract the file contents
                from it.
            is_document(bool): if the given information is for a document,
                set to ```False``` for a figure.
            file_name(str): Name of the file, used as a basis of the key for
                the files store.
            key(str): if passed, will use this as the key for the files store
                and ignore ``file_name``, use it to overwrite existing keys.


        Returns:

            dict: metadata of the added document or figure.


        Raises:

            TypeError: if not ``file_name`` nor ``key`` are passed (one of
                them is required).
        """
        if not key and not file_name:
            raise TypeError(
                'No file_name and no key passed, at least one of them is '
                'needed.'
            )

        if not key:
            key = self._get_unique_files_key(base_file_name=file_name)

        if stream is not None:
            self.files[key] = stream

        builder = LiteratureBuilder(record=self.dumps())
        metadata['key'] = key
        metadata['url'] = '/api/files/{bucket}/{key}'.format(
            bucket=self.files[key].bucket_id,
            key=key,
        )
        if is_document:
            builder.add_document(**metadata)
        else:
            builder.add_figure(**metadata)

        super(InspireRecord, self).update(builder.record)
        return metadata
Exemplo n.º 7
0
    def add_files(self, documents=None, figures=None):
        """Public method for adding documents and figures

        Args:
            documents (list[dict]): List of documents which should be added to this
            record
            figures (list[dict]): List of figures which should be added to this record

            Documents and figures are lists of dicts.
            Most obscure dict which whould be provided for each file is:
            {
                'url': 'http:// or /api/file/bucket_id/file_key'
                'is_document': True or False(default)
            }


        Returns:
             list: list of added keys
        """
        if not documents and not figures:
            raise TypeError("No files passed, at least one is needed")

        if not current_app.config.get("FEATURE_FLAG_ENABLE_FILES", False):
            if figures:
                self.setdefault("figures", []).extend(figures)

            if documents:
                self.setdefault("documents", []).extend(documents)
            return []
        files = []
        builder = LiteratureBuilder(record=self)
        if documents:
            doc_keys = [
                doc_metadata["key"]
                for doc_metadata in self.get("documents", [])
            ]
            for doc in documents:
                metadata = self._add_file(document=True, **doc)
                if metadata["key"] not in doc_keys:
                    builder.add_document(**metadata)
                files.append(metadata)
        if figures:
            fig_keys = [
                fig_metadata["key"]
                for fig_metadata in self.get("figures", [])
            ]
            for fig in figures:
                metadata = self._add_file(**fig)
                if metadata["key"] not in fig_keys:
                    builder.add_figure(**metadata)
                files.append(metadata)
        # FIXME: this is wrong every time it goes to ``update``` function
        # which means update refs, pidstore etc..
        super().update(builder.record.dumps())
        return files
Exemplo n.º 8
0
def populate_submission_document(obj, eng):
    submission_pdf = obj.extra_data.get('submission_pdf')
    if submission_pdf and is_pdf_link(submission_pdf):
        filename = secure_filename('fulltext.pdf')
        obj.data['documents'] = [
            document for document in obj.data.get('documents', ())
            if document.get('key') != filename
        ]
        lb = LiteratureBuilder(source=obj.data['acquisition_source']['source'],
                               record=obj.data)
        lb.add_document(
            filename,
            fulltext=True,
            url=submission_pdf,
            original_url=submission_pdf,
        )
        obj.data = lb.record
Exemplo n.º 9
0
def populate_submission_document(obj, eng):
    submission_pdf = obj.extra_data.get('submission_pdf')
    if submission_pdf and is_pdf_link(submission_pdf):
        filename = secure_filename('fulltext.pdf')
        obj.data['documents'] = [
            document for document in obj.data.get('documents', ())
            if document.get('key') != filename
        ]
        lb = LiteratureBuilder(
            source=obj.data['acquisition_source']['source'], record=obj.data)
        lb.add_document(
            filename,
            fulltext=True,
            url=submission_pdf,
            original_url=submission_pdf,
        )
        obj.data = lb.record
Exemplo n.º 10
0
def arxiv_fulltext_download(obj, eng):
    """Perform the fulltext download step for arXiv records.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    arxiv_id = get_arxiv_id(obj.data)
    filename = secure_filename('{0}.pdf'.format(arxiv_id))
    url = current_app.config['ARXIV_PDF_URL'].format(arxiv_id=arxiv_id)

    if not is_pdf_link(url):
        if NO_PDF_ON_ARXIV in requests.get(url).content:
            obj.log.info('No PDF is available for %s', arxiv_id)
            return
        raise DownloadError("{url} is not serving a PDF file.".format(url=url))

    pdf = download_file_to_workflow(
        workflow=obj,
        name=filename,
        url=url,
    )

    if pdf:
        obj.data['documents'] = [
            document for document in obj.data.get('documents', ())
            if document.get('key') != filename
        ]
        lb = LiteratureBuilder(source='arxiv', record=obj.data)
        lb.add_document(filename,
                        fulltext=True,
                        hidden=True,
                        material='preprint',
                        original_url=url,
                        url='/api/files/{bucket}/{key}'.format(
                            bucket=obj.files[filename].bucket_id,
                            key=filename))
        obj.data = lb.record
        obj.log.info('PDF retrieved from arXiv for %s', arxiv_id)
    else:
        obj.log.error('Cannot retrieve PDF from arXiv for %s', arxiv_id)