def submission_fulltext_download(obj, eng): submission_pdf = obj.extra_data.get('submission_pdf') if submission_pdf and is_pdf_link(submission_pdf): filename = secure_filename('fulltext.pdf') pdf = download_file_to_workflow( workflow=obj, name=filename, url=submission_pdf, ) if pdf: obj.data['documents'] = [ document for document in obj.data.get('documents', ()) if document.get('key') != filename ] lb = LiteratureBuilder(source=obj.data['acquisition_source']['source'], record=obj.data) lb.add_document( filename, fulltext=True, original_url=submission_pdf, url='/api/files/{bucket}/{key}'.format(bucket=obj.files[filename].bucket_id, key=filename) ) obj.data = lb.record obj.log.info('PDF provided by user from %s', submission_pdf) return obj.files[filename].file.uri else: obj.log.info('Cannot fetch PDF provided by user from %s', submission_pdf)
def populate_arxiv_document(obj, eng): arxiv_id = get_arxiv_id(obj.data) url = current_app.config['ARXIV_PDF_URL'].format(arxiv_id=arxiv_id) if not is_pdf_link(url): if NO_PDF_ON_ARXIV in requests.get(url).content: obj.log.info('No PDF is available for %s', arxiv_id) return raise DownloadError("{url} is not serving a PDF file.".format(url=url)) filename = secure_filename('{0}.pdf'.format(arxiv_id)) obj.data['documents'] = [ document for document in obj.data.get('documents', ()) if document.get('key') != filename ] lb = LiteratureBuilder(source='arxiv', record=obj.data) lb.add_document( filename, fulltext=True, hidden=True, material='preprint', original_url=url, url=url, ) obj.data = lb.record
def populate_arxiv_document(obj, eng): arxiv_id = LiteratureReader(obj.data).arxiv_id for conf_name in ('ARXIV_PDF_URL', 'ARXIV_PDF_URL_ALTERNATIVE'): url = current_app.config[conf_name].format(arxiv_id=arxiv_id) is_valid_pdf_link = is_pdf_link(url) if is_valid_pdf_link: break try: if NO_PDF_ON_ARXIV in requests.get(url).content: obj.log.info('No PDF is available for %s', arxiv_id) return except requests.exceptions.RequestException: raise DownloadError("Error accessing url {url}".format(url=url)) if not is_valid_pdf_link: raise DownloadError("{url} is not serving a PDF file.".format(url=url)) filename = secure_filename('{0}.pdf'.format(arxiv_id)) obj.data['documents'] = [ document for document in obj.data.get('documents', ()) if document.get('key') != filename ] lb = LiteratureBuilder(source='arxiv', record=obj.data) lb.add_document( filename, fulltext=True, hidden=True, material='preprint', original_url=url, url=url, ) obj.data = lb.record
def populate_arxiv_document(obj, eng): arxiv_id = get_arxiv_id(obj.data) for conf_name in ('ARXIV_PDF_URL', 'ARXIV_PDF_URL_ALTERNATIVE'): url = current_app.config[conf_name].format(arxiv_id=arxiv_id) is_valid_pdf_link = is_pdf_link(url) if is_valid_pdf_link: break if NO_PDF_ON_ARXIV in requests.get(url).content: obj.log.info('No PDF is available for %s', arxiv_id) return if not is_valid_pdf_link: raise DownloadError("{url} is not serving a PDF file.".format(url=url)) filename = secure_filename('{0}.pdf'.format(arxiv_id)) obj.data['documents'] = [ document for document in obj.data.get('documents', ()) if document.get('key') != filename ] lb = LiteratureBuilder(source='arxiv', record=obj.data) lb.add_document( filename, fulltext=True, hidden=True, material='preprint', original_url=url, url=url, ) obj.data = lb.record
def add_document_or_figure( self, metadata, stream=None, is_document=True, file_name=None, key=None, ): """Add a document or figure to the record. Args: metadata(dict): metadata of the document or figure, see the schemas for more details, will be validated. stream(file like object): if passed, will extract the file contents from it. is_document(bool): if the given information is for a document, set to ```False``` for a figure. file_name(str): Name of the file, used as a basis of the key for the files store. key(str): if passed, will use this as the key for the files store and ignore ``file_name``, use it to overwrite existing keys. Returns: dict: metadata of the added document or figure. Raises: TypeError: if not ``file_name`` nor ``key`` are passed (one of them is required). """ if not key and not file_name: raise TypeError( 'No file_name and no key passed, at least one of them is ' 'needed.' ) if not key: key = self._get_unique_files_key(base_file_name=file_name) if stream is not None: self.files[key] = stream builder = LiteratureBuilder(record=self.to_dict()) metadata['key'] = key metadata['url'] = '/api/files/{bucket}/{key}'.format( bucket=self.files[key].bucket_id, key=key, ) if is_document: builder.add_document(**metadata) else: builder.add_figure(**metadata) super(InspireRecord, self).update(builder.record) return metadata
def add_document_or_figure( self, metadata, stream=None, is_document=True, file_name=None, key=None, ): """Add a document or figure to the record. Args: metadata(dict): metadata of the document or figure, see the schemas for more details, will be validated. stream(file like object): if passed, will extract the file contents from it. is_document(bool): if the given information is for a document, set to ```False``` for a figure. file_name(str): Name of the file, used as a basis of the key for the files store. key(str): if passed, will use this as the key for the files store and ignore ``file_name``, use it to overwrite existing keys. Returns: dict: metadata of the added document or figure. Raises: TypeError: if not ``file_name`` nor ``key`` are passed (one of them is required). """ if not key and not file_name: raise TypeError( 'No file_name and no key passed, at least one of them is ' 'needed.' ) if not key: key = self._get_unique_files_key(base_file_name=file_name) if stream is not None: self.files[key] = stream builder = LiteratureBuilder(record=self.dumps()) metadata['key'] = key metadata['url'] = '/api/files/{bucket}/{key}'.format( bucket=self.files[key].bucket_id, key=key, ) if is_document: builder.add_document(**metadata) else: builder.add_figure(**metadata) super(InspireRecord, self).update(builder.record) return metadata
def add_files(self, documents=None, figures=None): """Public method for adding documents and figures Args: documents (list[dict]): List of documents which should be added to this record figures (list[dict]): List of figures which should be added to this record Documents and figures are lists of dicts. Most obscure dict which whould be provided for each file is: { 'url': 'http:// or /api/file/bucket_id/file_key' 'is_document': True or False(default) } Returns: list: list of added keys """ if not documents and not figures: raise TypeError("No files passed, at least one is needed") if not current_app.config.get("FEATURE_FLAG_ENABLE_FILES", False): if figures: self.setdefault("figures", []).extend(figures) if documents: self.setdefault("documents", []).extend(documents) return [] files = [] builder = LiteratureBuilder(record=self) if documents: doc_keys = [ doc_metadata["key"] for doc_metadata in self.get("documents", []) ] for doc in documents: metadata = self._add_file(document=True, **doc) if metadata["key"] not in doc_keys: builder.add_document(**metadata) files.append(metadata) if figures: fig_keys = [ fig_metadata["key"] for fig_metadata in self.get("figures", []) ] for fig in figures: metadata = self._add_file(**fig) if metadata["key"] not in fig_keys: builder.add_figure(**metadata) files.append(metadata) # FIXME: this is wrong every time it goes to ``update``` function # which means update refs, pidstore etc.. super().update(builder.record.dumps()) return files
def populate_submission_document(obj, eng): submission_pdf = obj.extra_data.get('submission_pdf') if submission_pdf and is_pdf_link(submission_pdf): filename = secure_filename('fulltext.pdf') obj.data['documents'] = [ document for document in obj.data.get('documents', ()) if document.get('key') != filename ] lb = LiteratureBuilder(source=obj.data['acquisition_source']['source'], record=obj.data) lb.add_document( filename, fulltext=True, url=submission_pdf, original_url=submission_pdf, ) obj.data = lb.record
def populate_submission_document(obj, eng): submission_pdf = obj.extra_data.get('submission_pdf') if submission_pdf and is_pdf_link(submission_pdf): filename = secure_filename('fulltext.pdf') obj.data['documents'] = [ document for document in obj.data.get('documents', ()) if document.get('key') != filename ] lb = LiteratureBuilder( source=obj.data['acquisition_source']['source'], record=obj.data) lb.add_document( filename, fulltext=True, url=submission_pdf, original_url=submission_pdf, ) obj.data = lb.record
def arxiv_fulltext_download(obj, eng): """Perform the fulltext download step for arXiv records. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.pdf'.format(arxiv_id)) url = current_app.config['ARXIV_PDF_URL'].format(arxiv_id=arxiv_id) if not is_pdf_link(url): if NO_PDF_ON_ARXIV in requests.get(url).content: obj.log.info('No PDF is available for %s', arxiv_id) return raise DownloadError("{url} is not serving a PDF file.".format(url=url)) pdf = download_file_to_workflow( workflow=obj, name=filename, url=url, ) if pdf: obj.data['documents'] = [ document for document in obj.data.get('documents', ()) if document.get('key') != filename ] lb = LiteratureBuilder(source='arxiv', record=obj.data) lb.add_document(filename, fulltext=True, hidden=True, material='preprint', original_url=url, url='/api/files/{bucket}/{key}'.format( bucket=obj.files[filename].bucket_id, key=filename)) obj.data = lb.record obj.log.info('PDF retrieved from arXiv for %s', arxiv_id) else: obj.log.error('Cannot retrieve PDF from arXiv for %s', arxiv_id)