def submission_fulltext_download(obj, eng): submission_pdf = obj.extra_data.get('submission_pdf') if submission_pdf and is_pdf_link(submission_pdf): filename = secure_filename('fulltext.pdf') pdf = download_file_to_workflow( workflow=obj, name=filename, url=submission_pdf, ) if pdf: obj.data['documents'] = [ document for document in obj.data.get('documents', ()) if document.get('key') != filename ] lb = LiteratureBuilder(source=obj.data['acquisition_source']['source'], record=obj.data) lb.add_document( filename, fulltext=True, original_url=submission_pdf, url='/api/files/{bucket}/{key}'.format(bucket=obj.files[filename].bucket_id, key=filename) ) obj.data = lb.record obj.log.info('PDF provided by user from %s', submission_pdf) return obj.files[filename].file.uri else: obj.log.info('Cannot fetch PDF provided by user from %s', submission_pdf)
def populate_arxiv_document(obj, eng): arxiv_id = LiteratureReader(obj.data).arxiv_id for conf_name in ('ARXIV_PDF_URL', 'ARXIV_PDF_URL_ALTERNATIVE'): url = current_app.config[conf_name].format(arxiv_id=arxiv_id) is_valid_pdf_link = is_pdf_link(url) if is_valid_pdf_link: break try: if NO_PDF_ON_ARXIV in requests.get(url).content: obj.log.info('No PDF is available for %s', arxiv_id) return except requests.exceptions.RequestException: raise DownloadError("Error accessing url {url}".format(url=url)) if not is_valid_pdf_link: raise DownloadError("{url} is not serving a PDF file.".format(url=url)) filename = secure_filename('{0}.pdf'.format(arxiv_id)) obj.data['documents'] = [ document for document in obj.data.get('documents', ()) if document.get('key') != filename ] lb = LiteratureBuilder(source='arxiv', record=obj.data) lb.add_document( filename, fulltext=True, hidden=True, material='preprint', original_url=url, url=url, ) obj.data = lb.record
def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) tarball = obj.files[filename] if tarball: with TemporaryDirectory(prefix='plot_extract') as scratch_space: tarball_file = retrieve_uri(tarball.file.uri, outdir=scratch_space) try: plots = process_tarball( tarball_file, output_directory=scratch_space, ) except (InvalidTarball, NoTexFilesFound): obj.log.info( 'Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id, ) return except DelegateError as err: obj.log.error( 'Error extracting plots for %s. Report and skip.', arxiv_id, ) current_app.logger.exception(err) return if 'figures' in obj.data: for figure in obj.data['figures']: if figure['key'] in obj.files: del obj.files[figure['key']] del obj.data['figures'] lb = LiteratureBuilder(source='arxiv', record=obj.data) for index, plot in enumerate(plots): plot_name = os.path.basename(plot.get('url')) key = plot_name if plot_name in obj.files.keys: key = '{number}_{name}'.format(number=index, name=plot_name) with open(plot.get('url')) as plot_file: obj.files[key] = plot_file lb.add_figure(key=key, caption=''.join(plot.get('captions', [])), label=plot.get('label'), material='preprint', url='/api/files/{bucket}/{key}'.format( bucket=obj.files[key].bucket_id, key=key, )) obj.data = lb.record obj.log.info('Added {0} plots.'.format(len(plots)))
def populate_arxiv_document(obj, eng): arxiv_id = get_arxiv_id(obj.data) url = current_app.config['ARXIV_PDF_URL'].format(arxiv_id=arxiv_id) if not is_pdf_link(url): if NO_PDF_ON_ARXIV in requests.get(url).content: obj.log.info('No PDF is available for %s', arxiv_id) return raise DownloadError("{url} is not serving a PDF file.".format(url=url)) filename = secure_filename('{0}.pdf'.format(arxiv_id)) obj.data['documents'] = [ document for document in obj.data.get('documents', ()) if document.get('key') != filename ] lb = LiteratureBuilder(source='arxiv', record=obj.data) lb.add_document( filename, fulltext=True, hidden=True, material='preprint', original_url=url, url=url, ) obj.data = lb.record
def populate_arxiv_document(obj, eng): arxiv_id = get_arxiv_id(obj.data) for conf_name in ('ARXIV_PDF_URL', 'ARXIV_PDF_URL_ALTERNATIVE'): url = current_app.config[conf_name].format(arxiv_id=arxiv_id) is_valid_pdf_link = is_pdf_link(url) if is_valid_pdf_link: break if NO_PDF_ON_ARXIV in requests.get(url).content: obj.log.info('No PDF is available for %s', arxiv_id) return if not is_valid_pdf_link: raise DownloadError("{url} is not serving a PDF file.".format(url=url)) filename = secure_filename('{0}.pdf'.format(arxiv_id)) obj.data['documents'] = [ document for document in obj.data.get('documents', ()) if document.get('key') != filename ] lb = LiteratureBuilder(source='arxiv', record=obj.data) lb.add_document( filename, fulltext=True, hidden=True, material='preprint', original_url=url, url=url, ) obj.data = lb.record
def add_document_or_figure( self, metadata, stream=None, is_document=True, file_name=None, key=None, ): """Add a document or figure to the record. Args: metadata(dict): metadata of the document or figure, see the schemas for more details, will be validated. stream(file like object): if passed, will extract the file contents from it. is_document(bool): if the given information is for a document, set to ```False``` for a figure. file_name(str): Name of the file, used as a basis of the key for the files store. key(str): if passed, will use this as the key for the files store and ignore ``file_name``, use it to overwrite existing keys. Returns: dict: metadata of the added document or figure. Raises: TypeError: if not ``file_name`` nor ``key`` are passed (one of them is required). """ if not key and not file_name: raise TypeError( 'No file_name and no key passed, at least one of them is ' 'needed.' ) if not key: key = self._get_unique_files_key(base_file_name=file_name) if stream is not None: self.files[key] = stream builder = LiteratureBuilder(record=self.to_dict()) metadata['key'] = key metadata['url'] = '/api/files/{bucket}/{key}'.format( bucket=self.files[key].bucket_id, key=key, ) if is_document: builder.add_document(**metadata) else: builder.add_figure(**metadata) super(InspireRecord, self).update(builder.record) return metadata
def populate_submission_document(obj, eng): submission_pdf = obj.extra_data.get('submission_pdf') if submission_pdf and is_pdf_link(submission_pdf): filename = secure_filename('fulltext.pdf') obj.data['documents'] = [ document for document in obj.data.get('documents', ()) if document.get('key') != filename ] lb = LiteratureBuilder(source=obj.data['acquisition_source']['source'], record=obj.data) lb.add_document( filename, fulltext=True, url=submission_pdf, original_url=submission_pdf, ) obj.data = lb.record
def populate_submission_document(obj, eng): submission_pdf = obj.extra_data.get('submission_pdf') if submission_pdf and is_pdf_link(submission_pdf): filename = secure_filename('fulltext.pdf') obj.data['documents'] = [ document for document in obj.data.get('documents', ()) if document.get('key') != filename ] lb = LiteratureBuilder( source=obj.data['acquisition_source']['source'], record=obj.data) lb.add_document( filename, fulltext=True, url=submission_pdf, original_url=submission_pdf, ) obj.data = lb.record
def to_hep(self, source): """Get an output ready hep formatted record from the given :class:`hepcrawl.utils.ParsedItem`, whatever format it's record might be. Args: source(str): string identifying the source for this item (ex. 'arXiv'). Returns: hepcrawl.utils.ParsedItem: the new item, with the internal record formated as hep record. Raises: UnknownItemFormat: if the source item format is unknown. """ builder = LiteratureBuilder(source=source) builder.add_acquisition_source( source=source, method='hepcrawl', date=datetime.datetime.now().isoformat(), submission_number=os.environ.get('SCRAPY_JOB', ''), ) self.record['acquisition_source'] = builder.record[ 'acquisition_source'] if self.record_format == 'hep': record = hep_to_hep( hep_record=self.record, record_files=self.record_files, ) for document in record.get('documents', []): if 'old_url' in document and 'original_url' not in document: document['original_url'] = document['old_url'] del document['old_url'] return record elif self.record_format == 'hepcrawl': record = _normalize_hepcrawl_record( item=self.record, source=source, ) return hepcrawl_to_hep(dict(record)) else: raise UnknownItemFormat('Unknown ParsedItem::{}'.format( self.record_format))
def assign_conference(record, conference_ref, cnum): builder = LiteratureBuilder(record=record) if not cnum: raise MissingArgumentError("cnum is required.") if not conference_ref: raise MissingArgumentError("$ref is required.") if not {"proceedings", "conference paper"}.intersection( record.get_value("document_type")): builder.add_document_type("conference paper") if conference_ref not in builder.record.get_value( "publication_info.conference_record", []) and cnum not in builder.record.get_value("publication_info.cnum", []): builder.add_publication_info(cnum=cnum, conference_record=conference_ref) LOGGER.info( "Assigning conference to record", recid=record.control_number, conference_ref=conference_ref, cnum=cnum, ) else: for idx, publication_info_element in enumerate( builder.record.get_value("publication_info")): record_conference_ref = publication_info_element.get( "conference_record", {}) record_cnum = publication_info_element.get("cnum", "") if conference_ref == record_conference_ref and cnum == record_cnum: LOGGER.warning( "Conference already assigned to record", recid=record.control_number, conference_ref=conference_ref, cnum=cnum, ) elif conference_ref == record_conference_ref: builder.record["publication_info"][idx]["cnum"] = cnum LOGGER.warning( "conference ref already assigned to paper without cnum.", recid=record.control_number, conference_ref=conference_ref, cnum=cnum, ) elif cnum == record_cnum: builder.record["publication_info"][idx][ "conference_record"] = conference_ref LOGGER.warning( "conference cnum already assigned to paper without ref.", recid=record.control_number, conference_ref=conference_ref, cnum=cnum, ) return dict(builder.record)
def process_cds_record(cds_record): control_numbers = get_value(cds_record, "metadata.other_ids", []) arxivs = get_value(cds_record, "metadata.eprints", []) dois = get_value(cds_record, "metadata.dois.value", []) report_numbers = get_value(cds_record, "metadata.report_numbers.value", []) cds_id = cds_record.get("id") or get_value(cds_record, "metadata.control_number", []) if not cds_id: LOGGER.info( "Cannot extract CDS id from CDS response", cds_data=cds_record, ) return record = get_record_for_provided_ids(control_numbers, arxivs, dois, report_numbers) if not record: LOGGER.warning( "Cannot find record with any of the provided IDS", control_numbers=control_numbers, arxivs=arxivs, dois=dois, report_numbers=report_numbers, ) return None control_number = record.control_number ids = record.get("external_system_identifiers", []) values = get_values_for_schema(ids, "CDS") if cds_id in values: LOGGER.info( "Correct CDS identifier is already present in the record", recid=control_number, cds_id=cds_id, ) return builder = LiteratureBuilder(record=record) builder.add_external_system_identifier(cds_id, "CDS") data = dict(builder.record) record.update(data)
def add_figures(self, figures): builder = LiteratureBuilder() with ThreadPoolExecutor(max_workers=current_app.config.get( "FILES_MAX_UPLOAD_THREADS", 5)) as executor: tasks = self._generate_and_submit_files_tasks(figures, executor) processed_figures = self._process_tasks_results(tasks, figures) if processed_figures: for document in processed_figures: self._update_file_entry(document, builder.add_figure) return builder.record.get("figures")
def arxiv_fulltext_download(obj, eng): """Perform the fulltext download step for arXiv records. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.pdf'.format(arxiv_id)) url = current_app.config['ARXIV_PDF_URL'].format(arxiv_id=arxiv_id) if not is_pdf_link(url): if NO_PDF_ON_ARXIV in requests.get(url).content: obj.log.info('No PDF is available for %s', arxiv_id) return raise DownloadError("{url} is not serving a PDF file.".format(url=url)) pdf = download_file_to_workflow( workflow=obj, name=filename, url=url, ) if pdf: obj.data['documents'] = [ document for document in obj.data.get('documents', ()) if document.get('key') != filename ] lb = LiteratureBuilder(source='arxiv', record=obj.data) lb.add_document(filename, fulltext=True, hidden=True, material='preprint', original_url=url, url='/api/files/{bucket}/{key}'.format( bucket=obj.files[filename].bucket_id, key=filename)) obj.data = lb.record obj.log.info('PDF retrieved from arXiv for %s', arxiv_id) else: obj.log.error('Cannot retrieve PDF from arXiv for %s', arxiv_id)
def add_document_or_figure( self, metadata, stream=None, is_document=True, file_name=None, key=None, ): """Add a document or figure to the record. Args: metadata(dict): metadata of the document or figure, see the schemas for more details, will be validated. stream(file like object): if passed, will extract the file contents from it. is_document(bool): if the given information is for a document, set to ```False``` for a figure. file_name(str): Name of the file, used as a basis of the key for the files store. key(str): if passed, will use this as the key for the files store and ignore ``file_name``, use it to overwrite existing keys. Returns: dict: metadata of the added document or figure. Raises: TypeError: if not ``file_name`` nor ``key`` are passed (one of them is required). """ if not key and not file_name: raise TypeError( 'No file_name and no key passed, at least one of them is ' 'needed.' ) if not key: key = self._get_unique_files_key(base_file_name=file_name) if stream is not None: self.files[key] = stream builder = LiteratureBuilder(record=self.dumps()) metadata['key'] = key metadata['url'] = '/api/files/{bucket}/{key}'.format( bucket=self.files[key].bucket_id, key=key, ) if is_document: builder.add_document(**metadata) else: builder.add_figure(**metadata) super(InspireRecord, self).update(builder.record) return metadata
def test_append_to(): formdata = '' builder = LiteratureBuilder("test") expected_result = None builder._append_to('test_field', formdata) assert builder.record.get('test_field') is expected_result formdata = 'value' expected_result = ['value'] builder._append_to('test_field_2', formdata) assert builder.record.get('test_field_2') == expected_result
def add_files(self, documents=None, figures=None): """Public method for adding documents and figures Args: documents (list[dict]): List of documents which should be added to this record figures (list[dict]): List of figures which should be added to this record Documents and figures are lists of dicts. Most obscure dict which whould be provided for each file is: { 'url': 'http:// or /api/file/bucket_id/file_key' 'is_document': True or False(default) } Returns: list: list of added keys """ if not documents and not figures: raise TypeError("No files passed, at least one is needed") if not current_app.config.get("FEATURE_FLAG_ENABLE_FILES", False): if figures: self.setdefault("figures", []).extend(figures) if documents: self.setdefault("documents", []).extend(documents) return [] files = [] builder = LiteratureBuilder(record=self) if documents: doc_keys = [ doc_metadata["key"] for doc_metadata in self.get("documents", []) ] for doc in documents: metadata = self._add_file(document=True, **doc) if metadata["key"] not in doc_keys: builder.add_document(**metadata) files.append(metadata) if figures: fig_keys = [ fig_metadata["key"] for fig_metadata in self.get("figures", []) ] for fig in figures: metadata = self._add_file(**fig) if metadata["key"] not in fig_keys: builder.add_figure(**metadata) files.append(metadata) # FIXME: this is wrong every time it goes to ``update``` function # which means update refs, pidstore etc.. super().update(builder.record.dumps()) return files
def add_documents(self, documents): builder = LiteratureBuilder() documents_hidden = list( filter(lambda x: x.get("hidden", False), documents)) documents_to_process = list( filter(lambda x: not x.get("hidden", False), documents)) for document in documents_hidden: self._update_file_entry(document, builder.add_document) with ThreadPoolExecutor(max_workers=current_app.config.get( "FILES_MAX_UPLOAD_THREADS", 5)) as executor: tasks = self._generate_and_submit_files_tasks( documents_to_process, executor) processed_documents = self._process_tasks_results( tasks, documents_to_process) if processed_documents: for document in processed_documents: self._update_file_entry(document, builder.add_document) return builder.record.get("documents")
def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ # Crude way to set memory limits for wand globally. mem_limit = current_app.config.get("WAND_MEMORY_LIMIT") if mem_limit and limits['memory'] != mem_limit: limits['memory'] = mem_limit # This sets disk limit, if not set it will swap data on disk # instead of throwing exception limits['disk'] = current_app.config.get("WAND_DISK_LIMIT", 0) # It will throw an exception when memory and disk limit exceeds. # At least workflow status will be saved. arxiv_id = LiteratureReader(obj.data).arxiv_id filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) try: tarball = obj.files[filename] except KeyError: obj.log.info('No file named=%s for arxiv_id %s', filename, arxiv_id) return with TemporaryDirectory(prefix='plot_extract') as scratch_space, \ retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file: try: plots = process_tarball( tarball_file, output_directory=scratch_space, ) except (InvalidTarball, NoTexFilesFound): obj.log.info( 'Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id, ) return except DelegateError as err: obj.log.error( 'Error extracting plots for %s. Report and skip.', arxiv_id, ) current_app.logger.exception(err) return if 'figures' in obj.data: for figure in obj.data['figures']: if figure['key'] in obj.files: del obj.files[figure['key']] del obj.data['figures'] lb = LiteratureBuilder(source='arxiv', record=obj.data) for index, plot in enumerate(plots): plot_name = os.path.basename(plot.get('url')) key = plot_name if plot_name in obj.files.keys: key = 'w{number}_{name}'.format( number=index, name=plot_name, ) with open(plot.get('url')) as plot_file: obj.files[key] = plot_file lb.add_figure( key=key, caption=''.join(plot.get('captions', [])), label=plot.get('label'), material='preprint', url='/api/files/{bucket}/{key}'.format( bucket=obj.files[key].bucket_id, key=key, ) ) obj.data = lb.record obj.log.info('Added {0} plots.'.format(len(plots)))
def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) tarball = obj.files[filename] if tarball: with TemporaryDirectory(prefix='plot_extract') as scratch_space, \ retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file: try: plots = process_tarball( tarball_file, output_directory=scratch_space, ) except (InvalidTarball, NoTexFilesFound): obj.log.info( 'Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id, ) return except DelegateError as err: obj.log.error( 'Error extracting plots for %s. Report and skip.', arxiv_id, ) current_app.logger.exception(err) return if 'figures' in obj.data: for figure in obj.data['figures']: if figure['key'] in obj.files: del obj.files[figure['key']] del obj.data['figures'] lb = LiteratureBuilder(source='arxiv', record=obj.data) for index, plot in enumerate(plots): plot_name = os.path.basename(plot.get('url')) key = plot_name if plot_name in obj.files.keys: key = 'w{number}_{name}'.format( number=index, name=plot_name, ) with open(plot.get('url')) as plot_file: obj.files[key] = plot_file lb.add_figure( key=key, caption=''.join(plot.get('captions', [])), label=plot.get('label'), material='preprint', url='/api/files/{bucket}/{key}'.format( bucket=obj.files[key].bucket_id, key=key, ) ) obj.data = lb.record obj.log.info('Added {0} plots.'.format(len(plots)))