def arxiv_fft_get(obj, eng): """Get FFT from arXiv, if arXiv ID is provided.""" deposition = Deposition(obj) sip = deposition.get_latest_sip(sealed=False) metadata = sip.metadata if 'arxiv_id' in metadata and metadata['arxiv_id']: arxiv_pdf_url = cfg.get("ARXIV_PDF_URL", "http://arxiv.org/pdf/") + \ "{0}.{1}" from invenio.config import CFG_TMPSHAREDDIR arxiv_file, arxiv_file_path = mkstemp( prefix="%s_" % (metadata['arxiv_id'].replace("/", "_")), suffix='.pdf', dir=CFG_TMPSHAREDDIR, ) os.close(arxiv_file) download_url(url=arxiv_pdf_url.format(metadata['arxiv_id'], "pdf"), content_type="pdf", download_to_file=arxiv_file_path) # To get 1111.2222.pdf as filename. filename = "{0}.pdf".format(metadata['arxiv_id'].replace("/", "_")) try: try: save_deposition_file(deposition, filename, arxiv_file_path) except FilenameAlreadyExists: obj.log.error("PDF file not saved: filename already exists.") except Exception as e: obj.log.error("PDF file not saved: {}.".format(e.message))
def test_content_type(self): """Test simple calls to download_url.""" from invenio_utils.filedownload import (download_url, InvenioFileDownloadError) tmpdoc = download_url("http://duckduckgo.com", content_type="html") self.assertTrue(tmpdoc) fun = lambda: download_url("http://google.com", content_type="pdf") self.assertRaises(InvenioFileDownloadError, fun)
def output_keywords_for_sources( input_sources, taxonomy_name, output_mode="text", output_limit=cfg['CLASSIFIER_DEFAULT_OUTPUT_NUMBER'], spires=False, match_mode="full", no_cache=False, with_author_keywords=False, rebuild_cache=False, only_core_tags=False, extract_acronyms=False, **kwargs): """Output the keywords for each source in sources.""" # Inner function which does the job and it would be too much work to # refactor the call (and it must be outside the loop, before it did # not process multiple files) def process_lines(): if output_mode == "text": print("Input file: %s" % source) line_nb = len(text_lines) word_nb = 0 for line in text_lines: word_nb += len(re.findall("\S+", line)) current_app.logger.info( "Remote file has %d lines and %d words.".format( line_nb, word_nb ) ) return get_keywords_from_text( text_lines, taxonomy_name, output_mode=output_mode, output_limit=output_limit, spires=spires, match_mode=match_mode, no_cache=no_cache, with_author_keywords=with_author_keywords, rebuild_cache=rebuild_cache, only_core_tags=only_core_tags, extract_acronyms=extract_acronyms ) # Get the fulltext for each source. for entry in input_sources: current_app.logger.info("Trying to read input file %s." % entry) text_lines = None source = "" if os.path.isdir(entry): for filename in os.listdir(entry): if filename.startswith('.'): continue filename = os.path.join(entry, filename) if os.path.isfile(filename): text_lines, dummy = get_plaintext_document_body(filename) if text_lines: source = filename process_lines() elif os.path.isfile(entry): text_lines, dummy = get_plaintext_document_body(entry) if text_lines: source = os.path.basename(entry) process_lines() else: # Treat as a URL. local_file = download_url(entry) text_lines, dummy = get_plaintext_document_body(local_file) if text_lines: source = entry.split("/")[-1] process_lines()
def output_keywords_for_sources(input_sources, taxonomy_name, output_mode="text", output_limit=None, spires=False, match_mode="full", no_cache=False, with_author_keywords=False, rebuild_cache=False, only_core_tags=False, extract_acronyms=False, **kwargs): """Output the keywords for each source in sources.""" if output_limit is None: output_limit = cfg['CLASSIFIER_DEFAULT_OUTPUT_NUMBER'] # Inner function which does the job and it would be too much work to # refactor the call (and it must be outside the loop, before it did # not process multiple files) def process_lines(): if output_mode == "text": print("Input file: %s" % source) line_nb = len(text_lines) word_nb = 0 for line in text_lines: word_nb += len(re.findall("\S+", line)) current_app.logger.info( "Remote file has %d lines and %d words.".format(line_nb, word_nb)) return get_keywords_from_text( text_lines, taxonomy_name, output_mode=output_mode, output_limit=output_limit, spires=spires, match_mode=match_mode, no_cache=no_cache, with_author_keywords=with_author_keywords, rebuild_cache=rebuild_cache, only_core_tags=only_core_tags, extract_acronyms=extract_acronyms) # Get the fulltext for each source. for entry in input_sources: current_app.logger.info("Trying to read input file %s." % entry) text_lines = None source = "" if os.path.isdir(entry): for filename in os.listdir(entry): if filename.startswith('.'): continue filename = os.path.join(entry, filename) if os.path.isfile(filename): text_lines, dummy = get_plaintext_document_body(filename) if text_lines: source = filename process_lines() elif os.path.isfile(entry): text_lines, dummy = get_plaintext_document_body(entry) if text_lines: source = os.path.basename(entry) process_lines() else: # Treat as a URL. from invenio_utils.filedownload import download_url local_file = download_url(entry) text_lines, dummy = get_plaintext_document_body(local_file) if text_lines: source = entry.split("/")[-1] process_lines()