def arxiv_fft_get(obj, eng): """Get FFT from arXiv, if arXiv ID is provided.""" deposition = Deposition(obj) sip = deposition.get_latest_sip(sealed=False) metadata = sip.metadata if 'arxiv_id' in metadata and metadata['arxiv_id']: arxiv_pdf_url = cfg.get("ARXIV_PDF_URL", "http://arxiv.org/pdf/") + \ "{0}.{1}" from invenio.config import CFG_TMPSHAREDDIR arxiv_file, arxiv_file_path = mkstemp( prefix="%s_" % (metadata['arxiv_id'].replace("/", "_")), suffix='.pdf', dir=CFG_TMPSHAREDDIR, ) os.close(arxiv_file) download_url(url=arxiv_pdf_url.format(metadata['arxiv_id'], "pdf"), content_type="pdf", download_to_file=arxiv_file_path) # To get 1111.2222.pdf as filename. filename = "{0}.pdf".format(metadata['arxiv_id'].replace("/", "_")) try: try: save_deposition_file(deposition, filename, arxiv_file_path) except FilenameAlreadyExists: obj.log.error("PDF file not saved: filename already exists.") except Exception as e: obj.log.error("PDF file not saved: {}.".format(e.message))
def old_URL_harvest(from_date, to_date, to_dir, area): """ Grab all the PDFs and tarballs off arXiv between from_date and to_date, where from_date and to_date are in YYMM form, and put them in their own separate folders inside of to_dir. Folder hierarchy will be to_dir/YYYY/MM/arXiv_id/stuff_downloaded_from_arXiv this obeys the old URL format @param: from_date (int): YYMM form of the date where we want to start harvesting @param: to_date (int): YYMM form of the date where we want to stop harvesting @param: to_dir (string): the base directory to put all these subdirs in @param: area (int): the index in the HEP_AREAS array of the area we are currently working on downloading @output: PDFs and tarballs from arXiv in a hierarchy rooted at to_dir @return: None """ yearmonthindex = from_date while yearmonthindex < to_date: sub_dir = make_useful_directories(yearmonthindex, to_dir) for paperindex in range(1, 1000): # for whatever reason, we can't count on these things to # start at 1 (in HEP_PH from 9403 to CENTURY_END only). # they start at frickin 202. #if area == HEP_PH and yearmonthindex < ARBITRARY_FROM_INDEX: # paperindex = paperindex + 201 # of note: before the URL change happened in 0704, it was # also the case that the paper numbers only had 3 digits next_to_harvest = '%04d%03d' % (yearmonthindex, paperindex) arXiv_id = area[AREA_STRING_INDEX] + next_to_harvest individual_dir = make_single_directory(sub_dir, arXiv_id) full_url = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER + \ area[URL] + next_to_harvest abs_path = os.path.join(individual_dir, area[AREA_STRING_INDEX] + next_to_harvest) if not download_url(url=full_url, content_type='tar', download_to_file=abs_path): break full_pdf_url = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER + \ area[URL] + next_to_harvest abs_path = os.path.join( individual_dir, area[AREA_STRING_INDEX] + next_to_harvest + PDF_EXTENSION) download_url(url=full_pdf_url, content_type='pdf', download_to_file=abs_path) time.sleep(CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT) if yearmonthindex % 100 == 12: # we reached the end of the year! yearmonthindex = yearmonthindex + FIX_FOR_YEAR_END yearmonthindex = yearmonthindex + 1
def new_URL_harvest(from_date, from_index, to_dir): """ Grab all the PDFs and tarballs off arXiv between from_date and to_date, where from_date and to_date are in YYMM form, and put them in their own separate folders inside of to_dir. Folder hierarchy will be to_dir/YYYY/MM/arXiv_id/stuff_downloaded_from_arXiv this obeys the new URL format @param: from_date (int): YYMM form of the date where we want to start harvesting @param: to_date (int): YYMM form of the date where we want to stop harvesting @param: to_dir (string): the base directory to put all these subdirs in @output: PDFs and tarballs from arXiv in a hierarchy rooted at to_dir @return: None """ global current_yearmonth yearmonthindex = from_date while yearmonthindex < current_yearmonth: if yearmonthindex == from_date: fro = from_index else: fro = 1 sub_dir = make_useful_directories(yearmonthindex, to_dir) for paperindex in range(fro, 10000): # of note: after the URL change happened in 0704, it was # the case that paper numbers had 4 digits next_to_harvest = '%04d.%04d' % (yearmonthindex, paperindex) arXiv_id = ARXIV_HEADER + next_to_harvest individual_dir = make_single_directory(sub_dir, arXiv_id) full_url = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER + \ next_to_harvest abs_path = os.path.join(individual_dir, ARXIV_HEADER + next_to_harvest) if not download_url(url=full_url, content_type='tar', download_to_file=abs_path): break full_pdf_url = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER + \ next_to_harvest abs_path = os.path.join(individual_dir, ARXIV_HEADER + next_to_harvest + PDF_EXTENSION) download_url(url=full_pdf_url, content_type='pdf', download_to_file=abs_path) time.sleep(CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT) # be nice to remote server if yearmonthindex % 100 == 12: # we reached the end of the year! yearmonthindex = yearmonthindex + FIX_FOR_YEAR_END yearmonthindex = yearmonthindex + 1
def test_content_type(self): """Test simple calls to download_url.""" from invenio.utils.filedownload import (download_url, InvenioFileDownloadError) tmpdoc = download_url("http://duckduckgo.com", content_type="html") self.assertTrue(tmpdoc) fun = lambda: download_url("http://google.com", content_type="pdf") self.assertRaises(InvenioFileDownloadError, fun)
def test_content_type(self): """Test simple calls to download_url.""" from invenio.utils.filedownload import (download_url, InvenioFileDownloadError) tmpdoc = download_url("http://duckduckgo.com", content_type="html") self.assertTrue(tmpdoc) fun = lambda: download_url("http://google.com", content_type="pdf") self.assertRaises(InvenioFileDownloadError, fun)
def old_URL_harvest(from_date, to_date, to_dir, area): """ Grab all the PDFs and tarballs off arXiv between from_date and to_date, where from_date and to_date are in YYMM form, and put them in their own separate folders inside of to_dir. Folder hierarchy will be to_dir/YYYY/MM/arXiv_id/stuff_downloaded_from_arXiv this obeys the old URL format @param: from_date (int): YYMM form of the date where we want to start harvesting @param: to_date (int): YYMM form of the date where we want to stop harvesting @param: to_dir (string): the base directory to put all these subdirs in @param: area (int): the index in the HEP_AREAS array of the area we are currently working on downloading @output: PDFs and tarballs from arXiv in a hierarchy rooted at to_dir @return: None """ yearmonthindex = from_date while yearmonthindex < to_date: sub_dir = make_useful_directories(yearmonthindex, to_dir) for paperindex in range(1, 1000): # for whatever reason, we can't count on these things to # start at 1 (in HEP_PH from 9403 to CENTURY_END only). # they start at frickin 202. #if area == HEP_PH and yearmonthindex < ARBITRARY_FROM_INDEX: # paperindex = paperindex + 201 # of note: before the URL change happened in 0704, it was # also the case that the paper numbers only had 3 digits next_to_harvest = '%04d%03d' % (yearmonthindex, paperindex) arXiv_id = area[AREA_STRING_INDEX] + next_to_harvest individual_dir = make_single_directory(sub_dir, arXiv_id) full_url = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER + \ area[URL] + next_to_harvest abs_path = os.path.join(individual_dir, area[AREA_STRING_INDEX] + next_to_harvest) if not download_url(url=full_url, content_type='tar', download_to_file=abs_path): break full_pdf_url = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER + \ area[URL] + next_to_harvest abs_path = os.path.join(individual_dir, area[AREA_STRING_INDEX] + next_to_harvest + PDF_EXTENSION) download_url(url=full_pdf_url, content_type='pdf', download_to_file=abs_path) time.sleep(CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT) if yearmonthindex % 100 == 12: # we reached the end of the year! yearmonthindex = yearmonthindex + FIX_FOR_YEAR_END yearmonthindex = yearmonthindex + 1
def parse_and_download(infile, sdir): """ Read the write_messageation in the input file and download the corresponding tarballs from arxiv. @param: infile (string): the name of the file to parse @param: sdir (string): where to put the downloaded tarballs """ tarfiles = [] tardir = os.path.join(sdir, 'tarballs') if not os.path.isdir(tardir): try: os.makedirs(tardir) except: write_message(sys.exc_info()[0]) write_message('files will be loose, not in ' + tardir) tardir = sdir infile = open(infile) for line in infile.readlines(): line = line.strip() if line.startswith('http://'): # hurray! url = line filename = url.split('/')[-1] abs_path = os.path.join(tardir, filename) if not download_url(url=url, content_type='tar', download_to_file=abs_path): write_message(filename + ' may already exist') write_message(sys.exc_info()[0]) filename = os.path.join(tardir, filename) tarfiles.append(filename) time.sleep(CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT) # be nice! elif line.startswith('arXiv'): tarfiles.extend(tarballs_by_arXiv_id([line.strip()], sdir)) return tarfiles
def parse_and_download(infile, sdir): """ Read the write_messageation in the input file and download the corresponding tarballs from arxiv. @param: infile (string): the name of the file to parse @param: sdir (string): where to put the downloaded tarballs """ tarfiles = [] tardir = os.path.join(sdir, 'tarballs') if not os.path.isdir(tardir): try: os.makedirs(tardir) except: write_message(sys.exc_info()[0]) write_message('files will be loose, not in ' + tardir) tardir = sdir infile = open(infile) for line in infile.readlines(): line = line.strip() if line.startswith('http://'): # hurray! url = line filename = url.split('/')[-1] abs_path = os.path.join(tardir, filename) if not download_url( url=url, content_type='tar', download_to_file=abs_path): write_message(filename + ' may already exist') write_message(sys.exc_info()[0]) filename = os.path.join(tardir, filename) tarfiles.append(filename) time.sleep(CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT) # be nice! elif line.startswith('arXiv'): tarfiles.extend(tarballs_by_arXiv_id([line.strip()], sdir)) return tarfiles
def output_keywords_for_sources(input_sources, taxonomy_name, output_mode="text", output_limit=cfg['CLASSIFIER_DEFAULT_OUTPUT_NUMBER'], spires=False, match_mode="full", no_cache=False, with_author_keywords=False, rebuild_cache=False, only_core_tags=False, extract_acronyms=False, **kwargs): """Output the keywords for each source in sources.""" from invenio.legacy.refextract.engine import get_plaintext_document_body # Inner function which does the job and it would be too much work to # refactor the call (and it must be outside the loop, before it did # not process multiple files) def process_lines(): if output_mode == "text": print("Input file: %s" % source) line_nb = len(text_lines) word_nb = 0 for line in text_lines: word_nb += len(re.findall("\S+", line)) current_app.logger.info("Remote file has %d lines and %d words.".format( line_nb, word_nb )) return get_keywords_from_text( text_lines, taxonomy_name, output_mode=output_mode, output_limit=output_limit, spires=spires, match_mode=match_mode, no_cache=no_cache, with_author_keywords=with_author_keywords, rebuild_cache=rebuild_cache, only_core_tags=only_core_tags, extract_acronyms=extract_acronyms ) # Get the fulltext for each source. for entry in input_sources: current_app.logger.info("Trying to read input file %s." % entry) text_lines = None source = "" if os.path.isdir(entry): for filename in os.listdir(entry): if filename.startswith('.'): continue filename = os.path.join(entry, filename) if os.path.isfile(filename): text_lines, dummy = get_plaintext_document_body(filename) if text_lines: source = filename process_lines() elif os.path.isfile(entry): text_lines, dummy = get_plaintext_document_body(entry) if text_lines: source = os.path.basename(entry) process_lines() else: # Treat as a URL. local_file = download_url(entry) text_lines, dummy = get_plaintext_document_body(local_file) if text_lines: source = entry.split("/")[-1] process_lines()
def harvest_single(single, to_dir, selection=("tarball", "pdf")): """ if we only want to harvest one id (arXiv or DESY), we can use this. @param: single (string): an id from arXiv or DESY @param: to_dir (string): where the output should be saved @output: the PDF and source tarball (if applicable) of this single record @return: (tarball, pdf): the location of the source tarball and PDF, None if not found """ if single.find('arXiv') > -1 and 'arxiv.org' in CFG_PLOTEXTRACTOR_SOURCE_BASE_URL.lower(): id_str = re.findall('[a-zA-Z\\-]+/\\d+|\\d+\\.\\d+', single)[0] idno = id_str.split('/') if len(idno) > 0: idno = idno[-1] yymm = int(idno[:4]) yymm_dir = make_useful_directories(yymm, to_dir) url_for_file = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + \ CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER + \ id_str url_for_pdf = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + \ CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER + \ id_str + '.pdf' # adds '.pdf' to avoid arXiv internal redirect from arXivID to arXivID.pdf individual_file = 'arXiv:' + id_str.replace('/', '_') individual_dir = make_single_directory(yymm_dir, individual_file) abs_path = os.path.join(individual_dir, individual_file) tarball = abs_path pdf = abs_path + '.pdf' try: if "tarball" in selection: write_message('downloading ' + url_for_file + ' to ' + tarball) tarball = download_url(url=url_for_file, content_type='tar', download_to_file=tarball) except InvenioFileDownloadError: tarball = None try: if "pdf" in selection: write_message('downloading ' + url_for_pdf + ' to ' + pdf) pdf = download_url(url=url_for_pdf, content_type="pdf", download_to_file=pdf) except InvenioFileDownloadError: pdf = None return (tarball, pdf) elif single.find('arXiv') > -1 and CFG_PLOTEXTRACTOR_SOURCE_BASE_URL != '': # hmm... is it a filesystem? if CFG_PLOTEXTRACTOR_SOURCE_BASE_URL.startswith('/'): if not os.path.exists(CFG_PLOTEXTRACTOR_SOURCE_BASE_URL): write_message('PROBLEM WITH CFG_PLOTEXTRACTOR_SOURCE_BASE_URL: we cannot ' + \ 'find this folder!') return (None, None) for root, files, dummy in os.walk(CFG_PLOTEXTRACTOR_SOURCE_BASE_URL): for file_name in files: id_no = single.replace('arXiv', '') if file_name.find(id_no) > -1 or\ file_name.find(id_no.replace('/', '_')) > -1 or\ file_name.find(id_no.replace('_', '/')) > -1 or\ file_name.find(id_no.replace(':', '')) > -1: # that's our file! probably. return (os.path.join(root, file_name), None) # well, no luck there return (None, None) # okay... is it... a website? elif CFG_PLOTEXTRACTOR_SOURCE_BASE_URL.startswith('http') and "tarball" in selection: url_for_file = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + single individual_file = os.path.join(to_dir, single) abs_path = os.path.join(to_dir, individual_file) try: abs_path = download_url(url=url_for_file, content_type='tar', download_to_file=abs_path) except InvenioFileDownloadError: abs_path = None return (abs_path, None) # well, I don't know what to do with it else: write_message('unsure how to handle CFG_PLOTEXTRACTOR_SOURCE_BASE_URL. ' + \ 'please fix the harvest_single function in ' + \ 'miscutil/lib/plotextractor_getter.py') return (None, None) elif single.find('DESY') > -1 and "pdf" in selection: # also okay! idno = re.findall('\\d{2,4}-\\d{3}', single)[0] year, number = idno.split('-') if len(year) < 4: if int(year) > 92: year = '19' + year else: year = '20' + year year_dir = make_single_directory(to_dir, year) desy_dir = make_single_directory(year_dir, 'DESY') individual_dir = make_single_directory(desy_dir, number) id_no = year[2:] + '-' + number + '.pdf' url_for_file = CFG_PLOTEXTRACTOR_DESY_BASE + year + \ CFG_PLOTEXTRACTOR_DESY_PIECE + id_no individual_file = id_no abs_path = os.path.join(individual_dir, individual_file) write_message('download ' + url_for_file + ' to ' + abs_path) try: abs_path = download_url(url=url_for_file, content_type='pdf', download_to_file=abs_path) except InvenioFileDownloadError: abs_path = None return (None, abs_path) write_message('END') return (None, None)
def new_URL_harvest(from_date, from_index, to_dir): """ Grab all the PDFs and tarballs off arXiv between from_date and to_date, where from_date and to_date are in YYMM form, and put them in their own separate folders inside of to_dir. Folder hierarchy will be to_dir/YYYY/MM/arXiv_id/stuff_downloaded_from_arXiv this obeys the new URL format @param: from_date (int): YYMM form of the date where we want to start harvesting @param: to_date (int): YYMM form of the date where we want to stop harvesting @param: to_dir (string): the base directory to put all these subdirs in @output: PDFs and tarballs from arXiv in a hierarchy rooted at to_dir @return: None """ global current_yearmonth yearmonthindex = from_date while yearmonthindex < current_yearmonth: if yearmonthindex == from_date: fro = from_index else: fro = 1 sub_dir = make_useful_directories(yearmonthindex, to_dir) for paperindex in range(fro, 10000): # of note: after the URL change happened in 0704, it was # the case that paper numbers had 4 digits next_to_harvest = '%04d.%04d' % (yearmonthindex, paperindex) arXiv_id = ARXIV_HEADER + next_to_harvest individual_dir = make_single_directory(sub_dir, arXiv_id) full_url = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER + \ next_to_harvest abs_path = os.path.join(individual_dir, ARXIV_HEADER + next_to_harvest) if not download_url(url=full_url, content_type='tar', download_to_file=abs_path): break full_pdf_url = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER + \ next_to_harvest abs_path = os.path.join( individual_dir, ARXIV_HEADER + next_to_harvest + PDF_EXTENSION) download_url(url=full_pdf_url, content_type='pdf', download_to_file=abs_path) time.sleep( CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT) # be nice to remote server if yearmonthindex % 100 == 12: # we reached the end of the year! yearmonthindex = yearmonthindex + FIX_FOR_YEAR_END yearmonthindex = yearmonthindex + 1
def harvest_single(single, to_dir, selection=("tarball", "pdf")): """ if we only want to harvest one id (arXiv or DESY), we can use this. @param: single (string): an id from arXiv or DESY @param: to_dir (string): where the output should be saved @output: the PDF and source tarball (if applicable) of this single record @return: (tarball, pdf): the location of the source tarball and PDF, None if not found """ if single.find( 'arXiv' ) > -1 and 'arxiv.org' in CFG_PLOTEXTRACTOR_SOURCE_BASE_URL.lower(): id_str = re.findall('[a-zA-Z\\-]+/\\d+|\\d+\\.\\d+', single)[0] idno = id_str.split('/') if len(idno) > 0: idno = idno[-1] yymm = int(idno[:4]) yymm_dir = make_useful_directories(yymm, to_dir) url_for_file = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + \ CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER + \ id_str url_for_pdf = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + \ CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER + \ id_str + '.pdf' # adds '.pdf' to avoid arXiv internal redirect from arXivID to arXivID.pdf individual_file = 'arXiv:' + id_str.replace('/', '_') individual_dir = make_single_directory(yymm_dir, individual_file) abs_path = os.path.join(individual_dir, individual_file) tarball = abs_path pdf = abs_path + '.pdf' try: if "tarball" in selection: write_message('downloading ' + url_for_file + ' to ' + tarball) tarball = download_url(url=url_for_file, content_type='tar', download_to_file=tarball) except InvenioFileDownloadError: tarball = None try: if "pdf" in selection: write_message('downloading ' + url_for_pdf + ' to ' + pdf) pdf = download_url(url=url_for_pdf, content_type="pdf", download_to_file=pdf) except InvenioFileDownloadError: pdf = None return (tarball, pdf) elif single.find('arXiv') > -1 and CFG_PLOTEXTRACTOR_SOURCE_BASE_URL != '': # hmm... is it a filesystem? if CFG_PLOTEXTRACTOR_SOURCE_BASE_URL.startswith('/'): if not os.path.exists(CFG_PLOTEXTRACTOR_SOURCE_BASE_URL): write_message('PROBLEM WITH CFG_PLOTEXTRACTOR_SOURCE_BASE_URL: we cannot ' + \ 'find this folder!') return (None, None) for root, files, dummy in os.walk( CFG_PLOTEXTRACTOR_SOURCE_BASE_URL): for file_name in files: id_no = single.replace('arXiv', '') if file_name.find(id_no) > -1 or\ file_name.find(id_no.replace('/', '_')) > -1 or\ file_name.find(id_no.replace('_', '/')) > -1 or\ file_name.find(id_no.replace(':', '')) > -1: # that's our file! probably. return (os.path.join(root, file_name), None) # well, no luck there return (None, None) # okay... is it... a website? elif CFG_PLOTEXTRACTOR_SOURCE_BASE_URL.startswith( 'http') and "tarball" in selection: url_for_file = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + single individual_file = os.path.join(to_dir, single) abs_path = os.path.join(to_dir, individual_file) try: abs_path = download_url(url=url_for_file, content_type='tar', download_to_file=abs_path) except InvenioFileDownloadError: abs_path = None return (abs_path, None) # well, I don't know what to do with it else: write_message('unsure how to handle CFG_PLOTEXTRACTOR_SOURCE_BASE_URL. ' + \ 'please fix the harvest_single function in ' + \ 'miscutil/lib/plotextractor_getter.py') return (None, None) elif single.find('DESY') > -1 and "pdf" in selection: # also okay! idno = re.findall('\\d{2,4}-\\d{3}', single)[0] year, number = idno.split('-') if len(year) < 4: if int(year) > 92: year = '19' + year else: year = '20' + year year_dir = make_single_directory(to_dir, year) desy_dir = make_single_directory(year_dir, 'DESY') individual_dir = make_single_directory(desy_dir, number) id_no = year[2:] + '-' + number + '.pdf' url_for_file = CFG_PLOTEXTRACTOR_DESY_BASE + year + \ CFG_PLOTEXTRACTOR_DESY_PIECE + id_no individual_file = id_no abs_path = os.path.join(individual_dir, individual_file) write_message('download ' + url_for_file + ' to ' + abs_path) try: abs_path = download_url(url=url_for_file, content_type='pdf', download_to_file=abs_path) except InvenioFileDownloadError: abs_path = None return (None, abs_path) write_message('END') return (None, None)