def __download_from_springer(url, save_dir, year, is_workshops=False, time_sleep_in_seconds=5, downloader='IDM'): downloader = Downloader(downloader) for i in range(3): try: papers_dict = springer.get_paper_name_link_from_url(url) break except Exception as e: print(str(e)) # total_paper_number = len(papers_dict) pbar = tqdm(papers_dict.keys()) postfix = f'ECCV_{year}' if is_workshops: postfix = f'ECCV_WS_{year}' for name in pbar: pbar.set_description(f'Downloading paper {name}') if not os.path.exists(os.path.join(save_dir, f'{name}_{postfix}.pdf')): downloader.download( papers_dict[name], os.path.join(save_dir, f'{name}_{postfix}.pdf'), time_sleep_in_seconds)
class Analytics: def __init__(self, config): self.config = config self.downloader = Downloader(config) self.parser = Parser(config) self.analyser = Analyser(config) def generate_report(self): if self.config.download: self.downloader.download() self.parser.parse() self.analyser.analyse() return self.config
class Analytics: def __init__(self, config): self.config = config self.downloader = Downloader(config) self.parser = Parser(config) self.analyser = Analyser(config) def download_logs(self): print("here") if self.config.download: self.downloader.download() def generate_report(self): #if self.config.download: # self.downloader.download() self.parser.parse() self.analyser.analyse() return self.config
def download_iclr_paper_given_html_file(year, html_path, save_dir, time_step_in_seconds=10, downloader='IDM'): """ download iclr conference paper given html file (current only support 2021) :param year: int, iclr year, current only support year >= 2018 :param html_path: str, html file's full pathname :param save_dir: str, paper save path :param time_step_in_seconds: int, the interval time between two downlaod request in seconds :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM' :return: True """ downloader = Downloader(downloader=downloader) base_url = f'https://openreview.net/group?id=ICLR.cc/{year}' content = open(html_path, 'rb').read() soup = BeautifulSoup(content, 'html5lib') divs = soup.find('div', {'class': 'tabs-container'}) oral_papers = divs.find('div', {'id': 'oral-presentations'}).find_all('li', {'class': 'note'}) num_oral_papers = len(oral_papers) print('found number of oral papers:', num_oral_papers) spotlight_papers = divs.find('div', {'id': 'spotlight-presentations'}).find_all('li', {'class': 'note'}) num_spotlight_papers = len(spotlight_papers) print('found number of spotlight papers:', num_spotlight_papers) poster_papers = divs.find('div', {'id': 'poster-presentations'}).find_all('li', {'class': 'note'}) num_poster_papers = len(poster_papers) print('found number of poster papers:', num_poster_papers) paper_postfix = f'ICLR_{year}' error_log = [] # oral oral_save_dir = os.path.join(save_dir, 'oral') print('downloading oral papers...........') os.makedirs(oral_save_dir, exist_ok=True) for index, paper in enumerate(oral_papers): a_hrefs = paper.find_all("a") name = slugify(a_hrefs[0].text.strip()) pdf_name = name + '_' + paper_postfix + '.pdf' if not os.path.exists(os.path.join(oral_save_dir, pdf_name)): link = a_hrefs[1].get('href') link = urllib.parse.urljoin(base_url, link) print('Downloading paper {}/{}: {}'.format(index + 1, num_oral_papers, name)) # try 1 times success_flag = False for d_iter in range(1): try: downloader.download( urls=link, save_path=os.path.join(oral_save_dir, pdf_name), time_sleep_in_seconds=time_step_in_seconds ) success_flag = True break except Exception as e: print('Error: ' + name + ' - ' + str(e)) # time.sleep(time_step_in_seconds) if not success_flag: error_log.append((name, link)) # spotlight spotlight_save_dir = os.path.join(save_dir, 'spotlight') print('downloading spotlight papers...........') os.makedirs(spotlight_save_dir, exist_ok=True) for index, paper in enumerate(spotlight_papers): a_hrefs = paper.find_all("a") name = slugify(a_hrefs[0].text.strip()) pdf_name = name + '_' + paper_postfix + '.pdf' if not os.path.exists(os.path.join(spotlight_save_dir, pdf_name)): link = a_hrefs[1].get('href') link = urllib.parse.urljoin(base_url, link) print('Downloading paper {}/{}: {}'.format(index + 1, num_spotlight_papers, name)) # try 1 times success_flag = False for d_iter in range(1): try: downloader.download( urls=link, save_path=os.path.join(spotlight_save_dir, pdf_name), time_sleep_in_seconds=time_step_in_seconds ) success_flag = True break except Exception as e: print('Error: ' + name + ' - ' + str(e)) # time.sleep(time_step_in_seconds) if not success_flag: error_log.append((name, link)) # poster poster_save_dir = os.path.join(save_dir, 'poster') print('downloading poster papers...........') os.makedirs(poster_save_dir, exist_ok=True) for index, paper in enumerate(poster_papers): a_hrefs = paper.find_all("a") name = slugify(a_hrefs[0].text.strip()) pdf_name = name + '_' + paper_postfix + '.pdf' if not os.path.exists(os.path.join(poster_save_dir, pdf_name)): link = a_hrefs[1].get('href') link = urllib.parse.urljoin(base_url, link) print('Downloading paper {}/{}: {}'.format(index + 1, num_poster_papers, name)) # try 1 times success_flag = False for d_iter in range(1): try: downloader.download( urls=link, save_path=os.path.join(poster_save_dir, pdf_name), time_sleep_in_seconds=time_step_in_seconds ) success_flag = True break except Exception as e: print('Error: ' + name + ' - ' + str(e)) # time.sleep(time_step_in_seconds) if not success_flag: error_log.append((name, link)) # 2. write error log print('write error log') with open('..\\log\\download_err_log.txt', 'w') as f: for log in tqdm(error_log): for e in log: f.write(e) f.write('\n') f.write('\n')
def download_iclr_paper(year, save_dir, time_step_in_seconds=5, downloader='IDM', is_use_arxiv_mirror=False): """ download iclr conference paper for year 2014, 2015 and 2016 :param year: int, iclr year :param save_dir: str, paper save path :param time_step_in_seconds: int, the interval time between two downlaod request in seconds :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM' :return: True """ downloader = Downloader(downloader=downloader) paper_postfix = f'ICLR_{year}' if year == 2016: base_url = 'https://iclr.cc/archive/www/doku.php%3Fid=iclr2016:main.html' elif year == 2015: base_url = 'https://iclr.cc/archive/www/doku.php%3Fid=iclr2015:main.html' elif year == 2014: base_url = 'https://iclr.cc/archive/2014/conference-proceedings/' else: raise ValueError('the website url is not given for this year!') os.makedirs(save_dir, exist_ok=True) if year == 2015: # oral and poster seperated oral_save_path = os.path.join(save_dir, 'oral') poster_save_path = os.path.join(save_dir, 'poster') workshop_save_path = os.path.join(save_dir, 'ws') os.makedirs(oral_save_path, exist_ok=True) os.makedirs(poster_save_path, exist_ok=True) os.makedirs(workshop_save_path, exist_ok=True) if os.path.exists(f'..\\urls\\init_url_iclr_{year}.dat'): with open(f'..\\urls\\init_url_iclr_{year}.dat', 'rb') as f: content = pickle.load(f) else: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} req = urllib.request.Request(url=base_url, headers=headers) content = urllib.request.urlopen(req).read() with open(f'..\\urls\\init_url_iclr_{year}.dat', 'wb') as f: pickle.dump(content, f) error_log = [] soup = BeautifulSoup(content, 'html.parser') print('open url successfully!') if year == 2016: papers = soup.find('h3', {'id': 'accepted_papers_conference_track'}).findNext('div').find_all('a') for paper in tqdm(papers): link = paper.get('href') if link.startswith('http://arxiv'): title = slugify(paper.text) pdf_name = f'{title}_{paper_postfix}.pdf' try: if not os.path.exists(os.path.join(save_dir, title+f'_{paper_postfix}.pdf')): pdf_link = get_pdf_link_from_arxiv(link, is_use_mirror=is_use_arxiv_mirror) print(f'downloading {title}') downloader.download( urls=pdf_link, save_path=os.path.join(save_dir, pdf_name), time_sleep_in_seconds=time_step_in_seconds ) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append((title, link, 'paper download error', str(e))) # workshops papers = soup.find('h3', {'id': 'workshop_track_posters_may_2nd'}).findNext('div').find_all('a') for paper in tqdm(papers): link = paper.get('href') if link.startswith('http://beta.openreview'): title = slugify(paper.text) pdf_name = f'{title}_ICLR_WS_{year}.pdf' try: if not os.path.exists(os.path.join(save_dir, 'ws', pdf_name)): pdf_link = get_pdf_link_from_openreview(link) print(f'downloading {title}') downloader.download( urls=pdf_link, save_path=os.path.join(save_dir, 'ws', pdf_name), time_sleep_in_seconds=time_step_in_seconds ) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append((title, link, 'paper download error', str(e))) papers = soup.find('h3', {'id': 'workshop_track_posters_may_3rd'}).findNext('div').find_all('a') for paper in tqdm(papers): link = paper.get('href') if link.startswith('http://beta.openreview'): title = slugify(paper.text) pdf_name = f'{title}_ICLR_WS_{year}.pdf' try: if not os.path.exists(os.path.join(save_dir, 'ws', pdf_name)): pdf_link = get_pdf_link_from_openreview(link) print(f'downloading {title}') downloader.download( urls=pdf_link, save_path=os.path.join(save_dir, 'ws', pdf_name), time_sleep_in_seconds=time_step_in_seconds ) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append((title, link, 'paper download error', str(e))) elif year == 2015: # oral papers oral_papers = soup.find('h3', {'id': 'conference_oral_presentations'}).findNext('div').find_all('a') for paper in tqdm(oral_papers): link = paper.get('href') if link.startswith('http://arxiv'): title = slugify(paper.text) pdf_name = f'{title}_{paper_postfix}.pdf' try: if not os.path.exists(os.path.join(oral_save_path, title+f'_{paper_postfix}.pdf')): pdf_link = get_pdf_link_from_arxiv(link, is_use_mirror=is_use_arxiv_mirror) print(f'downloading {title}') downloader.download( urls=pdf_link, save_path=os.path.join(oral_save_path, pdf_name), time_sleep_in_seconds=time_step_in_seconds ) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append((title, link, 'paper download error', str(e))) # workshops papers workshop_papers = soup.find('h3', {'id': 'may_7_workshop_poster_session'}).findNext('div').find_all('a') workshop_papers.append( soup.find('h3', {'id': 'may_8_workshop_poster_session'}).findNext('div').find_all('a')) for paper in tqdm(workshop_papers): link = paper.get('href') if link.startswith('http://arxiv'): title = slugify(paper.text) pdf_name = f'{title}_ICLR_WS_{year}.pdf' try: if not os.path.exists(os.path.join(workshop_save_path, title + f'_{paper_postfix}.pdf')): pdf_link = get_pdf_link_from_arxiv(link, is_use_mirror=is_use_arxiv_mirror) print(f'downloading {title}') downloader.download( urls=pdf_link, save_path=os.path.join(workshop_save_path, pdf_name), time_sleep_in_seconds=time_step_in_seconds) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append((title, link, 'paper download error', str(e))) # poster papers poster_papers = soup.find('h3', {'id': 'may_9_conference_poster_session'}).findNext('div').find_all('a') for paper in tqdm(poster_papers): link = paper.get('href') if link.startswith('http://arxiv'): title = slugify(paper.text) pdf_name = f'{title}_{paper_postfix}.pdf' try: if not os.path.exists(os.path.join(poster_save_path, title + f'_{paper_postfix}.pdf')): pdf_link = get_pdf_link_from_arxiv(link, is_use_mirror=is_use_arxiv_mirror) print(f'downloading {title}') downloader.download( urls=pdf_link, save_path=os.path.join(poster_save_path, pdf_name), time_sleep_in_seconds=time_step_in_seconds) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append((title, link, 'paper download error', str(e))) elif year == 2014: papers = soup.find('div', {'id': 'sites-canvas-main-content'}).find_all('a') for paper in tqdm(papers): link = paper.get('href') if link.startswith('http://arxiv'): title = slugify(paper.text) pdf_name = f'{title}_{paper_postfix}.pdf' try: if not os.path.exists(os.path.join(save_dir, pdf_name)): pdf_link = get_pdf_link_from_arxiv(link, is_use_mirror=is_use_arxiv_mirror) print(f'downloading {title}') downloader.download( urls=pdf_link, save_path=os.path.join(save_dir, pdf_name), time_sleep_in_seconds=time_step_in_seconds) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append((title, link, 'paper download error', str(e))) # workshops paper_postfix = f'ICLR_WS_{year}' base_url = 'https://sites.google.com/site/representationlearning2014/workshop-proceedings' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} req = urllib.request.Request(url=base_url, headers=headers) content = urllib.request.urlopen(req).read() soup = BeautifulSoup(content, 'html.parser') workshop_save_path = os.path.join(save_dir, 'WS') os.makedirs(workshop_save_path, exist_ok=True) papers = soup.find('div', {'id': 'sites-canvas-main-content'}).find_all('a') for paper in tqdm(papers): link = paper.get('href') if link.startswith('http://arxiv'): title = slugify(paper.text) pdf_name = f'{title}_{paper_postfix}.pdf' try: if not os.path.exists(os.path.join(workshop_save_path, pdf_name)): pdf_link = get_pdf_link_from_arxiv(link, is_use_mirror=is_use_arxiv_mirror) print(f'downloading {title}') downloader.download( urls=pdf_link, save_path=os.path.join(workshop_save_path, pdf_name), time_sleep_in_seconds=time_step_in_seconds) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append((title, link, 'paper download error', str(e))) # write error log print('write error log') with open('..\\log\\download_err_log.txt', 'w') as f: for log in tqdm(error_log): for e in log: if e is not None: f.write(e) else: f.write('None') f.write('\n') f.write('\n') return True
def download_iclr_spotlight_papers(save_dir, driver_path, year, base_url=None, time_step_in_seconds=10, downloader='IDM'): """ :param save_dir: str, paper save path :param driver_path: str, 'chromedriver.exe' full pathname :param year: int, iclr year, current only support year >= 2018 :param base_url: str, paper website url :param time_step_in_seconds: int, the interval time between two downlaod request in seconds :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM' :return: """ downloader = Downloader(downloader=downloader) if base_url is None: if year >= 2021: base_url = 'https://openreview.net/group?id=ICLR.cc/2021/Conference#spotlight-presentations' elif year == 2020: base_url = 'https://openreview.net/group?id=ICLR.cc/2020/Conference#accept-spotlight' else: raise ValueError('the website url is not given for this year!') first_poster_index = {'2017': 15} paper_postfix = f'ICLR_{year}' error_log = [] driver = webdriver.Chrome(driver_path) driver.get(base_url) if not os.path.exists(save_dir): os.makedirs(save_dir) # wait for the select element to become visible print('Starting web driver wait...') wait = WebDriverWait(driver, 20) print('Starting web driver wait... finished') res = wait.until(EC.presence_of_element_located((By.ID, "notes"))) print("Successful load the website!->",res) res = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "note"))) print("Successful load the website notes!->",res) # parse the results if year >= 2021: divs = driver.find_elements_by_xpath('//*[@id="spotlight-presentations"]/ul/li') elif year == 2020: divs = driver.find_elements_by_xpath('//*[@id="accept-spotlight"]/ul/li') else: divs = driver.find_elements_by_class_name('note')[:first_poster_index[str(year)]] num_papers = len(divs) print('found number of papers:',num_papers) for index, paper in enumerate(divs): a_hrefs = paper.find_elements_by_tag_name("a") if year >= 2018: name = slugify(a_hrefs[0].text.strip()) link = a_hrefs[1].get_attribute('href') else: name = slugify(paper.find_element_by_class_name('note_content_title').text) link = paper.find_element_by_class_name('note_content_pdf').get_attribute('href') print('Downloading paper {}/{}: {}'.format(index+1, num_papers, name)) pdf_name = name + '_' + paper_postfix + '.pdf' if not os.path.exists(os.path.join(save_dir, pdf_name)): # try 1 times success_flag = False for d_iter in range(1): try: downloader.download( urls=link, save_path=os.path.join(save_dir, pdf_name), time_sleep_in_seconds=time_step_in_seconds ) success_flag = True break except Exception as e: print('Error: ' + name + ' - ' + str(e)) if not success_flag: error_log.append((name, link)) driver.close() # 2. write error log print('write error log') with open('..\\log\\download_err_log.txt', 'w') as f: for log in tqdm(error_log): for e in log: f.write(e) f.write('\n') f.write('\n')
def main(argv): config = configparser.ConfigParser() config_path = os.path.expanduser(os.path.join("~", ".doimgrrc")) if os.path.isfile(config_path): config.read(config_path) parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='Command line based tool to request DOI data and convert \ it to BibTex entries.') subparsers = parser.add_subparsers() parser_search = subparsers.add_parser( 'search', help='Search database for published articles to find relevant DOIs', description="""Searches database for published articles. This can be used to find a specific DOI or getting information about a keyword/topic.""") parser_search.add_argument('query', type=str, help='search string') parser_search.add_argument( '--show-authors', action='store_true', default=config.getboolean('search', 'show-authors', fallback=False), help='if set additional author information is shown') parser_search.add_argument( '--show-type', action='store_true', default=config.getboolean('search', 'show-type', fallback=False), help='if set additional information about the type is shown') parser_search.add_argument( '--show-publisher', action='store_true', default=config.getboolean('search', 'show-publisher', fallback=False), help='if set additional information about the publisher is shown') parser_search.add_argument('--show-url', action='store_true', default=config.getboolean('search', 'show-url', fallback=False), help='if set a URL to the document is shown') allowed_sort_types = [ 'score', 'updated', 'deposited', 'indexed', 'published' ] parser_search.add_argument('--sort', type=str, choices=allowed_sort_types, default=config.get('search', 'sort', fallback='score'), help='sorting of search queries; allowed values are {}'\ .format(", ".join(allowed_sort_types)), metavar='') parser_search.add_argument('--order', type=str, choices=['asc', 'desc'], default=config.get('search', 'order', fallback='desc'), help='ordering of search queries') parser_search.add_argument('--year', type=int, default=config.getint('search', 'year', fallback=None), help='limit the year') parser_search.add_argument('--rows', type=int, default=config.getint('search', 'rows', fallback=20), help='number of rows to load') parser_search.add_argument('--color', action="store_true", default=config.getboolean('search', 'color', fallback=False), help='if set, colored output is used') valid_colors = [ 'black', 'cyan', 'magenta', 'yellow', 'blue', 'green', 'red', 'white' ] parser_search.add_argument('--color-doi', type=str, default=config.get('search', 'color-doi', fallback='red'), choices=valid_colors, help='color for DOIs') parser_search.add_argument('--color-title', type=str, default=config.get('search', 'color-title', fallback='green'), choices=valid_colors, help='color for titles') parser_search.add_argument('--color-more', type=str, default=config.get('search', 'color-more', fallback='blue'), choices=valid_colors, help='color for additional information such as \ authors, URLs, etc.') # receive allowed types via http://api.crossref.org/types allowed_types = api.get_valid_types() parser_search.add_argument('--type', type=str, choices=allowed_types, default=config.get('search', 'type', fallback=None), help='selects a single type; allowed values are {}'.format(", "\ .join(allowed_types)), metavar='') parser_search.set_defaults(which_parser='search') parser_cite = subparsers.add_parser( 'cite', help='Cite article based on DOI in different citation formats', description="""Cite articles with a known DOI. Formatting can be done using the `style`-parameter and supports hundreds of different citation formats. A full list of supported formats can be found in the subfolder `API/styles.txt`. The most common ones are `apa` and `bibtex`.""") parser_cite.add_argument('identifier', type=str, help='DOI identifier') parser_cite.add_argument('-s', '--style', type=str, default=config.get('cite', 'style', fallback="bibtex"), help='Citation style') parser_cite.add_argument( '-c', '--copy', action='store_true', default=config.get('cite', 'copy', fallback=False), help="""Copies the result to the system clipboard""") parser_cite.set_defaults(which_parser='cite') parser_download = subparsers.add_parser( 'download', help='Download articles based on their DOI', description="""Downloads articles, if a full text verison is provided by the authors.""") parser_download.add_argument('identifier', type=str, help='DOI identifier') parser_download.add_argument('-d', '--destination', type=str, default=config.get('download', 'destination', fallback="."), help='download destination') parser_download.set_defaults(which_parser='download') parser_bulk = subparsers.add_parser( 'bulk', help='Mass converting for multiple DOIs listed in a single file.', description= """Mass converting for multiple DOIs listed in a single file.""") parser_bulk.add_argument('input', type=argparse.FileType('r'), help='input file path', nargs='?', default=sys.stdin) parser_bulk.add_argument('output', type=argparse.FileType('w'), help='output file path', nargs='?', default=sys.stdout) parser_bulk.add_argument('-s', '--style', type=str, default=config.get('bulk', 'style', fallback="bibtex"), help='Citation style') parser_bulk.set_defaults(which_parser='bulk') parser_service = subparsers.add_parser( 'service', help='Provices service functions for the API such as rebuilding the \ database of valid types and styles', description="""Provices service functions for the API such as rebuilding the database of valid types and styles""") parser_service.add_argument( '--rebuild-api-types', action='store_true', help='Rebuild the types, that are accepted on API requests') parser_service.add_argument( '--rebuild-api-styles', action='store_true', help='Rebuild the styles, that are accepted on API requests') parser_service.set_defaults(which_parser='service') parser.add_argument( '-q', '--quiet', action='store_true', default=config.getboolean('general', 'quiet', fallback=False), help='turns off all unnecessary outputs; use this for scripting') parser.add_argument('--log-level', type=str, choices=['info', 'debug'], default=config.get('general', 'log-level', fallback="info"), help='set the logging level') parser.add_argument('--version', action="store_true", help='shows the version of doimgr') args = parser.parse_args() if args.version: print("doimgr version: {}".format(__version__)) sys.exit() # set the logging levels according to the users choice if args.quiet: level = logging.CRITICAL else: level = logging.INFO if args.log_level == 'debug': level = logging.DEBUG logging.basicConfig(level=level) logging.debug("doimgr version {}".format(__version__)) if hasattr(args, 'which_parser'): if args.which_parser == 'search': logging.debug('Arguments match to perform search') req = Request() if sys.stdout.isatty(): # only allow colors when the script's output is not redirected req.set_colored_output(args.color, doi=args.color_doi, title=args.color_title, more=args.color_more) else: logging.debug('Colors have been disabled due to detected \ reconnect') results = req.search( req.prepare_search_query(args.query, args.sort, args.order, args.year, args.type, args.rows)) req.print_search_content(results, args.show_authors, args.show_type, args.show_publisher, args.show_url) elif args.which_parser == 'cite': logging.debug('Arguments match to request single DOI') # check if given style is valid # this is not done via argparse directly due to the amount of # possible parameters styles = api.get_valid_styles() if args.style not in styles: raise ValueError("Given style \"{}\" is not valid. \ Aborting.".format(args.style)) req = Request() result = req.citation(req.prepare_citation_query(args.identifier), style=args.style) req.print_citation(result) if args.copy: Clipboard.copy_to(result) elif args.which_parser == 'download': logging.debug('Arguments match to download single DOI') try: os.makedirs(os.path.expanduser(args.destination)) logging.debug("Destination dir {} created.".format( args.destination)) except FileExistsError: logging.debug("Destination dir {} does already exists".format( args.destination)) req = Request() links = req.get_download_links(args.identifier) for link in links: url = link.get_url() d = Downloader() filepath = d.download( url, os.path.expanduser(args.destination), "{}.pdf".format(args.identifier.replace("/", "_"))) if filepath is not None: logging.info("Saved file as {}".format(filepath)) if len(links) == 0: logging.info("No valid download URLs found. Aborting.") elif args.which_parser == 'bulk': logging.debug('Arguments match with bulk conversion') # check if given style valid # this is not done via argparse directly due to the amount of # possible parameters styles = api.get_valid_styles() if args.style not in styles: raise ValueError("Given style \"{}\" is not valid. \ Aborting.".format(args.style)) b = BulkConverter() if args.output == sys.stdout: # switch to quiet mode, since we do not want to place # unneccesary messages on stdout logging.getLogger().setLevel(logging.CRITICAL) b.run(args.input, args.output, style=args.style) elif args.which_parser == 'service': logging.debug('Arguments match with service call') if args.rebuild_api_types: api.rebuild_valid_identifier(api.TYPE_TYPES) if args.rebuild_api_styles: api.rebuild_valid_identifier(api.TYPE_STYLES)
def download_paper(volumn, save_dir, time_step_in_seconds=5, downloader='IDM', url=None, is_use_url=False): """ download all JMLR paper files given volumn and restore in save_dir respectively :param volumn: int, JMLR volumn, such as 2019 :param save_dir: str, paper and supplement material's save path :param time_step_in_seconds: int, the interval time between two downlaod request in seconds :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM' :param url: None or str, None means to download volumn papers. :param is_use_url: bool, if to download papers from 'url'. url couldn't be None when is_use_url is True. :return: True """ downloader = Downloader(downloader=downloader) # create current dict title_list = [] # paper_dict = dict() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' } if not is_use_url: init_url = f'http://jmlr.org/papers/v{volumn}/' postfix = f'JMLR_v{volumn}' if os.path.exists(f'..\\urls\\init_url_JMLR_v{volumn}.dat'): with open(f'..\\urls\\init_url_JMLR_v{volumn}.dat', 'rb') as f: content = pickle.load(f) else: req = urllib.request.Request(url=init_url, headers=headers) content = urllib.request.urlopen(req, timeout=10).read() # content = open(f'..\\JMLR_{volumn}.html', 'rb').read() with open(f'..\\urls\\init_url_JMLR_v{volumn}.dat', 'wb') as f: pickle.dump(content, f) elif url is not None: req = urllib.request.Request(url=url, headers=headers) content = urllib.request.urlopen(req, timeout=10).read() postfix = f'JMLR' else: raise ValueError( ''''url' could not be None when 'is_use_url'=True!!!''') # soup = BeautifulSoup(content, 'html.parser') soup = BeautifulSoup(content, 'html5lib') # soup = BeautifulSoup(open(r'..\JMLR_2011.html', 'rb'), 'html.parser') error_log = [] os.makedirs(save_dir, exist_ok=True) if (not is_use_url) and volumn <= 4: paper_list = soup.find('div', {'id': 'content'}).find_all('tr') else: paper_list = soup.find('div', {'id': 'content'}).find_all('dl') # num_download = 5 # number of papers to download num_download = len(paper_list) for paper in tqdm(zip(paper_list, range(num_download))): # get title print('\n') this_paper = paper[0] title = slugify(this_paper.find('dt').text) try: print('Downloading paper {}/{}: {}'.format(paper[1] + 1, num_download, title)) except: print(title.encode('utf8')) title_list.append(title) this_paper_main_path = os.path.join( save_dir, f'{title}_{postfix}.pdf'.replace(' ', '_')) if os.path.exists(this_paper_main_path): continue # get abstract page url links = this_paper.find_all('a') main_link = None for link in links: if '[pdf]' == link.text or 'pdf' == link.text: main_link = urllib.parse.urljoin('http://jmlr.org', link.get('href')) break # try 1 time # error_flag = False for d_iter in range(1): try: # download paper with IDM if not os.path.exists( this_paper_main_path) and main_link is not None: downloader.download( urls=main_link, save_path=this_paper_main_path, time_sleep_in_seconds=time_step_in_seconds) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append( (title, main_link, 'main paper download error', str(e))) # store the results # 1. store in the pickle file # with open(f'{postfix}_pre.dat', 'wb') as f: # pickle.dump(paper_dict, f) # 2. write error log print('write error log') with open('..\\log\\download_err_log.txt', 'w') as f: for log in tqdm(error_log): for e in log: if e is not None: f.write(e) else: f.write('None') f.write('\n') f.write('\n')
def main(argv): config = configparser.ConfigParser() config_path = os.path.expanduser(os.path.join("~", ".doimgrrc")) if os.path.isfile(config_path): config.read(config_path) parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='Command line based tool to request DOI data and convert \ it to BibTex entries.') subparsers = parser.add_subparsers() parser_search = subparsers.add_parser('search', help='Search database for published articles to find relevant DOIs', description="""Searches database for published articles. This can be used to find a specific DOI or getting information about a keyword/topic.""") parser_search.add_argument('query', type=str, help='search string') parser_search.add_argument('--show-authors', action='store_true', default=config.getboolean('search', 'show-authors', fallback=False), help='if set additional author information is shown') parser_search.add_argument('--show-type', action='store_true', default=config.getboolean('search', 'show-type', fallback=False), help='if set additional information about the type is shown') parser_search.add_argument('--show-publisher', action='store_true', default=config.getboolean('search', 'show-publisher', fallback=False), help='if set additional information about the publisher is shown') parser_search.add_argument('--show-url', action='store_true', default=config.getboolean('search', 'show-url', fallback=False), help='if set a URL to the document is shown') allowed_sort_types=['score', 'updated', 'deposited', 'indexed', 'published'] parser_search.add_argument('--sort', type=str, choices=allowed_sort_types, default=config.get('search', 'sort', fallback='score'), help='sorting of search queries; allowed values are {}'\ .format(", ".join(allowed_sort_types)), metavar='') parser_search.add_argument('--order', type=str, choices=['asc', 'desc'], default=config.get('search', 'order', fallback='desc'), help='ordering of search queries') parser_search.add_argument('--year', type=int, default=config.getint('search', 'year', fallback=None), help='limit the year') parser_search.add_argument('--rows', type=int, default=config.getint('search', 'rows', fallback=20), help='number of rows to load') parser_search.add_argument('--color', action="store_true", default=config.getboolean('search', 'color', fallback=False), help='if set, colored output is used') valid_colors = ['black', 'cyan', 'magenta', 'yellow', 'blue', 'green', 'red', 'white'] parser_search.add_argument('--color-doi', type=str, default=config.get('search', 'color-doi', fallback='red'), choices=valid_colors, help='color for DOIs') parser_search.add_argument('--color-title', type=str, default=config.get('search', 'color-title', fallback='green'), choices=valid_colors, help='color for titles') parser_search.add_argument('--color-more', type=str, default=config.get('search', 'color-more', fallback='blue'), choices=valid_colors, help='color for additional information such as \ authors, URLs, etc.') # receive allowed types via http://api.crossref.org/types allowed_types = api.get_valid_types() parser_search.add_argument('--type', type=str, choices=allowed_types, default=config.get('search', 'type', fallback=None), help='selects a single type; allowed values are {}'.format(", "\ .join(allowed_types)), metavar='') parser_search.set_defaults(which_parser='search') parser_cite = subparsers.add_parser('cite', help='Cite article based on DOI in different citation formats', description="""Cite articles with a known DOI. Formatting can be done using the `style`-parameter and supports hundreds of different citation formats. A full list of supported formats can be found in the subfolder `API/styles.txt`. The most common ones are `apa` and `bibtex`.""") parser_cite.add_argument('identifier', type=str, help='DOI identifier') parser_cite.add_argument('-s', '--style', type=str, default=config.get('cite', 'style', fallback="bibtex"), help='Citation style') parser_cite.add_argument('-c', '--copy', action='store_true', default=config.get('cite', 'copy', fallback=False), help="""Copies the result to the system clipboard""") parser_cite.set_defaults(which_parser='cite') parser_download = subparsers.add_parser('download', help='Download articles based on their DOI', description="""Downloads articles, if a full text verison is provided by the authors.""") parser_download.add_argument('identifier', type=str, help='DOI identifier') parser_download.add_argument('-d', '--destination', type=str, default=config.get('download', 'destination', fallback="."), help='download destination') parser_download.set_defaults(which_parser='download') parser_bulk = subparsers.add_parser('bulk', help='Mass converting for multiple DOIs listed in a single file.', description="""Mass converting for multiple DOIs listed in a single file.""") parser_bulk.add_argument('input', type=argparse.FileType('r'), help='input file path', nargs='?', default=sys.stdin) parser_bulk.add_argument('output', type=argparse.FileType('w'), help='output file path', nargs='?', default=sys.stdout) parser_bulk.add_argument('-s', '--style', type=str, default=config.get('bulk', 'style', fallback="bibtex"), help='Citation style') parser_bulk.set_defaults(which_parser='bulk') parser_service = subparsers.add_parser('service', help='Provices service functions for the API such as rebuilding the \ database of valid types and styles', description="""Provices service functions for the API such as rebuilding the database of valid types and styles""") parser_service.add_argument('--rebuild-api-types', action='store_true', help='Rebuild the types, that are accepted on API requests') parser_service.add_argument('--rebuild-api-styles', action='store_true', help='Rebuild the styles, that are accepted on API requests') parser_service.set_defaults(which_parser='service') parser.add_argument('-q', '--quiet', action='store_true', default=config.getboolean('general', 'quiet', fallback=False), help='turns off all unnecessary outputs; use this for scripting') parser.add_argument('--log-level', type=str, choices=['info', 'debug'], default=config.get('general', 'log-level', fallback="info"), help='set the logging level') parser.add_argument('--version', action="store_true", help='shows the version of doimgr') args = parser.parse_args() if args.version: print("doimgr version: {}".format(__version__)) sys.exit() # set the logging levels according to the users choice if args.quiet: level = logging.CRITICAL else: level = logging.INFO if args.log_level == 'debug': level = logging.DEBUG logging.basicConfig(level=level) logging.debug("doimgr version {}".format(__version__)) if hasattr(args, 'which_parser'): if args.which_parser == 'search': logging.debug('Arguments match to perform search') req = Request() if sys.stdout.isatty(): # only allow colors when the script's output is not redirected req.set_colored_output(args.color, doi=args.color_doi, title=args.color_title, more=args.color_more) else: logging.debug('Colors have been disabled due to detected \ reconnect') results = req.search(req.prepare_search_query(args.query, args.sort, args.order, args.year, args.type, args.rows)) req.print_search_content(results, args.show_authors, args.show_type, args.show_publisher, args.show_url) elif args.which_parser == 'cite': logging.debug('Arguments match to request single DOI') # check if given style is valid # this is not done via argparse directly due to the amount of # possible parameters styles = api.get_valid_styles() if args.style not in styles: raise ValueError("Given style \"{}\" is not valid. \ Aborting.".format(args.style)) req = Request() result = req.citation(req.prepare_citation_query(args.identifier), style=args.style) req.print_citation(result) if args.copy: Clipboard.copy_to(result) elif args.which_parser == 'download': logging.debug('Arguments match to download single DOI') try: os.makedirs(os.path.expanduser(args.destination)) logging.debug("Destination dir {} created.".format( args.destination)) except FileExistsError: logging.debug("Destination dir {} does already exists".format( args.destination)) req = Request() links = req.get_download_links(args.identifier) for link in links: url = link.get_url() d = Downloader() filepath = d.download(url, os.path.expanduser(args.destination), "{}.pdf".format(args.identifier.replace("/", "_"))) if filepath is not None: logging.info("Saved file as {}".format(filepath)) if len(links) == 0: logging.info("No valid download URLs found. Aborting.") elif args.which_parser == 'bulk': logging.debug('Arguments match with bulk conversion') # check if given style valid # this is not done via argparse directly due to the amount of # possible parameters styles = api.get_valid_styles() if args.style not in styles: raise ValueError("Given style \"{}\" is not valid. \ Aborting.".format(args.style)) b = BulkConverter() if args.output == sys.stdout: # switch to quiet mode, since we do not want to place # unneccesary messages on stdout logging.getLogger().setLevel(logging.CRITICAL) b.run(args.input, args.output, style=args.style) elif args.which_parser == 'service': logging.debug('Arguments match with service call') if args.rebuild_api_types: api.rebuild_valid_identifier(api.TYPE_TYPES) if args.rebuild_api_styles: api.rebuild_valid_identifier(api.TYPE_STYLES)
def download_from_csv(postfix, save_dir, csv_file_path, is_download_main_paper=True, is_download_supplement=True, time_step_in_seconds=5, total_paper_number=None, downloader='IDM'): """ download paper and supplement files and save them to save_dir/main_paper and save_dir/supplement respectively :param postfix: str, postfix that will be added at the end of papers' title :param save_dir: str, paper and supplement material's save path :param csv_file_path: str, the full path to csv file :param is_download_main_paper: bool, True for downloading main paper :param is_download_supplement: bool, True for downloading supplemental material :param time_step_in_seconds: int, the interval time between two downloading request in seconds :param total_paper_number: int, the total number of papers that is going to download :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM'. :return: True """ downloader = Downloader(downloader=downloader) if not os.path.exists(csv_file_path): raise ValueError(f'ERROR: file not found in {csv_file_path}!!!') main_save_path = os.path.join(save_dir, 'main_paper') if is_download_main_paper: os.makedirs(main_save_path, exist_ok=True) if is_download_supplement: supplement_save_path = os.path.join(save_dir, 'supplement') os.makedirs(supplement_save_path, exist_ok=True) error_log = [] with open(csv_file_path, newline='') as csvfile: myreader = csv.DictReader(csvfile, delimiter=',') pbar = tqdm(myreader) i = 0 for this_paper in pbar: is_grouped = ('group' in this_paper) i += 1 # get title if is_grouped: group = slugify(this_paper['group']) title = slugify(this_paper['title']) if total_paper_number is not None: pbar.set_description( f'Downloading paper {i}/{total_paper_number}') else: pbar.set_description(f'Downloading paper {i}') this_paper_main_path = os.path.join(main_save_path, f'{title}_{postfix}.pdf') if is_grouped: this_paper_main_path = os.path.join(main_save_path, group, f'{title}_{postfix}.pdf') if is_download_supplement: this_paper_supp_path_no_ext = os.path.join( supplement_save_path, f'{title}_{postfix}_supp.') if is_grouped: this_paper_supp_path_no_ext = os.path.join( supplement_save_path, group, f'{title}_{postfix}_supp.') if '' != this_paper['supplemental link'] and os.path.exists(this_paper_main_path) and \ (os.path.exists(this_paper_supp_path_no_ext + 'zip') or os.path.exists( this_paper_supp_path_no_ext + 'pdf')): continue elif '' == this_paper['supplemental link'] and os.path.exists( this_paper_main_path): continue elif os.path.exists(this_paper_main_path): continue if 'error' == this_paper['main link']: error_log.append((title, 'no MAIN link')) elif '' != this_paper['main link']: if is_grouped: if is_download_main_paper: os.makedirs(os.path.join(main_save_path, group), exist_ok=True) if is_download_supplement: os.makedirs(os.path.join(supplement_save_path, group), exist_ok=True) if is_download_main_paper: try: # download paper with IDM if not os.path.exists(this_paper_main_path): downloader.download( urls=this_paper['main link'].replace( ' ', '%20'), save_path=os.path.join(os.getcwd(), this_paper_main_path), time_sleep_in_seconds=time_step_in_seconds) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append((title, this_paper['main link'], 'main paper download error', str(e))) # download supp if is_download_supplement: # check whether the supp can be downloaded if not (os.path.exists(this_paper_supp_path_no_ext + 'zip') or os.path.exists(this_paper_supp_path_no_ext + 'pdf')): if 'error' == this_paper['supplemental link']: error_log.append((title, 'no SUPPLEMENTAL link')) elif '' != this_paper['supplemental link']: supp_type = this_paper['supplemental link'].split( '.')[-1] try: downloader.download( urls=this_paper['supplemental link'], save_path=os.path.join( os.getcwd(), this_paper_supp_path_no_ext + supp_type), time_sleep_in_seconds=time_step_in_seconds) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append( (title, this_paper['supplemental link'], 'supplement download error', str(e))) # 2. write error log print('write error log') with open('..\\log\\download_err_log.txt', 'w') as f: for log in tqdm(error_log): for e in log: if e is not None: f.write(e) else: f.write('None') f.write('\n') f.write('\n') return True
def download_paper(year, save_dir, is_download_supplement=True, time_step_in_seconds=5, downloader='IDM'): """ download all ICML paper and supplement files given year, restore in save_dir/main_paper and save_dir/supplement respectively :param year: int, ICML year, such 2019 :param save_dir: str, paper and supplement material's save path :param is_download_supplement: bool, True for downloading supplemental material :param time_step_in_seconds: int, the interval time between two downlaod request in seconds :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM' :return: True """ downloader = Downloader(downloader=downloader) ICML_year_dict = { 2021: 139, 2020: 119, 2019: 97, 2018: 80, 2017: 70, 2016: 48, 2015: 37, 2014: 32, 2013: 28 } if year >= 2013: init_url = f'http://proceedings.mlr.press/v{ICML_year_dict[year]}/' elif year == 2012: init_url = 'https://icml.cc/2012/papers.1.html' elif year == 2011: init_url = 'http://www.icml-2011.org/papers.php' elif 2009 == year: init_url = 'https://icml.cc/Conferences/2009/abstracts.html' elif 2008 == year: init_url = 'http://www.machinelearning.org/archive/icml2008/abstracts.shtml' elif 2007 == year: init_url = 'https://icml.cc/Conferences/2007/paperlist.html' elif year in [2006, 2004, 2005]: init_url = f'https://icml.cc/Conferences/{year}/proceedings.html' elif 2003 == year: init_url = 'https://aaai.org/Library/ICML/icml03contents.php' else: raise ValueError('''the given year's url is unknown !''') postfix = f'ICML_{year}' if os.path.exists(f'..\\urls\\init_url_icml_{year}.dat'): with open(f'..\\urls\\init_url_icml_{year}.dat', 'rb') as f: content = pickle.load(f) else: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' } req = urllib.request.Request(url=init_url, headers=headers) content = urllib.request.urlopen(req).read() # content = open(f'..\\ICML_{year}.html', 'rb').read() with open(f'..\\urls\\init_url_icml_{year}.dat', 'wb') as f: pickle.dump(content, f) # soup = BeautifulSoup(content, 'html.parser') soup = BeautifulSoup(content, 'html5lib') # soup = BeautifulSoup(open(r'..\ICML_2011.html', 'rb'), 'html.parser') error_log = [] if year >= 2013: if year in ICML_year_dict.keys(): volume = f'v{ICML_year_dict[year]}' else: raise ValueError('''the given year's url is unknown !''') pmlr.download_paper_given_volume( volume=volume, save_dir=save_dir, postfix=postfix, is_download_supplement=is_download_supplement, time_step_in_seconds=time_step_in_seconds, downloader=downloader.downloader) elif 2012 == year: # 2012 # base_url = f'https://icml.cc/{year}/' paper_list_bar = tqdm(soup.find_all('div', {'class': 'paper'})) paper_index = 0 for paper in paper_list_bar: paper_index += 1 title = '' title = slugify(paper.find('h2').text) link = None for a in paper.find_all('a'): if 'ICML version (pdf)' == a.text: link = urllib.parse.urljoin(init_url, a.get('href')) break if link is not None: this_paper_main_path = os.path.join( save_dir, f'{title}_{postfix}.pdf'.replace(' ', '_')) paper_list_bar.set_description( f'find paper {paper_index}:{title}') if not os.path.exists(this_paper_main_path): paper_list_bar.set_description( f'downloading paper {paper_index}:{title}') downloader.download( urls=link, save_path=this_paper_main_path, time_sleep_in_seconds=time_step_in_seconds) else: error_log.append((title, 'no main link error')) elif 2011 == year: paper_list_bar = tqdm(soup.find_all('a')) paper_index = 0 for paper in paper_list_bar: h3 = paper.find('h3') if h3 is not None: title = slugify(h3.text) paper_index += 1 if 'download' == slugify(paper.text.strip()): link = paper.get('href') link = urllib.parse.urljoin(init_url, link) if link is not None: this_paper_main_path = os.path.join( save_dir, f'{title}_{postfix}.pdf'.replace(' ', '_')) paper_list_bar.set_description( f'find paper {paper_index}:{title}') if not os.path.exists(this_paper_main_path): paper_list_bar.set_description( f'downloading paper {paper_index}:{title}') downloader.download( urls=link, save_path=this_paper_main_path, time_sleep_in_seconds=time_step_in_seconds) else: error_log.append((title, 'no main link error')) elif year in [2009, 2008]: if 2009 == year: paper_list_bar = tqdm( soup.find('div', { 'id': 'right_column' }).find_all(['h3', 'a'])) elif 2008 == year: paper_list_bar = tqdm( soup.find('div', { 'class': 'content' }).find_all(['h3', 'a'])) paper_index = 0 title = None for paper in paper_list_bar: if 'h3' == paper.name: title = slugify(paper.text) paper_index += 1 elif 'full-paper' == slugify(paper.text.strip()): # a link = paper.get('href') if link is not None and title is not None: link = urllib.parse.urljoin(init_url, link) this_paper_main_path = os.path.join( save_dir, f'{title}_{postfix}.pdf') paper_list_bar.set_description( f'find paper {paper_index}:{title}') if not os.path.exists(this_paper_main_path): paper_list_bar.set_description( f'downloading paper {paper_index}:{title}') downloader.download( urls=link, save_path=this_paper_main_path, time_sleep_in_seconds=time_step_in_seconds) title = None else: error_log.append((title, 'no main link error')) elif year in [2006, 2005]: paper_list_bar = tqdm(soup.find_all('a')) paper_index = 0 for paper in paper_list_bar: title = slugify(paper.text.strip()) link = paper.get('href') paper_index += 1 if link is not None and title is not None and ( 'pdf' == link[-3:] or 'ps' == link[-2:]): link = urllib.parse.urljoin(init_url, link) this_paper_main_path = os.path.join( save_dir, f'{title}_{postfix}.pdf'.replace(' ', '_')) paper_list_bar.set_description( f'find paper {paper_index}:{title}') if not os.path.exists(this_paper_main_path): paper_list_bar.set_description( f'downloading paper {paper_index}:{title}') downloader.download( urls=link, save_path=this_paper_main_path, time_sleep_in_seconds=time_step_in_seconds) elif 2004 == year: paper_index = 0 paper_list_bar = tqdm( soup.find('table', { 'class': 'proceedings' }).find_all('tr')) title = None for paper in paper_list_bar: tr_class = None try: tr_class = paper.get('class')[0] except: pass if 'proc_2004_title' == tr_class: # title title = slugify(paper.text.strip()) paper_index += 1 else: for a in paper.find_all('a'): if '[Paper]' == a.text: link = a.get('href') if link is not None and title is not None: link = urllib.parse.urljoin(init_url, link) this_paper_main_path = os.path.join( save_dir, f'{title}_{postfix}.pdf'.replace(' ', '_')) paper_list_bar.set_description( f'find paper {paper_index}:{title}') if not os.path.exists(this_paper_main_path): paper_list_bar.set_description( f'downloading paper {paper_index}:{title}') downloader.download( urls=link, save_path=this_paper_main_path, time_sleep_in_seconds=time_step_in_seconds) break elif 2003 == year: paper_index = 0 paper_list_bar = tqdm( soup.find('div', { 'id': 'content' }).find_all('p', {'class': 'left'})) for paper in paper_list_bar: abs_link = None title = None link = None for a in paper.find_all('a'): abs_link = urllib.parse.urljoin(init_url, a.get('href')) if abs_link is not None: title = slugify(a.text.strip()) break if title is not None: paper_index += 1 this_paper_main_path = os.path.join( save_dir, f'{title}_{postfix}.pdf'.replace(' ', '_')) paper_list_bar.set_description( f'find paper {paper_index}:{title}') if not os.path.exists(this_paper_main_path): if abs_link is not None: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' } req = urllib.request.Request(url=abs_link, headers=headers) for i in range(3): try: abs_content = urllib.request.urlopen( req, timeout=10).read() break except Exception as e: if i == 2: print('error' + title + str(e)) error_log.append( (title, abs_link, 'download error', str(e))) abs_soup = BeautifulSoup(abs_content, 'html5lib') for a in abs_soup.find_all('a'): try: if 'pdf' == a.get('href')[-3:]: link = urllib.parse.urljoin( abs_link, a.get('href')) if link is not None: paper_list_bar.set_description( f'downloading paper {paper_index}:{title}' ) downloader.download( urls=link, save_path=this_paper_main_path, time_sleep_in_seconds= time_step_in_seconds) break except: pass # write error log print('write error log') with open('..\\log\\download_err_log.txt', 'w') as f: for log in tqdm(error_log): for e in log: if e is not None: f.write(e) else: f.write('None') f.write('\n') f.write('\n')
def download_paper_given_volume(volume, save_dir, postfix, is_download_supplement=True, time_step_in_seconds=5, downloader='IDM'): """ download main and supplement papers from PMLR. :param volume: str, such as 'v1', 'r1' :param save_dir: str, paper and supplement material's save path :param postfix: str, the postfix will be appended to the end of papers' titles :param is_download_supplement: bool, True for downloading supplemental material :param time_step_in_seconds: int, the interval time between two downlaod request in seconds :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM' :return: True """ downloader = Downloader(downloader=downloader) init_url = f'http://proceedings.mlr.press/{volume}/' if is_download_supplement: main_save_path = os.path.join(save_dir, 'main_paper') supplement_save_path = os.path.join(save_dir, 'supplement') os.makedirs(main_save_path, exist_ok=True) os.makedirs(supplement_save_path, exist_ok=True) else: main_save_path = save_dir os.makedirs(main_save_path, exist_ok=True) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' } req = urllib.request.Request(url=init_url, headers=headers) content = urllib.request.urlopen(req).read() soup = BeautifulSoup(content, 'html.parser') paper_list = soup.find_all('div', {'class': 'paper'}) error_log = [] title_list = [] num_download = len(paper_list) pbar = tqdm(zip(paper_list, range(num_download))) for paper in pbar: # get title this_paper = paper[0] title = slugify(this_paper.find_all('p', {'class': 'title'})[0].text) try: pbar.set_description( f'Downloading paper {paper[1] + 1}/{num_download}: {title}') except: pbar.set_description( f'''Downloading paper {paper[1] + 1}/{num_download}: {title.encode('utf8')}''' ) title_list.append(title) this_paper_main_path = os.path.join(main_save_path, f'{title}_{postfix}.pdf') if is_download_supplement: this_paper_supp_path = os.path.join(supplement_save_path, f'{title}_{postfix}_supp.pdf') this_paper_supp_path_no_ext = os.path.join( supplement_save_path, f'{title}_{postfix}_supp.') if os.path.exists(this_paper_main_path) and os.path.exists( this_paper_supp_path): continue else: if os.path.exists(this_paper_main_path): continue # get abstract page url links = this_paper.find_all('p', {'class': 'links'})[0].find_all('a') supp_link = None main_link = None for link in links: if 'Download PDF' == link.text or 'pdf' == link.text: main_link = link.get('href') elif is_download_supplement and ( 'Supplementary PDF' == link.text or 'Supplementary Material' == link.text or \ 'supplementary' == link.text): supp_link = link.get('href') if supp_link[-3:] != 'pdf': this_paper_supp_path = this_paper_supp_path_no_ext + supp_link[ -3:] # try 1 time # error_flag = False for d_iter in range(1): try: # download paper with IDM if not os.path.exists( this_paper_main_path) and main_link is not None: downloader.download( urls=main_link, save_path=this_paper_main_path, time_sleep_in_seconds=time_step_in_seconds) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append( (title, main_link, 'main paper download error', str(e))) # download supp if is_download_supplement: # check whether the supp can be downloaded if not os.path.exists( this_paper_supp_path) and supp_link is not None: try: downloader.download( urls=supp_link, save_path=this_paper_supp_path, time_sleep_in_seconds=time_step_in_seconds) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append( (title, supp_link, 'supplement download error', str(e))) # write error log print('writing error log...') with open('..\\log\\download_err_log.txt', 'w') as f: for log in tqdm(error_log): for e in log: if e is not None: f.write(e) else: f.write('None') f.write('\n') f.write('\n') return True