def __download_from_springer(url, save_dir, year, is_workshops=False, time_sleep_in_seconds=5, downloader='IDM'): downloader = Downloader(downloader) for i in range(3): try: papers_dict = springer.get_paper_name_link_from_url(url) break except Exception as e: print(str(e)) # total_paper_number = len(papers_dict) pbar = tqdm(papers_dict.keys()) postfix = f'ECCV_{year}' if is_workshops: postfix = f'ECCV_WS_{year}' for name in pbar: pbar.set_description(f'Downloading paper {name}') if not os.path.exists(os.path.join(save_dir, f'{name}_{postfix}.pdf')): downloader.download( papers_dict[name], os.path.join(save_dir, f'{name}_{postfix}.pdf'), time_sleep_in_seconds)
class Analytics: def __init__(self, config): self.config = config self.downloader = Downloader(config) self.parser = Parser(config) self.analyser = Analyser(config) def generate_report(self): if self.config.download: self.downloader.download() self.parser.parse() self.analyser.analyse() return self.config
def mac_install(self): log.info("Downloading virtualbox") self.init_progres_bar("Downloading virtualbox...") # start download path_to_vbox_distr = Dl.get(url=self.config["vbox_download"], dest_path=path.expandvars( self.config["downloads_path"]), on_update=self.update_progres_bar, abort=self.abort) self.finalize_progres_bar() self.message("Download completed. Mounting...") log.info("Installing virtualbox") if self.abort.is_set(): log.debug("Vbox intallation aborted.") return self.on_configuration_in_progress(True) self.mount_vbox_distro(path_to_vbox_distr) # OS SPECIFIC!!! if self.update: self.uninstall_vbox() self.message("Virtualbox old version is uninstalled") self.message("Mounted") self.message("Installing Virtualbox") self._install_vbox_dar_win(path_to_vbox_distr) self.message("Installed. Unmounting vbox distro") self.unmount_vbox_distro(self.config["vbox_distro_mountpoint"]) self.message("Unmounted. Removing distro") self.delete_vbox_distro(path_to_vbox_distr) self.message("Distro removed.") self.complete(True, "")
def _linux_install_download(self): log.debug("Downloading virtualbox") self.init_progres_bar("Downloading virtualbox...") self.path_to_vbox_distr = Dl.get(url=self.config["vbox_download"], dest_path=get_full_path( self.config["downloads_path"]), on_update=self.update_progres_bar, abort=self.abort) self.finalize_progres_bar() self.message("Download completed. Installing...")
class Analytics: def __init__(self, config): self.config = config self.downloader = Downloader(config) self.parser = Parser(config) self.analyser = Analyser(config) def download_logs(self): print("here") if self.config.download: self.downloader.download() def generate_report(self): #if self.config.download: # self.downloader.download() self.parser.parse() self.analyser.analyse() return self.config
def __init__(self, target): super(DialogImporter, self).__init__() loadUi('ui/chapterImporter.ui', self) self.root = target self.currentChapter = target self.chapterList = [] self.downloader = Downloader().get self.initUi() self.initSignal()
def make_filename(data, num, dl_title=None): if num not in data: return False, num, None req_data = data[num] code = Downloader.to_filename( f"{req_data['voting_number']} {req_data['card_code']}") title = Downloader.to_filename(req_data['voting_title']) if dl_title: global title_differences if req_data['voting_title'] != title or req_data[ 'voting_title'] != dl_title: title_differences += f"\n{code}" \ f"\nDownloaded: {dl_title}" \ f"\nExtracted: {title}" \ f"\nReal: {req_data['voting_title']}" sound_start = 'Неизвестно' if req_data['sound_start']: sound_start = { 'Трек запускается сразу после объявления (выход из за кулис под играющий трек)': 'Сразу', 'Трек запускается после выхода на сцену (с точки, без реквизита)': 'С точки', 'Трек запускается сразу после выноса реквизита (когда на сцене никого)': 'Стафф,Сразу', 'Трек запускается после выноса реквизита и выхода на точку': 'Стафф,ТЧК', 'Трек содержит превью (выход из за кулис во время превью танца)': 'Превью' }[req_data['sound_start']] elif req_data['card_code'][0] == 'V': # Videos sound_start = 'Сразу' title = f"[{sound_start}] {title}" title = Downloader.to_filename(f"{title} №{req_data['№']:d}") return True, code, title
def make_filename(data, num, dl_title=None): if num not in data: return False, num, None req_data = data[num] code = Downloader.to_filename( f"{req_data['voting_number']} {req_data['card_code']}") title = Downloader.to_filename(req_data['voting_title']) if dl_title: global title_differences if req_data['voting_title'] != title or req_data[ 'voting_title'] != dl_title: title_differences += f"\n{code}" \ f"\nDownloaded: {dl_title}" \ f"\nExtracted: {title}" \ f"\nReal: {req_data['voting_title']}" sound_start = 'Неизвестно' if req_data['sound_start']: sound_start = { 'Трек начинается до выхода на сцену': 'Сразу', 'Трек начинается до выхода на сцену (выход из за кулис под музыку)': 'Сразу', 'Трек начинается после выхода на сцену (без реквизита)': 'С точки', 'Трек начинается после выхода на сцену (начало с точки)': 'С точки', 'Трек содержит превью (выход из за кулис во время превью танца)': 'Превью', 'Трек начинается после выноса реквизита и подготовки': 'Стафф' }[req_data['sound_start']] elif req_data['card_code'][0] == 'V': # Videos sound_start = 'Сразу' title = f"[{sound_start}] {title}" title = Downloader.to_filename(f"{title} №{req_data['№']:d}") return True, code, title
def win_install(self): self.init_progres_bar("Downloading virtualbox...") # start download path_to_vbox_distr = Dl.get(url=self.config["vbox_download"], dest_path=path.expandvars( self.config["downloads_path"]), on_update=self.update_progres_bar, abort=self.abort) self.finalize_progres_bar() self.message("Download completed. Installing...") self.on_configuration_in_progress(True) self._install_vbox_windows(path_to_vbox_distr) self.message("Instalation complete!") self.complete(True, "")
def saveAs(self): # 导出电子书保存至Epub格式文件。 filePath, fileType = QFileDialog.getSaveFileName( parent=self, caption="导出Epub电子书", filter="Epub Files (*.epub)") # 设置文件扩展名过滤注意用双分号间隔 if not filePath: return self.statusBar.hide() self.progressBar.setValue(0) self.progressBar.show() try: # 创建Epub电子书 book = EBook(title=self.bookTitle.text()) # 设置下载器 book.downloader = Downloader().get # 增加书籍作者 for author in self.bookAuthor.text().split(','): if author.strip(): book.add_author(author.strip()) # 增加封面及页面样式 book.set_cover(self.cover_path) if os.path.isfile(self.style_path): book.set_css(self.style_path) self.progressBar.setValue(5) # 增加章节内容 self.outputChapters(chapter=self.epub.root, target=book, root=True) # 保存为文件 book.save_as(filePath) self.progressBar.setValue(100) QMessageBox.information(self, '保存完毕', '当前书籍内容已保存至以下文件:\r\n' + filePath, QMessageBox.Ok) except Exception as e: QMessageBox.critical(self, '错误', '保存Epub书籍时出现错误:\r\n' + e.args[0], QMessageBox.Ok) self.progressBar.hide() self.statusBar.show()
def main(): global scheduler, schedulerThread, downloader setup() downloader = Downloader() scheduler = Scheduler() schedulerThread = None Pusher() t = threading.Thread(target=send_heartbeat) t.daemon = True t.start() t = threading.Thread(target=websocket_server) t.daemon = True t.start() watchdog_thread = threading.Event() notify_systemd(watchdog_thread) if is_under_voltage(): browser_template('under_voltage') sleep(5) logging.debug('Entering infinite loop.') while True: if not scheduler.slides or len( scheduler.slides ) - 1 == scheduler.index or scheduler.state != scheduler.STATE_OK: schedulerThread = threading.Thread(target=run_scheduler) schedulerThread.start() if not scheduler.slides and schedulerThread.isAlive(): wait_for_scheduler() broadcast_loop(scheduler) if scheduler.index is 0 and schedulerThread and schedulerThread.isAlive( ): wait_for_scheduler()
def __init__(self): super(ApplicationWindow, self).__init__() loadUi('ui/mainWindow.ui', self) self.cover_path = 'template/cover.jpg' self.style_path = 'template/style.css' self.config_path = 'config.json' # 默认配置值 self.config = { 'httpProxyEnable': False, 'httpProxy': { 'http': 'http://127.0.0.1:1080', 'https': 'http://127.0.0.1:1080' } } self.__downloader = Downloader() self.downloader = self.__downloader.get self.initUi() self.initSignal() self.loadConfig(self.config_path)
def download_iclr_spotlight_papers(save_dir, driver_path, year, base_url=None, time_step_in_seconds=10, downloader='IDM'): """ :param save_dir: str, paper save path :param driver_path: str, 'chromedriver.exe' full pathname :param year: int, iclr year, current only support year >= 2018 :param base_url: str, paper website url :param time_step_in_seconds: int, the interval time between two downlaod request in seconds :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM' :return: """ downloader = Downloader(downloader=downloader) if base_url is None: if year >= 2021: base_url = 'https://openreview.net/group?id=ICLR.cc/2021/Conference#spotlight-presentations' elif year == 2020: base_url = 'https://openreview.net/group?id=ICLR.cc/2020/Conference#accept-spotlight' else: raise ValueError('the website url is not given for this year!') first_poster_index = {'2017': 15} paper_postfix = f'ICLR_{year}' error_log = [] driver = webdriver.Chrome(driver_path) driver.get(base_url) if not os.path.exists(save_dir): os.makedirs(save_dir) # wait for the select element to become visible print('Starting web driver wait...') wait = WebDriverWait(driver, 20) print('Starting web driver wait... finished') res = wait.until(EC.presence_of_element_located((By.ID, "notes"))) print("Successful load the website!->",res) res = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "note"))) print("Successful load the website notes!->",res) # parse the results if year >= 2021: divs = driver.find_elements_by_xpath('//*[@id="spotlight-presentations"]/ul/li') elif year == 2020: divs = driver.find_elements_by_xpath('//*[@id="accept-spotlight"]/ul/li') else: divs = driver.find_elements_by_class_name('note')[:first_poster_index[str(year)]] num_papers = len(divs) print('found number of papers:',num_papers) for index, paper in enumerate(divs): a_hrefs = paper.find_elements_by_tag_name("a") if year >= 2018: name = slugify(a_hrefs[0].text.strip()) link = a_hrefs[1].get_attribute('href') else: name = slugify(paper.find_element_by_class_name('note_content_title').text) link = paper.find_element_by_class_name('note_content_pdf').get_attribute('href') print('Downloading paper {}/{}: {}'.format(index+1, num_papers, name)) pdf_name = name + '_' + paper_postfix + '.pdf' if not os.path.exists(os.path.join(save_dir, pdf_name)): # try 1 times success_flag = False for d_iter in range(1): try: downloader.download( urls=link, save_path=os.path.join(save_dir, pdf_name), time_sleep_in_seconds=time_step_in_seconds ) success_flag = True break except Exception as e: print('Error: ' + name + ' - ' + str(e)) if not success_flag: error_log.append((name, link)) driver.close() # 2. write error log print('write error log') with open('..\\log\\download_err_log.txt', 'w') as f: for log in tqdm(error_log): for e in log: f.write(e) f.write('\n') f.write('\n')
def main(argv): config = configparser.ConfigParser() config_path = os.path.expanduser(os.path.join("~", ".doimgrrc")) if os.path.isfile(config_path): config.read(config_path) parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='Command line based tool to request DOI data and convert \ it to BibTex entries.') subparsers = parser.add_subparsers() parser_search = subparsers.add_parser( 'search', help='Search database for published articles to find relevant DOIs', description="""Searches database for published articles. This can be used to find a specific DOI or getting information about a keyword/topic.""") parser_search.add_argument('query', type=str, help='search string') parser_search.add_argument( '--show-authors', action='store_true', default=config.getboolean('search', 'show-authors', fallback=False), help='if set additional author information is shown') parser_search.add_argument( '--show-type', action='store_true', default=config.getboolean('search', 'show-type', fallback=False), help='if set additional information about the type is shown') parser_search.add_argument( '--show-publisher', action='store_true', default=config.getboolean('search', 'show-publisher', fallback=False), help='if set additional information about the publisher is shown') parser_search.add_argument('--show-url', action='store_true', default=config.getboolean('search', 'show-url', fallback=False), help='if set a URL to the document is shown') allowed_sort_types = [ 'score', 'updated', 'deposited', 'indexed', 'published' ] parser_search.add_argument('--sort', type=str, choices=allowed_sort_types, default=config.get('search', 'sort', fallback='score'), help='sorting of search queries; allowed values are {}'\ .format(", ".join(allowed_sort_types)), metavar='') parser_search.add_argument('--order', type=str, choices=['asc', 'desc'], default=config.get('search', 'order', fallback='desc'), help='ordering of search queries') parser_search.add_argument('--year', type=int, default=config.getint('search', 'year', fallback=None), help='limit the year') parser_search.add_argument('--rows', type=int, default=config.getint('search', 'rows', fallback=20), help='number of rows to load') parser_search.add_argument('--color', action="store_true", default=config.getboolean('search', 'color', fallback=False), help='if set, colored output is used') valid_colors = [ 'black', 'cyan', 'magenta', 'yellow', 'blue', 'green', 'red', 'white' ] parser_search.add_argument('--color-doi', type=str, default=config.get('search', 'color-doi', fallback='red'), choices=valid_colors, help='color for DOIs') parser_search.add_argument('--color-title', type=str, default=config.get('search', 'color-title', fallback='green'), choices=valid_colors, help='color for titles') parser_search.add_argument('--color-more', type=str, default=config.get('search', 'color-more', fallback='blue'), choices=valid_colors, help='color for additional information such as \ authors, URLs, etc.') # receive allowed types via http://api.crossref.org/types allowed_types = api.get_valid_types() parser_search.add_argument('--type', type=str, choices=allowed_types, default=config.get('search', 'type', fallback=None), help='selects a single type; allowed values are {}'.format(", "\ .join(allowed_types)), metavar='') parser_search.set_defaults(which_parser='search') parser_cite = subparsers.add_parser( 'cite', help='Cite article based on DOI in different citation formats', description="""Cite articles with a known DOI. Formatting can be done using the `style`-parameter and supports hundreds of different citation formats. A full list of supported formats can be found in the subfolder `API/styles.txt`. The most common ones are `apa` and `bibtex`.""") parser_cite.add_argument('identifier', type=str, help='DOI identifier') parser_cite.add_argument('-s', '--style', type=str, default=config.get('cite', 'style', fallback="bibtex"), help='Citation style') parser_cite.add_argument( '-c', '--copy', action='store_true', default=config.get('cite', 'copy', fallback=False), help="""Copies the result to the system clipboard""") parser_cite.set_defaults(which_parser='cite') parser_download = subparsers.add_parser( 'download', help='Download articles based on their DOI', description="""Downloads articles, if a full text verison is provided by the authors.""") parser_download.add_argument('identifier', type=str, help='DOI identifier') parser_download.add_argument('-d', '--destination', type=str, default=config.get('download', 'destination', fallback="."), help='download destination') parser_download.set_defaults(which_parser='download') parser_bulk = subparsers.add_parser( 'bulk', help='Mass converting for multiple DOIs listed in a single file.', description= """Mass converting for multiple DOIs listed in a single file.""") parser_bulk.add_argument('input', type=argparse.FileType('r'), help='input file path', nargs='?', default=sys.stdin) parser_bulk.add_argument('output', type=argparse.FileType('w'), help='output file path', nargs='?', default=sys.stdout) parser_bulk.add_argument('-s', '--style', type=str, default=config.get('bulk', 'style', fallback="bibtex"), help='Citation style') parser_bulk.set_defaults(which_parser='bulk') parser_service = subparsers.add_parser( 'service', help='Provices service functions for the API such as rebuilding the \ database of valid types and styles', description="""Provices service functions for the API such as rebuilding the database of valid types and styles""") parser_service.add_argument( '--rebuild-api-types', action='store_true', help='Rebuild the types, that are accepted on API requests') parser_service.add_argument( '--rebuild-api-styles', action='store_true', help='Rebuild the styles, that are accepted on API requests') parser_service.set_defaults(which_parser='service') parser.add_argument( '-q', '--quiet', action='store_true', default=config.getboolean('general', 'quiet', fallback=False), help='turns off all unnecessary outputs; use this for scripting') parser.add_argument('--log-level', type=str, choices=['info', 'debug'], default=config.get('general', 'log-level', fallback="info"), help='set the logging level') parser.add_argument('--version', action="store_true", help='shows the version of doimgr') args = parser.parse_args() if args.version: print("doimgr version: {}".format(__version__)) sys.exit() # set the logging levels according to the users choice if args.quiet: level = logging.CRITICAL else: level = logging.INFO if args.log_level == 'debug': level = logging.DEBUG logging.basicConfig(level=level) logging.debug("doimgr version {}".format(__version__)) if hasattr(args, 'which_parser'): if args.which_parser == 'search': logging.debug('Arguments match to perform search') req = Request() if sys.stdout.isatty(): # only allow colors when the script's output is not redirected req.set_colored_output(args.color, doi=args.color_doi, title=args.color_title, more=args.color_more) else: logging.debug('Colors have been disabled due to detected \ reconnect') results = req.search( req.prepare_search_query(args.query, args.sort, args.order, args.year, args.type, args.rows)) req.print_search_content(results, args.show_authors, args.show_type, args.show_publisher, args.show_url) elif args.which_parser == 'cite': logging.debug('Arguments match to request single DOI') # check if given style is valid # this is not done via argparse directly due to the amount of # possible parameters styles = api.get_valid_styles() if args.style not in styles: raise ValueError("Given style \"{}\" is not valid. \ Aborting.".format(args.style)) req = Request() result = req.citation(req.prepare_citation_query(args.identifier), style=args.style) req.print_citation(result) if args.copy: Clipboard.copy_to(result) elif args.which_parser == 'download': logging.debug('Arguments match to download single DOI') try: os.makedirs(os.path.expanduser(args.destination)) logging.debug("Destination dir {} created.".format( args.destination)) except FileExistsError: logging.debug("Destination dir {} does already exists".format( args.destination)) req = Request() links = req.get_download_links(args.identifier) for link in links: url = link.get_url() d = Downloader() filepath = d.download( url, os.path.expanduser(args.destination), "{}.pdf".format(args.identifier.replace("/", "_"))) if filepath is not None: logging.info("Saved file as {}".format(filepath)) if len(links) == 0: logging.info("No valid download URLs found. Aborting.") elif args.which_parser == 'bulk': logging.debug('Arguments match with bulk conversion') # check if given style valid # this is not done via argparse directly due to the amount of # possible parameters styles = api.get_valid_styles() if args.style not in styles: raise ValueError("Given style \"{}\" is not valid. \ Aborting.".format(args.style)) b = BulkConverter() if args.output == sys.stdout: # switch to quiet mode, since we do not want to place # unneccesary messages on stdout logging.getLogger().setLevel(logging.CRITICAL) b.run(args.input, args.output, style=args.style) elif args.which_parser == 'service': logging.debug('Arguments match with service call') if args.rebuild_api_types: api.rebuild_valid_identifier(api.TYPE_TYPES) if args.rebuild_api_styles: api.rebuild_valid_identifier(api.TYPE_STYLES)
query = f""" SELECT request_id, update_time, list.title as nom, requests.number, voting_title, [values].title as file_type, value as file FROM [values], requests, list WHERE list.id = topic_id AND request_id = requests.id AND type IN ('file', 'image') AND list.default_duration > 0 AND status != 'disapproved' ORDER BY [values].title """ def preprocess(num, name, file_name): skip_files_with = config['not_scene_files'] skip_by_field = any([s in file_name for s in skip_files_with]) dir_name = f'№{num}. {name}' return skip_by_field, dir_name, file_name d = Downloader(preprocess) if d.get_lists(db_path, query): print('\nDownloading files...') d.download_files(folder_path, d.DOWNLOAD_UPDATED_REQUESTS, flat=False)
def download_paper(volumn, save_dir, time_step_in_seconds=5, downloader='IDM', url=None, is_use_url=False): """ download all JMLR paper files given volumn and restore in save_dir respectively :param volumn: int, JMLR volumn, such as 2019 :param save_dir: str, paper and supplement material's save path :param time_step_in_seconds: int, the interval time between two downlaod request in seconds :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM' :param url: None or str, None means to download volumn papers. :param is_use_url: bool, if to download papers from 'url'. url couldn't be None when is_use_url is True. :return: True """ downloader = Downloader(downloader=downloader) # create current dict title_list = [] # paper_dict = dict() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' } if not is_use_url: init_url = f'http://jmlr.org/papers/v{volumn}/' postfix = f'JMLR_v{volumn}' if os.path.exists(f'..\\urls\\init_url_JMLR_v{volumn}.dat'): with open(f'..\\urls\\init_url_JMLR_v{volumn}.dat', 'rb') as f: content = pickle.load(f) else: req = urllib.request.Request(url=init_url, headers=headers) content = urllib.request.urlopen(req, timeout=10).read() # content = open(f'..\\JMLR_{volumn}.html', 'rb').read() with open(f'..\\urls\\init_url_JMLR_v{volumn}.dat', 'wb') as f: pickle.dump(content, f) elif url is not None: req = urllib.request.Request(url=url, headers=headers) content = urllib.request.urlopen(req, timeout=10).read() postfix = f'JMLR' else: raise ValueError( ''''url' could not be None when 'is_use_url'=True!!!''') # soup = BeautifulSoup(content, 'html.parser') soup = BeautifulSoup(content, 'html5lib') # soup = BeautifulSoup(open(r'..\JMLR_2011.html', 'rb'), 'html.parser') error_log = [] os.makedirs(save_dir, exist_ok=True) if (not is_use_url) and volumn <= 4: paper_list = soup.find('div', {'id': 'content'}).find_all('tr') else: paper_list = soup.find('div', {'id': 'content'}).find_all('dl') # num_download = 5 # number of papers to download num_download = len(paper_list) for paper in tqdm(zip(paper_list, range(num_download))): # get title print('\n') this_paper = paper[0] title = slugify(this_paper.find('dt').text) try: print('Downloading paper {}/{}: {}'.format(paper[1] + 1, num_download, title)) except: print(title.encode('utf8')) title_list.append(title) this_paper_main_path = os.path.join( save_dir, f'{title}_{postfix}.pdf'.replace(' ', '_')) if os.path.exists(this_paper_main_path): continue # get abstract page url links = this_paper.find_all('a') main_link = None for link in links: if '[pdf]' == link.text or 'pdf' == link.text: main_link = urllib.parse.urljoin('http://jmlr.org', link.get('href')) break # try 1 time # error_flag = False for d_iter in range(1): try: # download paper with IDM if not os.path.exists( this_paper_main_path) and main_link is not None: downloader.download( urls=main_link, save_path=this_paper_main_path, time_sleep_in_seconds=time_step_in_seconds) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append( (title, main_link, 'main paper download error', str(e))) # store the results # 1. store in the pickle file # with open(f'{postfix}_pre.dat', 'wb') as f: # pickle.dump(paper_dict, f) # 2. write error log print('write error log') with open('..\\log\\download_err_log.txt', 'w') as f: for log in tqdm(error_log): for e in log: if e is not None: f.write(e) else: f.write('None') f.write('\n') f.write('\n')
def main(argv): config = configparser.ConfigParser() config_path = os.path.expanduser(os.path.join("~", ".doimgrrc")) if os.path.isfile(config_path): config.read(config_path) parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='Command line based tool to request DOI data and convert \ it to BibTex entries.') subparsers = parser.add_subparsers() parser_search = subparsers.add_parser('search', help='Search database for published articles to find relevant DOIs', description="""Searches database for published articles. This can be used to find a specific DOI or getting information about a keyword/topic.""") parser_search.add_argument('query', type=str, help='search string') parser_search.add_argument('--show-authors', action='store_true', default=config.getboolean('search', 'show-authors', fallback=False), help='if set additional author information is shown') parser_search.add_argument('--show-type', action='store_true', default=config.getboolean('search', 'show-type', fallback=False), help='if set additional information about the type is shown') parser_search.add_argument('--show-publisher', action='store_true', default=config.getboolean('search', 'show-publisher', fallback=False), help='if set additional information about the publisher is shown') parser_search.add_argument('--show-url', action='store_true', default=config.getboolean('search', 'show-url', fallback=False), help='if set a URL to the document is shown') allowed_sort_types=['score', 'updated', 'deposited', 'indexed', 'published'] parser_search.add_argument('--sort', type=str, choices=allowed_sort_types, default=config.get('search', 'sort', fallback='score'), help='sorting of search queries; allowed values are {}'\ .format(", ".join(allowed_sort_types)), metavar='') parser_search.add_argument('--order', type=str, choices=['asc', 'desc'], default=config.get('search', 'order', fallback='desc'), help='ordering of search queries') parser_search.add_argument('--year', type=int, default=config.getint('search', 'year', fallback=None), help='limit the year') parser_search.add_argument('--rows', type=int, default=config.getint('search', 'rows', fallback=20), help='number of rows to load') parser_search.add_argument('--color', action="store_true", default=config.getboolean('search', 'color', fallback=False), help='if set, colored output is used') valid_colors = ['black', 'cyan', 'magenta', 'yellow', 'blue', 'green', 'red', 'white'] parser_search.add_argument('--color-doi', type=str, default=config.get('search', 'color-doi', fallback='red'), choices=valid_colors, help='color for DOIs') parser_search.add_argument('--color-title', type=str, default=config.get('search', 'color-title', fallback='green'), choices=valid_colors, help='color for titles') parser_search.add_argument('--color-more', type=str, default=config.get('search', 'color-more', fallback='blue'), choices=valid_colors, help='color for additional information such as \ authors, URLs, etc.') # receive allowed types via http://api.crossref.org/types allowed_types = api.get_valid_types() parser_search.add_argument('--type', type=str, choices=allowed_types, default=config.get('search', 'type', fallback=None), help='selects a single type; allowed values are {}'.format(", "\ .join(allowed_types)), metavar='') parser_search.set_defaults(which_parser='search') parser_cite = subparsers.add_parser('cite', help='Cite article based on DOI in different citation formats', description="""Cite articles with a known DOI. Formatting can be done using the `style`-parameter and supports hundreds of different citation formats. A full list of supported formats can be found in the subfolder `API/styles.txt`. The most common ones are `apa` and `bibtex`.""") parser_cite.add_argument('identifier', type=str, help='DOI identifier') parser_cite.add_argument('-s', '--style', type=str, default=config.get('cite', 'style', fallback="bibtex"), help='Citation style') parser_cite.add_argument('-c', '--copy', action='store_true', default=config.get('cite', 'copy', fallback=False), help="""Copies the result to the system clipboard""") parser_cite.set_defaults(which_parser='cite') parser_download = subparsers.add_parser('download', help='Download articles based on their DOI', description="""Downloads articles, if a full text verison is provided by the authors.""") parser_download.add_argument('identifier', type=str, help='DOI identifier') parser_download.add_argument('-d', '--destination', type=str, default=config.get('download', 'destination', fallback="."), help='download destination') parser_download.set_defaults(which_parser='download') parser_bulk = subparsers.add_parser('bulk', help='Mass converting for multiple DOIs listed in a single file.', description="""Mass converting for multiple DOIs listed in a single file.""") parser_bulk.add_argument('input', type=argparse.FileType('r'), help='input file path', nargs='?', default=sys.stdin) parser_bulk.add_argument('output', type=argparse.FileType('w'), help='output file path', nargs='?', default=sys.stdout) parser_bulk.add_argument('-s', '--style', type=str, default=config.get('bulk', 'style', fallback="bibtex"), help='Citation style') parser_bulk.set_defaults(which_parser='bulk') parser_service = subparsers.add_parser('service', help='Provices service functions for the API such as rebuilding the \ database of valid types and styles', description="""Provices service functions for the API such as rebuilding the database of valid types and styles""") parser_service.add_argument('--rebuild-api-types', action='store_true', help='Rebuild the types, that are accepted on API requests') parser_service.add_argument('--rebuild-api-styles', action='store_true', help='Rebuild the styles, that are accepted on API requests') parser_service.set_defaults(which_parser='service') parser.add_argument('-q', '--quiet', action='store_true', default=config.getboolean('general', 'quiet', fallback=False), help='turns off all unnecessary outputs; use this for scripting') parser.add_argument('--log-level', type=str, choices=['info', 'debug'], default=config.get('general', 'log-level', fallback="info"), help='set the logging level') parser.add_argument('--version', action="store_true", help='shows the version of doimgr') args = parser.parse_args() if args.version: print("doimgr version: {}".format(__version__)) sys.exit() # set the logging levels according to the users choice if args.quiet: level = logging.CRITICAL else: level = logging.INFO if args.log_level == 'debug': level = logging.DEBUG logging.basicConfig(level=level) logging.debug("doimgr version {}".format(__version__)) if hasattr(args, 'which_parser'): if args.which_parser == 'search': logging.debug('Arguments match to perform search') req = Request() if sys.stdout.isatty(): # only allow colors when the script's output is not redirected req.set_colored_output(args.color, doi=args.color_doi, title=args.color_title, more=args.color_more) else: logging.debug('Colors have been disabled due to detected \ reconnect') results = req.search(req.prepare_search_query(args.query, args.sort, args.order, args.year, args.type, args.rows)) req.print_search_content(results, args.show_authors, args.show_type, args.show_publisher, args.show_url) elif args.which_parser == 'cite': logging.debug('Arguments match to request single DOI') # check if given style is valid # this is not done via argparse directly due to the amount of # possible parameters styles = api.get_valid_styles() if args.style not in styles: raise ValueError("Given style \"{}\" is not valid. \ Aborting.".format(args.style)) req = Request() result = req.citation(req.prepare_citation_query(args.identifier), style=args.style) req.print_citation(result) if args.copy: Clipboard.copy_to(result) elif args.which_parser == 'download': logging.debug('Arguments match to download single DOI') try: os.makedirs(os.path.expanduser(args.destination)) logging.debug("Destination dir {} created.".format( args.destination)) except FileExistsError: logging.debug("Destination dir {} does already exists".format( args.destination)) req = Request() links = req.get_download_links(args.identifier) for link in links: url = link.get_url() d = Downloader() filepath = d.download(url, os.path.expanduser(args.destination), "{}.pdf".format(args.identifier.replace("/", "_"))) if filepath is not None: logging.info("Saved file as {}".format(filepath)) if len(links) == 0: logging.info("No valid download URLs found. Aborting.") elif args.which_parser == 'bulk': logging.debug('Arguments match with bulk conversion') # check if given style valid # this is not done via argparse directly due to the amount of # possible parameters styles = api.get_valid_styles() if args.style not in styles: raise ValueError("Given style \"{}\" is not valid. \ Aborting.".format(args.style)) b = BulkConverter() if args.output == sys.stdout: # switch to quiet mode, since we do not want to place # unneccesary messages on stdout logging.getLogger().setLevel(logging.CRITICAL) b.run(args.input, args.output, style=args.style) elif args.which_parser == 'service': logging.debug('Arguments match with service call') if args.rebuild_api_types: api.rebuild_valid_identifier(api.TYPE_TYPES) if args.rebuild_api_styles: api.rebuild_valid_identifier(api.TYPE_STYLES)
def download_from_csv(postfix, save_dir, csv_file_path, is_download_main_paper=True, is_download_supplement=True, time_step_in_seconds=5, total_paper_number=None, downloader='IDM'): """ download paper and supplement files and save them to save_dir/main_paper and save_dir/supplement respectively :param postfix: str, postfix that will be added at the end of papers' title :param save_dir: str, paper and supplement material's save path :param csv_file_path: str, the full path to csv file :param is_download_main_paper: bool, True for downloading main paper :param is_download_supplement: bool, True for downloading supplemental material :param time_step_in_seconds: int, the interval time between two downloading request in seconds :param total_paper_number: int, the total number of papers that is going to download :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM'. :return: True """ downloader = Downloader(downloader=downloader) if not os.path.exists(csv_file_path): raise ValueError(f'ERROR: file not found in {csv_file_path}!!!') main_save_path = os.path.join(save_dir, 'main_paper') if is_download_main_paper: os.makedirs(main_save_path, exist_ok=True) if is_download_supplement: supplement_save_path = os.path.join(save_dir, 'supplement') os.makedirs(supplement_save_path, exist_ok=True) error_log = [] with open(csv_file_path, newline='') as csvfile: myreader = csv.DictReader(csvfile, delimiter=',') pbar = tqdm(myreader) i = 0 for this_paper in pbar: is_grouped = ('group' in this_paper) i += 1 # get title if is_grouped: group = slugify(this_paper['group']) title = slugify(this_paper['title']) if total_paper_number is not None: pbar.set_description( f'Downloading paper {i}/{total_paper_number}') else: pbar.set_description(f'Downloading paper {i}') this_paper_main_path = os.path.join(main_save_path, f'{title}_{postfix}.pdf') if is_grouped: this_paper_main_path = os.path.join(main_save_path, group, f'{title}_{postfix}.pdf') if is_download_supplement: this_paper_supp_path_no_ext = os.path.join( supplement_save_path, f'{title}_{postfix}_supp.') if is_grouped: this_paper_supp_path_no_ext = os.path.join( supplement_save_path, group, f'{title}_{postfix}_supp.') if '' != this_paper['supplemental link'] and os.path.exists(this_paper_main_path) and \ (os.path.exists(this_paper_supp_path_no_ext + 'zip') or os.path.exists( this_paper_supp_path_no_ext + 'pdf')): continue elif '' == this_paper['supplemental link'] and os.path.exists( this_paper_main_path): continue elif os.path.exists(this_paper_main_path): continue if 'error' == this_paper['main link']: error_log.append((title, 'no MAIN link')) elif '' != this_paper['main link']: if is_grouped: if is_download_main_paper: os.makedirs(os.path.join(main_save_path, group), exist_ok=True) if is_download_supplement: os.makedirs(os.path.join(supplement_save_path, group), exist_ok=True) if is_download_main_paper: try: # download paper with IDM if not os.path.exists(this_paper_main_path): downloader.download( urls=this_paper['main link'].replace( ' ', '%20'), save_path=os.path.join(os.getcwd(), this_paper_main_path), time_sleep_in_seconds=time_step_in_seconds) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append((title, this_paper['main link'], 'main paper download error', str(e))) # download supp if is_download_supplement: # check whether the supp can be downloaded if not (os.path.exists(this_paper_supp_path_no_ext + 'zip') or os.path.exists(this_paper_supp_path_no_ext + 'pdf')): if 'error' == this_paper['supplemental link']: error_log.append((title, 'no SUPPLEMENTAL link')) elif '' != this_paper['supplemental link']: supp_type = this_paper['supplemental link'].split( '.')[-1] try: downloader.download( urls=this_paper['supplemental link'], save_path=os.path.join( os.getcwd(), this_paper_supp_path_no_ext + supp_type), time_sleep_in_seconds=time_step_in_seconds) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append( (title, this_paper['supplemental link'], 'supplement download error', str(e))) # 2. write error log print('write error log') with open('..\\log\\download_err_log.txt', 'w') as f: for log in tqdm(error_log): for e in log: if e is not None: f.write(e) else: f.write('None') f.write('\n') f.write('\n') return True
def test_download(self): link = self.config['vbox_download'] Downloader.get(link)
def download_paper(year, save_dir, is_download_supplement=True, time_step_in_seconds=5, downloader='IDM'): """ download all ICML paper and supplement files given year, restore in save_dir/main_paper and save_dir/supplement respectively :param year: int, ICML year, such 2019 :param save_dir: str, paper and supplement material's save path :param is_download_supplement: bool, True for downloading supplemental material :param time_step_in_seconds: int, the interval time between two downlaod request in seconds :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM' :return: True """ downloader = Downloader(downloader=downloader) ICML_year_dict = { 2021: 139, 2020: 119, 2019: 97, 2018: 80, 2017: 70, 2016: 48, 2015: 37, 2014: 32, 2013: 28 } if year >= 2013: init_url = f'http://proceedings.mlr.press/v{ICML_year_dict[year]}/' elif year == 2012: init_url = 'https://icml.cc/2012/papers.1.html' elif year == 2011: init_url = 'http://www.icml-2011.org/papers.php' elif 2009 == year: init_url = 'https://icml.cc/Conferences/2009/abstracts.html' elif 2008 == year: init_url = 'http://www.machinelearning.org/archive/icml2008/abstracts.shtml' elif 2007 == year: init_url = 'https://icml.cc/Conferences/2007/paperlist.html' elif year in [2006, 2004, 2005]: init_url = f'https://icml.cc/Conferences/{year}/proceedings.html' elif 2003 == year: init_url = 'https://aaai.org/Library/ICML/icml03contents.php' else: raise ValueError('''the given year's url is unknown !''') postfix = f'ICML_{year}' if os.path.exists(f'..\\urls\\init_url_icml_{year}.dat'): with open(f'..\\urls\\init_url_icml_{year}.dat', 'rb') as f: content = pickle.load(f) else: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' } req = urllib.request.Request(url=init_url, headers=headers) content = urllib.request.urlopen(req).read() # content = open(f'..\\ICML_{year}.html', 'rb').read() with open(f'..\\urls\\init_url_icml_{year}.dat', 'wb') as f: pickle.dump(content, f) # soup = BeautifulSoup(content, 'html.parser') soup = BeautifulSoup(content, 'html5lib') # soup = BeautifulSoup(open(r'..\ICML_2011.html', 'rb'), 'html.parser') error_log = [] if year >= 2013: if year in ICML_year_dict.keys(): volume = f'v{ICML_year_dict[year]}' else: raise ValueError('''the given year's url is unknown !''') pmlr.download_paper_given_volume( volume=volume, save_dir=save_dir, postfix=postfix, is_download_supplement=is_download_supplement, time_step_in_seconds=time_step_in_seconds, downloader=downloader.downloader) elif 2012 == year: # 2012 # base_url = f'https://icml.cc/{year}/' paper_list_bar = tqdm(soup.find_all('div', {'class': 'paper'})) paper_index = 0 for paper in paper_list_bar: paper_index += 1 title = '' title = slugify(paper.find('h2').text) link = None for a in paper.find_all('a'): if 'ICML version (pdf)' == a.text: link = urllib.parse.urljoin(init_url, a.get('href')) break if link is not None: this_paper_main_path = os.path.join( save_dir, f'{title}_{postfix}.pdf'.replace(' ', '_')) paper_list_bar.set_description( f'find paper {paper_index}:{title}') if not os.path.exists(this_paper_main_path): paper_list_bar.set_description( f'downloading paper {paper_index}:{title}') downloader.download( urls=link, save_path=this_paper_main_path, time_sleep_in_seconds=time_step_in_seconds) else: error_log.append((title, 'no main link error')) elif 2011 == year: paper_list_bar = tqdm(soup.find_all('a')) paper_index = 0 for paper in paper_list_bar: h3 = paper.find('h3') if h3 is not None: title = slugify(h3.text) paper_index += 1 if 'download' == slugify(paper.text.strip()): link = paper.get('href') link = urllib.parse.urljoin(init_url, link) if link is not None: this_paper_main_path = os.path.join( save_dir, f'{title}_{postfix}.pdf'.replace(' ', '_')) paper_list_bar.set_description( f'find paper {paper_index}:{title}') if not os.path.exists(this_paper_main_path): paper_list_bar.set_description( f'downloading paper {paper_index}:{title}') downloader.download( urls=link, save_path=this_paper_main_path, time_sleep_in_seconds=time_step_in_seconds) else: error_log.append((title, 'no main link error')) elif year in [2009, 2008]: if 2009 == year: paper_list_bar = tqdm( soup.find('div', { 'id': 'right_column' }).find_all(['h3', 'a'])) elif 2008 == year: paper_list_bar = tqdm( soup.find('div', { 'class': 'content' }).find_all(['h3', 'a'])) paper_index = 0 title = None for paper in paper_list_bar: if 'h3' == paper.name: title = slugify(paper.text) paper_index += 1 elif 'full-paper' == slugify(paper.text.strip()): # a link = paper.get('href') if link is not None and title is not None: link = urllib.parse.urljoin(init_url, link) this_paper_main_path = os.path.join( save_dir, f'{title}_{postfix}.pdf') paper_list_bar.set_description( f'find paper {paper_index}:{title}') if not os.path.exists(this_paper_main_path): paper_list_bar.set_description( f'downloading paper {paper_index}:{title}') downloader.download( urls=link, save_path=this_paper_main_path, time_sleep_in_seconds=time_step_in_seconds) title = None else: error_log.append((title, 'no main link error')) elif year in [2006, 2005]: paper_list_bar = tqdm(soup.find_all('a')) paper_index = 0 for paper in paper_list_bar: title = slugify(paper.text.strip()) link = paper.get('href') paper_index += 1 if link is not None and title is not None and ( 'pdf' == link[-3:] or 'ps' == link[-2:]): link = urllib.parse.urljoin(init_url, link) this_paper_main_path = os.path.join( save_dir, f'{title}_{postfix}.pdf'.replace(' ', '_')) paper_list_bar.set_description( f'find paper {paper_index}:{title}') if not os.path.exists(this_paper_main_path): paper_list_bar.set_description( f'downloading paper {paper_index}:{title}') downloader.download( urls=link, save_path=this_paper_main_path, time_sleep_in_seconds=time_step_in_seconds) elif 2004 == year: paper_index = 0 paper_list_bar = tqdm( soup.find('table', { 'class': 'proceedings' }).find_all('tr')) title = None for paper in paper_list_bar: tr_class = None try: tr_class = paper.get('class')[0] except: pass if 'proc_2004_title' == tr_class: # title title = slugify(paper.text.strip()) paper_index += 1 else: for a in paper.find_all('a'): if '[Paper]' == a.text: link = a.get('href') if link is not None and title is not None: link = urllib.parse.urljoin(init_url, link) this_paper_main_path = os.path.join( save_dir, f'{title}_{postfix}.pdf'.replace(' ', '_')) paper_list_bar.set_description( f'find paper {paper_index}:{title}') if not os.path.exists(this_paper_main_path): paper_list_bar.set_description( f'downloading paper {paper_index}:{title}') downloader.download( urls=link, save_path=this_paper_main_path, time_sleep_in_seconds=time_step_in_seconds) break elif 2003 == year: paper_index = 0 paper_list_bar = tqdm( soup.find('div', { 'id': 'content' }).find_all('p', {'class': 'left'})) for paper in paper_list_bar: abs_link = None title = None link = None for a in paper.find_all('a'): abs_link = urllib.parse.urljoin(init_url, a.get('href')) if abs_link is not None: title = slugify(a.text.strip()) break if title is not None: paper_index += 1 this_paper_main_path = os.path.join( save_dir, f'{title}_{postfix}.pdf'.replace(' ', '_')) paper_list_bar.set_description( f'find paper {paper_index}:{title}') if not os.path.exists(this_paper_main_path): if abs_link is not None: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' } req = urllib.request.Request(url=abs_link, headers=headers) for i in range(3): try: abs_content = urllib.request.urlopen( req, timeout=10).read() break except Exception as e: if i == 2: print('error' + title + str(e)) error_log.append( (title, abs_link, 'download error', str(e))) abs_soup = BeautifulSoup(abs_content, 'html5lib') for a in abs_soup.find_all('a'): try: if 'pdf' == a.get('href')[-3:]: link = urllib.parse.urljoin( abs_link, a.get('href')) if link is not None: paper_list_bar.set_description( f'downloading paper {paper_index}:{title}' ) downloader.download( urls=link, save_path=this_paper_main_path, time_sleep_in_seconds= time_step_in_seconds) break except: pass # write error log print('write error log') with open('..\\log\\download_err_log.txt', 'w') as f: for log in tqdm(error_log): for e in log: if e is not None: f.write(e) else: f.write('None') f.write('\n') f.write('\n')
import logging from lib.db import DB from lib.s3Storage import S3Storage from lib.downloader import Downloader from lib.formatter import Formatter from lib.chambersApi import ChambersApi logging.getLogger().setLevel(logging.INFO) publications = ChambersApi.get_publications() db = DB() downloader = Downloader() formatter = Formatter() s3 = S3Storage("lcmillsconsulting.com", "guides") try: index_updated = True for pub in publications: if not db.has_publication(pub): logging.info( f"Found new publication: {pub['description']}, scraping...") downloader.scrape(pub) s3.upload_data(pub) db.add_publication(pub) logging.info("Scraped data") if not db.is_publication_formatted(pub): logging.info("Formatting html") formatter.format_publication(pub) s3.upload_report(pub) db.set_publication_formatted(pub) index_updated = True
#!/usr/bin/python3 # -*- coding: utf-8 -*- from lib.ebook import EBook from lib.downloader import Downloader from lib.multi_threads import MultiThreads if __name__ == '__main__': # 创建Epub小说对象 book = EBook(author='川原砾', title='Sword Art Online 21 Unital Ring I') # 设置书籍封面 book.set_cover( 'https://ae01.alicdn.com/kf/Hd0ca7616ad0c4b06931df9f8d4406fbfC.jpg') # 设置书籍的络文件下载器 book.downloader = Downloader().get # 读取本地html文件 with open('test.html', mode='r', encoding='utf-8') as f: doc = pq(f.read()) # 将所有超链接改为绝对引用地址 doc.make_links_absolute(base_url=' /') # 通过筛选器选择小说章节 container = doc('div.post-container') tags = container('h2,p') # 读取小说章节内容 title = 'Sword Art Online 21 Unital Ring I' content = pq('<div></div>')
def test_download(self): r = dl.get( "https://sourceforge.net/projects/islands-image/files/Islands_vm_v0.0.011.ova/download", path.expandvars("%USERPROFILE%\\Downloads\\")) print(r)
def download_iclr_paper(year, save_dir, time_step_in_seconds=5, downloader='IDM', is_use_arxiv_mirror=False): """ download iclr conference paper for year 2014, 2015 and 2016 :param year: int, iclr year :param save_dir: str, paper save path :param time_step_in_seconds: int, the interval time between two downlaod request in seconds :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM' :return: True """ downloader = Downloader(downloader=downloader) paper_postfix = f'ICLR_{year}' if year == 2016: base_url = 'https://iclr.cc/archive/www/doku.php%3Fid=iclr2016:main.html' elif year == 2015: base_url = 'https://iclr.cc/archive/www/doku.php%3Fid=iclr2015:main.html' elif year == 2014: base_url = 'https://iclr.cc/archive/2014/conference-proceedings/' else: raise ValueError('the website url is not given for this year!') os.makedirs(save_dir, exist_ok=True) if year == 2015: # oral and poster seperated oral_save_path = os.path.join(save_dir, 'oral') poster_save_path = os.path.join(save_dir, 'poster') workshop_save_path = os.path.join(save_dir, 'ws') os.makedirs(oral_save_path, exist_ok=True) os.makedirs(poster_save_path, exist_ok=True) os.makedirs(workshop_save_path, exist_ok=True) if os.path.exists(f'..\\urls\\init_url_iclr_{year}.dat'): with open(f'..\\urls\\init_url_iclr_{year}.dat', 'rb') as f: content = pickle.load(f) else: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} req = urllib.request.Request(url=base_url, headers=headers) content = urllib.request.urlopen(req).read() with open(f'..\\urls\\init_url_iclr_{year}.dat', 'wb') as f: pickle.dump(content, f) error_log = [] soup = BeautifulSoup(content, 'html.parser') print('open url successfully!') if year == 2016: papers = soup.find('h3', {'id': 'accepted_papers_conference_track'}).findNext('div').find_all('a') for paper in tqdm(papers): link = paper.get('href') if link.startswith('http://arxiv'): title = slugify(paper.text) pdf_name = f'{title}_{paper_postfix}.pdf' try: if not os.path.exists(os.path.join(save_dir, title+f'_{paper_postfix}.pdf')): pdf_link = get_pdf_link_from_arxiv(link, is_use_mirror=is_use_arxiv_mirror) print(f'downloading {title}') downloader.download( urls=pdf_link, save_path=os.path.join(save_dir, pdf_name), time_sleep_in_seconds=time_step_in_seconds ) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append((title, link, 'paper download error', str(e))) # workshops papers = soup.find('h3', {'id': 'workshop_track_posters_may_2nd'}).findNext('div').find_all('a') for paper in tqdm(papers): link = paper.get('href') if link.startswith('http://beta.openreview'): title = slugify(paper.text) pdf_name = f'{title}_ICLR_WS_{year}.pdf' try: if not os.path.exists(os.path.join(save_dir, 'ws', pdf_name)): pdf_link = get_pdf_link_from_openreview(link) print(f'downloading {title}') downloader.download( urls=pdf_link, save_path=os.path.join(save_dir, 'ws', pdf_name), time_sleep_in_seconds=time_step_in_seconds ) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append((title, link, 'paper download error', str(e))) papers = soup.find('h3', {'id': 'workshop_track_posters_may_3rd'}).findNext('div').find_all('a') for paper in tqdm(papers): link = paper.get('href') if link.startswith('http://beta.openreview'): title = slugify(paper.text) pdf_name = f'{title}_ICLR_WS_{year}.pdf' try: if not os.path.exists(os.path.join(save_dir, 'ws', pdf_name)): pdf_link = get_pdf_link_from_openreview(link) print(f'downloading {title}') downloader.download( urls=pdf_link, save_path=os.path.join(save_dir, 'ws', pdf_name), time_sleep_in_seconds=time_step_in_seconds ) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append((title, link, 'paper download error', str(e))) elif year == 2015: # oral papers oral_papers = soup.find('h3', {'id': 'conference_oral_presentations'}).findNext('div').find_all('a') for paper in tqdm(oral_papers): link = paper.get('href') if link.startswith('http://arxiv'): title = slugify(paper.text) pdf_name = f'{title}_{paper_postfix}.pdf' try: if not os.path.exists(os.path.join(oral_save_path, title+f'_{paper_postfix}.pdf')): pdf_link = get_pdf_link_from_arxiv(link, is_use_mirror=is_use_arxiv_mirror) print(f'downloading {title}') downloader.download( urls=pdf_link, save_path=os.path.join(oral_save_path, pdf_name), time_sleep_in_seconds=time_step_in_seconds ) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append((title, link, 'paper download error', str(e))) # workshops papers workshop_papers = soup.find('h3', {'id': 'may_7_workshop_poster_session'}).findNext('div').find_all('a') workshop_papers.append( soup.find('h3', {'id': 'may_8_workshop_poster_session'}).findNext('div').find_all('a')) for paper in tqdm(workshop_papers): link = paper.get('href') if link.startswith('http://arxiv'): title = slugify(paper.text) pdf_name = f'{title}_ICLR_WS_{year}.pdf' try: if not os.path.exists(os.path.join(workshop_save_path, title + f'_{paper_postfix}.pdf')): pdf_link = get_pdf_link_from_arxiv(link, is_use_mirror=is_use_arxiv_mirror) print(f'downloading {title}') downloader.download( urls=pdf_link, save_path=os.path.join(workshop_save_path, pdf_name), time_sleep_in_seconds=time_step_in_seconds) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append((title, link, 'paper download error', str(e))) # poster papers poster_papers = soup.find('h3', {'id': 'may_9_conference_poster_session'}).findNext('div').find_all('a') for paper in tqdm(poster_papers): link = paper.get('href') if link.startswith('http://arxiv'): title = slugify(paper.text) pdf_name = f'{title}_{paper_postfix}.pdf' try: if not os.path.exists(os.path.join(poster_save_path, title + f'_{paper_postfix}.pdf')): pdf_link = get_pdf_link_from_arxiv(link, is_use_mirror=is_use_arxiv_mirror) print(f'downloading {title}') downloader.download( urls=pdf_link, save_path=os.path.join(poster_save_path, pdf_name), time_sleep_in_seconds=time_step_in_seconds) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append((title, link, 'paper download error', str(e))) elif year == 2014: papers = soup.find('div', {'id': 'sites-canvas-main-content'}).find_all('a') for paper in tqdm(papers): link = paper.get('href') if link.startswith('http://arxiv'): title = slugify(paper.text) pdf_name = f'{title}_{paper_postfix}.pdf' try: if not os.path.exists(os.path.join(save_dir, pdf_name)): pdf_link = get_pdf_link_from_arxiv(link, is_use_mirror=is_use_arxiv_mirror) print(f'downloading {title}') downloader.download( urls=pdf_link, save_path=os.path.join(save_dir, pdf_name), time_sleep_in_seconds=time_step_in_seconds) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append((title, link, 'paper download error', str(e))) # workshops paper_postfix = f'ICLR_WS_{year}' base_url = 'https://sites.google.com/site/representationlearning2014/workshop-proceedings' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} req = urllib.request.Request(url=base_url, headers=headers) content = urllib.request.urlopen(req).read() soup = BeautifulSoup(content, 'html.parser') workshop_save_path = os.path.join(save_dir, 'WS') os.makedirs(workshop_save_path, exist_ok=True) papers = soup.find('div', {'id': 'sites-canvas-main-content'}).find_all('a') for paper in tqdm(papers): link = paper.get('href') if link.startswith('http://arxiv'): title = slugify(paper.text) pdf_name = f'{title}_{paper_postfix}.pdf' try: if not os.path.exists(os.path.join(workshop_save_path, pdf_name)): pdf_link = get_pdf_link_from_arxiv(link, is_use_mirror=is_use_arxiv_mirror) print(f'downloading {title}') downloader.download( urls=pdf_link, save_path=os.path.join(workshop_save_path, pdf_name), time_sleep_in_seconds=time_step_in_seconds) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append((title, link, 'paper download error', str(e))) # write error log print('write error log') with open('..\\log\\download_err_log.txt', 'w') as f: for log in tqdm(error_log): for e in log: if e is not None: f.write(e) else: f.write('None') f.write('\n') f.write('\n') return True
def __init__(self, config): self.config = config self.downloader = Downloader(config) self.parser = Parser(config) self.analyser = Analyser(config)
def download_iclr_paper_given_html_file(year, html_path, save_dir, time_step_in_seconds=10, downloader='IDM'): """ download iclr conference paper given html file (current only support 2021) :param year: int, iclr year, current only support year >= 2018 :param html_path: str, html file's full pathname :param save_dir: str, paper save path :param time_step_in_seconds: int, the interval time between two downlaod request in seconds :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM' :return: True """ downloader = Downloader(downloader=downloader) base_url = f'https://openreview.net/group?id=ICLR.cc/{year}' content = open(html_path, 'rb').read() soup = BeautifulSoup(content, 'html5lib') divs = soup.find('div', {'class': 'tabs-container'}) oral_papers = divs.find('div', {'id': 'oral-presentations'}).find_all('li', {'class': 'note'}) num_oral_papers = len(oral_papers) print('found number of oral papers:', num_oral_papers) spotlight_papers = divs.find('div', {'id': 'spotlight-presentations'}).find_all('li', {'class': 'note'}) num_spotlight_papers = len(spotlight_papers) print('found number of spotlight papers:', num_spotlight_papers) poster_papers = divs.find('div', {'id': 'poster-presentations'}).find_all('li', {'class': 'note'}) num_poster_papers = len(poster_papers) print('found number of poster papers:', num_poster_papers) paper_postfix = f'ICLR_{year}' error_log = [] # oral oral_save_dir = os.path.join(save_dir, 'oral') print('downloading oral papers...........') os.makedirs(oral_save_dir, exist_ok=True) for index, paper in enumerate(oral_papers): a_hrefs = paper.find_all("a") name = slugify(a_hrefs[0].text.strip()) pdf_name = name + '_' + paper_postfix + '.pdf' if not os.path.exists(os.path.join(oral_save_dir, pdf_name)): link = a_hrefs[1].get('href') link = urllib.parse.urljoin(base_url, link) print('Downloading paper {}/{}: {}'.format(index + 1, num_oral_papers, name)) # try 1 times success_flag = False for d_iter in range(1): try: downloader.download( urls=link, save_path=os.path.join(oral_save_dir, pdf_name), time_sleep_in_seconds=time_step_in_seconds ) success_flag = True break except Exception as e: print('Error: ' + name + ' - ' + str(e)) # time.sleep(time_step_in_seconds) if not success_flag: error_log.append((name, link)) # spotlight spotlight_save_dir = os.path.join(save_dir, 'spotlight') print('downloading spotlight papers...........') os.makedirs(spotlight_save_dir, exist_ok=True) for index, paper in enumerate(spotlight_papers): a_hrefs = paper.find_all("a") name = slugify(a_hrefs[0].text.strip()) pdf_name = name + '_' + paper_postfix + '.pdf' if not os.path.exists(os.path.join(spotlight_save_dir, pdf_name)): link = a_hrefs[1].get('href') link = urllib.parse.urljoin(base_url, link) print('Downloading paper {}/{}: {}'.format(index + 1, num_spotlight_papers, name)) # try 1 times success_flag = False for d_iter in range(1): try: downloader.download( urls=link, save_path=os.path.join(spotlight_save_dir, pdf_name), time_sleep_in_seconds=time_step_in_seconds ) success_flag = True break except Exception as e: print('Error: ' + name + ' - ' + str(e)) # time.sleep(time_step_in_seconds) if not success_flag: error_log.append((name, link)) # poster poster_save_dir = os.path.join(save_dir, 'poster') print('downloading poster papers...........') os.makedirs(poster_save_dir, exist_ok=True) for index, paper in enumerate(poster_papers): a_hrefs = paper.find_all("a") name = slugify(a_hrefs[0].text.strip()) pdf_name = name + '_' + paper_postfix + '.pdf' if not os.path.exists(os.path.join(poster_save_dir, pdf_name)): link = a_hrefs[1].get('href') link = urllib.parse.urljoin(base_url, link) print('Downloading paper {}/{}: {}'.format(index + 1, num_poster_papers, name)) # try 1 times success_flag = False for d_iter in range(1): try: downloader.download( urls=link, save_path=os.path.join(poster_save_dir, pdf_name), time_sleep_in_seconds=time_step_in_seconds ) success_flag = True break except Exception as e: print('Error: ' + name + ' - ' + str(e)) # time.sleep(time_step_in_seconds) if not success_flag: error_log.append((name, link)) # 2. write error log print('write error log') with open('..\\log\\download_err_log.txt', 'w') as f: for log in tqdm(error_log): for e in log: f.write(e) f.write('\n') f.write('\n')
def download_paper_given_volume(volume, save_dir, postfix, is_download_supplement=True, time_step_in_seconds=5, downloader='IDM'): """ download main and supplement papers from PMLR. :param volume: str, such as 'v1', 'r1' :param save_dir: str, paper and supplement material's save path :param postfix: str, the postfix will be appended to the end of papers' titles :param is_download_supplement: bool, True for downloading supplemental material :param time_step_in_seconds: int, the interval time between two downlaod request in seconds :param downloader: str, the downloader to download, could be 'IDM' or 'Thunder', default to 'IDM' :return: True """ downloader = Downloader(downloader=downloader) init_url = f'http://proceedings.mlr.press/{volume}/' if is_download_supplement: main_save_path = os.path.join(save_dir, 'main_paper') supplement_save_path = os.path.join(save_dir, 'supplement') os.makedirs(main_save_path, exist_ok=True) os.makedirs(supplement_save_path, exist_ok=True) else: main_save_path = save_dir os.makedirs(main_save_path, exist_ok=True) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' } req = urllib.request.Request(url=init_url, headers=headers) content = urllib.request.urlopen(req).read() soup = BeautifulSoup(content, 'html.parser') paper_list = soup.find_all('div', {'class': 'paper'}) error_log = [] title_list = [] num_download = len(paper_list) pbar = tqdm(zip(paper_list, range(num_download))) for paper in pbar: # get title this_paper = paper[0] title = slugify(this_paper.find_all('p', {'class': 'title'})[0].text) try: pbar.set_description( f'Downloading paper {paper[1] + 1}/{num_download}: {title}') except: pbar.set_description( f'''Downloading paper {paper[1] + 1}/{num_download}: {title.encode('utf8')}''' ) title_list.append(title) this_paper_main_path = os.path.join(main_save_path, f'{title}_{postfix}.pdf') if is_download_supplement: this_paper_supp_path = os.path.join(supplement_save_path, f'{title}_{postfix}_supp.pdf') this_paper_supp_path_no_ext = os.path.join( supplement_save_path, f'{title}_{postfix}_supp.') if os.path.exists(this_paper_main_path) and os.path.exists( this_paper_supp_path): continue else: if os.path.exists(this_paper_main_path): continue # get abstract page url links = this_paper.find_all('p', {'class': 'links'})[0].find_all('a') supp_link = None main_link = None for link in links: if 'Download PDF' == link.text or 'pdf' == link.text: main_link = link.get('href') elif is_download_supplement and ( 'Supplementary PDF' == link.text or 'Supplementary Material' == link.text or \ 'supplementary' == link.text): supp_link = link.get('href') if supp_link[-3:] != 'pdf': this_paper_supp_path = this_paper_supp_path_no_ext + supp_link[ -3:] # try 1 time # error_flag = False for d_iter in range(1): try: # download paper with IDM if not os.path.exists( this_paper_main_path) and main_link is not None: downloader.download( urls=main_link, save_path=this_paper_main_path, time_sleep_in_seconds=time_step_in_seconds) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append( (title, main_link, 'main paper download error', str(e))) # download supp if is_download_supplement: # check whether the supp can be downloaded if not os.path.exists( this_paper_supp_path) and supp_link is not None: try: downloader.download( urls=supp_link, save_path=this_paper_supp_path, time_sleep_in_seconds=time_step_in_seconds) except Exception as e: # error_flag = True print('Error: ' + title + ' - ' + str(e)) error_log.append( (title, supp_link, 'supplement download error', str(e))) # write error log print('writing error log...') with open('..\\log\\download_err_log.txt', 'w') as f: for log in tqdm(error_log): for e in log: if e is not None: f.write(e) else: f.write('None') f.write('\n') f.write('\n') return True