def get_pdf(html): """ xxx""" reg = r'href="(.+?\.pdf)">pdf' pdfre = re.compile(reg) pdflist = re.findall(pdfre, html) dir_name = 'CVPR2015' maxrows = len(pdflist) pbar = prgbar.ProgressBar(total=maxrows) if os.path.exists(dir_name) is False: os.mkdir(dir_name) for idx, pdfurl in enumerate(pdflist): reg2 = r'papers/(.+?\.pdf)' pdfre2 = re.compile(reg2) filename = dir_name + '/' + re.findall(pdfre2, pdfurl)[0] pbar.log('http://www.cv-foundation.org/openaccess/' + pdfurl) if os.path.exists(filename) is True: pbar.log('Exist') else: urllib.urlretrieve( 'http://www.cv-foundation.org/openaccess/' + pdfurl, filename) pbar.update(index=(idx + 1)) pbar.finish()
def get_pdf(html, keywords): """ xxx""" reg = r'href="(.+?\.pdf)">pdf' pdfre = re.compile(reg) pdflist = re.findall(pdfre, html) dir_name = 'CVPR2018' maxrows = len(pdflist) pbar = prgbar.ProgressBar(total=maxrows) if os.path.exists(dir_name) is False: os.mkdir(dir_name) for idx, pdfurl in enumerate(pdflist): reg2 = r'papers/(.+?\.pdf)' pdfre2 = re.compile(reg2) name_ori = re.findall(pdfre2, pdfurl)[0] # words list words_list = name_ori.split('_') name_list = words_list[1:-3] filename = dir_name + '/' + contrust_paper_name(name_list) print(filename) pbar.log('http://openaccess.thecvf.com/' + pdfurl) if os.path.exists(filename) is True: pbar.log('Exist') else: urllib.urlretrieve( 'http://openaccess.thecvf.com/' + pdfurl, filename) # # ingore unconcerned papers # if keywords in name_list: # filename = dir_name + '/' + contrust_paper_name(name_list) # print(filename) # pbar.log('http://openaccess.thecvf.com/' + pdfurl) # if os.path.exists(filename) is True: # pbar.log('Exist') # else: # urllib.urlretrieve( # 'http://openaccess.thecvf.com/' + pdfurl, filename) # else: # print('ignore paper %s' % name_ori) pbar.update(index=(idx + 1)) pbar.finish()
def get_pdf(html): """ xxx""" reg = r'href="(.+?\.pdf)">pdf' pdfre = re.compile(reg) pdflist = re.findall(pdfre, html) dir_name = 'COLT2016' if os.path.exists(dir_name) is False: os.mkdir(dir_name) maxrows = len(pdflist) pbar = prgbar.ProgressBar(total=maxrows) for idx, pdfurl in enumerate(pdflist): filename = dir_name + '/' + pdfurl pbar.log('http://jmlr.org/proceedings/papers/v49/' + pdfurl) if os.path.exists(filename) is True: pbar.log('Exist') else: urllib.urlretrieve( 'http://jmlr.org/proceedings/papers/v49/' + pdfurl, filename) pbar.update(index=(idx + 1)) pbar.finish()
def get_pdf(html): """ xxx""" reg = r'href="/paper/(.+?)"' pdfre = re.compile(reg) pdflist = re.findall(pdfre, html) dir_name = 'NIPS2014' maxrows = len(pdflist) pbar = prgbar.ProgressBar(total=maxrows) if os.path.exists(dir_name) is False: os.mkdir(dir_name) for idx, pdfurl in enumerate(pdflist): filename = dir_name + '/' + pdfurl + '.pdf' pbar.log('http://papers.nips.cc/paper/' + pdfurl + '.pdf') if os.path.exists(filename) is True: pbar.log('Exist') else: urllib.urlretrieve( 'http://papers.nips.cc/paper/' + pdfurl + '.pdf', filename) pbar.update(index=(idx + 1)) pbar.finish()