def get_nips_url(volume_or_year): nips_proceedings_repository = 'https://papers.nips.cc' webpage_text = load_webpage(nips_proceedings_repository) volumes_list = extract_html_tag(webpage_text, 'a') nips_pattern = re.compile( r'(?:Advances in )?Neural Information Processing Systems (?:(?P<volume>\d{1,2}) )?\(NIPS (?P<year>\d{4})\)', re.IGNORECASE) nips_by_year = {} nips_by_volume = {} for v in volumes_list: extract = nips_pattern.search(v.contents[0]) if extract: year = extract.group('year') year = year.strip() if year is not None else year volume = extract.group('volume') volume = volume.strip() if volume is not None else volume url = nips_proceedings_repository + v.get('href').strip() if year is not None: nips_by_year[year] = (url, volume, year) if volume is not None: nips_by_volume[volume] = (url, volume, year) book_url = nips_by_year.get(volume_or_year) if book_url is None: book_url = nips_by_volume.get(volume_or_year) if book_url is None: raise Exception( 'Unknown NIPS volume or year {}'.format(volume_or_year)) else: return book_url else: return book_url
def get_nips_proceedings(volume_or_year): nips_book_url, volume, year = get_nips_url(volume_or_year) meta_filename = '_metadata.pkl' proceedings_source = 'nips' proceedings_name = year.replace(' ', '_').strip() proceedings_dir = os.path.join(ROOT_TEMPDIR, proceedings_source, proceedings_name) meta_file = os.path.join(proceedings_dir, meta_filename) if os.path.exists(meta_file): print('Pickle found: {}\nReading pickle'.format(meta_file)) with open(meta_file, 'rb') as pf: nips_data = pickle.load(pf) else: create_global_tempdir() create_dir(proceedings_dir) webpage_text = load_webpage(nips_book_url) nips_data = parse_nips_proceedings(webpage_text, year) with open(meta_file, 'wb') as pf: pickle.dump(nips_data, pf) return nips_data
def get_aaai_proceedings(year): # http://www.aaai.org/Library/AAAI/aaai-library.php meta_filename = '_metadata.pkl' proceedings_source = 'aaai' proceedings_name = year.replace(' ', '_').strip() proceedings_dir = os.path.join(ROOT_TEMPDIR, proceedings_source, proceedings_name) meta_file = os.path.join(proceedings_dir, meta_filename) if os.path.exists(meta_file): print('Pickle found: {}\nReading pickle'.format(meta_file)) with open(meta_file, 'rb') as pf: aaai_data = pickle.load(pf) else: create_global_tempdir() create_dir(proceedings_dir) aaai_proceedings_url = 'http://www.aaai.org/Library/AAAI/aaai{}contents.php'.format( proceedings_name[2:4]) webpage_text = load_webpage(aaai_proceedings_url) aaai_data = parse_aaai_proceedings(webpage_text, year) with open(meta_file, 'wb') as pf: pickle.dump(aaai_data, pf) return aaai_data
def parse_old_aaai(paper_url): paper_webpage = load_webpage(paper_url) paper_soup = soup_up(paper_webpage) try: paper_soup = paper_soup.find_all('div', {'id': 'abstract'})[0] except IndexError: print('\nSkipping (unreadable web page): {}'.format(paper_url)) return None, None, None, None, None title_and_url = paper_soup.find_all('h1')[0] title_and_url_ = title_and_url.find_all('a') if title_and_url_: title_and_url = title_and_url_[0] # Title title = title_and_url.text.strip() # PDF url and file name pdf_url = title_and_url.get('href') pdf_filename = None if pdf_url is not None: base_dir = os.path.dirname(paper_url) https_in = False if base_dir.startswith('https'): https_in = True base_dir = base_dir[7:] else: https_in = False base_dir = base_dir[6:] pdf_url = os.path.normpath(os.path.join(base_dir, pdf_url)) if https_in: pdf_url = 'https:/' + pdf_url else: pdf_url = 'http:/' + pdf_url pdf_filename = os.path.basename(pdf_url) else: # Title title = title_and_url.text.strip() pdf_url = None pdf_filename = None paper_p = paper_soup.find_all('p') # Track/ info info = paper_p[2].contents[-1].encode('utf-8').decode('utf-8') # Authors authors = paper_p[0].text.strip() authors = [a.strip() for a in authors.split(',')] # Abstract abstract = paper_p[1].text.strip() return info, title, authors, pdf_url, pdf_filename
def get_nips_paper(url): nips_proceedings_repository = 'https://papers.nips.cc' pdf_url, pdf_filename, zip_sup_url, zip_sup_filename = None, None, None, None paper_page = load_webpage(url) paper_page_a = extract_html_tag(paper_page, 'a') for a in paper_page_a: a_contents = a.contents[0].strip().lower() if a_contents == '[pdf]': pdf_url = nips_proceedings_repository + a.get('href').strip() pdf_filename = os.path.basename(pdf_url) elif a_contents == '[supplemental]': zip_sup_url = nips_proceedings_repository + a.get('href').strip() zip_sup_filename = os.path.basename(zip_sup_url) return pdf_url, pdf_filename, zip_sup_url, zip_sup_filename
def parse_new_aaai(paper_url): paper_webpage = load_webpage(paper_url) paper_soup = soup_up(paper_webpage) try: # Track/ info track = paper_soup.find_all('div', {'id': 'breadcrumb'})[0].find_all('a') info = track[-2].contents[0].strip() # Title title = paper_soup.find_all('div', {'id': 'title'}) title = title[0].contents[0].strip() # Authors authors = paper_soup.find_all('div', {'id': 'author'}) authors = [a.strip() for a in authors[0].text.split(',')] # Abstract abstract = paper_soup.find_all('div', {'id': 'abstract'}) abstract = abstract[0].find_all('div')[0].text.strip() # PDF url and file name pdf = paper_soup.find_all('div', {'id': 'paper'}) pdf_url = None pdf_filename = None for p in pdf[0].find_all('a'): if 'pdf' in p.text.lower(): pdf_url = p.get('href') break if pdf_url is not None: pdf_url = pdf_url.replace('/view/', '/download/') pdf_filename = '{}.pdf'.format('-'.join(pdf_url.split('/')[-2:])) return info, title, authors, pdf_url, pdf_filename except IndexError: print('\nSkipping (unreadable web page): {}'.format(paper_url)) return None, None, None, None, None
def get_mlr_proceedings(volume): meta_filename = '_metadata.pkl' proceedings_source = 'mlr' proceedings_name = volume.replace(' ', '_').strip() proceedings_dir = os.path.join(ROOT_TEMPDIR, proceedings_source, proceedings_name) meta_file = os.path.join(proceedings_dir, meta_filename) if os.path.exists(meta_file): print('Pickle found: {}\nReading pickle'.format(meta_file)) with open(meta_file, 'rb') as pf: mlrp_data = pickle.load(pf) else: create_global_tempdir() create_dir(proceedings_dir) mlr_proceedings_url = 'http://proceedings.mlr.press/{}/'.format(volume) webpage_text = load_webpage(mlr_proceedings_url) mlrp_data = parse_mlr_proceedings(webpage_text, volume) with open(meta_file, 'wb') as pf: pickle.dump(mlrp_data, pf) return mlrp_data