Python load_webpage 예제들, scrap.scrap.load_webpage Python 예제들

예제 #1

0

파일 보기

def get_nips_url(volume_or_year):
    nips_proceedings_repository = 'https://papers.nips.cc'
    webpage_text = load_webpage(nips_proceedings_repository)
    volumes_list = extract_html_tag(webpage_text, 'a')

    nips_pattern = re.compile(
        r'(?:Advances in )?Neural Information Processing Systems (?:(?P<volume>\d{1,2}) )?\(NIPS (?P<year>\d{4})\)',
        re.IGNORECASE)
    nips_by_year = {}
    nips_by_volume = {}
    for v in volumes_list:
        extract = nips_pattern.search(v.contents[0])
        if extract:
            year = extract.group('year')
            year = year.strip() if year is not None else year
            volume = extract.group('volume')
            volume = volume.strip() if volume is not None else volume
            url = nips_proceedings_repository + v.get('href').strip()
            if year is not None:
                nips_by_year[year] = (url, volume, year)
            if volume is not None:
                nips_by_volume[volume] = (url, volume, year)

    book_url = nips_by_year.get(volume_or_year)
    if book_url is None:
        book_url = nips_by_volume.get(volume_or_year)
        if book_url is None:
            raise Exception(
                'Unknown NIPS volume or year {}'.format(volume_or_year))
        else:
            return book_url
    else:
        return book_url

예제 #2

0

파일 보기

def get_nips_proceedings(volume_or_year):
    nips_book_url, volume, year = get_nips_url(volume_or_year)

    meta_filename = '_metadata.pkl'
    proceedings_source = 'nips'
    proceedings_name = year.replace(' ', '_').strip()
    proceedings_dir = os.path.join(ROOT_TEMPDIR, proceedings_source,
                                   proceedings_name)

    meta_file = os.path.join(proceedings_dir, meta_filename)
    if os.path.exists(meta_file):
        print('Pickle found: {}\nReading pickle'.format(meta_file))
        with open(meta_file, 'rb') as pf:
            nips_data = pickle.load(pf)
    else:
        create_global_tempdir()
        create_dir(proceedings_dir)

        webpage_text = load_webpage(nips_book_url)
        nips_data = parse_nips_proceedings(webpage_text, year)

        with open(meta_file, 'wb') as pf:
            pickle.dump(nips_data, pf)

    return nips_data

예제 #3

0

파일 보기

def get_aaai_proceedings(year):
    # http://www.aaai.org/Library/AAAI/aaai-library.php
    meta_filename = '_metadata.pkl'
    proceedings_source = 'aaai'
    proceedings_name = year.replace(' ', '_').strip()
    proceedings_dir = os.path.join(ROOT_TEMPDIR, proceedings_source,
                                   proceedings_name)

    meta_file = os.path.join(proceedings_dir, meta_filename)
    if os.path.exists(meta_file):
        print('Pickle found: {}\nReading pickle'.format(meta_file))
        with open(meta_file, 'rb') as pf:
            aaai_data = pickle.load(pf)
    else:
        create_global_tempdir()
        create_dir(proceedings_dir)

        aaai_proceedings_url = 'http://www.aaai.org/Library/AAAI/aaai{}contents.php'.format(
            proceedings_name[2:4])
        webpage_text = load_webpage(aaai_proceedings_url)

        aaai_data = parse_aaai_proceedings(webpage_text, year)

        with open(meta_file, 'wb') as pf:
            pickle.dump(aaai_data, pf)

    return aaai_data

예제 #4

0

파일 보기

def parse_old_aaai(paper_url):
    paper_webpage = load_webpage(paper_url)
    paper_soup = soup_up(paper_webpage)
    try:
        paper_soup = paper_soup.find_all('div', {'id': 'abstract'})[0]
    except IndexError:
        print('\nSkipping (unreadable web page): {}'.format(paper_url))
        return None, None, None, None, None

    title_and_url = paper_soup.find_all('h1')[0]
    title_and_url_ = title_and_url.find_all('a')
    if title_and_url_:
        title_and_url = title_and_url_[0]

        # Title
        title = title_and_url.text.strip()

        # PDF url and file name
        pdf_url = title_and_url.get('href')
        pdf_filename = None
        if pdf_url is not None:
            base_dir = os.path.dirname(paper_url)
            https_in = False
            if base_dir.startswith('https'):
                https_in = True
                base_dir = base_dir[7:]
            else:
                https_in = False
                base_dir = base_dir[6:]
            pdf_url = os.path.normpath(os.path.join(base_dir, pdf_url))
            if https_in:
                pdf_url = 'https:/' + pdf_url
            else:
                pdf_url = 'http:/' + pdf_url

            pdf_filename = os.path.basename(pdf_url)
    else:
        # Title
        title = title_and_url.text.strip()
        pdf_url = None
        pdf_filename = None

    paper_p = paper_soup.find_all('p')
    # Track/ info
    info = paper_p[2].contents[-1].encode('utf-8').decode('utf-8')

    # Authors
    authors = paper_p[0].text.strip()
    authors = [a.strip() for a in authors.split(',')]

    # Abstract
    abstract = paper_p[1].text.strip()

    return info, title, authors, pdf_url, pdf_filename

예제 #5

0

파일 보기

def get_nips_paper(url):
    nips_proceedings_repository = 'https://papers.nips.cc'
    pdf_url, pdf_filename, zip_sup_url, zip_sup_filename = None, None, None, None
    paper_page = load_webpage(url)
    paper_page_a = extract_html_tag(paper_page, 'a')
    for a in paper_page_a:
        a_contents = a.contents[0].strip().lower()
        if a_contents == '[pdf]':
            pdf_url = nips_proceedings_repository + a.get('href').strip()
            pdf_filename = os.path.basename(pdf_url)
        elif a_contents == '[supplemental]':
            zip_sup_url = nips_proceedings_repository + a.get('href').strip()
            zip_sup_filename = os.path.basename(zip_sup_url)
    return pdf_url, pdf_filename, zip_sup_url, zip_sup_filename

예제 #6

0

파일 보기

def parse_new_aaai(paper_url):
    paper_webpage = load_webpage(paper_url)
    paper_soup = soup_up(paper_webpage)

    try:
        # Track/ info
        track = paper_soup.find_all('div',
                                    {'id': 'breadcrumb'})[0].find_all('a')
        info = track[-2].contents[0].strip()

        # Title
        title = paper_soup.find_all('div', {'id': 'title'})
        title = title[0].contents[0].strip()

        # Authors
        authors = paper_soup.find_all('div', {'id': 'author'})
        authors = [a.strip() for a in authors[0].text.split(',')]

        # Abstract
        abstract = paper_soup.find_all('div', {'id': 'abstract'})
        abstract = abstract[0].find_all('div')[0].text.strip()

        # PDF url and file name
        pdf = paper_soup.find_all('div', {'id': 'paper'})
        pdf_url = None
        pdf_filename = None
        for p in pdf[0].find_all('a'):
            if 'pdf' in p.text.lower():
                pdf_url = p.get('href')
                break
        if pdf_url is not None:
            pdf_url = pdf_url.replace('/view/', '/download/')
            pdf_filename = '{}.pdf'.format('-'.join(pdf_url.split('/')[-2:]))

        return info, title, authors, pdf_url, pdf_filename
    except IndexError:
        print('\nSkipping (unreadable web page): {}'.format(paper_url))
        return None, None, None, None, None

예제 #7

0

파일 보기

def get_mlr_proceedings(volume):
    meta_filename = '_metadata.pkl'
    proceedings_source = 'mlr'
    proceedings_name = volume.replace(' ', '_').strip()
    proceedings_dir = os.path.join(ROOT_TEMPDIR, proceedings_source,
                                   proceedings_name)

    meta_file = os.path.join(proceedings_dir, meta_filename)
    if os.path.exists(meta_file):
        print('Pickle found: {}\nReading pickle'.format(meta_file))
        with open(meta_file, 'rb') as pf:
            mlrp_data = pickle.load(pf)
    else:
        create_global_tempdir()
        create_dir(proceedings_dir)

        mlr_proceedings_url = 'http://proceedings.mlr.press/{}/'.format(volume)
        webpage_text = load_webpage(mlr_proceedings_url)
        mlrp_data = parse_mlr_proceedings(webpage_text, volume)

        with open(meta_file, 'wb') as pf:
            pickle.dump(mlrp_data, pf)

    return mlrp_data