Python SimpleScraper.get示例，scraper.SimpleScraper.get Python示例

示例#1

0

显示文件

文件： top10.py 项目： notnews/top10

def huffingtonpost_mostpop(n=5, results=None):
    if results is None:
        results = []

    """
    There is JSON API
    http://www.huffpost.com/mapi/v2/us/trending?device=desktop&statsType=rawPageView&statsPlatform=desktop&algo=trending
    """
    scraper = SimpleScraper()
    json_str = scraper.get('http://www.huffpost.com/mapi/v2/us/trending?device=desktop&statsType=rawPageView&statsPlatform=desktop&algo=trending')
    if not json_str:
        logging.error("Cannot get website")
        return results

    j = json.loads(json_str)
    urls = []
    for e in j['results']['entries']:
        if e['section_name'].lower() == 'politics':
            data = get_record_template('huffingtonpost', 'mostpop-politics')
            data['url'] = e['huffpost_url']
            data['link_text'] = e['label']
            if data not in results:
                results.append(data)
        if len(results) >= n:
            break
    return results

示例#2

0

显示文件

文件： top10.py 项目： notnews/top10

def usatoday_mostpop(n=5, results=None):
    if results is None:
        results = []

    """
    There is only 4 items
    """
    scraper = SimpleScraper()
    html = scraper.get('http://www.usatoday.com/')
    if not html:
        logging.error("Cannot get website")
        return results

    with open('html/usatoday-mostpop.html', 'w', encoding='utf-8') as f:
        f.write(html)

    soup = BeautifulSoup(html, 'lxml')
    urls = []
    for a in soup.select('.hfwmm-light-list-link')[:n]:
        data = get_record_template('usatoday', 'mostpop')
        data['url'] = 'http://www.usatoday.com' + a['href']
        data['link_text'] = a.text.strip()
        if data not in results:
            results.append(data)
        if len(results) >= n:
            break

    return results

示例#3

0

显示文件

文件： top10.py 项目： notnews/top10

def foxnews_mostpop(section='politics', n=5, results=None):
    if results is None:
        results = []

    """
        There is JSON API
        http://www.foxnews.com/feeds/trending/all/feed/json?callback=articles
        http://www.foxnews.com/feeds/trending/politics/feed/json?callback=articles
    """
    scraper = SimpleScraper()
    json_str = scraper.get('http://www.foxnews.com/feeds/trending/{0:s}/feed/json?callback=articles'
                           .format(section))
    if not json_str:
        logging.error("Cannot get website")
        return results

    m = re.match('articles\((.*)\)', json_str, flags=re.S)
    if m:
        json_str = m.group(1)
    j = json.loads(json_str)
    for d in j['response']['docs']:
        data = get_record_template('foxnews', 'mostpop-{0:s}'.format(section))
        data['url'] = d['url'][0]
        data['link_text'] = d['title']
        if data not in results:
            results.append(data)
        if len(results) >= n:
            break
    return results

示例#4

0

显示文件

文件： top10.py 项目： notnews/top10

def washingtonpost_topmost(n=5, results=None):
    if results is None:
        results = []

    scraper = SimpleScraper()
    html = scraper.get('https://www.washingtonpost.com/pb/themost/')
    if not html:
        logging.error("Cannot get website")
        return results

    soup = BeautifulSoup(html, 'html.parser')
    for div in soup.select('.feed-link'):
        if 'feed-title' in div['class']:
            continue
        data = get_record_template('washingtonpost', 'themost-atlantic')
        try:
            data['link_text'] = div.span.text.strip()
        except:
            continue
        onclick = div['onclick']
        m = re.match("window\.open\('(.*?)'.*", onclick)
        if m:
            data['url'] = m.group(1)
            if data not in results:
                results.append(data)
            if len(results) >= n:
                break
    return results

示例#5

0

显示文件

def download_webpage(url, filepath, compress=False, selenium=False):
    scraper = SimpleScraper()
    html = scraper.get(url)
    if selenium:
        if not html or html.find('Redirecting to...') != -1:
            return
        scraper = SeleniumScraper()
        html = scraper.get(url)
        scraper.driver.close()
    if not html:
        html = ''

    logging.info("Saving to file {0:s}".format(filepath))

    if compress:
        with gzip.open(filepath, 'wb') as f:
            f.write(bytes(html, 'utf-8'))
    else:
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(html)

示例#6

0

显示文件

文件： top10.py 项目： notnews/top10

def nyt_mostviewed(section='national', time_period=1, offet=0, n=5, results=None, api_key=''):
    if results is None:
        results = []

    """
    REF: https://developer.nytimes.com/most_popular_api_v2.json
    https://api.nytimes.com/svc/mostpopular/v2/mostviewed/all-sections/1.json?api_key=e9857cefff754d7dacad8df079a803c0
    https://api.nytimes.com/svc/mostpopular/v2/mostviewed/national/1.json?api_key=e9857cefff754d7dacad8df079a803c0
    Example :-
        {
          "status": "string",
          "copyright": "string",
          "num_results": 0,
          "results": [
            {
              "url": "string",
              "column": "string",
              "section": "string",
              "byline": "string",
              "title": "string",
              "abstract": "string",
              "published_date": "string",
              "source": "string"
            }
          ]
        }
    """
    scraper = SimpleScraper()
    url = 'https://api.nytimes.com/svc/mostpopular/v2/mostviewed/{0}/{1}.json?api-key={2}'.format(section, time_period, api_key)
    json_str = scraper.get(url)
    if not json_str:
        logging.error("Cannot get website")
        return results

    j = json.loads(json_str)
    for r in j['results']:
        data = get_record_template('nyt', 'mostviewed-{0:s}'.format(section))
        data['url'] = r['url']
        data['link_text'] = r['title']
        if data not in results:
            results.append(data)
        if len(results) >= n:
            break
    return results

示例#7

0

显示文件

文件： top10.py 项目： notnews/top10

def foxnews_feeds(section='national', n=5, results=None):
    if results is None:
        results = []

    scraper = SimpleScraper()
    html = scraper.get('http://feeds.foxnews.com/foxnews/{0:s}'
                       .format(section))
    if not html:
        logging.error("Cannot get website")
        return results

    soup = BeautifulSoup(html, 'lxml')
    for item in soup.select('item'):
        data = get_record_template('foxnews', 'feeds-{0:s}'.format(section))
        data['url'] = item.select('guid')[0].text
        data['link_text'] = item.select('title')[0].text
        if data not in results:
            results.append(data)
        if len(results) >= n:
            break
    return results

示例#8

0

显示文件

文件： top10.py 项目： notnews/top10

def washingtonpost_top_politics(n=5, results=None):
    if results is None:
        results = []

    scraper = SimpleScraper()
    html = scraper.get('https://www.washingtonpost.com/politics/?nid=top_nav_politics')
    if not html:
        logging.error("Cannot get website")
        return results

    soup = BeautifulSoup(html, 'html.parser')

    for a in soup.select('.story-headline h3 a'):
        data = get_record_template('washingtonpost', 'top-politics')
        data['url'] = a['href']
        data['link_text'] = a.text
        if data not in results:
            results.append(data)
        if len(results) >= n:
            break
    return results

示例#9

0

显示文件

文件： top10.py 项目： notnews/top10

def rss_yahoo_news_top_politics(n=5, results=None, src_list='original'):
    if results is None:
        results = []

    sources = {'original': 'yahoo',
               'ap': 'ap.org',
               'reuters': 'reuters.com'}

    retry = 0
    while retry < 5:
        scraper = SimpleScraper()
        html = scraper.get('https://news.yahoo.com/rss/politics')
        if not html:
            logging.error("Cannot get website")
            return results

        with open('html/yahoo-news-top-politics-{0}.html'.format(src_list),
                  'w', encoding='utf-8') as f:
            f.write(html)

        soup = BeautifulSoup(html, 'xml')

        for item in soup.select('item'):
            source_url = item.source['url']
            if source_url.find(sources[src_list]) != -1:
                href = item.link.text
                if href.startswith('https://news.yahoo.com/'):
                    data = get_record_template('yahoo', 'mostpop-{0:s}'
                                               .format(src_list))
                    data['url'] = href
                    data['link_text'] = item.title.text
                    if data not in results:
                        results.append(data)
            if len(results) >= n:
                return results
        retry += 1
        logging.info("Failed to change Yahoo news provider, retry #{0:d}"
                     .format(retry))
    return results

示例#10

0

显示文件

文件： top10.py 项目： notnews/top10

def google_news_politics(n=5, results=None):
    if results is None:
        results = []

    scraper = SimpleScraper()
    html = scraper.get('https://news.google.com/?ned=us&topic=po')
    if not html:
        logging.error("Cannot get website")
        return results

    with open('html/google-news-politics.html', 'w', encoding='utf-8') as f:
        f.write(html)

    soup = BeautifulSoup(html, 'lxml')
    for a in soup.select('h2 a.article'):
        data = get_record_template('google', 'mostpop-politics')
        data['url'] = a['href']
        data['link_text'] = a.text
        if data not in results:
            results.append(data)
        if len(results) >= n:
            break
    return results

示例#11

0

显示文件

文件： top10.py 项目： notnews/top10

def wsj_mostpop_politics(n=5, results=None):
    if results is None:
        results = []

    scraper = SimpleScraper()
    html = scraper.get('http://www.wsj.com/news/politics')
    if not html:
        logging.error("Cannot get website")
        return results

    with open('html/wsj-mostpop-politics.html', 'w', encoding='utf-8') as f:
        f.write(html)

    soup = BeautifulSoup(html, 'lxml')
    for a in soup.select('.wsj-popular-list.article .wsj-popular-item .pop-item-link'):
        data =get_record_template('wsj', 'mostpop-politics')
        data['url'] = a['href']
        data['link_text'] = a.text
        if data not in results:
            results.append(data)
        if len(results) >= n:
            break
    return results

示例#12

0

显示文件

文件： top10.py 项目： notnews/top10

def washingtonpost_mostread(section='politics', n=5, results=None):
    if results is None:
        results = []
    scraper = SimpleScraper()
    html = scraper.get('https://www.washingtonpost.com/{0:s}'.format(section))
    if not html:
        logging.error("Cannot get website")
        return results

    soup = BeautifulSoup(html, 'html.parser')
    most_read = soup.find('div', {'id': 'post-most-rr'})
    for h in most_read.select('div.headline'):
        a = h.parent
        if a.name != 'a':
            continue
        data = get_record_template('washingtonpost',
                                   'mostread-{0:s}'.format(section))
        data['url'] = a['href']
        data['link_text'] = h.text
        if data not in results:
            results.append(data)
        if len(results) >= n:
            break
    return results

示例#13

0

显示文件

文件： scrape_presspass.py 项目： soodoku/get-journalist-data

import os
from scraper import SimpleScraper

BASE_URL = 'http://www.presspass.me/journalist'


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: {0!s} <input file>".format((__file__)))
        sys.exit()

    if not os.path.exists('./presspass'):
        os.mkdir('./presspass')

    f = open(sys.argv[1])
    reader = csv.DictReader(f)
    scraper = SimpleScraper()
    i = 0
    count = 0
    for r in reader:
        count += 1
        name = r['twitter.username']
        print(count, "==>", name)
        html = scraper.get(BASE_URL + '/{0!s}'.format(name))
        if html:
            with open('presspass/{0!s}.html'.format(name), "wb") as f:
                f.write(str.encode(html))
            i += 1
    print("Found: {0:d}".format(i))
    f.close()

示例#14

0

显示文件

import re
from scraper import SimpleScraper
from bs4 import BeautifulSoup

HOUSE = ['rajyasabha', 'loksabha']

BASE_URL = 'http://www.archive.india.gov.in/govt/'

if __name__ == "__main__":
    scraper = SimpleScraper()
    for h in HOUSE:
        print("House type: {0!s}".format(h))
        if not os.path.exists('./{0!s}'.format(h)):
            os.mkdir('./{0!s}'.format(h))

        html = scraper.get(BASE_URL + '{0!s}.php?alpha=all'.format(h))

        soup = BeautifulSoup(html)
        i = 0
        for a in soup.find_all("a", href=True):
            link = a['href']
            m = re.match(r".*?mpcode=(\d+)", link)
            if m:
                i += 1
                mpcode = m.group(1)
                print(i, link)
                html2 = scraper.get(BASE_URL + link)
                if html2:
                    with open('{0!s}/detail-{1!s}.html'.format(h, mpcode),
                              "wb") as f:
                        f.write(str.encode(html2))

示例#15

0

显示文件

文件： scrape_indian_gov.py 项目： soodoku/indian-politician-bios

import re
from scraper import SimpleScraper
from bs4 import BeautifulSoup

HOUSE = ["rajyasabha", "loksabha"]

BASE_URL = "http://www.archive.india.gov.in/govt/"

if __name__ == "__main__":
    scraper = SimpleScraper()
    for h in HOUSE:
        print("House type: {0!s}".format(h))
        if not os.path.exists("./{0!s}".format(h)):
            os.mkdir("./{0!s}".format(h))

        html = scraper.get(BASE_URL + "{0!s}.php?alpha=all".format(h))

        soup = BeautifulSoup(html)
        i = 0
        for a in soup.find_all("a", href=True):
            link = a["href"]
            m = re.match(r".*?mpcode=(\d+)", link)
            if m:
                i += 1
                mpcode = m.group(1)
                print(i, link)
                html2 = scraper.get(BASE_URL + link)
                if html2:
                    with open("{0!s}/detail-{1!s}.html".format(h, mpcode), "wb") as f:
                        f.write(str.encode(html2))
                html3 = scraper.get(BASE_URL + "{0!s}mpbiodata.php?mpcode={1!s}".format(h, mpcode))