def huffingtonpost_mostpop(n=5, results=None): if results is None: results = [] """ There is JSON API http://www.huffpost.com/mapi/v2/us/trending?device=desktop&statsType=rawPageView&statsPlatform=desktop&algo=trending """ scraper = SimpleScraper() json_str = scraper.get('http://www.huffpost.com/mapi/v2/us/trending?device=desktop&statsType=rawPageView&statsPlatform=desktop&algo=trending') if not json_str: logging.error("Cannot get website") return results j = json.loads(json_str) urls = [] for e in j['results']['entries']: if e['section_name'].lower() == 'politics': data = get_record_template('huffingtonpost', 'mostpop-politics') data['url'] = e['huffpost_url'] data['link_text'] = e['label'] if data not in results: results.append(data) if len(results) >= n: break return results
def usatoday_mostpop(n=5, results=None): if results is None: results = [] """ There is only 4 items """ scraper = SimpleScraper() html = scraper.get('http://www.usatoday.com/') if not html: logging.error("Cannot get website") return results with open('html/usatoday-mostpop.html', 'w', encoding='utf-8') as f: f.write(html) soup = BeautifulSoup(html, 'lxml') urls = [] for a in soup.select('.hfwmm-light-list-link')[:n]: data = get_record_template('usatoday', 'mostpop') data['url'] = 'http://www.usatoday.com' + a['href'] data['link_text'] = a.text.strip() if data not in results: results.append(data) if len(results) >= n: break return results
def foxnews_mostpop(section='politics', n=5, results=None): if results is None: results = [] """ There is JSON API http://www.foxnews.com/feeds/trending/all/feed/json?callback=articles http://www.foxnews.com/feeds/trending/politics/feed/json?callback=articles """ scraper = SimpleScraper() json_str = scraper.get('http://www.foxnews.com/feeds/trending/{0:s}/feed/json?callback=articles' .format(section)) if not json_str: logging.error("Cannot get website") return results m = re.match('articles\((.*)\)', json_str, flags=re.S) if m: json_str = m.group(1) j = json.loads(json_str) for d in j['response']['docs']: data = get_record_template('foxnews', 'mostpop-{0:s}'.format(section)) data['url'] = d['url'][0] data['link_text'] = d['title'] if data not in results: results.append(data) if len(results) >= n: break return results
def washingtonpost_topmost(n=5, results=None): if results is None: results = [] scraper = SimpleScraper() html = scraper.get('https://www.washingtonpost.com/pb/themost/') if not html: logging.error("Cannot get website") return results soup = BeautifulSoup(html, 'html.parser') for div in soup.select('.feed-link'): if 'feed-title' in div['class']: continue data = get_record_template('washingtonpost', 'themost-atlantic') try: data['link_text'] = div.span.text.strip() except: continue onclick = div['onclick'] m = re.match("window\.open\('(.*?)'.*", onclick) if m: data['url'] = m.group(1) if data not in results: results.append(data) if len(results) >= n: break return results
def download_webpage(url, filepath, compress=False, selenium=False): scraper = SimpleScraper() html = scraper.get(url) if selenium: if not html or html.find('Redirecting to...') != -1: return scraper = SeleniumScraper() html = scraper.get(url) scraper.driver.close() if not html: html = '' logging.info("Saving to file {0:s}".format(filepath)) if compress: with gzip.open(filepath, 'wb') as f: f.write(bytes(html, 'utf-8')) else: with open(filepath, 'w', encoding='utf-8') as f: f.write(html)
def nyt_mostviewed(section='national', time_period=1, offet=0, n=5, results=None, api_key=''): if results is None: results = [] """ REF: https://developer.nytimes.com/most_popular_api_v2.json https://api.nytimes.com/svc/mostpopular/v2/mostviewed/all-sections/1.json?api_key=e9857cefff754d7dacad8df079a803c0 https://api.nytimes.com/svc/mostpopular/v2/mostviewed/national/1.json?api_key=e9857cefff754d7dacad8df079a803c0 Example :- { "status": "string", "copyright": "string", "num_results": 0, "results": [ { "url": "string", "column": "string", "section": "string", "byline": "string", "title": "string", "abstract": "string", "published_date": "string", "source": "string" } ] } """ scraper = SimpleScraper() url = 'https://api.nytimes.com/svc/mostpopular/v2/mostviewed/{0}/{1}.json?api-key={2}'.format(section, time_period, api_key) json_str = scraper.get(url) if not json_str: logging.error("Cannot get website") return results j = json.loads(json_str) for r in j['results']: data = get_record_template('nyt', 'mostviewed-{0:s}'.format(section)) data['url'] = r['url'] data['link_text'] = r['title'] if data not in results: results.append(data) if len(results) >= n: break return results
def foxnews_feeds(section='national', n=5, results=None): if results is None: results = [] scraper = SimpleScraper() html = scraper.get('http://feeds.foxnews.com/foxnews/{0:s}' .format(section)) if not html: logging.error("Cannot get website") return results soup = BeautifulSoup(html, 'lxml') for item in soup.select('item'): data = get_record_template('foxnews', 'feeds-{0:s}'.format(section)) data['url'] = item.select('guid')[0].text data['link_text'] = item.select('title')[0].text if data not in results: results.append(data) if len(results) >= n: break return results
def washingtonpost_top_politics(n=5, results=None): if results is None: results = [] scraper = SimpleScraper() html = scraper.get('https://www.washingtonpost.com/politics/?nid=top_nav_politics') if not html: logging.error("Cannot get website") return results soup = BeautifulSoup(html, 'html.parser') for a in soup.select('.story-headline h3 a'): data = get_record_template('washingtonpost', 'top-politics') data['url'] = a['href'] data['link_text'] = a.text if data not in results: results.append(data) if len(results) >= n: break return results
def rss_yahoo_news_top_politics(n=5, results=None, src_list='original'): if results is None: results = [] sources = {'original': 'yahoo', 'ap': 'ap.org', 'reuters': 'reuters.com'} retry = 0 while retry < 5: scraper = SimpleScraper() html = scraper.get('https://news.yahoo.com/rss/politics') if not html: logging.error("Cannot get website") return results with open('html/yahoo-news-top-politics-{0}.html'.format(src_list), 'w', encoding='utf-8') as f: f.write(html) soup = BeautifulSoup(html, 'xml') for item in soup.select('item'): source_url = item.source['url'] if source_url.find(sources[src_list]) != -1: href = item.link.text if href.startswith('https://news.yahoo.com/'): data = get_record_template('yahoo', 'mostpop-{0:s}' .format(src_list)) data['url'] = href data['link_text'] = item.title.text if data not in results: results.append(data) if len(results) >= n: return results retry += 1 logging.info("Failed to change Yahoo news provider, retry #{0:d}" .format(retry)) return results
def google_news_politics(n=5, results=None): if results is None: results = [] scraper = SimpleScraper() html = scraper.get('https://news.google.com/?ned=us&topic=po') if not html: logging.error("Cannot get website") return results with open('html/google-news-politics.html', 'w', encoding='utf-8') as f: f.write(html) soup = BeautifulSoup(html, 'lxml') for a in soup.select('h2 a.article'): data = get_record_template('google', 'mostpop-politics') data['url'] = a['href'] data['link_text'] = a.text if data not in results: results.append(data) if len(results) >= n: break return results
def wsj_mostpop_politics(n=5, results=None): if results is None: results = [] scraper = SimpleScraper() html = scraper.get('http://www.wsj.com/news/politics') if not html: logging.error("Cannot get website") return results with open('html/wsj-mostpop-politics.html', 'w', encoding='utf-8') as f: f.write(html) soup = BeautifulSoup(html, 'lxml') for a in soup.select('.wsj-popular-list.article .wsj-popular-item .pop-item-link'): data =get_record_template('wsj', 'mostpop-politics') data['url'] = a['href'] data['link_text'] = a.text if data not in results: results.append(data) if len(results) >= n: break return results
def washingtonpost_mostread(section='politics', n=5, results=None): if results is None: results = [] scraper = SimpleScraper() html = scraper.get('https://www.washingtonpost.com/{0:s}'.format(section)) if not html: logging.error("Cannot get website") return results soup = BeautifulSoup(html, 'html.parser') most_read = soup.find('div', {'id': 'post-most-rr'}) for h in most_read.select('div.headline'): a = h.parent if a.name != 'a': continue data = get_record_template('washingtonpost', 'mostread-{0:s}'.format(section)) data['url'] = a['href'] data['link_text'] = h.text if data not in results: results.append(data) if len(results) >= n: break return results
import os from scraper import SimpleScraper BASE_URL = 'http://www.presspass.me/journalist' if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: {0!s} <input file>".format((__file__))) sys.exit() if not os.path.exists('./presspass'): os.mkdir('./presspass') f = open(sys.argv[1]) reader = csv.DictReader(f) scraper = SimpleScraper() i = 0 count = 0 for r in reader: count += 1 name = r['twitter.username'] print(count, "==>", name) html = scraper.get(BASE_URL + '/{0!s}'.format(name)) if html: with open('presspass/{0!s}.html'.format(name), "wb") as f: f.write(str.encode(html)) i += 1 print("Found: {0:d}".format(i)) f.close()
import re from scraper import SimpleScraper from bs4 import BeautifulSoup HOUSE = ['rajyasabha', 'loksabha'] BASE_URL = 'http://www.archive.india.gov.in/govt/' if __name__ == "__main__": scraper = SimpleScraper() for h in HOUSE: print("House type: {0!s}".format(h)) if not os.path.exists('./{0!s}'.format(h)): os.mkdir('./{0!s}'.format(h)) html = scraper.get(BASE_URL + '{0!s}.php?alpha=all'.format(h)) soup = BeautifulSoup(html) i = 0 for a in soup.find_all("a", href=True): link = a['href'] m = re.match(r".*?mpcode=(\d+)", link) if m: i += 1 mpcode = m.group(1) print(i, link) html2 = scraper.get(BASE_URL + link) if html2: with open('{0!s}/detail-{1!s}.html'.format(h, mpcode), "wb") as f: f.write(str.encode(html2))
import re from scraper import SimpleScraper from bs4 import BeautifulSoup HOUSE = ["rajyasabha", "loksabha"] BASE_URL = "http://www.archive.india.gov.in/govt/" if __name__ == "__main__": scraper = SimpleScraper() for h in HOUSE: print("House type: {0!s}".format(h)) if not os.path.exists("./{0!s}".format(h)): os.mkdir("./{0!s}".format(h)) html = scraper.get(BASE_URL + "{0!s}.php?alpha=all".format(h)) soup = BeautifulSoup(html) i = 0 for a in soup.find_all("a", href=True): link = a["href"] m = re.match(r".*?mpcode=(\d+)", link) if m: i += 1 mpcode = m.group(1) print(i, link) html2 = scraper.get(BASE_URL + link) if html2: with open("{0!s}/detail-{1!s}.html".format(h, mpcode), "wb") as f: f.write(str.encode(html2)) html3 = scraper.get(BASE_URL + "{0!s}mpbiodata.php?mpcode={1!s}".format(h, mpcode))