def cl_news_util(args, cache): if not cache: htmlfile = utils.get_html_file('http://www.reuters.com/') parser = REUTERSHTMLParser() parser.feed(htmlfile) articlelist = parser.articlelist else: articlelist = cache if len(args) > 1: if args[1] == '--headlines' or args[1] == '-h': utils.reuters_headlines(articlelist) return articlelist if len(args) > 2: if args[1] == '--open' or args[1] == '-o': index = args[2] article = get_reuters_article(articlelist, index) utils.go_to_page(article['url']) return articlelist if args[1] == '--read' or args[1] == '-r': index = args[2] article = get_reuters_article(articlelist, index) htmlfile = utils.get_html_file(article['url']) abbrevurl = article['url'][28:] print '\n' + article['title'] + ' -- ' + abbrevurl print '==================\n' parser = REUTERSARTICLEParser() parser.feed(htmlfile) return articlelist utils.handle_error('reuters_error')
def cl_news_util(args, cache): if not cache: htmlfile = utils.get_html_file('http://bigstory.ap.org/') parser = APHTMLParser() parser.feed(htmlfile) articlelist = parser.articlelist else: articlelist = cache if len(args) > 1: if args[1] == '--headlines' or args[1] == '-h': utils.ap_headlines(articlelist) return articlelist if len(args) > 2: if args[1] == '--open' or args[1] == '-o': index = args[2] article = get_ap_article(articlelist, index) utils.go_to_page(article['url']) return articlelist if args[1] == '--read' or args[1] == '-r': index = args[2] article = get_ap_article(articlelist, index) htmlfile = utils.get_html_file(article['url']) content = re.search( r'<meta name="description" content="(.+?)" />', htmlfile) print_article_header(article['title'], content.group(1)) return articlelist utils.handle_error('ap_error')
def cl_news_util(args, cache): if not cache: htmlfile = utils.get_html_file('https://www.washingtonpost.com') htmlfile = htmlfile.decode('utf-8') parser = WPHTMLParser() parser.feed(htmlfile) articlelist = parser.articlelist else: articlelist = cache if len(args) > 1: if args[1] == '--headlines' or args[1] == '-h': utils.wp_headlines(articlelist) return articlelist if len(args) > 2: if args[1] == '--open' or args[1] == '-o': index = args[2] article = articlelist[index] utils.go_to_page(article['url']) return articlelist if args[1] == '--read' or args[1] == '-r': index = int(args[2]) - 1 article = articlelist[index] htmlfile = utils.get_html_file(article['url']) htmlfile = htmlfile.decode('utf-8') print '\n' + article['title'] print '==================\n' parser = WPARTICLEParser() parser.feed(htmlfile) return articlelist utils.handle_error('wp_error')
def cl_news_util(args, cache): if not cache: htmlfile = utils.get_html_file('http://www.theguardian.com/us') parser = GUARDIANHTMLParser() parser.feed(htmlfile) articlelist = parser.articlelist else: articlelist = cache if len(args) > 1: if args[1] == '--headlines' or args[1] =='-h': utils.gu_headlines(articlelist) return articlelist if len(args) > 2: if args[1] == '--open' or args[1] == '-o': index = args[2] article = get_gu_article(articlelist, index) utils.go_to_page(article['url']) return articlelist if args[1] == '--read' or args[1] == '-r': index = args[2] article = get_gu_article(articlelist, index) htmlfile = utils.get_html_file(article['url']) abbrevurl = article['url'][28:] print '\n' + article['title'] + ' -- ' + abbrevurl print '==================\n' htmlfile = htmlfile.decode('utf-8') parser = GUARDIANARTICLEParser() parser.feed(htmlfile) return articlelist utils.handle_error('ap_error')
def main(): a = 'http://bigstory.ap.org/article/f7645d59944d47228f2eb195a35a19a4/' htmlfile = utils.get_html_file( a + 'get-without-planned-parenthood-one-texas-effort-stumbles') content = re.search(r'<meta name="description" content="(.+?)" />', htmlfile) print content.group(1)
def cl_news_util(arguments, cache): if not cache: htmlfile = utils.get_html_file('http://news.ycombinator.com') storylinks = re.findall(r'href="(.+)" class="storylink">(.+)</a><span', htmlfile) else: storylinks = cache if len(arguments) > 1: if arguments[1] == '--headlines' or arguments[1] == '-h': utils.hn_headlines(storylinks) return storylinks if arguments[1] == '--open' or arguments[1] == '-o': if len(arguments) > 2: index = int(arguments[2]) openpage(storylinks, index) return storylinks if arguments[1] == '--copy' or arguments[1] == '-cp': if len(arguments) > 2: utils.copy_file(arguments[2], htmlfile) return storylinks utils.handle_error('hn_error')
def get_article_list(): htmlfile = utils.get_html_file('http://cnn.com') articles = re.findall(r'articleList":\[(.+?)\]', htmlfile) articles = re.findall(r'({.+?})', articles[0]) article_list = [] for article in articles: article_list.append(json.loads(article)) return article_list
def cl_news_util(args, cache): if not cache: htmlfile = utils.get_html_file('http://www.aljazeera.com/') htmlfile = htmlfile.decode('utf-8') parser = AJHTMLParser() parser.feed(htmlfile) articlelist = parser.articlelist else: articlelist = cache if len(args) > 1: if args[1] == '--headlines' or args[1] == '-h': utils.aj_headlines(articlelist) return articlelist if len(args) > 2: if args[1] == '--open' or args[1] == '-o': index = args[2] article = get_aj_article(articlelist, index) utils.go_to_page(article['url']) return articlelist if args[1] == '--read' or args[1] == '-r': index = args[2] article = get_aj_article(articlelist, index) htmlfile = utils.get_html_file('http://www.aljazeera.com/' + article['url']) htmlfile = htmlfile.decode('utf-8') print '\n' + article['title'] print '=====================' parser = AJARTICLEParser() parser.feed(htmlfile) return articlelist utils.handle_error('aj_error')
def cl_news_util(args, cache): if not cache: htmlfile = utils.get_html_file('https://www.nytimes.com') parser = NYTIMESHTMLParser() parser.feed(htmlfile) articlelist = parser.articlelist else: articlelist = cache if len(args) > 1: if args[1] == '--headlines' or args[1] == '-h': utils.nyt_headlines(articlelist) return articlelist if len(args) > 2: # NOT CURRENTLY USED # if args[1] == '--open' or args[1] == '-o': # index = args[2] # article = get_nyt_article(articlelist, index) # utils.go_to_page(article['url']) # return articlelist if args[1] == '--read' or args[1] == '-r': try: index = int(args[2]) - 1 url = articlelist[index]['url'] article = articlelist[index] # This url call is specific to NYT htmlfile = urllib2.build_opener( urllib2.HTTPCookieProcessor).open(url) htmlfile = htmlfile.read() parser = NYTIMESARTICLEParser() print '=========nyt=========\n' print article['title'] + '\n' print '=====================\n' parser.feed(htmlfile) return articlelist except: return utils.handle_error('nyt_error')
def main(): arguments = sys.argv if len(arguments) > 1: htmlfile = utils.get_html_file('http://news.ycombinator.com') storylinks = re.findall(r'href="(.+)" class="storylink">(.+)</a><span', htmlfile) if arguments[1] == '--headlines' or arguments[1] == '-h': utils.hn_headlines(storylinks) return if arguments[1] == '--open' or arguments[1] == '-o': if len(arguments) > 2: index = int(arguments[2]) openpage(storylinks, index) return if arguments[1] == '--copy' or arguments[1] == '-cp': if len(arguments) > 2: utils.copy_file(arguments[2], htmlfile) return utils.handle_error('hn_error')
def main(): # cl_news_util(['gu', '-h'], False) htmlfile = utils.get_html_file('https://www.theguardian.com/us-news/2017/mar/14/mosque-obama-visited-trump-travel-ban-muslim') parser = GUARDIANARTICLEParser() parser.feed(htmlfile) print parser.collectdata