def go_yahoo(): '''抓取雅虎的所有公司''' baseCrawl = BaseCrawl() baseExtract = BaseExtract() cxn_db = None try: cxn_db = get_cxn_db() cur_db = cxn_db.cursor() in_sql = """ insert into yahoo_comps (code_name,crawl_time,return_type,add_time,count) values (%s,now(),%s,now(),1) on duplicate key update crawl_time = now(),return_type = %s,count=count+1 """ code_names = get_comps() logger.info("number of companys is %d"%len(code_names)) for code in code_names: logger.debug("crawl %s, start............ "%code) status,headlines_links = go_crawl_headlines(code,baseCrawl,baseExtract) cur_db.execute(in_sql,(code,status,status)) if status == 0 and len(headlines_links) > 0: extract_headlines_news(code,headlines_links,cur_db) logger.debug("crawl %s, end.............. "%code) except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) finally: if cxn_db: cxn_db.close()
def go_yahoo(): '''抓取雅虎的所有公司''' baseCrawl = BaseCrawl() baseExtract = BaseExtract() cxn_db = None try: cxn_db = get_cxn_db() cur_db = cxn_db.cursor() in_sql = """ insert into yahoo_comps (code_name,crawl_time,return_type,add_time,count) values (%s,now(),%s,now(),1) on duplicate key update crawl_time = now(),return_type = %s,count=count+1 """ code_names = get_comps() logger.info("number of companys is %d" % len(code_names)) for code in code_names: logger.debug("crawl %s, start............ " % code) status, headlines_links = go_crawl_headlines( code, baseCrawl, baseExtract) cur_db.execute(in_sql, (code, status, status)) if status == 0 and len(headlines_links) > 0: extract_headlines_news(code, headlines_links, cur_db) logger.debug("crawl %s, end.............. " % code) except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) finally: if cxn_db: cxn_db.close()
def go_google(): logger.info("crawl google finance start......") source_type = SOURCE_TYPE_GOOGLE code_names_file = 'code_name_vol.lst' url_prefix = 'https://www.google.com/finance/company_news?q=' req_referer = 'https://www.google.com/finance' finance_home = ['https://www.google.com/finance','https://news.google.com/news/section?ned=us&topic=b'] extracteNewsurls = GoogleExtracteNewsurls() go_newsextrac(source_type,code_names_file,url_prefix,req_referer,finance_home,extracteNewsurls) logger.info("crawl google finance end......")
def go_yahoo(): logger.info("crawl yahoo finance start......") source_type = SOURCE_TYPE_YAHOO code_names_file = 'code_name_vol.lst' url_prefix = 'http://finance.yahoo.com/q/h?s=' req_referer = 'http://finance.yahoo.com/' finance_homes = ['http://finance.yahoo.com/'] extracteNewsurls = YahooExtracteNewsurls() go_newsextrac(source_type,code_names_file,url_prefix,req_referer,finance_homes,extracteNewsurls) logger.info("crawl yahoo finance end......")
def go_newsextrac(source_type, code_names_file, url_prefix, req_referer, finance_homes, extracteNewsurls): '''抓取所有的新闻链接、抽取并存储 @param source_type: 抓取来源:比如yahoo、google等 @param code_names_file: 公司上市代码列表 @param url_prefix: 拼接上市公司新闻url的前缀 @param req_referer: 抓取前连,防封禁 @param finance_homes: 财经频道首页地址 @param extracteNewsurls: 个性化的抽取对象 ''' baseCrawl = BaseCrawl() baseExtract = BaseExtract() cxn_db = None try: cxn_db = get_cxn_db() cur_db = cxn_db.cursor() #构造一个boolm filter initUrlsBloomFilter(cur_db, source_type) in_sql = """ insert into com_news_extract_state (url,code_name,crawl_time,return_type,add_time,crawl_count,source_type) values (%s,%s,now(),%s,now(),1,%s) on duplicate key update crawl_time = now(),return_type = %s,crawl_count=crawl_count+1,source_type = %s """ news_chanel_urls = get_compnewsurls(code_names_file, url_prefix, finance_homes) logger.info("number of companys is %d" % len(news_chanel_urls)) for code, news_chanel_url in news_chanel_urls: logger.debug("crawl %s, start............ " % news_chanel_url) is_finance_home = False if code == FINANCE_HOME: is_finance_home = True status, news_links = extracteNewsurls.extracte(news_chanel_url, is_finance_home, baseCrawl, baseExtract, req_referer, try_times=1) cur_db.execute(in_sql, (news_chanel_url, code, status, source_type, status, source_type)) if status == 0 and len(news_links) > 0: extract_news(code, news_links, source_type, cur_db) logger.debug("crawl %s, end.............. " % code) logger.info("number of news url is %s" % NEWS_URL_EXTRACTE) except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) finally: if cxn_db: cxn_db.close()
def go_yahoo(): logger.info("crawl yahoo finance start......") source_type = SOURCE_TYPE_YAHOO code_names_file = 'code_name_vol.lst' url_prefix = 'http://finance.yahoo.com/q/h?s=' req_referer = 'http://finance.yahoo.com/' finance_homes = ['http://finance.yahoo.com/'] extracteNewsurls = YahooExtracteNewsurls() go_newsextrac(source_type, code_names_file, url_prefix, req_referer, finance_homes, extracteNewsurls) logger.info("crawl yahoo finance end......")
def go_google(): logger.info("crawl google finance start......") source_type = SOURCE_TYPE_GOOGLE code_names_file = 'code_name_vol.lst' url_prefix = 'https://www.google.com/finance/company_news?q=' req_referer = 'https://www.google.com/finance' finance_home = [ 'https://www.google.com/finance', 'https://news.google.com/news/section?ned=us&topic=b' ] extracteNewsurls = GoogleExtracteNewsurls() go_newsextrac(source_type, code_names_file, url_prefix, req_referer, finance_home, extracteNewsurls) logger.info("crawl google finance end......")
def go_newsextrac(source_type,code_names_file,url_prefix,req_referer,finance_homes,extracteNewsurls): '''抓取所有的新闻链接、抽取并存储 @param source_type: 抓取来源:比如yahoo、google等 @param code_names_file: 公司上市代码列表 @param url_prefix: 拼接上市公司新闻url的前缀 @param req_referer: 抓取前连,防封禁 @param finance_homes: 财经频道首页地址 @param extracteNewsurls: 个性化的抽取对象 ''' baseCrawl = BaseCrawl() baseExtract = BaseExtract() cxn_db = None try: cxn_db = get_cxn_db() cur_db = cxn_db.cursor() #构造一个boolm filter initUrlsBloomFilter(cur_db,source_type) in_sql = """ insert into com_news_extract_state (url,code_name,crawl_time,return_type,add_time,crawl_count,source_type) values (%s,%s,now(),%s,now(),1,%s) on duplicate key update crawl_time = now(),return_type = %s,crawl_count=crawl_count+1,source_type = %s """ news_chanel_urls = get_compnewsurls(code_names_file,url_prefix,finance_homes) logger.info("number of companys is %d"%len(news_chanel_urls)) for code,news_chanel_url in news_chanel_urls: logger.debug("crawl %s, start............ "%news_chanel_url) is_finance_home = False if code == FINANCE_HOME: is_finance_home = True status,news_links = extracteNewsurls.extracte(news_chanel_url,is_finance_home,baseCrawl,baseExtract,req_referer,try_times = 1) cur_db.execute(in_sql,(news_chanel_url,code,status,source_type,status,source_type)) if status == 0 and len(news_links) > 0: extract_news(code,news_links,source_type,cur_db) logger.debug("crawl %s, end.............. "%code) logger.info("number of news url is %s"%NEWS_URL_EXTRACTE) except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) finally: if cxn_db: cxn_db.close()
def extracte(self, news_chanel_url, is_finance_home, baseCrawl, baseExtract, req_referer, try_times=1): '''抽取频道首页或者每个公司新闻频道中得所有urls, 每个页面最多抓取 MAX_TRY_TIMES+1 次 @param news_chanel_url: 抓取的链接, @param is_finance_home:是否是finance_home,可能需要单独处理 @param baseCrawl: 封装的抓取基类 @param baseExtract: 封装的抽取基类 @param try_times: 重试次数 @return: 抓取状态,抽取的新闻链接 ''' logger.info("crawl %s, %d time" % (news_chanel_url, try_times)) time.sleep(IDEL_TIME * (try_times - 1)) status = 2 html = '' try: status, html = baseCrawl.crawl_page(news_chanel_url, req_referer=req_referer) #返回频道下面所有新闻链接 news_links = [] if status == 200: status = 0 logger.info("crawl %s, success " % news_chanel_url) news_links = self.extracteUrls(baseExtract, html, is_finance_home) elif status in [301, 302]: status = 1 logger.info("crawl %s, no data,fail" % news_chanel_url) else: if try_times <= MAX_TRY_TIMES: return self.extracte(news_chanel_url, is_finance_home, baseCrawl, baseExtract, req_referer, try_times + 1) else: status = 2 logger.error("crawl %s, %d time,fail" % (news_chanel_url, try_times)) except: logger.error("crawl %s, failed ,Error:%s" % (news_chanel_url, traceback.format_exc())) return status, news_links
def go_crawl_headlines(code_name, baseCrawl, baseExtract, try_times=1): '''雅虎金融每个公司页面的新闻抓取的抓取,每个页面最多抓取 MAX_TRY_TIMES+1 次 @param req_url: 抓取的链接,形如:http://finance.yahoo.com/q/h?s=bidu @param baseCrawl: 封装的抓取基类 @param baseExtract: 封装的抽取基类 @param try_times: 重试次数 @return: 抓取状态,抽取的新闻链接, ''' req_url = 'http://finance.yahoo.com/q/h?s=%s' % code_name logger.info("crawl %s, %d time" % (req_url, try_times)) time.sleep(IDEL_TIME * try_times) status, html = baseCrawl.crawl_page(req_url, req_referer=REQ_REFER) #返回新闻链接 headlines_links = [] if status == 200: status = 0 logger.info("crawl %s, success " % req_url) beautifulSoup = baseExtract.get_bs(html) content = beautifulSoup.find( 'div', {'class': 'mod yfi_quote_headline withsky'}) if content is None: logger.info("extract %s is none " % req_url) else: for link in beautifulSoup.findAll('a'): headlines_links.append(link.get('href')) elif status in [301, 302]: status = 1 logger.info("crawl %s, no data,fail" % req_url) else: if try_times <= MAX_TRY_TIMES: return go_crawl_headlines(req_url, baseCrawl, baseExtract, try_times + 1) else: status = 2 logger.error("crawl %s, %d time,fail" % (req_url, try_times)) return status, headlines_links
def go_crawl_headlines(code_name,baseCrawl,baseExtract,try_times = 1): '''雅虎金融每个公司页面的新闻抓取的抓取,每个页面最多抓取 MAX_TRY_TIMES+1 次 @param req_url: 抓取的链接,形如:http://finance.yahoo.com/q/h?s=bidu @param baseCrawl: 封装的抓取基类 @param baseExtract: 封装的抽取基类 @param try_times: 重试次数 @return: 抓取状态,抽取的新闻链接, ''' req_url = 'http://finance.yahoo.com/q/h?s=%s'%code_name logger.info("crawl %s, %d time"%(req_url,try_times)) time.sleep(IDEL_TIME * try_times) status,html = baseCrawl.crawl_page(req_url, req_referer = REQ_REFER) #返回新闻链接 headlines_links = [] if status == 200: status = 0 logger.info("crawl %s, success "%req_url) beautifulSoup = baseExtract.get_bs(html) content = beautifulSoup.find('div', {'class':'mod yfi_quote_headline withsky'}) if content is None: logger.info("extract %s is none "%req_url) else: for link in beautifulSoup.findAll('a'): headlines_links.append(link.get('href')) elif status in [301,302]: status = 1 logger.info("crawl %s, no data,fail"%req_url) else: if try_times <= MAX_TRY_TIMES: return go_crawl_headlines(req_url,baseCrawl,baseExtract,try_times+1) else: status = 2 logger.error("crawl %s, %d time,fail"%(req_url,try_times)) return status,headlines_links
def extracte(self,news_chanel_url,is_finance_home,baseCrawl,baseExtract,req_referer,try_times = 1): '''抽取频道首页或者每个公司新闻频道中得所有urls, 每个页面最多抓取 MAX_TRY_TIMES+1 次 @param news_chanel_url: 抓取的链接, @param is_finance_home:是否是finance_home,可能需要单独处理 @param baseCrawl: 封装的抓取基类 @param baseExtract: 封装的抽取基类 @param try_times: 重试次数 @return: 抓取状态,抽取的新闻链接 ''' logger.info("crawl %s, %d time"%(news_chanel_url,try_times)) time.sleep(IDEL_TIME * (try_times-1)) status = 2 html = '' try: status,html = baseCrawl.crawl_page(news_chanel_url, req_referer = req_referer) #返回频道下面所有新闻链接 news_links = [] if status == 200: status = 0 logger.info("crawl %s, success "%news_chanel_url) news_links = self.extracteUrls(baseExtract,html, is_finance_home) elif status in [301,302]: status = 1 logger.info("crawl %s, no data,fail"%news_chanel_url) else: if try_times <= MAX_TRY_TIMES: return self.extracte(news_chanel_url,is_finance_home, baseCrawl, baseExtract, req_referer, try_times+1) else: status = 2 logger.error("crawl %s, %d time,fail"%(news_chanel_url,try_times)) except: logger.error("crawl %s, failed ,Error:%s" % (news_chanel_url,traceback.format_exc())) return status,news_links