def go_yahoo(): '''抓取雅虎的所有公司''' baseCrawl = BaseCrawl() baseExtract = BaseExtract() cxn_db = None try: cxn_db = get_cxn_db() cur_db = cxn_db.cursor() in_sql = """ insert into yahoo_comps (code_name,crawl_time,return_type,add_time,count) values (%s,now(),%s,now(),1) on duplicate key update crawl_time = now(),return_type = %s,count=count+1 """ code_names = get_comps() logger.info("number of companys is %d"%len(code_names)) for code in code_names: logger.debug("crawl %s, start............ "%code) status,headlines_links = go_crawl_headlines(code,baseCrawl,baseExtract) cur_db.execute(in_sql,(code,status,status)) if status == 0 and len(headlines_links) > 0: extract_headlines_news(code,headlines_links,cur_db) logger.debug("crawl %s, end.............. "%code) except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) finally: if cxn_db: cxn_db.close()
def go_yahoo(): '''抓取雅虎的所有公司''' baseCrawl = BaseCrawl() baseExtract = BaseExtract() cxn_db = None try: cxn_db = get_cxn_db() cur_db = cxn_db.cursor() in_sql = """ insert into yahoo_comps (code_name,crawl_time,return_type,add_time,count) values (%s,now(),%s,now(),1) on duplicate key update crawl_time = now(),return_type = %s,count=count+1 """ code_names = get_comps() logger.info("number of companys is %d" % len(code_names)) for code in code_names: logger.debug("crawl %s, start............ " % code) status, headlines_links = go_crawl_headlines( code, baseCrawl, baseExtract) cur_db.execute(in_sql, (code, status, status)) if status == 0 and len(headlines_links) > 0: extract_headlines_news(code, headlines_links, cur_db) logger.debug("crawl %s, end.............. " % code) except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) finally: if cxn_db: cxn_db.close()
def go_newsextrac(source_type, code_names_file, url_prefix, req_referer, finance_homes, extracteNewsurls): '''抓取所有的新闻链接、抽取并存储 @param source_type: 抓取来源:比如yahoo、google等 @param code_names_file: 公司上市代码列表 @param url_prefix: 拼接上市公司新闻url的前缀 @param req_referer: 抓取前连,防封禁 @param finance_homes: 财经频道首页地址 @param extracteNewsurls: 个性化的抽取对象 ''' baseCrawl = BaseCrawl() baseExtract = BaseExtract() cxn_db = None try: cxn_db = get_cxn_db() cur_db = cxn_db.cursor() #构造一个boolm filter initUrlsBloomFilter(cur_db, source_type) in_sql = """ insert into com_news_extract_state (url,code_name,crawl_time,return_type,add_time,crawl_count,source_type) values (%s,%s,now(),%s,now(),1,%s) on duplicate key update crawl_time = now(),return_type = %s,crawl_count=crawl_count+1,source_type = %s """ news_chanel_urls = get_compnewsurls(code_names_file, url_prefix, finance_homes) logger.info("number of companys is %d" % len(news_chanel_urls)) for code, news_chanel_url in news_chanel_urls: logger.debug("crawl %s, start............ " % news_chanel_url) is_finance_home = False if code == FINANCE_HOME: is_finance_home = True status, news_links = extracteNewsurls.extracte(news_chanel_url, is_finance_home, baseCrawl, baseExtract, req_referer, try_times=1) cur_db.execute(in_sql, (news_chanel_url, code, status, source_type, status, source_type)) if status == 0 and len(news_links) > 0: extract_news(code, news_links, source_type, cur_db) logger.debug("crawl %s, end.............. " % code) logger.info("number of news url is %s" % NEWS_URL_EXTRACTE) except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) finally: if cxn_db: cxn_db.close()
def go_newsextrac(source_type,code_names_file,url_prefix,req_referer,finance_homes,extracteNewsurls): '''抓取所有的新闻链接、抽取并存储 @param source_type: 抓取来源:比如yahoo、google等 @param code_names_file: 公司上市代码列表 @param url_prefix: 拼接上市公司新闻url的前缀 @param req_referer: 抓取前连,防封禁 @param finance_homes: 财经频道首页地址 @param extracteNewsurls: 个性化的抽取对象 ''' baseCrawl = BaseCrawl() baseExtract = BaseExtract() cxn_db = None try: cxn_db = get_cxn_db() cur_db = cxn_db.cursor() #构造一个boolm filter initUrlsBloomFilter(cur_db,source_type) in_sql = """ insert into com_news_extract_state (url,code_name,crawl_time,return_type,add_time,crawl_count,source_type) values (%s,%s,now(),%s,now(),1,%s) on duplicate key update crawl_time = now(),return_type = %s,crawl_count=crawl_count+1,source_type = %s """ news_chanel_urls = get_compnewsurls(code_names_file,url_prefix,finance_homes) logger.info("number of companys is %d"%len(news_chanel_urls)) for code,news_chanel_url in news_chanel_urls: logger.debug("crawl %s, start............ "%news_chanel_url) is_finance_home = False if code == FINANCE_HOME: is_finance_home = True status,news_links = extracteNewsurls.extracte(news_chanel_url,is_finance_home,baseCrawl,baseExtract,req_referer,try_times = 1) cur_db.execute(in_sql,(news_chanel_url,code,status,source_type,status,source_type)) if status == 0 and len(news_links) > 0: extract_news(code,news_links,source_type,cur_db) logger.debug("crawl %s, end.............. "%code) logger.info("number of news url is %s"%NEWS_URL_EXTRACTE) except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) finally: if cxn_db: cxn_db.close()
if status == 0 and len(news_links) > 0: extract_news(code,news_links,source_type,cur_db) logger.debug("crawl %s, end.............. "%code) logger.info("number of news url is %s"%NEWS_URL_EXTRACTE) except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) finally: if cxn_db: cxn_db.close() if __name__ == '__main__': cxn_db = None try: cxn_db = get_cxn_db() cur_db = cxn_db.cursor() #构造一个boolm filter initUrlsBloomFilter(cur_db,1) # print bf.exists('https://mail.yahoo.com/?.intl=us&.lang=en-US&.src=ym') # print bf.exists('http://www.latimes.com/la-fi-hy-helmet-safety-20150409-story.html') # print bf.exists('http://www.baidu.com') except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) finally: if cxn_db: cxn_db.close()
if status == 0 and len(news_links) > 0: extract_news(code, news_links, source_type, cur_db) logger.debug("crawl %s, end.............. " % code) logger.info("number of news url is %s" % NEWS_URL_EXTRACTE) except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) finally: if cxn_db: cxn_db.close() if __name__ == '__main__': cxn_db = None try: cxn_db = get_cxn_db() cur_db = cxn_db.cursor() #构造一个boolm filter initUrlsBloomFilter(cur_db, 1) # print bf.exists('https://mail.yahoo.com/?.intl=us&.lang=en-US&.src=ym') # print bf.exists('http://www.latimes.com/la-fi-hy-helmet-safety-20150409-story.html') # print bf.exists('http://www.baidu.com') except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) finally: if cxn_db: cxn_db.close()