예제 #1
0
def go_yahoo():
    '''抓取雅虎的所有公司'''
    baseCrawl = BaseCrawl()
    baseExtract = BaseExtract()
    
    cxn_db = None
    try:
        cxn_db = get_cxn_db()
        cur_db = cxn_db.cursor()
        in_sql = """ insert into yahoo_comps (code_name,crawl_time,return_type,add_time,count)
values (%s,now(),%s,now(),1)
on duplicate key update crawl_time = now(),return_type = %s,count=count+1 """
        code_names = get_comps()
        logger.info("number of companys is %d"%len(code_names))
        for code in code_names:
            logger.debug("crawl %s, start............ "%code)
            status,headlines_links = go_crawl_headlines(code,baseCrawl,baseExtract)
            cur_db.execute(in_sql,(code,status,status))
            if status == 0 and len(headlines_links) > 0:
                extract_headlines_news(code,headlines_links,cur_db)
            logger.debug("crawl %s, end.............. "%code)
    
    except:
        logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) 
    finally:
        if cxn_db:
            cxn_db.close() 
예제 #2
0
def go_yahoo():
    '''抓取雅虎的所有公司'''
    baseCrawl = BaseCrawl()
    baseExtract = BaseExtract()

    cxn_db = None
    try:
        cxn_db = get_cxn_db()
        cur_db = cxn_db.cursor()
        in_sql = """ insert into yahoo_comps (code_name,crawl_time,return_type,add_time,count)
values (%s,now(),%s,now(),1)
on duplicate key update crawl_time = now(),return_type = %s,count=count+1 """
        code_names = get_comps()
        logger.info("number of companys is %d" % len(code_names))
        for code in code_names:
            logger.debug("crawl %s, start............ " % code)
            status, headlines_links = go_crawl_headlines(
                code, baseCrawl, baseExtract)
            cur_db.execute(in_sql, (code, status, status))
            if status == 0 and len(headlines_links) > 0:
                extract_headlines_news(code, headlines_links, cur_db)
            logger.debug("crawl %s, end.............. " % code)

    except:
        logger.error("crawl_page failed ,Error:%s" % traceback.format_exc())
    finally:
        if cxn_db:
            cxn_db.close()
예제 #3
0
def go_newsextrac(source_type, code_names_file, url_prefix, req_referer,
                  finance_homes, extracteNewsurls):
    '''抓取所有的新闻链接、抽取并存储
    @param source_type: 抓取来源:比如yahoo、google等
    @param code_names_file: 公司上市代码列表
    @param url_prefix: 拼接上市公司新闻url的前缀
    @param req_referer: 抓取前连,防封禁
    @param finance_homes: 财经频道首页地址
    @param  extracteNewsurls: 个性化的抽取对象
    '''
    baseCrawl = BaseCrawl()
    baseExtract = BaseExtract()

    cxn_db = None
    try:
        cxn_db = get_cxn_db()
        cur_db = cxn_db.cursor()
        #构造一个boolm filter
        initUrlsBloomFilter(cur_db, source_type)

        in_sql = """ insert into com_news_extract_state (url,code_name,crawl_time,return_type,add_time,crawl_count,source_type)
values (%s,%s,now(),%s,now(),1,%s)
on duplicate key update crawl_time = now(),return_type = %s,crawl_count=crawl_count+1,source_type = %s """

        news_chanel_urls = get_compnewsurls(code_names_file, url_prefix,
                                            finance_homes)

        logger.info("number of companys is %d" % len(news_chanel_urls))
        for code, news_chanel_url in news_chanel_urls:
            logger.debug("crawl %s, start............ " % news_chanel_url)
            is_finance_home = False
            if code == FINANCE_HOME:
                is_finance_home = True
            status, news_links = extracteNewsurls.extracte(news_chanel_url,
                                                           is_finance_home,
                                                           baseCrawl,
                                                           baseExtract,
                                                           req_referer,
                                                           try_times=1)
            cur_db.execute(in_sql, (news_chanel_url, code, status, source_type,
                                    status, source_type))
            if status == 0 and len(news_links) > 0:
                extract_news(code, news_links, source_type, cur_db)
            logger.debug("crawl %s, end.............. " % code)

        logger.info("number of news url is %s" % NEWS_URL_EXTRACTE)

    except:
        logger.error("crawl_page failed ,Error:%s" % traceback.format_exc())
    finally:
        if cxn_db:
            cxn_db.close()
예제 #4
0
def go_newsextrac(source_type,code_names_file,url_prefix,req_referer,finance_homes,extracteNewsurls):
    '''抓取所有的新闻链接、抽取并存储
    @param source_type: 抓取来源:比如yahoo、google等
    @param code_names_file: 公司上市代码列表
    @param url_prefix: 拼接上市公司新闻url的前缀
    @param req_referer: 抓取前连,防封禁
    @param finance_homes: 财经频道首页地址
    @param  extracteNewsurls: 个性化的抽取对象
    '''
    baseCrawl = BaseCrawl() 
    baseExtract = BaseExtract()
    
    cxn_db = None
    try:
        cxn_db = get_cxn_db()
        cur_db = cxn_db.cursor()
        #构造一个boolm filter
        initUrlsBloomFilter(cur_db,source_type)
        
        in_sql = """ insert into com_news_extract_state (url,code_name,crawl_time,return_type,add_time,crawl_count,source_type)
values (%s,%s,now(),%s,now(),1,%s)
on duplicate key update crawl_time = now(),return_type = %s,crawl_count=crawl_count+1,source_type = %s """
       
        news_chanel_urls = get_compnewsurls(code_names_file,url_prefix,finance_homes)
        
        logger.info("number of companys is %d"%len(news_chanel_urls))
        for code,news_chanel_url in news_chanel_urls:
            logger.debug("crawl %s, start............ "%news_chanel_url)
            is_finance_home = False
            if code == FINANCE_HOME:
                is_finance_home = True
            status,news_links = extracteNewsurls.extracte(news_chanel_url,is_finance_home,baseCrawl,baseExtract,req_referer,try_times = 1)
            cur_db.execute(in_sql,(news_chanel_url,code,status,source_type,status,source_type))
            if status == 0 and len(news_links) > 0:
                extract_news(code,news_links,source_type,cur_db)
            logger.debug("crawl %s, end.............. "%code)
        
        logger.info("number of news url is %s"%NEWS_URL_EXTRACTE)
    
    except:
        logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) 
    finally:
        if cxn_db:
            cxn_db.close()
예제 #5
0
            if status == 0 and len(news_links) > 0:
                extract_news(code,news_links,source_type,cur_db)
            logger.debug("crawl %s, end.............. "%code)
        
        logger.info("number of news url is %s"%NEWS_URL_EXTRACTE)
    
    except:
        logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) 
    finally:
        if cxn_db:
            cxn_db.close()
            
if __name__ == '__main__':
    cxn_db = None
    try:
        cxn_db = get_cxn_db()
        cur_db = cxn_db.cursor()
        #构造一个boolm filter
        initUrlsBloomFilter(cur_db,1)
        
#         print bf.exists('https://mail.yahoo.com/?.intl=us&.lang=en-US&.src=ym')
#         print bf.exists('http://www.latimes.com/la-fi-hy-helmet-safety-20150409-story.html')
#         print bf.exists('http://www.baidu.com')
        
    except:
        logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) 
    finally:
        if cxn_db:
            cxn_db.close()

예제 #6
0
            if status == 0 and len(news_links) > 0:
                extract_news(code, news_links, source_type, cur_db)
            logger.debug("crawl %s, end.............. " % code)

        logger.info("number of news url is %s" % NEWS_URL_EXTRACTE)

    except:
        logger.error("crawl_page failed ,Error:%s" % traceback.format_exc())
    finally:
        if cxn_db:
            cxn_db.close()


if __name__ == '__main__':
    cxn_db = None
    try:
        cxn_db = get_cxn_db()
        cur_db = cxn_db.cursor()
        #构造一个boolm filter
        initUrlsBloomFilter(cur_db, 1)

#         print bf.exists('https://mail.yahoo.com/?.intl=us&.lang=en-US&.src=ym')
#         print bf.exists('http://www.latimes.com/la-fi-hy-helmet-safety-20150409-story.html')
#         print bf.exists('http://www.baidu.com')

    except:
        logger.error("crawl_page failed ,Error:%s" % traceback.format_exc())
    finally:
        if cxn_db:
            cxn_db.close()