Пример #1
0
def go_yahoo():
    '''抓取雅虎的所有公司'''
    baseCrawl = BaseCrawl()
    baseExtract = BaseExtract()
    
    cxn_db = None
    try:
        cxn_db = get_cxn_db()
        cur_db = cxn_db.cursor()
        in_sql = """ insert into yahoo_comps (code_name,crawl_time,return_type,add_time,count)
values (%s,now(),%s,now(),1)
on duplicate key update crawl_time = now(),return_type = %s,count=count+1 """
        code_names = get_comps()
        logger.info("number of companys is %d"%len(code_names))
        for code in code_names:
            logger.debug("crawl %s, start............ "%code)
            status,headlines_links = go_crawl_headlines(code,baseCrawl,baseExtract)
            cur_db.execute(in_sql,(code,status,status))
            if status == 0 and len(headlines_links) > 0:
                extract_headlines_news(code,headlines_links,cur_db)
            logger.debug("crawl %s, end.............. "%code)
    
    except:
        logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) 
    finally:
        if cxn_db:
            cxn_db.close() 
Пример #2
0
def go_yahoo():
    '''抓取雅虎的所有公司'''
    baseCrawl = BaseCrawl()
    baseExtract = BaseExtract()

    cxn_db = None
    try:
        cxn_db = get_cxn_db()
        cur_db = cxn_db.cursor()
        in_sql = """ insert into yahoo_comps (code_name,crawl_time,return_type,add_time,count)
values (%s,now(),%s,now(),1)
on duplicate key update crawl_time = now(),return_type = %s,count=count+1 """
        code_names = get_comps()
        logger.info("number of companys is %d" % len(code_names))
        for code in code_names:
            logger.debug("crawl %s, start............ " % code)
            status, headlines_links = go_crawl_headlines(
                code, baseCrawl, baseExtract)
            cur_db.execute(in_sql, (code, status, status))
            if status == 0 and len(headlines_links) > 0:
                extract_headlines_news(code, headlines_links, cur_db)
            logger.debug("crawl %s, end.............. " % code)

    except:
        logger.error("crawl_page failed ,Error:%s" % traceback.format_exc())
    finally:
        if cxn_db:
            cxn_db.close()
Пример #3
0
def go_google():
    logger.info("crawl google finance start......")
    source_type = SOURCE_TYPE_GOOGLE
    code_names_file = 'code_name_vol.lst'
    url_prefix = 'https://www.google.com/finance/company_news?q='
    req_referer = 'https://www.google.com/finance'
    finance_home = ['https://www.google.com/finance','https://news.google.com/news/section?ned=us&topic=b']
    extracteNewsurls = GoogleExtracteNewsurls()
    
    go_newsextrac(source_type,code_names_file,url_prefix,req_referer,finance_home,extracteNewsurls)
    
    logger.info("crawl google finance end......")
Пример #4
0
def go_yahoo():
    logger.info("crawl yahoo finance start......")
    source_type = SOURCE_TYPE_YAHOO
    code_names_file = 'code_name_vol.lst'
    url_prefix = 'http://finance.yahoo.com/q/h?s='
    req_referer = 'http://finance.yahoo.com/'
    finance_homes = ['http://finance.yahoo.com/']
    extracteNewsurls = YahooExtracteNewsurls()
    
    go_newsextrac(source_type,code_names_file,url_prefix,req_referer,finance_homes,extracteNewsurls)
    
    logger.info("crawl yahoo finance end......")
Пример #5
0
def go_newsextrac(source_type, code_names_file, url_prefix, req_referer,
                  finance_homes, extracteNewsurls):
    '''抓取所有的新闻链接、抽取并存储
    @param source_type: 抓取来源:比如yahoo、google等
    @param code_names_file: 公司上市代码列表
    @param url_prefix: 拼接上市公司新闻url的前缀
    @param req_referer: 抓取前连,防封禁
    @param finance_homes: 财经频道首页地址
    @param  extracteNewsurls: 个性化的抽取对象
    '''
    baseCrawl = BaseCrawl()
    baseExtract = BaseExtract()

    cxn_db = None
    try:
        cxn_db = get_cxn_db()
        cur_db = cxn_db.cursor()
        #构造一个boolm filter
        initUrlsBloomFilter(cur_db, source_type)

        in_sql = """ insert into com_news_extract_state (url,code_name,crawl_time,return_type,add_time,crawl_count,source_type)
values (%s,%s,now(),%s,now(),1,%s)
on duplicate key update crawl_time = now(),return_type = %s,crawl_count=crawl_count+1,source_type = %s """

        news_chanel_urls = get_compnewsurls(code_names_file, url_prefix,
                                            finance_homes)

        logger.info("number of companys is %d" % len(news_chanel_urls))
        for code, news_chanel_url in news_chanel_urls:
            logger.debug("crawl %s, start............ " % news_chanel_url)
            is_finance_home = False
            if code == FINANCE_HOME:
                is_finance_home = True
            status, news_links = extracteNewsurls.extracte(news_chanel_url,
                                                           is_finance_home,
                                                           baseCrawl,
                                                           baseExtract,
                                                           req_referer,
                                                           try_times=1)
            cur_db.execute(in_sql, (news_chanel_url, code, status, source_type,
                                    status, source_type))
            if status == 0 and len(news_links) > 0:
                extract_news(code, news_links, source_type, cur_db)
            logger.debug("crawl %s, end.............. " % code)

        logger.info("number of news url is %s" % NEWS_URL_EXTRACTE)

    except:
        logger.error("crawl_page failed ,Error:%s" % traceback.format_exc())
    finally:
        if cxn_db:
            cxn_db.close()
Пример #6
0
def go_yahoo():
    logger.info("crawl yahoo finance start......")
    source_type = SOURCE_TYPE_YAHOO
    code_names_file = 'code_name_vol.lst'
    url_prefix = 'http://finance.yahoo.com/q/h?s='
    req_referer = 'http://finance.yahoo.com/'
    finance_homes = ['http://finance.yahoo.com/']
    extracteNewsurls = YahooExtracteNewsurls()

    go_newsextrac(source_type, code_names_file, url_prefix, req_referer,
                  finance_homes, extracteNewsurls)

    logger.info("crawl yahoo finance end......")
Пример #7
0
def go_google():
    logger.info("crawl google finance start......")
    source_type = SOURCE_TYPE_GOOGLE
    code_names_file = 'code_name_vol.lst'
    url_prefix = 'https://www.google.com/finance/company_news?q='
    req_referer = 'https://www.google.com/finance'
    finance_home = [
        'https://www.google.com/finance',
        'https://news.google.com/news/section?ned=us&topic=b'
    ]
    extracteNewsurls = GoogleExtracteNewsurls()

    go_newsextrac(source_type, code_names_file, url_prefix, req_referer,
                  finance_home, extracteNewsurls)

    logger.info("crawl google finance end......")
Пример #8
0
def go_newsextrac(source_type,code_names_file,url_prefix,req_referer,finance_homes,extracteNewsurls):
    '''抓取所有的新闻链接、抽取并存储
    @param source_type: 抓取来源:比如yahoo、google等
    @param code_names_file: 公司上市代码列表
    @param url_prefix: 拼接上市公司新闻url的前缀
    @param req_referer: 抓取前连,防封禁
    @param finance_homes: 财经频道首页地址
    @param  extracteNewsurls: 个性化的抽取对象
    '''
    baseCrawl = BaseCrawl() 
    baseExtract = BaseExtract()
    
    cxn_db = None
    try:
        cxn_db = get_cxn_db()
        cur_db = cxn_db.cursor()
        #构造一个boolm filter
        initUrlsBloomFilter(cur_db,source_type)
        
        in_sql = """ insert into com_news_extract_state (url,code_name,crawl_time,return_type,add_time,crawl_count,source_type)
values (%s,%s,now(),%s,now(),1,%s)
on duplicate key update crawl_time = now(),return_type = %s,crawl_count=crawl_count+1,source_type = %s """
       
        news_chanel_urls = get_compnewsurls(code_names_file,url_prefix,finance_homes)
        
        logger.info("number of companys is %d"%len(news_chanel_urls))
        for code,news_chanel_url in news_chanel_urls:
            logger.debug("crawl %s, start............ "%news_chanel_url)
            is_finance_home = False
            if code == FINANCE_HOME:
                is_finance_home = True
            status,news_links = extracteNewsurls.extracte(news_chanel_url,is_finance_home,baseCrawl,baseExtract,req_referer,try_times = 1)
            cur_db.execute(in_sql,(news_chanel_url,code,status,source_type,status,source_type))
            if status == 0 and len(news_links) > 0:
                extract_news(code,news_links,source_type,cur_db)
            logger.debug("crawl %s, end.............. "%code)
        
        logger.info("number of news url is %s"%NEWS_URL_EXTRACTE)
    
    except:
        logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) 
    finally:
        if cxn_db:
            cxn_db.close()
Пример #9
0
    def extracte(self,
                 news_chanel_url,
                 is_finance_home,
                 baseCrawl,
                 baseExtract,
                 req_referer,
                 try_times=1):
        '''抽取频道首页或者每个公司新闻频道中得所有urls,
        每个页面最多抓取 MAX_TRY_TIMES+1 次
        @param news_chanel_url: 抓取的链接,
        @param is_finance_home:是否是finance_home,可能需要单独处理
        @param baseCrawl: 封装的抓取基类
        @param baseExtract: 封装的抽取基类
        @param try_times: 重试次数
        @return: 抓取状态,抽取的新闻链接
        '''

        logger.info("crawl %s, %d time" % (news_chanel_url, try_times))
        time.sleep(IDEL_TIME * (try_times - 1))
        status = 2
        html = ''
        try:
            status, html = baseCrawl.crawl_page(news_chanel_url,
                                                req_referer=req_referer)
            #返回频道下面所有新闻链接
            news_links = []
            if status == 200:
                status = 0
                logger.info("crawl %s, success " % news_chanel_url)
                news_links = self.extracteUrls(baseExtract, html,
                                               is_finance_home)

            elif status in [301, 302]:
                status = 1
                logger.info("crawl %s, no data,fail" % news_chanel_url)
            else:
                if try_times <= MAX_TRY_TIMES:
                    return self.extracte(news_chanel_url, is_finance_home,
                                         baseCrawl, baseExtract, req_referer,
                                         try_times + 1)
                else:
                    status = 2
                    logger.error("crawl %s, %d time,fail" %
                                 (news_chanel_url, try_times))
        except:
            logger.error("crawl %s, failed ,Error:%s" %
                         (news_chanel_url, traceback.format_exc()))

        return status, news_links
Пример #10
0
def go_crawl_headlines(code_name, baseCrawl, baseExtract, try_times=1):
    '''雅虎金融每个公司页面的新闻抓取的抓取,每个页面最多抓取 MAX_TRY_TIMES+1 次
    @param req_url: 抓取的链接,形如:http://finance.yahoo.com/q/h?s=bidu
    @param baseCrawl: 封装的抓取基类
    @param baseExtract: 封装的抽取基类
    @param try_times: 重试次数
    @return: 抓取状态,抽取的新闻链接,
    '''
    req_url = 'http://finance.yahoo.com/q/h?s=%s' % code_name

    logger.info("crawl %s, %d time" % (req_url, try_times))
    time.sleep(IDEL_TIME * try_times)
    status, html = baseCrawl.crawl_page(req_url, req_referer=REQ_REFER)
    #返回新闻链接
    headlines_links = []
    if status == 200:
        status = 0
        logger.info("crawl %s, success " % req_url)
        beautifulSoup = baseExtract.get_bs(html)
        content = beautifulSoup.find(
            'div', {'class': 'mod yfi_quote_headline withsky'})
        if content is None:
            logger.info("extract %s is none " % req_url)
        else:
            for link in beautifulSoup.findAll('a'):
                headlines_links.append(link.get('href'))
    elif status in [301, 302]:
        status = 1
        logger.info("crawl %s, no data,fail" % req_url)
    else:
        if try_times <= MAX_TRY_TIMES:
            return go_crawl_headlines(req_url, baseCrawl, baseExtract,
                                      try_times + 1)
        else:
            status = 2
            logger.error("crawl %s, %d time,fail" % (req_url, try_times))

    return status, headlines_links
Пример #11
0
def go_crawl_headlines(code_name,baseCrawl,baseExtract,try_times = 1):
    '''雅虎金融每个公司页面的新闻抓取的抓取,每个页面最多抓取 MAX_TRY_TIMES+1 次
    @param req_url: 抓取的链接,形如:http://finance.yahoo.com/q/h?s=bidu
    @param baseCrawl: 封装的抓取基类
    @param baseExtract: 封装的抽取基类
    @param try_times: 重试次数
    @return: 抓取状态,抽取的新闻链接,
    '''
    req_url = 'http://finance.yahoo.com/q/h?s=%s'%code_name
    
    logger.info("crawl %s, %d time"%(req_url,try_times))
    time.sleep(IDEL_TIME * try_times)
    status,html = baseCrawl.crawl_page(req_url, req_referer = REQ_REFER)
    #返回新闻链接
    headlines_links = []
    if status == 200:
        status = 0
        logger.info("crawl %s, success "%req_url)
        beautifulSoup = baseExtract.get_bs(html)
        content = beautifulSoup.find('div', {'class':'mod yfi_quote_headline withsky'})
        if content is None:
            logger.info("extract %s is none "%req_url)
        else:
            for link in beautifulSoup.findAll('a'):
                headlines_links.append(link.get('href'))
    elif status in [301,302]:
        status = 1
        logger.info("crawl %s, no data,fail"%req_url)
    else:
        if try_times <= MAX_TRY_TIMES:
            return go_crawl_headlines(req_url,baseCrawl,baseExtract,try_times+1)
        else:
            status = 2
            logger.error("crawl %s, %d time,fail"%(req_url,try_times))
    
    return status,headlines_links
Пример #12
0
 def extracte(self,news_chanel_url,is_finance_home,baseCrawl,baseExtract,req_referer,try_times = 1):
     '''抽取频道首页或者每个公司新闻频道中得所有urls,
     每个页面最多抓取 MAX_TRY_TIMES+1 次
     @param news_chanel_url: 抓取的链接,
     @param is_finance_home:是否是finance_home,可能需要单独处理
     @param baseCrawl: 封装的抓取基类
     @param baseExtract: 封装的抽取基类
     @param try_times: 重试次数
     @return: 抓取状态,抽取的新闻链接
     '''
     
     logger.info("crawl %s, %d time"%(news_chanel_url,try_times))
     time.sleep(IDEL_TIME * (try_times-1))
     status = 2
     html = ''
     try:
         status,html = baseCrawl.crawl_page(news_chanel_url, req_referer = req_referer)
         #返回频道下面所有新闻链接
         news_links = []
         if status == 200:
             status = 0
             logger.info("crawl %s, success "%news_chanel_url)
             news_links = self.extracteUrls(baseExtract,html, is_finance_home)
             
         elif status in [301,302]:
             status = 1
             logger.info("crawl %s, no data,fail"%news_chanel_url)
         else:
             if try_times <= MAX_TRY_TIMES:
                 return self.extracte(news_chanel_url,is_finance_home, baseCrawl, baseExtract, req_referer, try_times+1)
             else:
                 status = 2
                 logger.error("crawl %s, %d time,fail"%(news_chanel_url,try_times))
     except:
         logger.error("crawl %s, failed ,Error:%s" % (news_chanel_url,traceback.format_exc()))
     
     
     return status,news_links