def crawl_page(self, req_url, req_data=None, req_referer=None): '''页面抓取的公共方法''' response = None try: # req = self.__get_req(req_url, req_data, req_referer) debug_handler = urllib2.HTTPHandler(debuglevel=0) opener = urllib2.build_opener(debug_handler, RedirctHandler) headers = Headers if req_referer: headers['Referer'] = req_referer opener.handlers.append(headers) response = opener.open(req_url, timeout=20) # response = urllib2.urlopen(req,timeout=1000) html = '' status = response.getcode() if status == 200: html = response.read() return status, html except urllib2.URLError, e: if hasattr(e, 'code'): logger.error("URLError, code is :%s" % e.code) return e.code, '' elif hasattr(e, 'reason'): logger.error("URLError, code is :%s" % e.reason) return e.reason, ''
def get_compnewsurls(code_names_file, url_prefix, finance_homes=None): '''产生上市公司新闻频道的urls @param code_names_file: 公司上市代码列表 @param url_prefix: 拼接上市公司新闻url的前缀 @param finance_homes: 财经频道首页地址 ''' if code_names_file is None or url_prefix is None: logger.error(' code_names_file or url_prefix is None') return [] news_chanel_urls = [] if finance_homes and len(finance_homes) > 0: for f_home in finance_homes: news_chanel_urls.append((FINANCE_HOME, f_home)) try: path = os.path.realpath(__file__) path = os.path.dirname(path) lspath = os.path.split(path) if lspath[0] and lspath[1]: code_names_file = lspath[0] + '/data/' + code_names_file for code_name in open(code_names_file): if code_name and len(code_name) > 0: cols = code_name.split('\t') if cols and len(cols) > 1: news_chanel_urls.append((cols[0], url_prefix + cols[0])) except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) return news_chanel_urls
def get_compnewsurls(code_names_file,url_prefix,finance_homes=None): '''产生上市公司新闻频道的urls @param code_names_file: 公司上市代码列表 @param url_prefix: 拼接上市公司新闻url的前缀 @param finance_homes: 财经频道首页地址 ''' if code_names_file is None or url_prefix is None: logger.error(' code_names_file or url_prefix is None') return [] news_chanel_urls = [] if finance_homes and len(finance_homes) > 0: for f_home in finance_homes: news_chanel_urls.append((FINANCE_HOME,f_home)) try: path = os.path.realpath(__file__) path = os.path.dirname(path) lspath = os.path.split(path) if lspath[0] and lspath[1]: code_names_file = lspath[0] + '/data/' + code_names_file for code_name in open(code_names_file): if code_name and len(code_name) > 0: cols = code_name.split('\t') if cols and len(cols) > 1: news_chanel_urls.append((cols[0],url_prefix + cols[0])) except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) return news_chanel_urls
def go_yahoo(): '''抓取雅虎的所有公司''' baseCrawl = BaseCrawl() baseExtract = BaseExtract() cxn_db = None try: cxn_db = get_cxn_db() cur_db = cxn_db.cursor() in_sql = """ insert into yahoo_comps (code_name,crawl_time,return_type,add_time,count) values (%s,now(),%s,now(),1) on duplicate key update crawl_time = now(),return_type = %s,count=count+1 """ code_names = get_comps() logger.info("number of companys is %d"%len(code_names)) for code in code_names: logger.debug("crawl %s, start............ "%code) status,headlines_links = go_crawl_headlines(code,baseCrawl,baseExtract) cur_db.execute(in_sql,(code,status,status)) if status == 0 and len(headlines_links) > 0: extract_headlines_news(code,headlines_links,cur_db) logger.debug("crawl %s, end.............. "%code) except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) finally: if cxn_db: cxn_db.close()
def go_yahoo(): '''抓取雅虎的所有公司''' baseCrawl = BaseCrawl() baseExtract = BaseExtract() cxn_db = None try: cxn_db = get_cxn_db() cur_db = cxn_db.cursor() in_sql = """ insert into yahoo_comps (code_name,crawl_time,return_type,add_time,count) values (%s,now(),%s,now(),1) on duplicate key update crawl_time = now(),return_type = %s,count=count+1 """ code_names = get_comps() logger.info("number of companys is %d" % len(code_names)) for code in code_names: logger.debug("crawl %s, start............ " % code) status, headlines_links = go_crawl_headlines( code, baseCrawl, baseExtract) cur_db.execute(in_sql, (code, status, status)) if status == 0 and len(headlines_links) > 0: extract_headlines_news(code, headlines_links, cur_db) logger.debug("crawl %s, end.............. " % code) except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) finally: if cxn_db: cxn_db.close()
def crawl_page(self, req_url, req_data=None , req_referer=None): '''页面抓取的公共方法''' response = None try: # req = self.__get_req(req_url, req_data, req_referer) debug_handler = urllib2.HTTPHandler(debuglevel = 0) opener = urllib2.build_opener(debug_handler, RedirctHandler) headers = Headers if req_referer: headers['Referer'] = req_referer opener.handlers.append(headers) response = opener.open(req_url,timeout=20) # response = urllib2.urlopen(req,timeout=1000) html = '' status = response.getcode() if status == 200: html = response.read() return status,html except urllib2.URLError, e: if hasattr(e, 'code'): logger.error("URLError, code is :%s" % e.code) return e.code,'' elif hasattr(e, 'reason'): logger.error("URLError, code is :%s" % e.reason) return e.reason,''
def go_newsextrac(source_type, code_names_file, url_prefix, req_referer, finance_homes, extracteNewsurls): '''抓取所有的新闻链接、抽取并存储 @param source_type: 抓取来源:比如yahoo、google等 @param code_names_file: 公司上市代码列表 @param url_prefix: 拼接上市公司新闻url的前缀 @param req_referer: 抓取前连,防封禁 @param finance_homes: 财经频道首页地址 @param extracteNewsurls: 个性化的抽取对象 ''' baseCrawl = BaseCrawl() baseExtract = BaseExtract() cxn_db = None try: cxn_db = get_cxn_db() cur_db = cxn_db.cursor() #构造一个boolm filter initUrlsBloomFilter(cur_db, source_type) in_sql = """ insert into com_news_extract_state (url,code_name,crawl_time,return_type,add_time,crawl_count,source_type) values (%s,%s,now(),%s,now(),1,%s) on duplicate key update crawl_time = now(),return_type = %s,crawl_count=crawl_count+1,source_type = %s """ news_chanel_urls = get_compnewsurls(code_names_file, url_prefix, finance_homes) logger.info("number of companys is %d" % len(news_chanel_urls)) for code, news_chanel_url in news_chanel_urls: logger.debug("crawl %s, start............ " % news_chanel_url) is_finance_home = False if code == FINANCE_HOME: is_finance_home = True status, news_links = extracteNewsurls.extracte(news_chanel_url, is_finance_home, baseCrawl, baseExtract, req_referer, try_times=1) cur_db.execute(in_sql, (news_chanel_url, code, status, source_type, status, source_type)) if status == 0 and len(news_links) > 0: extract_news(code, news_links, source_type, cur_db) logger.debug("crawl %s, end.............. " % code) logger.info("number of news url is %s" % NEWS_URL_EXTRACTE) except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) finally: if cxn_db: cxn_db.close()
def get_cxn_db(): '''获取数据库的连接''' try: return MySQLdb.connect(host=DB_HOST, user=DB_USER, passwd=DB_PASSWD, db=DB_NAME, port=DB_PORT, charset='UTF8') except: logger.error('create mysql connect error' + ',Except: ' + traceback.format_exc()) return None
def extracte(self, news_chanel_url, is_finance_home, baseCrawl, baseExtract, req_referer, try_times=1): '''抽取频道首页或者每个公司新闻频道中得所有urls, 每个页面最多抓取 MAX_TRY_TIMES+1 次 @param news_chanel_url: 抓取的链接, @param is_finance_home:是否是finance_home,可能需要单独处理 @param baseCrawl: 封装的抓取基类 @param baseExtract: 封装的抽取基类 @param try_times: 重试次数 @return: 抓取状态,抽取的新闻链接 ''' logger.info("crawl %s, %d time" % (news_chanel_url, try_times)) time.sleep(IDEL_TIME * (try_times - 1)) status = 2 html = '' try: status, html = baseCrawl.crawl_page(news_chanel_url, req_referer=req_referer) #返回频道下面所有新闻链接 news_links = [] if status == 200: status = 0 logger.info("crawl %s, success " % news_chanel_url) news_links = self.extracteUrls(baseExtract, html, is_finance_home) elif status in [301, 302]: status = 1 logger.info("crawl %s, no data,fail" % news_chanel_url) else: if try_times <= MAX_TRY_TIMES: return self.extracte(news_chanel_url, is_finance_home, baseCrawl, baseExtract, req_referer, try_times + 1) else: status = 2 logger.error("crawl %s, %d time,fail" % (news_chanel_url, try_times)) except: logger.error("crawl %s, failed ,Error:%s" % (news_chanel_url, traceback.format_exc())) return status, news_links
def go_newsextrac(source_type,code_names_file,url_prefix,req_referer,finance_homes,extracteNewsurls): '''抓取所有的新闻链接、抽取并存储 @param source_type: 抓取来源:比如yahoo、google等 @param code_names_file: 公司上市代码列表 @param url_prefix: 拼接上市公司新闻url的前缀 @param req_referer: 抓取前连,防封禁 @param finance_homes: 财经频道首页地址 @param extracteNewsurls: 个性化的抽取对象 ''' baseCrawl = BaseCrawl() baseExtract = BaseExtract() cxn_db = None try: cxn_db = get_cxn_db() cur_db = cxn_db.cursor() #构造一个boolm filter initUrlsBloomFilter(cur_db,source_type) in_sql = """ insert into com_news_extract_state (url,code_name,crawl_time,return_type,add_time,crawl_count,source_type) values (%s,%s,now(),%s,now(),1,%s) on duplicate key update crawl_time = now(),return_type = %s,crawl_count=crawl_count+1,source_type = %s """ news_chanel_urls = get_compnewsurls(code_names_file,url_prefix,finance_homes) logger.info("number of companys is %d"%len(news_chanel_urls)) for code,news_chanel_url in news_chanel_urls: logger.debug("crawl %s, start............ "%news_chanel_url) is_finance_home = False if code == FINANCE_HOME: is_finance_home = True status,news_links = extracteNewsurls.extracte(news_chanel_url,is_finance_home,baseCrawl,baseExtract,req_referer,try_times = 1) cur_db.execute(in_sql,(news_chanel_url,code,status,source_type,status,source_type)) if status == 0 and len(news_links) > 0: extract_news(code,news_links,source_type,cur_db) logger.debug("crawl %s, end.............. "%code) logger.info("number of news url is %s"%NEWS_URL_EXTRACTE) except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) finally: if cxn_db: cxn_db.close()
def get_comps(): '''从文件中获取所有的上市公司名单''' code_names = [] try: code_names_file = '../data/code_name_vol.lst' path = os.path.realpath(__file__) path = os.path.dirname(path) lspath = os.path.split(path) if lspath[0] and lspath[1]: code_names_file = lspath[0] + '/data/code_name_vol.lst' for code_name in open(code_names_file): if code_name and len(code_name) > 0: cols = code_name.split('\t') if cols and len(cols) > 1: code_names.append(cols[0]) except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) return code_names
def go_crawl_headlines(code_name, baseCrawl, baseExtract, try_times=1): '''雅虎金融每个公司页面的新闻抓取的抓取,每个页面最多抓取 MAX_TRY_TIMES+1 次 @param req_url: 抓取的链接,形如:http://finance.yahoo.com/q/h?s=bidu @param baseCrawl: 封装的抓取基类 @param baseExtract: 封装的抽取基类 @param try_times: 重试次数 @return: 抓取状态,抽取的新闻链接, ''' req_url = 'http://finance.yahoo.com/q/h?s=%s' % code_name logger.info("crawl %s, %d time" % (req_url, try_times)) time.sleep(IDEL_TIME * try_times) status, html = baseCrawl.crawl_page(req_url, req_referer=REQ_REFER) #返回新闻链接 headlines_links = [] if status == 200: status = 0 logger.info("crawl %s, success " % req_url) beautifulSoup = baseExtract.get_bs(html) content = beautifulSoup.find( 'div', {'class': 'mod yfi_quote_headline withsky'}) if content is None: logger.info("extract %s is none " % req_url) else: for link in beautifulSoup.findAll('a'): headlines_links.append(link.get('href')) elif status in [301, 302]: status = 1 logger.info("crawl %s, no data,fail" % req_url) else: if try_times <= MAX_TRY_TIMES: return go_crawl_headlines(req_url, baseCrawl, baseExtract, try_times + 1) else: status = 2 logger.error("crawl %s, %d time,fail" % (req_url, try_times)) return status, headlines_links
def extracte(self,news_chanel_url,is_finance_home,baseCrawl,baseExtract,req_referer,try_times = 1): '''抽取频道首页或者每个公司新闻频道中得所有urls, 每个页面最多抓取 MAX_TRY_TIMES+1 次 @param news_chanel_url: 抓取的链接, @param is_finance_home:是否是finance_home,可能需要单独处理 @param baseCrawl: 封装的抓取基类 @param baseExtract: 封装的抽取基类 @param try_times: 重试次数 @return: 抓取状态,抽取的新闻链接 ''' logger.info("crawl %s, %d time"%(news_chanel_url,try_times)) time.sleep(IDEL_TIME * (try_times-1)) status = 2 html = '' try: status,html = baseCrawl.crawl_page(news_chanel_url, req_referer = req_referer) #返回频道下面所有新闻链接 news_links = [] if status == 200: status = 0 logger.info("crawl %s, success "%news_chanel_url) news_links = self.extracteUrls(baseExtract,html, is_finance_home) elif status in [301,302]: status = 1 logger.info("crawl %s, no data,fail"%news_chanel_url) else: if try_times <= MAX_TRY_TIMES: return self.extracte(news_chanel_url,is_finance_home, baseCrawl, baseExtract, req_referer, try_times+1) else: status = 2 logger.error("crawl %s, %d time,fail"%(news_chanel_url,try_times)) except: logger.error("crawl %s, failed ,Error:%s" % (news_chanel_url,traceback.format_exc())) return status,news_links
def go_crawl_headlines(code_name,baseCrawl,baseExtract,try_times = 1): '''雅虎金融每个公司页面的新闻抓取的抓取,每个页面最多抓取 MAX_TRY_TIMES+1 次 @param req_url: 抓取的链接,形如:http://finance.yahoo.com/q/h?s=bidu @param baseCrawl: 封装的抓取基类 @param baseExtract: 封装的抽取基类 @param try_times: 重试次数 @return: 抓取状态,抽取的新闻链接, ''' req_url = 'http://finance.yahoo.com/q/h?s=%s'%code_name logger.info("crawl %s, %d time"%(req_url,try_times)) time.sleep(IDEL_TIME * try_times) status,html = baseCrawl.crawl_page(req_url, req_referer = REQ_REFER) #返回新闻链接 headlines_links = [] if status == 200: status = 0 logger.info("crawl %s, success "%req_url) beautifulSoup = baseExtract.get_bs(html) content = beautifulSoup.find('div', {'class':'mod yfi_quote_headline withsky'}) if content is None: logger.info("extract %s is none "%req_url) else: for link in beautifulSoup.findAll('a'): headlines_links.append(link.get('href')) elif status in [301,302]: status = 1 logger.info("crawl %s, no data,fail"%req_url) else: if try_times <= MAX_TRY_TIMES: return go_crawl_headlines(req_url,baseCrawl,baseExtract,try_times+1) else: status = 2 logger.error("crawl %s, %d time,fail"%(req_url,try_times)) return status,headlines_links
def extract_news(code,news_links,crawl_source,cursor): '''抽取新闻,并进行NLP @param code: 上市公司编码 @param news_links: 需要抽取的新闻链接 @param crawl_source @param cursor: 数据库游标 ''' in_sql = """ INSERT INTO news_extract_content(url_md5,url,code_name,newspaper_title,newspaper_text, newspaper_authors,newspaper_summary,newspaper_keywords,boilerpipe_article, boilerpipe_articlesentences,boilerpipe_keepeverything,boilerpipe_largestcontent, boilerpipe_numwordsrules,boilerpipe_canola,up_time,add_time,extract_count,crawl_source) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now(),1,%s) on duplicate key update code_name = %s,newspaper_title = %s,newspaper_text = %s, newspaper_authors = %s,newspaper_summary = %s,newspaper_keywords = %s, boilerpipe_article = %s,boilerpipe_articlesentences = %s,boilerpipe_keepeverything = %s, boilerpipe_largestcontent = %s,boilerpipe_numwordsrules = %s,boilerpipe_canola = %s, up_time = now(),extract_count=extract_count+1,crawl_source = %s """ for link in news_links: #长度小于30的url一般都不是新闻连接,暴力,简单可依赖 if link is None or len(link) <= 30: continue #已经抓取的url就不需要抓取了 if link in bf: continue try: global NEWS_URL_EXTRACTE NEWS_URL_EXTRACTE += 1 url_md5 = hashlib.md5(link).hexdigest() #首先让使用newspaper newspaper_title = '' newspaper_text = '' newspaper_authors = '' newspaper_summary = '' newspaper_keywords = '' article = Article(link) article.download() html = article.html if html is None or len(html) == 0: continue article.parse() if article.text and len(article.text) > 0: newspaper_title = article.title newspaper_text = article.text newspaper_authors = article.authors if newspaper_authors and len(newspaper_authors) > 0: newspaper_authors = ','.join(newspaper_authors) else: newspaper_authors = '' article.nlp() newspaper_summary = article.summary newspaper_keywords = article.keywords if newspaper_keywords and len(newspaper_keywords) > 0: newspaper_keywords = ','.join(newspaper_keywords) else: newspaper_keywords = '' #然后使用boilerpipe extractor = Extractor(extractor='ArticleExtractor',html = html) boilerpipe_article = extractor.getText() extractor = Extractor(extractor='ArticleSentencesExtractor',html = html) boilerpipe_articlesentences = extractor.getText() extractor = Extractor(extractor='KeepEverythingExtractor',html = html) boilerpipe_keepeverything = extractor.getText() extractor = Extractor(extractor='LargestContentExtractor',html = html) boilerpipe_largestcontent = extractor.getText() extractor = Extractor(extractor='NumWordsRulesExtractor',html = html) boilerpipe_numwordsrules = extractor.getText() extractor = Extractor(extractor='CanolaExtractor',html = html) boilerpipe_canola = extractor.getText() #输入的参数 content = (url_md5,link,code, newspaper_title, newspaper_text, newspaper_authors,newspaper_summary,newspaper_keywords,\ boilerpipe_article,boilerpipe_articlesentences,boilerpipe_keepeverything,boilerpipe_largestcontent,\ boilerpipe_numwordsrules,boilerpipe_canola,crawl_source, \ code, newspaper_title,newspaper_text, newspaper_authors,\ newspaper_summary,newspaper_keywords,boilerpipe_article,boilerpipe_articlesentences,boilerpipe_keepeverything,\ boilerpipe_largestcontent,boilerpipe_numwordsrules,boilerpipe_canola,crawl_source) cursor.execute(in_sql,content) except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc())
html = '' status = response.getcode() if status == 200: html = response.read() return status,html except urllib2.URLError, e: if hasattr(e, 'code'): logger.error("URLError, code is :%s" % e.code) return e.code,'' elif hasattr(e, 'reason'): logger.error("URLError, code is :%s" % e.reason) return e.reason,'' except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) return -100,'' finally: if response: response.close() class BaseExtract(object): '''页面抽取基类''' def get_bs(self, markup, parser='lxml', page_encoding=None): '''获取定制的BS @param markup: 网页内容 @param parser: 解析器,替换html.parser默认使用 lxml解析,very fast 解析器有:lxml>html.parser>html5lib @param page_encoding: 网页编码 ''' return BeautifulSoup(markup, features=parser, from_encoding=page_encoding)
def extract_headlines_news(code,headlines_links,cursor): '''抽取yahoo的新闻链接并解析''' in_sql = """ INSERT INTO yahoo_comp_news(url_md5,url,code_name,newspaper_title,newspaper_text, newspaper_authors,newspaper_summary,newspaper_keywords,boilerpipe_article, boilerpipe_articlesentences,boilerpipe_keepeverything,boilerpipe_largestcontent, boilerpipe_numwordsrules,boilerpipe_canola,up_time,add_time,count) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now(),1) on duplicate key update code_name = %s,newspaper_title = %s,newspaper_text = %s, newspaper_authors = %s,newspaper_summary = %s,newspaper_keywords = %s, boilerpipe_article = %s,boilerpipe_articlesentences = %s,boilerpipe_keepeverything = %s, boilerpipe_largestcontent = %s,boilerpipe_numwordsrules = %s,boilerpipe_canola = %s, up_time = now(),count=count+1 """ for link in headlines_links: #长度小于35的url一般都不是新闻连接 if link is None or len(link) <= 35: continue try: url_md5 = hashlib.md5(link).hexdigest() #首先让使用newspaper newspaper_title = '' newspaper_text = '' newspaper_authors = '' newspaper_summary = '' newspaper_keywords = '' article = Article(link) article.download() html = article.html if html is None or len(html) == 0: continue article.parse() if article.text and len(article.text) > 0: newspaper_title = article.title newspaper_text = article.text newspaper_authors = article.authors if newspaper_authors and len(newspaper_authors) > 0: newspaper_authors = ','.join(newspaper_authors) else: newspaper_authors = '' article.nlp() newspaper_summary = article.summary newspaper_keywords = article.keywords if newspaper_keywords and len(newspaper_keywords) > 0: newspaper_keywords = ','.join(newspaper_keywords) else: newspaper_keywords = '' #然后使用boilerpipe extractor = Extractor(extractor='ArticleExtractor',html = html) boilerpipe_article = extractor.getText() extractor = Extractor(extractor='ArticleSentencesExtractor',html = html) boilerpipe_articlesentences = extractor.getText() extractor = Extractor(extractor='KeepEverythingExtractor',html = html) boilerpipe_keepeverything = extractor.getText() extractor = Extractor(extractor='LargestContentExtractor',html = html) boilerpipe_largestcontent = extractor.getText() extractor = Extractor(extractor='NumWordsRulesExtractor',html = html) boilerpipe_numwordsrules = extractor.getText() extractor = Extractor(extractor='CanolaExtractor',html = html) boilerpipe_canola = extractor.getText() #输入的参数 content = (url_md5,link,code, newspaper_title, newspaper_text, newspaper_authors,newspaper_summary,newspaper_keywords,\ boilerpipe_article,boilerpipe_articlesentences,boilerpipe_keepeverything,boilerpipe_largestcontent,\ boilerpipe_numwordsrules,boilerpipe_canola, \ code, newspaper_title,newspaper_text, newspaper_authors,\ newspaper_summary,newspaper_keywords,boilerpipe_article,boilerpipe_articlesentences,boilerpipe_keepeverything,\ boilerpipe_largestcontent,boilerpipe_numwordsrules,boilerpipe_canola) cursor.execute(in_sql,content) except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc())
def extract_headlines_news(code, headlines_links, cursor): '''抽取yahoo的新闻链接并解析''' in_sql = """ INSERT INTO yahoo_comp_news(url_md5,url,code_name,newspaper_title,newspaper_text, newspaper_authors,newspaper_summary,newspaper_keywords,boilerpipe_article, boilerpipe_articlesentences,boilerpipe_keepeverything,boilerpipe_largestcontent, boilerpipe_numwordsrules,boilerpipe_canola,up_time,add_time,count) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now(),1) on duplicate key update code_name = %s,newspaper_title = %s,newspaper_text = %s, newspaper_authors = %s,newspaper_summary = %s,newspaper_keywords = %s, boilerpipe_article = %s,boilerpipe_articlesentences = %s,boilerpipe_keepeverything = %s, boilerpipe_largestcontent = %s,boilerpipe_numwordsrules = %s,boilerpipe_canola = %s, up_time = now(),count=count+1 """ for link in headlines_links: #长度小于35的url一般都不是新闻连接 if link is None or len(link) <= 35: continue try: url_md5 = hashlib.md5(link).hexdigest() #首先让使用newspaper newspaper_title = '' newspaper_text = '' newspaper_authors = '' newspaper_summary = '' newspaper_keywords = '' article = Article(link) article.download() html = article.html if html is None or len(html) == 0: continue article.parse() if article.text and len(article.text) > 0: newspaper_title = article.title newspaper_text = article.text newspaper_authors = article.authors if newspaper_authors and len(newspaper_authors) > 0: newspaper_authors = ','.join(newspaper_authors) else: newspaper_authors = '' article.nlp() newspaper_summary = article.summary newspaper_keywords = article.keywords if newspaper_keywords and len(newspaper_keywords) > 0: newspaper_keywords = ','.join(newspaper_keywords) else: newspaper_keywords = '' #然后使用boilerpipe extractor = Extractor(extractor='ArticleExtractor', html=html) boilerpipe_article = extractor.getText() extractor = Extractor(extractor='ArticleSentencesExtractor', html=html) boilerpipe_articlesentences = extractor.getText() extractor = Extractor(extractor='KeepEverythingExtractor', html=html) boilerpipe_keepeverything = extractor.getText() extractor = Extractor(extractor='LargestContentExtractor', html=html) boilerpipe_largestcontent = extractor.getText() extractor = Extractor(extractor='NumWordsRulesExtractor', html=html) boilerpipe_numwordsrules = extractor.getText() extractor = Extractor(extractor='CanolaExtractor', html=html) boilerpipe_canola = extractor.getText() #输入的参数 content = (url_md5,link,code, newspaper_title, newspaper_text, newspaper_authors,newspaper_summary,newspaper_keywords,\ boilerpipe_article,boilerpipe_articlesentences,boilerpipe_keepeverything,boilerpipe_largestcontent,\ boilerpipe_numwordsrules,boilerpipe_canola, \ code, newspaper_title,newspaper_text, newspaper_authors,\ newspaper_summary,newspaper_keywords,boilerpipe_article,boilerpipe_articlesentences,boilerpipe_keepeverything,\ boilerpipe_largestcontent,boilerpipe_numwordsrules,boilerpipe_canola) cursor.execute(in_sql, content) except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc())
def extract_news(code, news_links, crawl_source, cursor): '''抽取新闻,并进行NLP @param code: 上市公司编码 @param news_links: 需要抽取的新闻链接 @param crawl_source @param cursor: 数据库游标 ''' in_sql = """ INSERT INTO news_extract_content(url_md5,url,code_name,newspaper_title,newspaper_text, newspaper_authors,newspaper_summary,newspaper_keywords,boilerpipe_article, boilerpipe_articlesentences,boilerpipe_keepeverything,boilerpipe_largestcontent, boilerpipe_numwordsrules,boilerpipe_canola,up_time,add_time,extract_count,crawl_source) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now(),1,%s) on duplicate key update code_name = %s,newspaper_title = %s,newspaper_text = %s, newspaper_authors = %s,newspaper_summary = %s,newspaper_keywords = %s, boilerpipe_article = %s,boilerpipe_articlesentences = %s,boilerpipe_keepeverything = %s, boilerpipe_largestcontent = %s,boilerpipe_numwordsrules = %s,boilerpipe_canola = %s, up_time = now(),extract_count=extract_count+1,crawl_source = %s """ for link in news_links: #长度小于30的url一般都不是新闻连接,暴力,简单可依赖 if link is None or len(link) <= 30: continue #已经抓取的url就不需要抓取了 if link in bf: continue try: global NEWS_URL_EXTRACTE NEWS_URL_EXTRACTE += 1 url_md5 = hashlib.md5(link).hexdigest() #首先让使用newspaper newspaper_title = '' newspaper_text = '' newspaper_authors = '' newspaper_summary = '' newspaper_keywords = '' article = Article(link) article.download() html = article.html if html is None or len(html) == 0: continue article.parse() if article.text and len(article.text) > 0: newspaper_title = article.title newspaper_text = article.text newspaper_authors = article.authors if newspaper_authors and len(newspaper_authors) > 0: newspaper_authors = ','.join(newspaper_authors) else: newspaper_authors = '' article.nlp() newspaper_summary = article.summary newspaper_keywords = article.keywords if newspaper_keywords and len(newspaper_keywords) > 0: newspaper_keywords = ','.join(newspaper_keywords) else: newspaper_keywords = '' #然后使用boilerpipe extractor = Extractor(extractor='ArticleExtractor', html=html) boilerpipe_article = extractor.getText() extractor = Extractor(extractor='ArticleSentencesExtractor', html=html) boilerpipe_articlesentences = extractor.getText() extractor = Extractor(extractor='KeepEverythingExtractor', html=html) boilerpipe_keepeverything = extractor.getText() extractor = Extractor(extractor='LargestContentExtractor', html=html) boilerpipe_largestcontent = extractor.getText() extractor = Extractor(extractor='NumWordsRulesExtractor', html=html) boilerpipe_numwordsrules = extractor.getText() extractor = Extractor(extractor='CanolaExtractor', html=html) boilerpipe_canola = extractor.getText() #输入的参数 content = (url_md5,link,code, newspaper_title, newspaper_text, newspaper_authors,newspaper_summary,newspaper_keywords,\ boilerpipe_article,boilerpipe_articlesentences,boilerpipe_keepeverything,boilerpipe_largestcontent,\ boilerpipe_numwordsrules,boilerpipe_canola,crawl_source, \ code, newspaper_title,newspaper_text, newspaper_authors,\ newspaper_summary,newspaper_keywords,boilerpipe_article,boilerpipe_articlesentences,boilerpipe_keepeverything,\ boilerpipe_largestcontent,boilerpipe_numwordsrules,boilerpipe_canola,crawl_source) cursor.execute(in_sql, content) except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc())
html = '' status = response.getcode() if status == 200: html = response.read() return status, html except urllib2.URLError, e: if hasattr(e, 'code'): logger.error("URLError, code is :%s" % e.code) return e.code, '' elif hasattr(e, 'reason'): logger.error("URLError, code is :%s" % e.reason) return e.reason, '' except: logger.error("crawl_page failed ,Error:%s" % traceback.format_exc()) return -100, '' finally: if response: response.close() class BaseExtract(object): '''页面抽取基类''' def get_bs(self, markup, parser='lxml', page_encoding=None): '''获取定制的BS @param markup: 网页内容 @param parser: 解析器,替换html.parser默认使用 lxml解析,very fast 解析器有:lxml>html.parser>html5lib @param page_encoding: 网页编码 '''