Пример #1
0
 def insertDb(self,index,results):
     print '[+] 新浪科技新闻...开始插入:',len(results)
     conn = MySQLdb.connect(host='localhost',user='******',passwd='',port=3306,charset='utf8')
     cur = conn.cursor()
     conn.select_db('newslab')
     if index == 1:
         for result in results:
             title = result[1]
             summery = result[3]
             coverurl = Toolkit.getImageUrl(result[2])
             timeStr = Toolkit.timeStrConv(result[4]); 
             source = u"新浪科技"
             website = result[0]
             md5Str = hashlib.md5(website).hexdigest()
             try:
                 cur.execute("insert ignore into news(title,summary,coverurl,time,source,website,md5) values('%s','%s','%s','%s','%s','%s','%s')" % (title,summery,coverurl,timeStr,source,website,md5Str))
             except Exception,e:
                 print "[-] "+website+"插入失败",e
Пример #2
0
 def getContent(self,url):
      if url.find('163.com') >= 0:
     ##网易新闻
         print '[+]网易新闻:'+url
         html = urllib.urlopen(url).read().decode('gbk').encode('utf-8')
         netEaseReg = re.compile(r'<div id="endText" class="end-text">([\s\S]*?)<div class="sharecommend-wrap clearfix">');
         result = netEaseReg.findall(html)
         if len(result)>0:
             print Toolkit.filterHtmlTag(result[0]).strip()
             return Toolkit.filterHtmlTag(result[0]).strip()
      elif url.find('qq.com') >= 0:
         print "[+]腾讯新闻:"+url
         html = urllib.urlopen(url).read().decode("gbk").encode("utf-8")
         tencentReg = re.compile(r'<P align=center>([\s\S]*?)</div>');
         result = tencentReg.findall(html)
         if len(result)>0:
             print Toolkit.filterHtmlTag(result[0])
             return Toolkit.filterHtmlTag(result[0])
Пример #3
0
     print '[+] 网易国内新闻...数据插入:',len(results)
     for result in results:               
         title = result[1].strip()
         coverurl = Toolkit.getImageUrl(result[2])
         summary = Toolkit.filterHtmlTag(result[3]).strip()
         source = "网易国内新闻"
         timeStr = result[4]
         website = result[0]
         md5Str = hashlib.md5(website).hexdigest()
         try:
             cur.execute("insert ignore into news_domestic(title,summary,coverurl,time,source,website,md5) values('%s','%s','%s','%s','%s','%s','%s')" % (title,summary,coverurl,timeStr,source,website,md5Str))
         except Exception,e:
             print "[-] "+website+"插入失败",e
 elif index == 3:
      print '[+] 网易社会新闻...数据插入:',len(results)
      for result in results:               
         title = result[1].strip()
         coverurl = Toolkit.getImageUrl(result[2])
         summary = Toolkit.filterHtmlTag(result[3]).strip()
         source = "网易社会新闻"
         timeStr = result[4]
         website = result[0]
         md5Str = hashlib.md5(website).hexdigest()
         try:
             cur.execute("insert ignore into news_social(title,summary,coverurl,time,source,website,md5) values('%s','%s','%s','%s','%s','%s','%s')" % (title,summary,coverurl,timeStr,source,website,md5Str))
         except Exception,e:
             print "[-] "+website+"插入失败",e
 conn.commit();
 cur.close();
 conn.close()  
 print '[+] 网易科技新闻...结束'      
Пример #4
0
        title = result[2]
        summary = Toolkit.filterHtmlTag(result[3])
        coverurl =  str(Toolkit.getImageUrl(result[0]))
        source = u'腾讯国内'
        website = "http://news.qq.com"+result[1]
        md5Str = hashlib.md5(website).hexdigest()
        try:
            cur.execute("insert ignore into news_domestic(title,summary,coverurl,source,website,md5) values('%s','%s','%s','%s','%s','%s')" % (title,summary,coverurl,source,website,md5Str))
        except Exception,e:
            print "[-] "+website+" "+title+summary+" 插入失败",e
      print '[+] 腾讯国内新闻...插入结束'
 elif index == 3:
      print '[+] 腾讯社会新闻...开始插入:',len(results)
      for result in results:
        title = result[2]
        summary = Toolkit.filterHtmlTag(result[3])
        coverurl =  str(Toolkit.getImageUrl(result[0]))
        source = u'腾讯国内'
        website = "http://news.qq.com"+result[1]
        md5Str = hashlib.md5(website).hexdigest()
        try:
            cur.execute("insert ignore into news_social(title,summary,coverurl,source,website,md5) values('%s','%s','%s','%s','%s','%s')" % (title,summary,coverurl,source,website,md5Str))
        except Exception,e:
            print "[-] "+website+" "+title+summary+" 插入失败",e
      print '[+] 腾讯社会新闻...插入结束'   
         
 conn.commit();
 cur.close();
 conn.close() 
 print '[+] 腾讯新闻...结束'