Пример #1
0
 def _parseHeadinfo(self, doc):
           
     fg = FigureItem()
     strimdata  = ''
     jdiclst = []
     scripts = re.findall('<script>FM\.view\((.*)\);?</script>', doc)
     if scripts:
         for i in scripts:
             jdiclst.append( json.loads(i) )
     else:
         print '_fetch_manload: raw doc parse error'
         
     for jdic in jdiclst:
         if 'ns' in jdic:
             if jdic['ns'] == 'pl.header.head.index':
                 strimdata = jdic['html']
                 d = PyQuery( strimdata ) 
                 break
     else:
         raise Exception('_parseHeadinfo error')
     
     
     info = self.remoteReader.getDoc( self.remoteReader.makeUrl_hostinfo(self.uid) )
     m = re.search(r'注册时间[.\s\S]+(\d{4})-(\d{2})-(\d{2})', info) 
     if m:
         t = time.mktime(time.strptime('%s-%s-%s' % (m.group(1), m.group(2), m.group(3)), '%Y-%m-%d'))
     else:
         t = 0  #2012-07-06
     
     fg.uid       = self.uid
     fg.domainid  = self.remoteReader.domain
     fg.establish = t
     fg.follow = re.search(self.followmask, strimdata).group(1)
     fg.fans = re.search(self.fansmask, strimdata).group(1)
     fg.weibo = re.search(self.weibomask, strimdata).group(1)
     
     text1 = d('span').filter('.name').text()
     text2 = d('strong').filter('.W_f20.W_Yahei').text()
     if text1:
         fg.name = text1
     else:
         fg.name = text2
          
     try:
         fg.verify = d('.pf_verified_info').contents()[0]
     except:
         fg.verify = ''
         
         
     fg.intro = d('.pf_intro').text()
      
     for i in d('.layer_menulist_tags').items('a'):
         fg.tags.append( i.text() ) 
         
     if not fg.isValid():
         print '    - Thread {0} weibo figure info not enough'.format(self.no)
             
     else:
         return fg    
Пример #2
0
class FigureFetcher(Fetcher):
    
    def __init__(self, queue):         
        super(FigureFetcher, self).__init__()
        self.localReader = FigureDatabase()
        self.figure = FigureItem() 
        
        self.q = queue
        
  
    def run(self):
        fg = FigureItem()
        while not self.q.empty():
            uid = int(self.q.get()[0])
            doc = self.remoteReader.getDoc( self.remoteReader.makeUrl_hostweibo(uid) )
            self._parseHeadinfo(doc)
            self.remoteReader.finishFetching()
            self.localReader.record( fg ) 

    def _parseHeadinfo(self, doc):
          
        strimdata  = ''
        jdiclst = []
        scripts = re.findall('<script>FM\.view\((.*)\);?</script>', doc)
        if scripts:
            for i in scripts:
                jdiclst.append( json.loads(i) )
        else:
            print '_fetch_manload: raw doc parse error'
            
        for jdic in jdiclst:
            if 'ns' in jdic:
                if jdic['ns'] == 'pl.header.head.index':
                    strimdata = jdic['html']
                    d = PyQuery( strimdata ) 
                    break
        else:
            raise Exception('_parseHeadinfo error')
        
        
        info = self.remoteReader.getDoc( self.remoteReader.makeUrl_hostinfo(self.uid) )
        m = re.search(r'注册时间[.\s\S]+(\d{4})-(\d{2})-(\d{2})', info) 
        if m:
            t = time.mktime(time.strptime('%s-%s-%s' % (m.group(1), m.group(2), m.group(3)), '%Y-%m-%d'))
        else:
            t = 0  #2012-07-06
        
        self.figure.uid       = self.remoteReader.uid
        self.figure.domainid  = self.remoteReader.domain
        self.figure.establish = t
        self.figure.follow = re.search(self.followmask, strimdata).group(1)
        self.figure.fans = re.search(self.fansmask, strimdata).group(1)
        self.figure.weibo = re.search(self.weibomask, strimdata).group(1)
        
        text1 = d('span').filter('.name').text()
        text2 = d('strong').filter('.W_f20.W_Yahei').text()
        if text1:
            self.figure.name = text1
        else:
            self.figure.name = text2
             
        try:
            self.figure.verify = d('.pf_verified_info').contents()[0]
        except:
            self.figure.verify = ''
            
            
        self.figure.intro = d('.pf_intro').text()
         
        for i in d('.layer_menulist_tags').items('a'):
            self.figure.tags.append( i.text() ) 
            
        if not self.figure.isValid():
            print '    - Thread {0} weibo figure info not enough'.format(self.no)