Пример #1
0
 def itemCast(self, row):
     fg = FigureItem()
     fg.uid = row[0]
     fg.domainid = row[1]
     fg.name = row[2]
     fg.follow = row[3]
     fg.fans = row[4]
     fg.weibo = row[5]
     fg.establish = row[6]
     return fg
Пример #2
0
    def test_parseWeiboLst(self, uid):
        fd = '../BigVs/' + str(uid)
        if os.path.exists(fd):
            with open('../BigVs/' + str(uid), 'r') as f:
                rawdoc = f.read()

            d = PyQuery(rawdoc.decode('utf-8'))
            fg = FigureItem()

            fg.follow = d('strong').filter(lambda i, this: PyQuery(this).attr(
                'node-type') == 'follow').text()
            fg.fans = d('strong').filter(lambda i, this: PyQuery(this).attr(
                'node-type') == 'fans').text()
            fg.weibo = d('strong').filter(lambda i, this: PyQuery(this).attr(
                'node-type') == 'weibo').text()

            fg.name = d('span').filter('.name').text()
            fg.verify = d('.pf_verified_info').contents()[0]
            fg.intro = d('.pf_intro').text()

            for i in d('.layer_menulist_tags').items('a'):
                fg.tags.append(i.text())

            return fg
        else:
            print 'file not exists'
Пример #3
0
 def run(self):
     fg = FigureItem()
     while not self.q.empty():
         uid = int(self.q.get()[0])
         doc = self.remoteReader.getDoc( self.remoteReader.makeUrl_hostweibo(uid) )
         self._parseHeadinfo(doc)
         self.remoteReader.finishFetching()
         self.localReader.record( fg ) 
Пример #4
0
 def itemCast(self, row):
     fg = FigureItem()
     fg.uid          = row[0]
     fg.domainid     = row[1]
     fg.name         = row[2]
     fg.follow       = row[3]
     fg.fans         = row[4]
     fg.weibo        = row[5]
     fg.establish    = row[6]     
     return fg
Пример #5
0
   def __init__(self, queue=None, no=0, skip=False):         
 
       super(WeiboFetcher, self).__init__() 
       
       self.initRemask()
       self.remoteReader   = Page() 
       
       self.figure   = FigureItem()
       self.q = queue
       self.no = no
       self.repeat = 0
       self.skip = skip
Пример #6
0
 def test_parseWeiboLst(self, uid):
     fd = '../BigVs/' + str(uid)
     if os.path.exists(fd): 
         with open( '../BigVs/' + str(uid), 'r' ) as f:
             rawdoc = f.read()
         
         d = PyQuery( rawdoc.decode('utf-8') ) 
         fg = FigureItem()
         
         fg.follow = d('strong').filter(lambda i, this: PyQuery(this).attr('node-type') == 'follow').text()
         fg.fans = d('strong').filter(lambda i, this: PyQuery(this).attr('node-type') == 'fans').text()
         fg.weibo = d('strong').filter(lambda i, this: PyQuery(this).attr('node-type') == 'weibo').text()
         
         fg.name = d('span').filter('.name').text()
         fg.verify = d('.pf_verified_info').contents()[0]
         fg.intro = d('.pf_intro').text()
          
         for i in d('.layer_menulist_tags').items('a'):
             fg.tags.append( i.text() ) 
             
             
         return fg
     else:
         print 'file not exists'
Пример #7
0
 def __init__(self, queue):         
     super(FigureFetcher, self).__init__()
     self.localReader = FigureDatabase()
     self.figure = FigureItem() 
     
     self.q = queue
Пример #8
0
class FigureFetcher(Fetcher):
    
    def __init__(self, queue):         
        super(FigureFetcher, self).__init__()
        self.localReader = FigureDatabase()
        self.figure = FigureItem() 
        
        self.q = queue
        
  
    def run(self):
        fg = FigureItem()
        while not self.q.empty():
            uid = int(self.q.get()[0])
            doc = self.remoteReader.getDoc( self.remoteReader.makeUrl_hostweibo(uid) )
            self._parseHeadinfo(doc)
            self.remoteReader.finishFetching()
            self.localReader.record( fg ) 

    def _parseHeadinfo(self, doc):
          
        strimdata  = ''
        jdiclst = []
        scripts = re.findall('<script>FM\.view\((.*)\);?</script>', doc)
        if scripts:
            for i in scripts:
                jdiclst.append( json.loads(i) )
        else:
            print '_fetch_manload: raw doc parse error'
            
        for jdic in jdiclst:
            if 'ns' in jdic:
                if jdic['ns'] == 'pl.header.head.index':
                    strimdata = jdic['html']
                    d = PyQuery( strimdata ) 
                    break
        else:
            raise Exception('_parseHeadinfo error')
        
        
        info = self.remoteReader.getDoc( self.remoteReader.makeUrl_hostinfo(self.uid) )
        m = re.search(r'注册时间[.\s\S]+(\d{4})-(\d{2})-(\d{2})', info) 
        if m:
            t = time.mktime(time.strptime('%s-%s-%s' % (m.group(1), m.group(2), m.group(3)), '%Y-%m-%d'))
        else:
            t = 0  #2012-07-06
        
        self.figure.uid       = self.remoteReader.uid
        self.figure.domainid  = self.remoteReader.domain
        self.figure.establish = t
        self.figure.follow = re.search(self.followmask, strimdata).group(1)
        self.figure.fans = re.search(self.fansmask, strimdata).group(1)
        self.figure.weibo = re.search(self.weibomask, strimdata).group(1)
        
        text1 = d('span').filter('.name').text()
        text2 = d('strong').filter('.W_f20.W_Yahei').text()
        if text1:
            self.figure.name = text1
        else:
            self.figure.name = text2
             
        try:
            self.figure.verify = d('.pf_verified_info').contents()[0]
        except:
            self.figure.verify = ''
            
            
        self.figure.intro = d('.pf_intro').text()
         
        for i in d('.layer_menulist_tags').items('a'):
            self.figure.tags.append( i.text() ) 
            
        if not self.figure.isValid():
            print '    - Thread {0} weibo figure info not enough'.format(self.no)
                
                
                
                
                
Пример #9
0
 def _parseHeadinfo(self, doc):
           
     fg = FigureItem()
     strimdata  = ''
     jdiclst = []
     scripts = re.findall('<script>FM\.view\((.*)\);?</script>', doc)
     if scripts:
         for i in scripts:
             jdiclst.append( json.loads(i) )
     else:
         print '_fetch_manload: raw doc parse error'
         
     for jdic in jdiclst:
         if 'ns' in jdic:
             if jdic['ns'] == 'pl.header.head.index':
                 strimdata = jdic['html']
                 d = PyQuery( strimdata ) 
                 break
     else:
         raise Exception('_parseHeadinfo error')
     
     
     info = self.remoteReader.getDoc( self.remoteReader.makeUrl_hostinfo(self.uid) )
     m = re.search(r'注册时间[.\s\S]+(\d{4})-(\d{2})-(\d{2})', info) 
     if m:
         t = time.mktime(time.strptime('%s-%s-%s' % (m.group(1), m.group(2), m.group(3)), '%Y-%m-%d'))
     else:
         t = 0  #2012-07-06
     
     fg.uid       = self.uid
     fg.domainid  = self.remoteReader.domain
     fg.establish = t
     fg.follow = re.search(self.followmask, strimdata).group(1)
     fg.fans = re.search(self.fansmask, strimdata).group(1)
     fg.weibo = re.search(self.weibomask, strimdata).group(1)
     
     text1 = d('span').filter('.name').text()
     text2 = d('strong').filter('.W_f20.W_Yahei').text()
     if text1:
         fg.name = text1
     else:
         fg.name = text2
          
     try:
         fg.verify = d('.pf_verified_info').contents()[0]
     except:
         fg.verify = ''
         
         
     fg.intro = d('.pf_intro').text()
      
     for i in d('.layer_menulist_tags').items('a'):
         fg.tags.append( i.text() ) 
         
     if not fg.isValid():
         print '    - Thread {0} weibo figure info not enough'.format(self.no)
             
     else:
         return fg