Пример #1
0
 def _parseHeadinfo(self, doc):
           
     fg = FigureItem()
     strimdata  = ''
     jdiclst = []
     scripts = re.findall('<script>FM\.view\((.*)\);?</script>', doc)
     if scripts:
         for i in scripts:
             jdiclst.append( json.loads(i) )
     else:
         print '_fetch_manload: raw doc parse error'
         
     for jdic in jdiclst:
         if 'ns' in jdic:
             if jdic['ns'] == 'pl.header.head.index':
                 strimdata = jdic['html']
                 d = PyQuery( strimdata ) 
                 break
     else:
         raise Exception('_parseHeadinfo error')
     
     
     info = self.remoteReader.getDoc( self.remoteReader.makeUrl_hostinfo(self.uid) )
     m = re.search(r'注册时间[.\s\S]+(\d{4})-(\d{2})-(\d{2})', info) 
     if m:
         t = time.mktime(time.strptime('%s-%s-%s' % (m.group(1), m.group(2), m.group(3)), '%Y-%m-%d'))
     else:
         t = 0  #2012-07-06
     
     fg.uid       = self.uid
     fg.domainid  = self.remoteReader.domain
     fg.establish = t
     fg.follow = re.search(self.followmask, strimdata).group(1)
     fg.fans = re.search(self.fansmask, strimdata).group(1)
     fg.weibo = re.search(self.weibomask, strimdata).group(1)
     
     text1 = d('span').filter('.name').text()
     text2 = d('strong').filter('.W_f20.W_Yahei').text()
     if text1:
         fg.name = text1
     else:
         fg.name = text2
          
     try:
         fg.verify = d('.pf_verified_info').contents()[0]
     except:
         fg.verify = ''
         
         
     fg.intro = d('.pf_intro').text()
      
     for i in d('.layer_menulist_tags').items('a'):
         fg.tags.append( i.text() ) 
         
     if not fg.isValid():
         print '    - Thread {0} weibo figure info not enough'.format(self.no)
             
     else:
         return fg    
Пример #2
0
    def test_parseWeiboLst(self, uid):
        fd = '../BigVs/' + str(uid)
        if os.path.exists(fd):
            with open('../BigVs/' + str(uid), 'r') as f:
                rawdoc = f.read()

            d = PyQuery(rawdoc.decode('utf-8'))
            fg = FigureItem()

            fg.follow = d('strong').filter(lambda i, this: PyQuery(this).attr(
                'node-type') == 'follow').text()
            fg.fans = d('strong').filter(lambda i, this: PyQuery(this).attr(
                'node-type') == 'fans').text()
            fg.weibo = d('strong').filter(lambda i, this: PyQuery(this).attr(
                'node-type') == 'weibo').text()

            fg.name = d('span').filter('.name').text()
            fg.verify = d('.pf_verified_info').contents()[0]
            fg.intro = d('.pf_intro').text()

            for i in d('.layer_menulist_tags').items('a'):
                fg.tags.append(i.text())

            return fg
        else:
            print 'file not exists'
Пример #3
0
 def itemCast(self, row):
     fg = FigureItem()
     fg.uid          = row[0]
     fg.domainid     = row[1]
     fg.name         = row[2]
     fg.follow       = row[3]
     fg.fans         = row[4]
     fg.weibo        = row[5]
     fg.establish    = row[6]     
     return fg
Пример #4
0
 def itemCast(self, row):
     fg = FigureItem()
     fg.uid = row[0]
     fg.domainid = row[1]
     fg.name = row[2]
     fg.follow = row[3]
     fg.fans = row[4]
     fg.weibo = row[5]
     fg.establish = row[6]
     return fg
Пример #5
0
 def test_parseWeiboLst(self, uid):
     fd = '../BigVs/' + str(uid)
     if os.path.exists(fd): 
         with open( '../BigVs/' + str(uid), 'r' ) as f:
             rawdoc = f.read()
         
         d = PyQuery( rawdoc.decode('utf-8') ) 
         fg = FigureItem()
         
         fg.follow = d('strong').filter(lambda i, this: PyQuery(this).attr('node-type') == 'follow').text()
         fg.fans = d('strong').filter(lambda i, this: PyQuery(this).attr('node-type') == 'fans').text()
         fg.weibo = d('strong').filter(lambda i, this: PyQuery(this).attr('node-type') == 'weibo').text()
         
         fg.name = d('span').filter('.name').text()
         fg.verify = d('.pf_verified_info').contents()[0]
         fg.intro = d('.pf_intro').text()
          
         for i in d('.layer_menulist_tags').items('a'):
             fg.tags.append( i.text() ) 
             
             
         return fg
     else:
         print 'file not exists'