def _parseHeadinfo(self, doc): fg = FigureItem() strimdata = '' jdiclst = [] scripts = re.findall('<script>FM\.view\((.*)\);?</script>', doc) if scripts: for i in scripts: jdiclst.append( json.loads(i) ) else: print '_fetch_manload: raw doc parse error' for jdic in jdiclst: if 'ns' in jdic: if jdic['ns'] == 'pl.header.head.index': strimdata = jdic['html'] d = PyQuery( strimdata ) break else: raise Exception('_parseHeadinfo error') info = self.remoteReader.getDoc( self.remoteReader.makeUrl_hostinfo(self.uid) ) m = re.search(r'注册时间[.\s\S]+(\d{4})-(\d{2})-(\d{2})', info) if m: t = time.mktime(time.strptime('%s-%s-%s' % (m.group(1), m.group(2), m.group(3)), '%Y-%m-%d')) else: t = 0 #2012-07-06 fg.uid = self.uid fg.domainid = self.remoteReader.domain fg.establish = t fg.follow = re.search(self.followmask, strimdata).group(1) fg.fans = re.search(self.fansmask, strimdata).group(1) fg.weibo = re.search(self.weibomask, strimdata).group(1) text1 = d('span').filter('.name').text() text2 = d('strong').filter('.W_f20.W_Yahei').text() if text1: fg.name = text1 else: fg.name = text2 try: fg.verify = d('.pf_verified_info').contents()[0] except: fg.verify = '' fg.intro = d('.pf_intro').text() for i in d('.layer_menulist_tags').items('a'): fg.tags.append( i.text() ) if not fg.isValid(): print ' - Thread {0} weibo figure info not enough'.format(self.no) else: return fg
def itemCast(self, row): fg = FigureItem() fg.uid = row[0] fg.domainid = row[1] fg.name = row[2] fg.follow = row[3] fg.fans = row[4] fg.weibo = row[5] fg.establish = row[6] return fg