def _parseHeadinfo(self, doc): fg = FigureItem() strimdata = '' jdiclst = [] scripts = re.findall('<script>FM\.view\((.*)\);?</script>', doc) if scripts: for i in scripts: jdiclst.append( json.loads(i) ) else: print '_fetch_manload: raw doc parse error' for jdic in jdiclst: if 'ns' in jdic: if jdic['ns'] == 'pl.header.head.index': strimdata = jdic['html'] d = PyQuery( strimdata ) break else: raise Exception('_parseHeadinfo error') info = self.remoteReader.getDoc( self.remoteReader.makeUrl_hostinfo(self.uid) ) m = re.search(r'注册时间[.\s\S]+(\d{4})-(\d{2})-(\d{2})', info) if m: t = time.mktime(time.strptime('%s-%s-%s' % (m.group(1), m.group(2), m.group(3)), '%Y-%m-%d')) else: t = 0 #2012-07-06 fg.uid = self.uid fg.domainid = self.remoteReader.domain fg.establish = t fg.follow = re.search(self.followmask, strimdata).group(1) fg.fans = re.search(self.fansmask, strimdata).group(1) fg.weibo = re.search(self.weibomask, strimdata).group(1) text1 = d('span').filter('.name').text() text2 = d('strong').filter('.W_f20.W_Yahei').text() if text1: fg.name = text1 else: fg.name = text2 try: fg.verify = d('.pf_verified_info').contents()[0] except: fg.verify = '' fg.intro = d('.pf_intro').text() for i in d('.layer_menulist_tags').items('a'): fg.tags.append( i.text() ) if not fg.isValid(): print ' - Thread {0} weibo figure info not enough'.format(self.no) else: return fg
class FigureFetcher(Fetcher): def __init__(self, queue): super(FigureFetcher, self).__init__() self.localReader = FigureDatabase() self.figure = FigureItem() self.q = queue def run(self): fg = FigureItem() while not self.q.empty(): uid = int(self.q.get()[0]) doc = self.remoteReader.getDoc( self.remoteReader.makeUrl_hostweibo(uid) ) self._parseHeadinfo(doc) self.remoteReader.finishFetching() self.localReader.record( fg ) def _parseHeadinfo(self, doc): strimdata = '' jdiclst = [] scripts = re.findall('<script>FM\.view\((.*)\);?</script>', doc) if scripts: for i in scripts: jdiclst.append( json.loads(i) ) else: print '_fetch_manload: raw doc parse error' for jdic in jdiclst: if 'ns' in jdic: if jdic['ns'] == 'pl.header.head.index': strimdata = jdic['html'] d = PyQuery( strimdata ) break else: raise Exception('_parseHeadinfo error') info = self.remoteReader.getDoc( self.remoteReader.makeUrl_hostinfo(self.uid) ) m = re.search(r'注册时间[.\s\S]+(\d{4})-(\d{2})-(\d{2})', info) if m: t = time.mktime(time.strptime('%s-%s-%s' % (m.group(1), m.group(2), m.group(3)), '%Y-%m-%d')) else: t = 0 #2012-07-06 self.figure.uid = self.remoteReader.uid self.figure.domainid = self.remoteReader.domain self.figure.establish = t self.figure.follow = re.search(self.followmask, strimdata).group(1) self.figure.fans = re.search(self.fansmask, strimdata).group(1) self.figure.weibo = re.search(self.weibomask, strimdata).group(1) text1 = d('span').filter('.name').text() text2 = d('strong').filter('.W_f20.W_Yahei').text() if text1: self.figure.name = text1 else: self.figure.name = text2 try: self.figure.verify = d('.pf_verified_info').contents()[0] except: self.figure.verify = '' self.figure.intro = d('.pf_intro').text() for i in d('.layer_menulist_tags').items('a'): self.figure.tags.append( i.text() ) if not self.figure.isValid(): print ' - Thread {0} weibo figure info not enough'.format(self.no)