def get_topic(self, topic_id): xml = self.api.xml(API_GROUP_GET_TOPIC % topic_id) txt_title = xml.xpath('//div[@id="content"]/h1/text()')[0].strip() txt_content = xml.xpath('//div[@class="topic-content"]')[0].xpath( 'string()').strip() txt_author_avatar = xml.xpath('//img[@class="pil"]/@src')[0] xml_from = xml.xpath( '//div[@class="topic-doc"]//span[@class="from"]/a')[0] txt_created_at = xml.xpath( '//div[@class="topic-doc"]//span[@class="color-green"]/text()')[0] xml_group = xml.xpath('//div[@id="g-side-info-member"]')[0] txt_group_avatar = xml_group.xpath('.//img/@src')[0] xml_group_title = xml_group.xpath('.//div[@class="title"]/a')[0] txt_author_url = xml_from.get('href') txt_group_url = xml_group_title.get('href').split('?', 1)[0] return { 'id': topic_id, 'title': txt_title, 'content': txt_content, 'created_at': txt_created_at, 'author_avatar': txt_author_avatar, 'author_nickname': xml_from.text, 'author_alias': slash_right(txt_author_url), 'author_url': txt_author_url, 'group_icon': txt_group_avatar, 'group_name': xml_group.xpath('.//div[@class="title"]/a/text()')[0], 'group_alias': slash_right(txt_group_url), 'group_url': txt_group_url, }
def list_rev_contacts(self, user_alias=None, start=0): results = [] if user_alias is None: xml = self.api.xml(API_PEOPLE_LIST_REV_CONTACTS, params={'start': start}) for item in xml.xpath('//ul[@class="user-list"]/li'): try: avatar = item.xpath('.//img/@src')[0] nickname = item.xpath('.//img/@alt')[0] url = item.xpath('.//a/@href')[0] alias = slash_right(url) xml_info = item.xpath('.//div[@class="info"]/p')[0].xpath( 'string(.)').strip() match = re.search(r'(.+)被[^\d]*(\d+).*关注[^\d]*(\d+)', xml_info, re.S) city = None contact_count = 0 rev_contact_count = 0 if match: groups = match.groups() city = groups[0].strip() contact_count = int(groups[1].strip()) rev_contact_count = int(groups[2].strip()) results.append({ 'avatar': avatar, 'nickname': nickname, 'url': url, 'alias': alias, 'city': city, 'contact_count': contact_count, 'rev_contact_count': rev_contact_count, }) except Exception as e: self.api.logger.exception( 'parse rev contact list error: %s' % e) else: xml = self.api.xml(API_PEOPLE_LIST_USER_REV_CONTACTS % user_alias, params={'start': start}) for item in xml.xpath('//dl[@class="obu"]'): try: avatar = item.xpath('.//img/@src')[0] nickname = item.xpath('.//img/@alt')[0] url = item.xpath('.//a/@href')[0] alias = slash_right(url) results.append({ 'avatar': avatar, 'nickname': nickname, 'url': url, 'alias': alias, }) except Exception as e: self.api.logger.exception( 'parse rev contact list error: %s' % e) return build_list_result(results, xml)
def list_comments(self, topic_id, start=0): """ 回复列表 :param topic_id: 话题ID :param start: 翻页 :return: 带下一页的列表 """ xml = self.api.xml(API_GROUP_GET_TOPIC % topic_id, params={'start': start}) xml_results = xml.xpath('//ul[@id="comments"]/li') results = [] for item in xml_results: try: author_avatar = item.xpath('.//img/@src')[0] author_url = item.xpath('.//div[@class="user-face"]/a/@href')[0] author_alias = slash_right(author_url) author_signature = item.xpath('.//h4/text()')[1].strip() author_nickname = item.xpath('.//h4/a/text()')[0].strip() created_at = item.xpath('.//h4/span/text()')[0].strip() content = etree.tostring(item.xpath('.//div[@class="reply-doc content"]/p')[0]).decode('utf8').strip() cid = item.get('id') results.append({ 'id': cid, 'author_avatar': author_avatar, 'author_url': author_url, 'author_alias': author_alias, 'author_signature': author_signature, 'author_nickname': author_nickname, 'created_at': created_at, 'content': unescape(content), }) except Exception as e: self.api.logger.exception('parse comment exception: %s' % e) return build_list_result(results, xml)
def list_contacts(self, user_alias=None, start=0): results = [] if user_alias is None: xml = self.api.xml(API_PEOPLE_LIST_CONTACTS, params={'start': start}) for item in xml.xpath('//ul[@class="user-list"]/li'): try: avatar = item.xpath('.//img/@src')[0] nickname = item.xpath('.//img/@alt')[0] url = item.xpath('.//a/@href')[0] alias = slash_right(url) city = item.xpath('.//span[@class="loc"]/text()')[0][3:] signature = item.xpath( './/span[@class="signature"]/text()')[0][3:] results.append({ 'avatar': avatar, 'nickname': nickname, 'url': url, 'alias': alias, 'city': city, 'signature': signature, }) except Exception as e: self.api.logger.exception('parse contact error: %s' % e) else: xml = self.api.xml(API_PEOPLE_LIST_USER_CONTACTS % user_alias, params={'start': start}) for item in xml.xpath('//dl[@class="obu"]'): try: avatar = item.xpath('.//img/@src')[0] nickname = item.xpath('.//img/@alt')[0] url = item.xpath('.//a/@href')[0] alias = slash_right(url) results.append({ 'avatar': avatar, 'nickname': nickname, 'url': url, 'alias': alias, }) except Exception as e: self.api.logger.exception('parse contact error: %s' % e) return build_list_result(results, xml)
def flush(self): """ 更新会话信息,主要是ck, user_alias """ if 'dbcl2' not in self.cookies: return r = self.req(API_ACCOUNT_HOME) if RE_SESSION_EXPIRE.search(r.url): return self.expire() self.cookies.update(dict(r.cookies)) self.user_alias = slash_right(r.url) self.logger.debug('flush with user_alias <%s>' % self.user_alias) return
def list_comments(self, topic_id, start=0): """ 回复列表 :param topic_id: 话题ID :param start: 翻页 :return: 带下一页的列表 """ all_results = [] start = int(start) comments_count = start import os output_folder = os.getcwd() output_md = os.path.join(output_folder, 'md', str(topic_id)) if not os.path.exists(output_md): os.makedirs(output_md) output_json = os.path.join(output_folder, 'json', str(topic_id)) if not os.path.exists(output_json): os.makedirs(output_json) while True: xml = self.api.xml(API_GROUP_GET_TOPIC % topic_id, params={'start': start}) xml_results = xml.xpath('//ul[@id="comments"]/li') results = [] for item in xml_results: try: author_avatar = item.xpath('.//img/@src')[0] author_url = item.xpath( './/div[@class="user-face"]/a/@href')[0] author_alias = slash_right(author_url) author_signature = item.xpath('.//h4/text()')[1].strip() author_nickname = item.xpath('.//h4/a/text()')[0].strip() topic_author = item.xpath( './/h4/span[@class="topic-author-icon"]/text()') topic_author = "" if len( topic_author) == 0 else topic_author[0].strip() created_at = item.xpath( './/h4/span[@class="pubtime"]/text()')[0].strip() reply_to_url = item.xpath( './/div[@class="reply-quote-content"]/span[@class="pubdate"]/a/@href' ) reply_to_url = "" if len( reply_to_url) == 0 else reply_to_url[0].strip() reply_to = item.xpath( './/div[@class="reply-quote-content"]/span[@class="pubdate"]/a/text()' ) reply_to = "" if len( reply_to) == 0 else reply_to[0].strip() reply_quote_content = item.xpath( './/div[@class="reply-quote-content"]/span/text()') reply_quote_content = "" if len( reply_quote_content ) == 0 else reply_quote_content[0].strip() content = item.xpath( './/div[@class="reply-doc content"]/p/text()') content = "" if len(content) == 0 else content[0].strip() img = item.xpath('.//img/@data-orig') img = "" if len(img) == 0 else img[0].strip() cid = item.get('id') results.append({ 'id': cid, 'author_avatar': author_avatar, 'author_url': author_url, 'author_alias': author_alias, 'author_signature': author_signature, 'author_nickname': author_nickname, 'topic_author': topic_author, 'created_at': created_at, 'reply_to_url': reply_to_url, 'reply_to': reply_to, 'reply_quote_content': reply_quote_content, 'content': unescape(content), 'img': img, }) out_content = '' out_content += '* [![{}]({})]({})'.format( author_nickname, author_avatar, author_url) out_content += ' ' + '[{}]({})'.format( author_nickname, author_url) + (' ' + topic_author if topic_author else '') + ' ' + created_at + ' \n' if reply_to: out_content += ' >' + reply_quote_content.replace( '\n', ' \n >') + ' \n' out_content += ' >\n' out_content += ' >-- ' + '[{}]({})'.format( reply_to, reply_to_url) + ' \n' out_content += ' \n' if img: out_content += ' ![{}]({})'.format('', img) + ' \n' out_content += ' ' + content.replace('\n', ' \n ') + ' \n' out_content += '---\n' file = open( os.path.join( output_md, 'comments{}-{}.md'.format(start + 1, start + 100)), 'ab') file.write(out_content.encode()) file.close() except Exception as e: self.api.logger.exception('parse comment exception: %s' % e) all_results.extend(results) list_results = build_list_result(results, xml) comments_count += int(list_results['count']) import json file = open( os.path.join( output_json, 'comments{}-{}.json'.format(start + 1, start + 100)), 'w+') file.write(json.dumps(build_list_result(results, xml), indent=2)) file.close() if list_results['next_start']: start = int(list_results['next_start']) elif list_results['count'] < 100 and list_results['count'] > 0: break import time import random time.sleep(10 * random.uniform(1.5, 2.5)) return {'results': all_results, 'count': comments_count}
def _parse_topic_table(self, xml, tds='title,created,comment,group', selector='//table[@class="olt"]//tr'): """ 解析话题列表 :internal :param xml: 页面XML :param tds: 每列的含义,可以是title, created, comment, group, updated, author, time, rec :param selector: 表在页面中的位置 :return: """ xml_results = xml.xpath(selector) results = [] tds = tds.split(',') for item in xml_results: try: result = {} index = 0 for td in tds: index += 1 if td == 'title': xml_title = item.xpath('.//td[position()=%s]/a' % index)[0] url = xml_title.get('href') tid = int(slash_right(url)) title = xml_title.text result.update({'id': tid, 'url': url, 'title': title}) elif td == 'created': xml_created = item.xpath('.//td[position()=%s]/a' % index) \ or item.xpath('.//td[position()=%s]' % index) created_at = xml_created[0].get('title') result['created_at'] = created_at elif td == 'comment': xml_comment = item.xpath('.//td[position()=%s]/span' % index) \ or item.xpath('.//td[position()=%s]' % index) comment_count = int( re.match(r'\d+', xml_comment[0].text).group()) result['comment_count'] = comment_count elif td == 'group': xml_group = item.xpath('.//td[position()=%s]/a' % index)[0] group_url = xml_group.get('href') group_alias = slash_right(group_url) group_name = xml_group.text result.update({ 'group_alias': group_alias, 'group_url': group_url, 'group_name': group_name }) elif td == 'author': xml_author = item.xpath('.//td[position()=%s]/a' % index)[0] author_url = xml_author.get('href') author_alias = slash_right(author_url) author_nickname = xml_author.text result.update({ 'author_url': author_url, 'author_alias': author_alias, 'author_nickname': author_nickname, }) elif td == 'updated': result['updated_at'] = item.xpath( './/td[position()=%s]/text()' % index)[0] elif td == 'time': result['time'] = item.xpath( './/td[position()=%s]/text()' % index)[0] elif td == 'rec': xml_rec = item.xpath( './/td[position()=%s]//a[@class="lnk-remove"]/@href' % (index - 1))[0] result['rec_id'] = re.search(r'rec_id=(\d+)', xml_rec).groups()[0] results.append(result) except Exception as e: self.api.api.logger.exception( 'parse topic table exception: %s' % e) return results