예제 #1
0
파일: Group.py 프로젝트: tengfens/dbapi
 def get_topic(self, topic_id):
     xml = self.api.xml(API_GROUP_GET_TOPIC % topic_id)
     txt_title = xml.xpath('//div[@id="content"]/h1/text()')[0].strip()
     txt_content = xml.xpath('//div[@class="topic-content"]')[0].xpath(
         'string()').strip()
     txt_author_avatar = xml.xpath('//img[@class="pil"]/@src')[0]
     xml_from = xml.xpath(
         '//div[@class="topic-doc"]//span[@class="from"]/a')[0]
     txt_created_at = xml.xpath(
         '//div[@class="topic-doc"]//span[@class="color-green"]/text()')[0]
     xml_group = xml.xpath('//div[@id="g-side-info-member"]')[0]
     txt_group_avatar = xml_group.xpath('.//img/@src')[0]
     xml_group_title = xml_group.xpath('.//div[@class="title"]/a')[0]
     txt_author_url = xml_from.get('href')
     txt_group_url = xml_group_title.get('href').split('?', 1)[0]
     return {
         'id': topic_id,
         'title': txt_title,
         'content': txt_content,
         'created_at': txt_created_at,
         'author_avatar': txt_author_avatar,
         'author_nickname': xml_from.text,
         'author_alias': slash_right(txt_author_url),
         'author_url': txt_author_url,
         'group_icon': txt_group_avatar,
         'group_name':
         xml_group.xpath('.//div[@class="title"]/a/text()')[0],
         'group_alias': slash_right(txt_group_url),
         'group_url': txt_group_url,
     }
예제 #2
0
 def list_rev_contacts(self, user_alias=None, start=0):
     results = []
     if user_alias is None:
         xml = self.api.xml(API_PEOPLE_LIST_REV_CONTACTS,
                            params={'start': start})
         for item in xml.xpath('//ul[@class="user-list"]/li'):
             try:
                 avatar = item.xpath('.//img/@src')[0]
                 nickname = item.xpath('.//img/@alt')[0]
                 url = item.xpath('.//a/@href')[0]
                 alias = slash_right(url)
                 xml_info = item.xpath('.//div[@class="info"]/p')[0].xpath(
                     'string(.)').strip()
                 match = re.search(r'(.+)被[^\d]*(\d+).*关注[^\d]*(\d+)',
                                   xml_info, re.S)
                 city = None
                 contact_count = 0
                 rev_contact_count = 0
                 if match:
                     groups = match.groups()
                     city = groups[0].strip()
                     contact_count = int(groups[1].strip())
                     rev_contact_count = int(groups[2].strip())
                 results.append({
                     'avatar': avatar,
                     'nickname': nickname,
                     'url': url,
                     'alias': alias,
                     'city': city,
                     'contact_count': contact_count,
                     'rev_contact_count': rev_contact_count,
                 })
             except Exception as e:
                 self.api.logger.exception(
                     'parse rev contact list error: %s' % e)
     else:
         xml = self.api.xml(API_PEOPLE_LIST_USER_REV_CONTACTS % user_alias,
                            params={'start': start})
         for item in xml.xpath('//dl[@class="obu"]'):
             try:
                 avatar = item.xpath('.//img/@src')[0]
                 nickname = item.xpath('.//img/@alt')[0]
                 url = item.xpath('.//a/@href')[0]
                 alias = slash_right(url)
                 results.append({
                     'avatar': avatar,
                     'nickname': nickname,
                     'url': url,
                     'alias': alias,
                 })
             except Exception as e:
                 self.api.logger.exception(
                     'parse rev contact list error: %s' % e)
     return build_list_result(results, xml)
예제 #3
0
 def list_comments(self, topic_id, start=0):
     """
     回复列表
     
     :param topic_id: 话题ID
     :param start: 翻页
     :return: 带下一页的列表
     """
     xml = self.api.xml(API_GROUP_GET_TOPIC % topic_id, params={'start': start})
     xml_results = xml.xpath('//ul[@id="comments"]/li')
     results = []
     for item in xml_results:
         try:
             author_avatar = item.xpath('.//img/@src')[0]
             author_url = item.xpath('.//div[@class="user-face"]/a/@href')[0]
             author_alias = slash_right(author_url)
             author_signature = item.xpath('.//h4/text()')[1].strip()
             author_nickname = item.xpath('.//h4/a/text()')[0].strip()
             created_at = item.xpath('.//h4/span/text()')[0].strip()
             content = etree.tostring(item.xpath('.//div[@class="reply-doc content"]/p')[0]).decode('utf8').strip()
             cid = item.get('id')
             results.append({
                 'id': cid,
                 'author_avatar': author_avatar,
                 'author_url': author_url,
                 'author_alias': author_alias,
                 'author_signature': author_signature,
                 'author_nickname': author_nickname,
                 'created_at': created_at,
                 'content': unescape(content),
             })
         except Exception as e:
             self.api.logger.exception('parse comment exception: %s' % e)
     return build_list_result(results, xml)
예제 #4
0
 def list_contacts(self, user_alias=None, start=0):
     results = []
     if user_alias is None:
         xml = self.api.xml(API_PEOPLE_LIST_CONTACTS,
                            params={'start': start})
         for item in xml.xpath('//ul[@class="user-list"]/li'):
             try:
                 avatar = item.xpath('.//img/@src')[0]
                 nickname = item.xpath('.//img/@alt')[0]
                 url = item.xpath('.//a/@href')[0]
                 alias = slash_right(url)
                 city = item.xpath('.//span[@class="loc"]/text()')[0][3:]
                 signature = item.xpath(
                     './/span[@class="signature"]/text()')[0][3:]
                 results.append({
                     'avatar': avatar,
                     'nickname': nickname,
                     'url': url,
                     'alias': alias,
                     'city': city,
                     'signature': signature,
                 })
             except Exception as e:
                 self.api.logger.exception('parse contact error: %s' % e)
     else:
         xml = self.api.xml(API_PEOPLE_LIST_USER_CONTACTS % user_alias,
                            params={'start': start})
         for item in xml.xpath('//dl[@class="obu"]'):
             try:
                 avatar = item.xpath('.//img/@src')[0]
                 nickname = item.xpath('.//img/@alt')[0]
                 url = item.xpath('.//a/@href')[0]
                 alias = slash_right(url)
                 results.append({
                     'avatar': avatar,
                     'nickname': nickname,
                     'url': url,
                     'alias': alias,
                 })
             except Exception as e:
                 self.api.logger.exception('parse contact error: %s' % e)
     return build_list_result(results, xml)
예제 #5
0
 def flush(self):
     """
     更新会话信息,主要是ck, user_alias
     """
     if 'dbcl2' not in self.cookies:
         return
     r = self.req(API_ACCOUNT_HOME)
     if RE_SESSION_EXPIRE.search(r.url):
         return self.expire()
     self.cookies.update(dict(r.cookies))
     self.user_alias = slash_right(r.url)
     self.logger.debug('flush with user_alias <%s>' % self.user_alias)
     return
예제 #6
0
파일: Group.py 프로젝트: tengfens/dbapi
 def list_comments(self, topic_id, start=0):
     """
     回复列表
     
     :param topic_id: 话题ID
     :param start: 翻页
     :return: 带下一页的列表
     """
     all_results = []
     start = int(start)
     comments_count = start
     import os
     output_folder = os.getcwd()
     output_md = os.path.join(output_folder, 'md', str(topic_id))
     if not os.path.exists(output_md):
         os.makedirs(output_md)
     output_json = os.path.join(output_folder, 'json', str(topic_id))
     if not os.path.exists(output_json):
         os.makedirs(output_json)
     while True:
         xml = self.api.xml(API_GROUP_GET_TOPIC % topic_id,
                            params={'start': start})
         xml_results = xml.xpath('//ul[@id="comments"]/li')
         results = []
         for item in xml_results:
             try:
                 author_avatar = item.xpath('.//img/@src')[0]
                 author_url = item.xpath(
                     './/div[@class="user-face"]/a/@href')[0]
                 author_alias = slash_right(author_url)
                 author_signature = item.xpath('.//h4/text()')[1].strip()
                 author_nickname = item.xpath('.//h4/a/text()')[0].strip()
                 topic_author = item.xpath(
                     './/h4/span[@class="topic-author-icon"]/text()')
                 topic_author = "" if len(
                     topic_author) == 0 else topic_author[0].strip()
                 created_at = item.xpath(
                     './/h4/span[@class="pubtime"]/text()')[0].strip()
                 reply_to_url = item.xpath(
                     './/div[@class="reply-quote-content"]/span[@class="pubdate"]/a/@href'
                 )
                 reply_to_url = "" if len(
                     reply_to_url) == 0 else reply_to_url[0].strip()
                 reply_to = item.xpath(
                     './/div[@class="reply-quote-content"]/span[@class="pubdate"]/a/text()'
                 )
                 reply_to = "" if len(
                     reply_to) == 0 else reply_to[0].strip()
                 reply_quote_content = item.xpath(
                     './/div[@class="reply-quote-content"]/span/text()')
                 reply_quote_content = "" if len(
                     reply_quote_content
                 ) == 0 else reply_quote_content[0].strip()
                 content = item.xpath(
                     './/div[@class="reply-doc content"]/p/text()')
                 content = "" if len(content) == 0 else content[0].strip()
                 img = item.xpath('.//img/@data-orig')
                 img = "" if len(img) == 0 else img[0].strip()
                 cid = item.get('id')
                 results.append({
                     'id': cid,
                     'author_avatar': author_avatar,
                     'author_url': author_url,
                     'author_alias': author_alias,
                     'author_signature': author_signature,
                     'author_nickname': author_nickname,
                     'topic_author': topic_author,
                     'created_at': created_at,
                     'reply_to_url': reply_to_url,
                     'reply_to': reply_to,
                     'reply_quote_content': reply_quote_content,
                     'content': unescape(content),
                     'img': img,
                 })
                 out_content = ''
                 out_content += '* [![{}]({})]({})'.format(
                     author_nickname, author_avatar, author_url)
                 out_content += '    ' + '[{}]({})'.format(
                     author_nickname,
                     author_url) + ('    ' + topic_author if topic_author
                                    else '') + '    ' + created_at + '  \n'
                 if reply_to:
                     out_content += '  >' + reply_quote_content.replace(
                         '\n', '  \n  >') + '  \n'
                     out_content += '  >\n'
                     out_content += '  >-- ' + '[{}]({})'.format(
                         reply_to, reply_to_url) + '  \n'
                     out_content += '  \n'
                 if img:
                     out_content += '  ![{}]({})'.format('', img) + '  \n'
                 out_content += '  ' + content.replace('\n',
                                                       '  \n  ') + '  \n'
                 out_content += '---\n'
                 file = open(
                     os.path.join(
                         output_md,
                         'comments{}-{}.md'.format(start + 1, start + 100)),
                     'ab')
                 file.write(out_content.encode())
                 file.close()
             except Exception as e:
                 self.api.logger.exception('parse comment exception: %s' %
                                           e)
         all_results.extend(results)
         list_results = build_list_result(results, xml)
         comments_count += int(list_results['count'])
         import json
         file = open(
             os.path.join(
                 output_json,
                 'comments{}-{}.json'.format(start + 1, start + 100)), 'w+')
         file.write(json.dumps(build_list_result(results, xml), indent=2))
         file.close()
         if list_results['next_start']:
             start = int(list_results['next_start'])
         elif list_results['count'] < 100 and list_results['count'] > 0:
             break
         import time
         import random
         time.sleep(10 * random.uniform(1.5, 2.5))
     return {'results': all_results, 'count': comments_count}
예제 #7
0
파일: Group.py 프로젝트: tengfens/dbapi
 def _parse_topic_table(self,
                        xml,
                        tds='title,created,comment,group',
                        selector='//table[@class="olt"]//tr'):
     """
     解析话题列表
     
     :internal
     :param xml: 页面XML 
     :param tds: 每列的含义,可以是title, created, comment, group, updated, author, time, rec
     :param selector: 表在页面中的位置
     :return: 
     """
     xml_results = xml.xpath(selector)
     results = []
     tds = tds.split(',')
     for item in xml_results:
         try:
             result = {}
             index = 0
             for td in tds:
                 index += 1
                 if td == 'title':
                     xml_title = item.xpath('.//td[position()=%s]/a' %
                                            index)[0]
                     url = xml_title.get('href')
                     tid = int(slash_right(url))
                     title = xml_title.text
                     result.update({'id': tid, 'url': url, 'title': title})
                 elif td == 'created':
                     xml_created = item.xpath('.//td[position()=%s]/a' % index) \
                                   or item.xpath('.//td[position()=%s]' % index)
                     created_at = xml_created[0].get('title')
                     result['created_at'] = created_at
                 elif td == 'comment':
                     xml_comment = item.xpath('.//td[position()=%s]/span' % index) \
                                   or item.xpath('.//td[position()=%s]' % index)
                     comment_count = int(
                         re.match(r'\d+', xml_comment[0].text).group())
                     result['comment_count'] = comment_count
                 elif td == 'group':
                     xml_group = item.xpath('.//td[position()=%s]/a' %
                                            index)[0]
                     group_url = xml_group.get('href')
                     group_alias = slash_right(group_url)
                     group_name = xml_group.text
                     result.update({
                         'group_alias': group_alias,
                         'group_url': group_url,
                         'group_name': group_name
                     })
                 elif td == 'author':
                     xml_author = item.xpath('.//td[position()=%s]/a' %
                                             index)[0]
                     author_url = xml_author.get('href')
                     author_alias = slash_right(author_url)
                     author_nickname = xml_author.text
                     result.update({
                         'author_url': author_url,
                         'author_alias': author_alias,
                         'author_nickname': author_nickname,
                     })
                 elif td == 'updated':
                     result['updated_at'] = item.xpath(
                         './/td[position()=%s]/text()' % index)[0]
                 elif td == 'time':
                     result['time'] = item.xpath(
                         './/td[position()=%s]/text()' % index)[0]
                 elif td == 'rec':
                     xml_rec = item.xpath(
                         './/td[position()=%s]//a[@class="lnk-remove"]/@href'
                         % (index - 1))[0]
                     result['rec_id'] = re.search(r'rec_id=(\d+)',
                                                  xml_rec).groups()[0]
             results.append(result)
         except Exception as e:
             self.api.api.logger.exception(
                 'parse topic table exception: %s' % e)
     return results