def fetch_articles_list(boardName, boardId, page): url = config.base_url + '/bbsdoc.php?board=' + boardName + '&ftype=0&page=' + str(page) #logger.debug(url) html = newsm_common.request_get(url, 'GB18030', 20, 10) if html is None: logger.error('URL request failed: ' + url) return # logger.debug(html) # c.o(1,1,'loury','m ',985656622,'[公告]同意开设"Python/Python语言"看版 (转载) ',0,0,0); result = re.compile('c\.o\((\d+),(\d+),\'([^\']+)\',\'([^\']+)\',(\d+),\'([^\']+)\',(\d+),(\d+),(\d+)\)').findall(html) articles = [] if (len(result) > 0): for line in result: article = { '_id': str(boardId) + '.' + line[0], 'title':line[5].strip(), 'parent_id':str(boardId) + '.' + line[1], 'author':line[2].strip(), 'size':int(line[6]), 'flag': line[3].strip(), 'board_name': boardName.strip(), 'board_id': boardId, 'created_at':int(line[4]) } articles.append(article) return articles
def browseBoard(name, id, sectionId): url = config.base_url + '/bbsdoc.php?board=' + name content = newsm_common.request_get(url, 'GB18030', 20, 10) content = content.replace(u'\u3000', u'') #logger.debug(content) boards = [] result = re.compile(r'o\.o\([^\)]*\)').findall(content) #logger.debug(result) for line in result: match = re.match( r'o\.o\((true|false),(\d+),(\d+),(\d+),\'\[([^\]]*)\]\',\s*\'([^\']+)\',\s*\'([^\']+)\',\s*\'([^\']*)\',(\d+),(\d+),(\d+)\)', line) # logger.debug(match.group()) board = {} board['_id'] = int(match.group(3)) board['name'] = match.group(6) board['name2'] = match.group(7) board['moderators'] = match.group(8) board['section_name'] = match.group(5) board['unkown1'] = int(match.group(4)) board['unkown2'] = int(match.group(10)) board['online'] = int(match.group(11)) board['is_folder'] = match.group(1) board['post_count'] = int(match.group(9)) board['section_id'] = sectionId board['parent_id'] = id boards.append(board) # logger.debug(board) if board['is_folder'] == 'true': sub_boards = browseBoard(board['name'], board['_id'], sectionId) boards.extend(sub_boards) return boards
def browseSection(id): url = config.base_url + '/bbsfav.php?select=' + str(id) + '&x' content = newsm_common.request_get(url, 'GB18030', 20, 10) content = content.replace(u'\u3000', u'') #logger.debug(content) boards = [] # list all sections result = re.compile(r'o\.f\([^\)]*\)').findall(content) if (len(result) > 0): for line in result: match = re.match( r'o\.f\((\d+),\'([^\s]*)\s+(.*)\',(\d+),\'(.*)\'\)', line) section = {} section['parent_id'] = id section['_id'] = int(match.group(1)) section['name'] = match.group(5) section['name2'] = match.group(2) section['desc'] = match.group(3) section['rank'] = int(match.group(4)) #all_sections.append(section) #logger.debug(section) sub_boards = browseSection(section['_id']) boards.extend(sub_boards) # list all boards # o.o(false,1,1161,23473,'[清华]','CECM.THU','清华土木建管','ghostzb',21767,0,1); # 版面or目录, group, group2, (不知道), 分区名, 版面名, 版面中文名, 版主(可能为空), 帖子数, (不知道), 在线数 result = re.compile(r'o\.o\([^\)]*\)').findall(content) #logger.debug(result) for line in result: match = re.match( r'o\.o\((true|false),(\d+),(\d+),(\d+),\'\[([^\]]*)\]\',\s*\'([^\']+)\',\s*\'([^\']+)\',\s*\'([^\']*)\',(\d+),(\d+),(\d+)\)', line) #logger.debug(match.group()) board = {} board['_id'] = int(match.group(3)) board['name'] = match.group(6) board['name2'] = match.group(7) board['moderators'] = match.group(8) board['section_name'] = match.group(5) board['unkown1'] = int(match.group(4)) board['unkown2'] = int(match.group(10)) board['online'] = int(match.group(11)) board['is_folder'] = match.group(1) board['post_count'] = int(match.group(9)) board['section_id'] = id board['parent_id'] = 0 boards.append(board) # logger.debug(board) if board['is_folder'] == 'true': sub_boards = browseBoard(board['name'], board['_id'], id) boards.extend(sub_boards) return boards
def fetch_new_articles(board, start_page=0): url = config.base_url + '/bbsdoc.php?board=' + board['name'] html = newsm_common.request_get(url, 'GB18030', 20, 10) if html is None: logger.error(' URL request failed: ' + url) return # logger.debug(html) # docWriter('Python',284,96499,0,0,3218,96528,'/groups/comp.faq/Python',1,1); result = re.compile('docWriter\(\'' + board['name'] + '\',(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),\'([^\']+)\',(\d+),(\d+)\)').search(html) if result is None: logger.error('Not matched') return # logger.debug(result.group()) pages = int(result.group(5)) boardId = int(result.group(1)) tb_article = config.mongo_db['article_' + str(boardId)] skipped_count = 0 new_articles = 0 logger.info('=== {}, {}'.format(boardId, board['name'])) if start_page > 0 and start_page < pages: pages = start_page for page in range(pages, 1, -1): logger.info(' {}, {}, P{}'.format(boardId, board['name'], page)) articles = fetch_articles_list(board['name'], boardId, page) for article in articles: # timeArray = time.localtime(article['created_at']) # logger.debug(time.strftime("%Y-%m-%d %H:%M:%S", timeArray) + ': ' + str(article['_id']) + ',' + article['title']) dummy = tb_article.find_one({'_id': article['_id']}) if dummy is None or dummy['content'] == '': # Fetch the rest profiles for article fetch_article(article) if article['content'] != '': tb_article.save(article) new_articles += 1 # Add or update user if ('author' in article) and (not article['author'] == ''): newsm_user.update_user(article['author']) else: skipped_count += 1 else: skipped_count += 1 #logger.debug('skip: ' + str(article['_id'])) if (skipped_count > 30): break logger.info(' New articles: {}'.format(new_articles))
def fetch_article(article): article['title'] = newsm_common.remove_emoji(article['title'].strip()) realId = str(article['_id'])[len(str(article['board_id'])) + 1:] url = config.base_url + '/bbscon.php?bid=' + str(article['board_id']) + '&id=' + realId html = newsm_common.request_get(url, 'GB18030', 20, 10) if html is None: logger.error(' URL request failed: ' + url) return # logger.info(html) # logger.info('发信人: [FROM: 60.191.227.*]\r[m\n');o.h(0);o.t(); # logger.info('发信人: [FROM: 60.191.227.*]\r[m\n');attach('test.zip', 4227, 2059);o.h(0);o.t(); result = re.compile( '(prints\(\'(.*)\'\);(attach\(\'([^\']+)\',\s*(\d+),\s*(\d+)\);){0,}o\.h\(0\);o\.t\(\);)').search(html) if result is None: logger.debug(' Not matched: {}'.format(html)) article['content'] = '' article['updated_at'] = int(time.time()) article['attachments'] = [] return article['content'] = result.group(2) # simplifiy the content article['content'] = re.sub(r'\\n', '\n', article['content']) article['content'] = re.sub(r'\\r\[[;\d]{0,8}m', '', article['content']) article['content'] = re.sub(r'\\(/|"|\')', r'\1', article['content']) article['content'] = newsm_common.remove_emoji(article['content']) # extract the IP ip = extract_ip_from_article(article['content']) if (not ip is None): article['ip'] = ip else: article['ip'] = None article['updated_at'] = int(time.time()) article['attachments'] = [] # If there are attachments if not result.group(3) is None: # group 3 only match one occurence, so here apply the matching on group 1 again result = re.compile('attach\(\'([^\']+)\',\s*(\d+),\s*(\d+)\);').findall(result.group(1)) if len(result) > 0: # logger.info(result) for line in result: attachment = { 'name':line[0].strip(), 'size':int(line[1]), 'id':int(line[2]) } article['attachments'].append(attachment)
def fetch_user(name): name = name.strip() url = config.base_url + '/bbsqry.php?userid=' + name html = newsm_common.request_get(url, 'GB18030', 20, 10) # logger.info(html) if html is None: logger.error('URL request failed: ' + url) return None '''<tr><td>该用户不存在</td></tr>''' result = re.compile('<tr><td>该用户不存在</td></tr>').search(html) if result is not None: user = { '_id': name.lower(), 'name': name, 'nick': '用户不存在', 'logins': 0, 'posts': 0, 'last_login': '', 'ip': '', 'last_active': '', 'life': 0, 'title': '', 'updated_at': int(time.time()), 'next_update': int(time.time()) + 3600 * 240 } return user result = re.compile('<pre>\s*([\s\S]*)\s*</pre>').search(html) if result is None: logger.error(' Not matched: {}'.format(html)) return None # logger.info(result.group(1)) result2 = re.compile( '([^(]+)\(([\s\S]*)\) 共上站 (\d+) 次,发表过 (\d+) 篇文章\s+上次在\s+\[(.*)\] 从 \[(.*)\] 到本站一游。(?:积分: \[\d+\])?\s+离线时间\s*\[(.*)\] 信箱: \[(.*)\] 生命力: \[(-?\d+)\] 身份: \[(.*)\]。' ).search(result.group(1)) if result2 is None: logger.error('Not matched(2)') logger.debug(result.group()) return None # logger.info(result2.group()) user = { '_id': result2.group(1).strip().lower(), 'name': result2.group(1).strip(), 'nick': result2.group(2).strip(), 'logins': int(result2.group(3)), 'posts': int(result2.group(4)), 'last_login': result2.group(5), 'ip': result2.group(6), 'last_active': result2.group(7), 'life': int(result2.group(9)), 'title': result2.group(10), 'updated_at': int(time.time()), 'next_update': int(time.time()) + 3600 * 72 } result3 = re.compile('\'dp1\'\);\s+prints\(\'(.*)\'\);\/\/-->').search( html) # logger.debug(result3.group()) if result3 is not None: user['signature'] = result3.group(1).strip() user['signature'] = re.sub(r'\\n', '\n', user['signature']) user['signature'] = re.sub(r'\\r\[[;\d]{0,12}m', '', user['signature']) user['signature'] = re.sub(r'\\(/|"|\')', r'\1', user['signature']) user['signature'] = user['signature'].strip() else: user['signature'] = None return user