import re from lxml import etree import requests import time from threading import Thread from crawler import PostsCrawler from mysql_manager import MysqlManager max_threads = 10 interval = 20 mysql_mgr = MysqlManager(max_threads) def post_crawl_task(topic): # Get 1st page of this topic post_crawler = PostsCrawler() post_crawler.get_content(topic['url'], 1) posts = post_crawler.get_posts() # Get number of pages of this topic page_count = post_crawler.get_max_page() print(topic['url']) print('page count', page_count) # Get the rest posts of this topic if page_count > 1: for i in range(2, page_count + 1): post_crawler.get_content(topic['url'], i) posts += post_crawler.get_posts()
import json from mysql_manager import MysqlManager mysql = MysqlManager(4) with open('videos.json', 'r') as f: i = 1 while True: print("Parse json: ", i) i+= 1 line = f.readline() if not line: break if len(line) < 10: continue # urls = re.findall('http://v3-dy.ixigua.com[^\"]+', json_str) obj = json.loads(line) # aweme_list->[n]->video->play_addr->url_list i_url = 0 for v in obj['aweme_list']: # print("-----", i_url) try: url = v['video']['play_addr']['url_list'][0] except Exception as err: print("parse error ", i, " index: ", i_url) i_url += 1 # print(url)
import re from lxml import etree import requests import time import global_var from mysql_manager import MysqlManager mysql_mgr = MysqlManager(4) class BoardsCrawler: domain = 'http://www.newsmth.net/' base_url = domain + '/nForum/section/{}?ajax' def __init__(self, interval=1): self.interval = interval def get_board_of_section(self, section_idx): url = self.base_url.format(section_idx) response = requests.get(url, headers=global_var.newsmth_headers) time.sleep(self.interval) self.content = response.text self.tree = etree.HTML(self.content) def get_board_list(self, etr_obj=None): if etr_obj is None: etr_obj = self.tree elements = etr_obj.xpath( '//table[@class="board-list corner"]/tbody/tr')
def __init__(self, limit=500): self.reply_limit = limit self.mm = MysqlManager(4)
import re from lxml import etree import requests from threading import Thread import time import html from mysql_manager import MysqlManager from crawler import PostsCrawler max_threads = 10 wait_duration = 20 mysql_mgr = MysqlManager(10) def post_crawl_task(topic): # Get 1st page of this topic post_crawler = PostsCrawler() post_crawler.get_content(topic['url'], 1) posts = post_crawler.get_posts() # Get number of pages of this topic page_count = post_crawler.get_max_page() # Get the rest posts of this topic if page_count > 1: for i in range(2, page_count + 1): post_crawler.get_content(topic['url'], i) posts += post_crawler.get_posts() break # Insert post of a topic
def __init__(self, limit=200): self.reply_limit = limit self.mm = MysqlManager(4) self.post = {}