def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36', } self.times = ['', 'w', 'm'] self.mysql = Mysql()
def start_requests(self): mysql = Mysql() sql = 'SELECT DISTINCT plat,roomid FROM rank_history WHERE plat and roomid is not NULL AND ranktype != \'水军榜\' AND ranktype != \'土豪榜\'' for item in mysql.get_two(sql): plat = item[0] roomid = item[1] yield Request(self.url_anchor_other.format(plat=plat, roomid=roomid), meta={ 'plat': plat, 'roomid': roomid, 'num': 2 }, callback=self.others) time = '' yield Request(url=self.url_anchor_price.format(plat=plat, roomid=roomid, time=time), meta={ 'time': time, 'plat': plat, 'roomid': roomid }, callback=self.rich_gift) yield Request(self.url_source_gname.format(plat=plat, roomid=roomid, time=time), meta={ 'time': time, 'plat': plat, 'roomid': roomid }, callback=self.preferences) yield Request(self.url_price_list.format(plat=plat, roomid=roomid, time=time), meta={ 'time': time, 'plat': plat, 'roomid': roomid, 'num': 1 }, callback=self.price_list) # for time in ['w', 'm']: # yield Request(self.url_anchor_timeline.format(plat=plat, roomid=roomid, time=time),meta={'time': time,'plat':plat,'roomid':roomid},callback=self.timeline) mysql.close_db()
def set_table(self, tablename): anchor_table = '{tablename}_{date}'.format( tablename=tablename, date=datetime.now().strftime('%m%d')) sql5 = 'DROP TABLE IF EXISTS {table};'.format(table=anchor_table) sql = '''CREATE TABLE IF NOT EXISTS {table} LIKE {tablename};'''.format( table=anchor_table, tablename=tablename) #建新表 sql1 = 'INSERT INTO {table} SELECT * FROM {tablename};'.format( table=anchor_table, tablename=tablename) #数据插入新表 sql2 = 'DROP TABLE IF EXISTS {tablename};'.format( tablename=tablename) #删除旧表 sql3 = 'CREATE TABLE IF NOT EXISTS {tablename} LIKE {table};'.format( tablename=tablename, table=anchor_table) # mysql = Mysql() mysql.cursor.execute(sql5) mysql.create_table(sql) mysql.insert_one(sql1) mysql.cursor.execute(sql2) print('删除成功') mysql.create_table(sql3) mysql.close_db()
def start_requests(self): mysql = Mysql() sql = 'SELECT * FROM `rich_id`' for item in mysql.get_two(sql): plat = item[1] roomid = item[0] yield Request(self.url_prefer.format(plat=plat, roomid=roomid), meta={ 'plat': plat, 'roomid': roomid }, callback=self.preferences) yield Request(self.url_pug.format(plat=plat, roomid=roomid), meta={ 'plat': plat, 'roomid': roomid }, callback=self.pug) for time in ['', 'm']: yield Request(url=self.url_gift.format(plat=plat, roomid=roomid, time=time), meta={ 'time': time, 'plat': plat, 'roomid': roomid }, callback=self.gift) yield Request(self.url_msg.format(plat=plat, roomid=roomid, time=time), meta={ 'time': time, 'plat': plat, 'roomid': roomid }, callback=self.msg) # yield Request(self.url_timeline.format(plat=plat, roomid=roomid, time=time), # meta={'time': time, 'plat': plat, 'roomid': roomid}, callback=self.timeline) mysql.close_db()
class XhlPipeline(object): def __init__(self): self.mysql = Mysql() def close_spider(self, spider): self.mysql.close_db() def process_item(self, item, spider): if isinstance(item, XhlItem): sql_tb_wdetail = 'replace into ' + self.mysql.get_sql_sentence( 'anchor_test', item) # print(sql_tb_wdetail) self.mysql.insert_one(sql_tb_wdetail) elif isinstance(item, otherItem): sql_tb_wdetail = 'replace into ' + self.mysql.get_sql_sentence( 'anchor_detail', item) # print(sql_tb_wdetail) self.mysql.insert_one(sql_tb_wdetail) return item
roomid = re.search('roomid=(.*?)\"', str(item)).group(1) try: anchor = item.select('dd div')[0].string.replace('\'', '\\\'') except Exception: anchor = item.select('dd div')[0].string messages2 = { 'roomid': roomid, 'plat': plat, 'ranktype': rankname, 'ranknum': item.select('dd')[0].string, 'anchor': anchor, 'platform': platformname, # 'img': item.select('dt img')[0]['src'], 'rankdetail': item.select('dd')[2].string, 'type': item.select('dd div')[1].string, 'ranktime': datetime.now().strftime('%m-%d'), 'Crawltime': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), } sql = 'replace into ' + mysql.get_sql_sentence( 'rank_history', messages2) print(sql) mysql.insert_one(sql) else: parse(url, platformname, plat, mysql) if __name__ == '__main__': mysql = Mysql() # text(mysql) get_url(mysql) # mysql.close_db()
class Anchordetail(object): def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36', } self.times = ['', 'w', 'm'] self.mysql = Mysql() def rich_gift(self, roomid, plat): #送礼土豪 for time in self.times: url = 'https://www.xiaohulu.com/test_anchor2/ajax_anchor_price_tyrants?plat_id={plat}&room_id={roomid}&t={time}'.format( plat=plat, roomid=roomid, time=time) response = requests.get(url, headers=self.headers) if time == '': time = 't' message = { 'roomid': roomid, 'plat': plat, 'ranktype': '送礼土豪', 'ranktime': time, 'detail': response.text.replace('\\u', '\\\\u').replace('\'', '\\\''), 'Crawltime': datetime.now().strftime('%Y-%m-%d'), } sql = 'replace into ' + self.mysql.get_sql_sentence( 'anchor_test', message) print(sql) self.mysql.insert_one(sql) def preferences(self, roomid, plat): #土豪偏好 for time in self.times: url = 'https://www.xiaohulu.com/test_anchor2/ajax_anchor_source_gname_range?plat_id={plat}&room_id={roomid}&t={time}'.format( plat=plat, roomid=roomid, time=time) response = requests.get(url, headers=self.headers) if time == '': time = 't' message = { 'roomid': roomid, 'plat': plat, 'ranktype': '土豪偏好', 'ranktime': time, 'detail': response.text.replace('\\u', '\\\\u').replace('\'', '\\\''), 'Crawltime': datetime.now().strftime('%Y-%m-%d'), } sql = 'replace into ' + self.mysql.get_sql_sentence( 'anchor_test', message) print(sql) self.mysql.insert_one(sql) def rich_id(self, roomid, plat, name): message1 = { 'roomid': roomid, 'plat': plat, 'name': name, } sql = 'replace into ' + self.mysql.get_sql_sentence( 'rich_id', message1) # print(sql) self.mysql.insert_one(sql) def price_list(self, roomid, plat): #礼物活跃排行榜 for time in self.times: url = 'https://www.xiaohulu.com/test_anchor2/ajax_anchor_price_list?plat_id={plat}&room_id={roomid}&t={time}'.format( plat=plat, roomid=roomid, time=time) response = requests.get(url, headers=self.headers) text = response.text[:-1] + ',' + response.text[-1:] result = re.findall("({.*?}),", text) if len(result) != 0: for item in result: item = json.loads(item) self.rich_id(item['from_id'], item['platform_id'], item['from_name']) if time == '': time = 't' message = { 'roomid': roomid, 'plat': plat, 'ranktype': '礼物活跃排行榜', 'ranktime': time, 'detail': response.text.replace('\\u', '\\\\u').replace('\'', '\\\''), 'Crawltime': datetime.now().strftime('%Y-%m-%d'), } sql = 'replace into ' + self.mysql.get_sql_sentence( 'anchor_test', message) print(sql) self.mysql.insert_one(sql) def timeline(self, roomid, plat): #土豪活跃时段 times = ['w', 'm'] for time in times: url = 'https://www.xiaohulu.com/test_anchor2/ajax_anchor_timeline_range?plat_id={plat}&room_id={roomid}&t={time}'.format( plat=plat, roomid=roomid, time=time) response = requests.get(url, headers=self.headers) if time == '': time = 't' message = { 'roomid': roomid, 'plat': plat, 'ranktype': '土豪活跃时段', 'ranktime': time, 'detail': response.text.replace('\\u', '\\\\u').replace('\'', '\\\''), 'Crawltime': datetime.now().strftime('%Y-%m-%d'), } sql = 'replace into ' + self.mysql.get_sql_sentence( 'anchor_test', message) print(sql) self.mysql.insert_one(sql) def get_table(self, index, response, timestr, data, dict): key = 'case{index}'.format(index=index) text = 'case {index}:(.*?)obj.str '.format(index=index) table1 = re.search(text, response.text, re.S).group(1).strip() dict[key] = timestr.search(table1).group(1) + ',' + data.search( table1).group(1) def others(self, roomid, plat): url = 'http://www.xiaohulu.com/anchor2/details/?plat={plat}&roomid={roomid}'.format( plat=plat, roomid=roomid) response = requests.get(url, headers=self.headers) soup = BeautifulSoup(response.text, 'lxml') achievements = soup.select('.container .a_card_r2') list = [] for achieve in achievements: # 单场最高成就 for i in range(0, len(achieve.select('dl dd p'))): list.append(achieve.select('dl dd p')[i].string.strip()) # print(list) rankindexs = soup.select('.a_card_r1.a_card_r3 .rank_j li i') indexs = [] for rankindex in rankindexs: # 主播指数排名 indexs.append(rankindex.get_text().strip()) # print(indexs) r_gift = soup.select('.cb_left5_r dl dd') # 最新收取送礼列表 gift_list = [] for item in r_gift: if item.get_text().strip() != '': gift_list.append(item.get_text().strip().replace( '\u200e', '').replace('\u202d', '')) if item.select('img'): gift_list.append(item.select('img')[0]['src']) # print(gift_list) dict = {} timestr = re.compile('obj.timestr =(.*?);') data = re.compile('obj.data =(.*?);') self.get_table(0, response, timestr, data, dict) for i in range(2, 17): self.get_table(i, response, timestr, data, dict) # print(dict) anchor = soup.select('.a_card_left ul li') anchor_list = [] #主播个人信息 for item in anchor[:-1]: if item.get_text().strip() != '': anchor_list.append(item.get_text().strip()) if item.select('img'): anchor_list.append(item.select('img')[0]['src']) # print(anchor_list) message = { 'roomid': roomid, 'plat': plat, 'anchor': str(anchor_list).replace('\'', '\\\''), 'achieve': str(list).replace('\'', '\\\''), 'rankindex': str(indexs).replace('\'', '\\\''), 'crawltime': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'gift': str(gift_list).replace('\'', '\\\''), 'table': str(dict).replace('\'', '\\\''), } sql = 'replace into ' + self.mysql.get_sql_sentence( 'anchor_detail', message) print(sql) self.mysql.insert_one(sql) def main(self, roomid, plat): self.rich_gift(roomid, plat) self.preferences(roomid, plat) self.price_list(roomid, plat) self.timeline(roomid, plat) self.others(roomid, plat) def get(self): with open('detail.txt', 'r') as f: items = f.readlines() with open('detail.txt', 'w') as f_w: for item in items: plat = re.search('\?plat=(\d+)&', item).group(1) roomid = re.search('&roomid=(\d+)', item).group(1) try: self.main(roomid, plat) except Exception: f_w.write(item + '\n') def run(self): self.get() self.mysql.close_db()
class XiohuluSpider(scrapy.Spider): name = 'xhl_detail' allowed_domains = ['www.xioahulu.com'] start_urls = ['http://www.xioahulu.com/'] url_anchor_price = 'https://www.xiaohulu.com/test_anchor2/ajax_anchor_price_tyrants?plat_id={plat}&room_id={roomid}&t={time}' url_source_gname = 'https://www.xiaohulu.com/test_anchor2/ajax_anchor_source_gname_range?plat_id={plat}&room_id={roomid}&t={time}' url_price_list = 'https://www.xiaohulu.com/test_anchor2/ajax_anchor_price_list?plat_id={plat}&room_id={roomid}&t={time}' url_anchor_timeline = 'https://www.xiaohulu.com/test_anchor2/ajax_anchor_timeline_range?plat_id={plat}&room_id={roomid}&t={time}' url_anchor_other = 'http://www.xiaohulu.com/anchor2/details/?plat={plat}&roomid={roomid}' custom_settings = { 'CONCURRENT_REQUESTS': 150, 'CONCURRENT_REQUESTS_PER_DOMAIN': 50, # 'LOG_LEVEL' : 'INFO' # 'DOWNLOAD_TIMEOUT': 30, # 'DOWNLOADER_MIDDLEWARES': {'xhl.middlewares.MyproxisSpiderMidleware': 125, }, 'ITEM_PIPELINES': { 'xhl.pipelines.XhlPipeline': 300, }, } mysql = Mysql() def start_requests(self): mysql = Mysql() sql = 'SELECT DISTINCT plat,roomid FROM rank_history WHERE plat and roomid is not NULL AND ranktype != \'水军榜\' AND ranktype != \'土豪榜\'' for item in mysql.get_two(sql): plat = item[0] roomid = item[1] yield Request(self.url_anchor_other.format(plat=plat, roomid=roomid), meta={ 'plat': plat, 'roomid': roomid, 'num': 2 }, callback=self.others) time = '' yield Request(url=self.url_anchor_price.format(plat=plat, roomid=roomid, time=time), meta={ 'time': time, 'plat': plat, 'roomid': roomid }, callback=self.rich_gift) yield Request(self.url_source_gname.format(plat=plat, roomid=roomid, time=time), meta={ 'time': time, 'plat': plat, 'roomid': roomid }, callback=self.preferences) yield Request(self.url_price_list.format(plat=plat, roomid=roomid, time=time), meta={ 'time': time, 'plat': plat, 'roomid': roomid, 'num': 1 }, callback=self.price_list) # for time in ['w', 'm']: # yield Request(self.url_anchor_timeline.format(plat=plat, roomid=roomid, time=time),meta={'time': time,'plat':plat,'roomid':roomid},callback=self.timeline) mysql.close_db() def rich_id(self, list): sql = "replace into rich_id(roomid,plat,name)values(%s,%s,%s)" # # print(sql) self.mysql.insert_sql_many(sql, list) def price_list(self, response): #礼物活跃排行榜 if response.meta['num'] == 1: if response == []: yield Request(self.url_price_list.format( plat=response.meta['plat'], roomid=response.meta['roomid'], time=response.meta['time']), meta={ 'time': response.meta['time'], 'plat': response.meta['plat'], 'roomid': response.meta['roomid'], }, callback=self.price_list) else: item = XhlItem() list_price = [] text = response.text[:-1] + ',' + response.text[-1:] result = re.findall("({.*?}),", text) if len(result) != 0: for ns in result: n = json.loads(ns) rich_set = (n['from_id'], n['platform_id'], n['from_name']) list_price.append(rich_set) self.rich_id(list_price) if response.meta['time'] == '': time = 't' else: time = response.meta['time'] item['roomid'] = response.meta['roomid'] item['plat'] = response.meta['plat'] item['ranktype'] = '礼物活跃排行榜' item['ranktime'] = time item['detail'] = response.text.replace('\\u', '\\\\u').replace( '\'', '\\\'') item['Crawltime'] = datetime.now().strftime('%Y-%m-%d') yield item # sql = 'insert into ' + self.mysql.get_sql_sentence(anchor_table, message) # print(sql) # self.mysql.insert_one(sql) else: item = XhlItem() list_price = [] text = response.text[:-1] + ',' + response.text[-1:] result = re.findall("({.*?}),", text) if len(result) != 0: for ns in result: n = json.loads(ns) rich_set = (n['from_id'], n['platform_id'], n['from_name']) list_price.append(rich_set) self.rich_id(list_price) if response.meta['time'] == '': time = 't' else: time = response.meta['time'] item['roomid'] = response.meta['roomid'] item['plat'] = response.meta['plat'] item['ranktype'] = '礼物活跃排行榜' item['ranktime'] = time item['detail'] = response.text.replace('\\u', '\\\\u').replace( '\'', '\\\'') item['Crawltime'] = datetime.now().strftime('%Y-%m-%d') yield item # sql = 'insert into ' + self.mysql.get_sql_sentence(anchor_table, message) # print(sql) # self.mysql.insert_one(sql) def rich_gift(self, response): #送礼土豪 item = XhlItem() if response.meta['time'] == '': time = 't' else: time = response.meta['time'] item['roomid'] = response.meta['roomid'] item['plat'] = response.meta['plat'] item['ranktype'] = '送礼土豪' item['ranktime'] = time item['detail'] = response.text.replace('\\u', '\\\\u').replace('\'', '\\\'') item['Crawltime'] = datetime.now().strftime('%Y-%m-%d') yield item def preferences(self, response): #土豪偏好 item = XhlItem() if response.meta['time'] == '': time = 't' else: time = response.meta['time'] item['roomid'] = response.meta['roomid'] item['plat'] = response.meta['plat'] item['ranktype'] = '土豪偏好' item['ranktime'] = time item['detail'] = response.text.replace('\\u', '\\\\u').replace('\'', '\\\'') item['Crawltime'] = datetime.now().strftime('%Y-%m-%d') yield item def timeline(self, response): #土豪活跃时段 item = XhlItem() if response.meta['time'] == '': time = 't' else: time = response.meta['time'] item['roomid'] = response.meta['roomid'] item['plat'] = response.meta['plat'] item['ranktype'] = '土豪活跃时段' item['ranktime'] = time item['detail'] = response.text.replace('\\u', '\\\\u').replace('\'', '\\\'') item['Crawltime'] = datetime.now().strftime('%Y-%m-%d') yield item def get_table(self, index, response, timestr, data, dict): key = 'case{index}'.format(index=index) text = 'case {index}:(.*?)obj.str '.format(index=index) try: table1 = re.search(text, response.text, re.S).group(1).strip() dict[key] = timestr.search(table1).group(1).strip().replace( '\"', '') + ',' + data.search(table1).group(1).strip().replace( '\"', '') except Exception: with open('detail.txt', 'a+') as f: f.write(response.url + '\n') f.write(str(index)) dict[key] = '' def others(self, response): soup = BeautifulSoup(response.text, 'lxml') achievements = soup.select('.container .a_card_r2') list = [] for achieve in achievements: # 单场最高成就 for i in range(0, len(achieve.select('dl dd p'))): list.append(achieve.select('dl dd p')[i].string.strip()) # print(list) rankindexs = soup.select('.a_card_r1.a_card_r3 .rank_j li i') indexs = [] for rankindex in rankindexs: # 主播指数排名 indexs.append(rankindex.get_text().strip()) # print(indexs) r_gift = soup.select('.cb_left5_r dl dd') # 最新收取送礼列表 gift_list = [] for item in r_gift: if item.get_text().strip() != '': gift_list.append(item.get_text().strip().replace( '\u200e', '').replace('\u202d', '')) if item.select('img'): gift_list.append(item.select('img')[0]['src']) # print(gift_list) dict = {} timestr = re.compile('obj.timestr =(.*?);') data = re.compile('obj.data =(.*?);') for i in range(3, 17): self.get_table(i, response, timestr, data, dict) # print(json.dumps(dict)) anchor = soup.select('.a_card_left ul li') anchor_list = [] #主播个人信息 for item in anchor[:-1]: if item.get_text().strip() != '': anchor_list.append(item.get_text().strip()) if item.select('img'): anchor_list.append(item.select('img')[0]['src']) # print(anchor_list) item = otherItem() item['roomid'] = response.meta['roomid'] item['plat'] = response.meta['plat'] item['anchor'] = str(anchor_list).replace('\'', '\\\'') item['achieve'] = str(list).replace('\'', '\\\'') item['rankindex'] = str(indexs).replace('\'', '\\\'') item['Crawltime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') item['gift'] = str(gift_list).replace('\'', '\\\'') item['table'] = json.dumps(dict) if item['anchor'] == []: with open('detail.txt', 'a+') as f: f.write(response.url + '\n') else: yield item
class RankSpider(scrapy.Spider): name = 'xhl_rank' allowed_domains = ['www.xiaohulu.com'] start_urls = ['http://www.xiaohulu.com/'] mysql = Mysql() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36', } def stringToDict(self): ''' 将从浏览器上Copy来的cookie字符串转化为Scrapy能使用的Dict :return: ''' cookie = 'PHPSESSID=k9av555ng9ntqcrdguveq3nm22; Hm_lvt_2772005b8bc0b193d080228322981977=1528770876; Hm_lvt_1c358b33dfa30c89dd3a1927a5921793=1528770876; xhl_cok=d8e0Bl3Mda5qkQtjGYQexSBBP3o5Ewa3OZ%2BKzz5FQG%2FEqWjqYBgWRcoxnPoRt%2B23PhD6nyF6%2BVqkErfZSQ; 6N3e_f2ec___XHLTXZ__www=0f594v2V4vbeCABmpyV%2F1ZUmdYs5EtNt6b35BRSZSYMeO9xIzNViHBl%2FHVuiMI3MWguA8zYu%2FiCKTs%2F4e4W4IfJK2%2BPpU0uLGy378xDb1Q; Hm_lpvt_1c358b33dfa30c89dd3a1927a5921793=1528875351; Hm_lpvt_2772005b8bc0b193d080228322981977=1528875351' itemDict = {} items = cookie.split(';') for item in items: key = item.split('=')[0].replace(' ', '') value = item.split('=')[1] itemDict[key] = value return itemDict def start_requests(self): url = 'http://www.xiaohulu.com/Anchor/index.html?plat=1&class=all&day=1440m' url_set = 'http://www.xiaohulu.com/Anchor/index.html?plat={id}&class=all&day=' url_rich = 'http://www.xiaohulu.com/Spectator/index.html?plat={id}&day=' response = requests.get(url, headers=self.headers) soup = BeautifulSoup(response.text, 'lxml') platforms = soup.select('.container ul.panktip select')[1] for platform in platforms.select('option')[1:]: platformname = platform.string.strip() plat = re.search('plat=(\d+)&', platform['value']).group(1) url_plat = url_set.format(id=plat) url_richplat = url_rich.format(id=re.search('plat=(\d+)&', platform['value']).group(1)) response1 = requests.get(url_plat, headers=self.headers) soup = BeautifulSoup(response1.text, 'lxml') dates = soup.select('.container ul.panktip select')[0] datelist = [dates.select('option')[1], dates.select('option')[9], dates.select('option')[15]] for date in datelist: # for date in dates.select('option'): url_time = url_plat + date['value'] url_rich1 = url_richplat + date['value'] yield scrapy.Request(url=url_time, meta={'date': date.string, 'plat': plat}, callback=self.parse) yield scrapy.Request(url=url_rich1, meta={'date': date.string, 'plat': plat, 'name': platformname}, callback=self.rich, headers=self.headers, cookies=self.stringToDict()) # main(url_time, data.string, mysql, plat) # rich(url_rich1, data.string, platformname, mysql, plat) def parse(self, response): try: list_rank = [] soup = BeautifulSoup(response.text, 'lxml') titalranks = soup.find_all(attrs={'class': 'w560'}) for item in titalranks: print(item.find(attrs={'class': 'h_list'}).string) # print(item.select('.stitle ul li')) for i in range(0, len(item.select('.stitle ul li')))[1:]: rankname = item.select('.stitle ul li')[i].string ranks = item.select('.mt20.lmmain div.svtable')[i] ids = ranks.select('a') contents = ranks.select('tr') for n in range(0, len(ids)): roomid = re.search('roomid=(.*?)&', str(ids[n])).group(1) content = contents[n] messages = { 'roomid': roomid, 'plat': response.meta['plat'], 'ranktype': rankname, 'ranknum': content.select('span.num')[0].string, 'anchor': re.search('</i>(.*)</h4>', str(content.select('dd h4')[0]), re.S).group(1), 'platform': content.select('dd h4 i')[0].string, 'type': re.match('(.*?)\s', content.select('dd p')[0].get_text()).group(1), 'img': content.select('dt img')[0]['src'], 'rankdetail': content.select('td.hsnum')[0].string.strip(), 'ranktime': response.meta['date'], 'Crawltime': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), } list_set = ( roomid, messages['plat'], messages['ranktype'], messages['ranknum'], messages['anchor'], messages['platform'], messages['type'], messages['img'], messages['rankdetail'], messages['ranktime'], messages['Crawltime']) list_rank.append(list_set) sql = "replace into rank_history(roomid,plat,ranktype,ranknum,anchor,platform,type,img,rankdetail,ranktime,Crawltime)values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" print(sql) self.mysql.insert_sql_many(sql,list_rank) except Exception as e: with open('error.txt', 'a+') as f: try: f.write(response.url + '\n') f.write('error: %s \n' % e) except Exception: f.write(response.url + '\n') def rich_id(self,list): sql = "replace into rich_id(roomid,plat,name)values(%s,%s,%s)" # # print(sql) self.mysql.insert_sql_many(sql,list) def rich(self, response): try: list_rank = [] list_price = [] soup = BeautifulSoup(response.text, 'lxml') titalranks = soup.find_all(attrs={'class': 'w560'}) for item in titalranks: rankname = item.select('.stitle h3')[0].string ranks = item.select('.mt20 div.svtable')[0] contents = ranks.select('tr') ids = ranks.select('a') for n in range(0, len(ids)): roomid = re.search('fromid=(.*?)\"', str(ids[n])).group(1) content = contents[n] try: anchor = content.select('dd h4')[0].string.replace('\'', '\\\'') except Exception: anchor = content.select('dd h4')[0].string messages = { 'roomid': roomid, 'plat': response.meta['plat'], 'ranktype': rankname, 'ranknum': content.select('span.num')[0].string, 'anchor': anchor, 'platform': response.meta['name'], 'img': content.select('dt img')[0]['src'], 'rankdetail': content.select('td .hsnum')[0].string.strip(), 'ranktime': response.meta['date'], 'Crawltime': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), } if rankname == '土豪榜': rich_set = (messages['roomid'], messages['plat'], messages['anchor']) list_price.append(rich_set) list_set = ( roomid, messages['plat'], messages['ranktype'], messages['ranknum'], messages['anchor'], messages['platform'], messages['img'], messages['rankdetail'], messages['ranktime'], messages['Crawltime']) list_rank.append(list_set) sql = "replace into rank_history(roomid,plat,ranktype,ranknum,anchor,platform,img,rankdetail,ranktime,Crawltime)values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" print(sql) self.rich_id(list_price) self.mysql.insert_sql_many(sql, list_rank) except Exception as e: with open('error.txt', 'a+') as f: try: f.write(response.url + '\n') f.write('error: %s \n' % e) except Exception: f.write(response.url + '\n')
def __init__(self): self.mysql = Mysql()
class UserSpider(scrapy.Spider): name = 'xhl_user' allowed_domains = ['www.xiaohulu.com'] start_urls = ['http://www.xiaohulu.com/'] url_gift = 'http://www.xiaohulu.com/test_spectator2/ajax_spec_price_range?t={time}&plat_id={plat}&from_id={roomid}' url_msg = 'https://www.xiaohulu.com/spectator2/ajax_spec_msg_range?t={time}&plat_id={plat}&from_id={roomid}' url_timeline = 'https://www.xiaohulu.com/spectator2/ajax_spec_price_period_range?t={time}&plat_id={plat}&from_id={roomid}' url_prefer = 'https://www.xiaohulu.com/spectator2/ajax_spec_source_gname_range?plat_id={plat}&from_id={roomid}' url_pug = 'https://www.xiaohulu.com/spectator2/ajax_spec_history?plat_id={plat}&from_id={roomid}' mysql = Mysql() custom_settings = { 'CONCURRENT_REQUESTS': 60, 'CONCURRENT_REQUESTS_PER_DOMAIN': 30, # 'DOWNLOADER_MIDDLEWARES': {'xhl.middlewares.MyproxisSpiderMidleware': 125, } # 'LOG_LEVEL' : 'INFO' # 'ITEM_PIPELINES': {'xhl.pipelines.XhlPipeline': 300, }, } def start_requests(self): mysql = Mysql() sql = 'SELECT * FROM `rich_id`' for item in mysql.get_two(sql): plat = item[1] roomid = item[0] yield Request(self.url_prefer.format(plat=plat, roomid=roomid), meta={ 'plat': plat, 'roomid': roomid }, callback=self.preferences) yield Request(self.url_pug.format(plat=plat, roomid=roomid), meta={ 'plat': plat, 'roomid': roomid }, callback=self.pug) for time in ['', 'm']: yield Request(url=self.url_gift.format(plat=plat, roomid=roomid, time=time), meta={ 'time': time, 'plat': plat, 'roomid': roomid }, callback=self.gift) yield Request(self.url_msg.format(plat=plat, roomid=roomid, time=time), meta={ 'time': time, 'plat': plat, 'roomid': roomid }, callback=self.msg) # yield Request(self.url_timeline.format(plat=plat, roomid=roomid, time=time), # meta={'time': time, 'plat': plat, 'roomid': roomid}, callback=self.timeline) mysql.close_db() def parse(self, response): print(response.text) def gift(self, response): # 礼物价值 try: if response.meta['time'] == '': time = 't' else: time = response.meta['time'] message = { 'fromid': response.meta['roomid'], 'plat': response.meta['plat'], 'ranktype': '礼物价值趋势', 'ranktime': time, 'detail': response.text.replace('\\u', '\\\\u').replace('\'', '\\\''), 'Crawltime': datetime.now().strftime('%Y-%m-%d'), } sql = 'replace into ' + self.mysql.get_sql_sentence( 'user_detail', message) print(sql) self.mysql.insert_one(sql) except Exception as e: with open('user.txt', 'a+') as f: f.write(response.url + '\n') f.write('error: %s \n' % e) def msg(self, response): # 弹幕数量趋势 if response.meta['time'] == '': time = 't' else: time = response.meta['time'] message = { 'fromid': response.meta['roomid'], 'plat': response.meta['plat'], 'ranktype': '弹幕数量趋势', 'ranktime': time, 'detail': response.text.replace('\\u', '\\\\u').replace('\'', '\\\''), 'Crawltime': datetime.now().strftime('%Y-%m-%d'), } sql = 'replace into ' + self.mysql.get_sql_sentence( 'user_detail', message) print(sql) self.mysql.insert_one(sql) def timeline(self, response): # 送礼时段分布 if response.meta['time'] == '': time = 't' else: time = response.meta['time'] message = { 'fromid': response.meta['roomid'], 'plat': response.meta['plat'], 'ranktype': '送礼时段分布', 'ranktime': time, 'detail': response.text.replace('\\u', '\\\\u').replace('\'', '\\\''), 'Crawltime': datetime.now().strftime('%Y-%m-%d'), } sql = 'replace into ' + self.mysql.get_sql_sentence( 'user_detail', message) print(sql) self.mysql.insert_one(sql) def preferences(self, response): # 土豪偏好 message = { 'fromid': response.meta['roomid'], 'plat': response.meta['plat'], 'ranktype': '土豪偏好', 'ranktime': 'm', 'detail': response.text.replace('\\u', '\\\\u').replace('\'', '\\\''), 'Crawltime': datetime.now().strftime('%Y-%m-%d'), } sql = 'replace into ' + self.mysql.get_sql_sentence( 'user_detail', message) print(sql) self.mysql.insert_one(sql) def pug_detail(mysql, response, plat, fromid): url1 = 'http://www.xiaohulu.com/test_spectator2/ajax_spec_history_list?plat_id={id}&from_id={fromid}&date={date}&p=1' result = re.findall("({.*?})", response.text) for item in result: info = json.loads(item) response = requests.get(url1.format(id=plat, fromid=fromid, date=info['date']), headers=headers) message = { 'fromid': fromid, 'plat': plat, 'ranktime': info['date'], 'detail': response.text.replace('\\u', '\\\\u').replace('\'', '\\\''), 'Crawltime': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), } sql = 'replace into ' + mysql.get_sql_sentence('user_pug', message) print(sql) mysql.insert_one(sql) # print(message) def pug(self, response): # 土豪足迹 message = { 'fromid': response.meta['roomid'], 'plat': response.meta['plat'], 'ranktype': '土豪足迹', 'ranktime': 'm', 'detail': response.text.replace('\'', '\\\''), 'Crawltime': datetime.now().strftime('%Y-%m-%d'), } print(message) sql = 'replace into ' + self.mysql.get_sql_sentence( 'user_detail', message) print(sql) self.mysql.insert_one(sql)