Пример #1
0
 def __init__(self):
     self.headers = {
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
     }
     self.times = ['', 'w', 'm']
     self.mysql = Mysql()
Пример #2
0
 def start_requests(self):
     mysql = Mysql()
     sql = 'SELECT DISTINCT plat,roomid FROM rank_history WHERE plat and roomid is not NULL AND ranktype != \'水军榜\' AND ranktype != \'土豪榜\''
     for item in mysql.get_two(sql):
         plat = item[0]
         roomid = item[1]
         yield Request(self.url_anchor_other.format(plat=plat,
                                                    roomid=roomid),
                       meta={
                           'plat': plat,
                           'roomid': roomid,
                           'num': 2
                       },
                       callback=self.others)
         time = ''
         yield Request(url=self.url_anchor_price.format(plat=plat,
                                                        roomid=roomid,
                                                        time=time),
                       meta={
                           'time': time,
                           'plat': plat,
                           'roomid': roomid
                       },
                       callback=self.rich_gift)
         yield Request(self.url_source_gname.format(plat=plat,
                                                    roomid=roomid,
                                                    time=time),
                       meta={
                           'time': time,
                           'plat': plat,
                           'roomid': roomid
                       },
                       callback=self.preferences)
         yield Request(self.url_price_list.format(plat=plat,
                                                  roomid=roomid,
                                                  time=time),
                       meta={
                           'time': time,
                           'plat': plat,
                           'roomid': roomid,
                           'num': 1
                       },
                       callback=self.price_list)
         # for time in ['w', 'm']:
         #     yield Request(self.url_anchor_timeline.format(plat=plat, roomid=roomid, time=time),meta={'time': time,'plat':plat,'roomid':roomid},callback=self.timeline)
     mysql.close_db()
Пример #3
0
 def set_table(self, tablename):
     anchor_table = '{tablename}_{date}'.format(
         tablename=tablename, date=datetime.now().strftime('%m%d'))
     sql5 = 'DROP TABLE IF EXISTS {table};'.format(table=anchor_table)
     sql = '''CREATE TABLE IF NOT EXISTS {table} LIKE {tablename};'''.format(
         table=anchor_table, tablename=tablename)  #建新表
     sql1 = 'INSERT INTO {table} SELECT * FROM {tablename};'.format(
         table=anchor_table, tablename=tablename)  #数据插入新表
     sql2 = 'DROP TABLE IF EXISTS {tablename};'.format(
         tablename=tablename)  #删除旧表
     sql3 = 'CREATE TABLE IF NOT EXISTS {tablename} LIKE {table};'.format(
         tablename=tablename, table=anchor_table)  #
     mysql = Mysql()
     mysql.cursor.execute(sql5)
     mysql.create_table(sql)
     mysql.insert_one(sql1)
     mysql.cursor.execute(sql2)
     print('删除成功')
     mysql.create_table(sql3)
     mysql.close_db()
Пример #4
0
 def start_requests(self):
     mysql = Mysql()
     sql = 'SELECT * FROM `rich_id`'
     for item in mysql.get_two(sql):
         plat = item[1]
         roomid = item[0]
         yield Request(self.url_prefer.format(plat=plat, roomid=roomid),
                       meta={
                           'plat': plat,
                           'roomid': roomid
                       },
                       callback=self.preferences)
         yield Request(self.url_pug.format(plat=plat, roomid=roomid),
                       meta={
                           'plat': plat,
                           'roomid': roomid
                       },
                       callback=self.pug)
         for time in ['', 'm']:
             yield Request(url=self.url_gift.format(plat=plat,
                                                    roomid=roomid,
                                                    time=time),
                           meta={
                               'time': time,
                               'plat': plat,
                               'roomid': roomid
                           },
                           callback=self.gift)
             yield Request(self.url_msg.format(plat=plat,
                                               roomid=roomid,
                                               time=time),
                           meta={
                               'time': time,
                               'plat': plat,
                               'roomid': roomid
                           },
                           callback=self.msg)
             # yield Request(self.url_timeline.format(plat=plat, roomid=roomid, time=time),
             #               meta={'time': time, 'plat': plat, 'roomid': roomid}, callback=self.timeline)
     mysql.close_db()
Пример #5
0
class XhlPipeline(object):
    def __init__(self):
        self.mysql = Mysql()

    def close_spider(self, spider):
        self.mysql.close_db()

    def process_item(self, item, spider):
        if isinstance(item, XhlItem):
            sql_tb_wdetail = 'replace into ' + self.mysql.get_sql_sentence(
                'anchor_test', item)
            # print(sql_tb_wdetail)
            self.mysql.insert_one(sql_tb_wdetail)
        elif isinstance(item, otherItem):
            sql_tb_wdetail = 'replace into ' + self.mysql.get_sql_sentence(
                'anchor_detail', item)
            # print(sql_tb_wdetail)
            self.mysql.insert_one(sql_tb_wdetail)

        return item
Пример #6
0
            roomid = re.search('roomid=(.*?)\"', str(item)).group(1)
            try:
                anchor = item.select('dd div')[0].string.replace('\'', '\\\'')
            except Exception:
                anchor = item.select('dd div')[0].string
            messages2 = {
                'roomid': roomid,
                'plat': plat,
                'ranktype': rankname,
                'ranknum': item.select('dd')[0].string,
                'anchor': anchor,
                'platform': platformname,
                # 'img': item.select('dt img')[0]['src'],
                'rankdetail': item.select('dd')[2].string,
                'type': item.select('dd div')[1].string,
                'ranktime': datetime.now().strftime('%m-%d'),
                'Crawltime': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            }
            sql = 'replace into ' + mysql.get_sql_sentence(
                'rank_history', messages2)
            print(sql)
            mysql.insert_one(sql)
    else:
        parse(url, platformname, plat, mysql)


if __name__ == '__main__':
    mysql = Mysql()
    # text(mysql)
    get_url(mysql)
    # mysql.close_db()
Пример #7
0
class Anchordetail(object):
    def __init__(self):
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
        }
        self.times = ['', 'w', 'm']
        self.mysql = Mysql()

    def rich_gift(self, roomid, plat):  #送礼土豪
        for time in self.times:
            url = 'https://www.xiaohulu.com/test_anchor2/ajax_anchor_price_tyrants?plat_id={plat}&room_id={roomid}&t={time}'.format(
                plat=plat, roomid=roomid, time=time)
            response = requests.get(url, headers=self.headers)
            if time == '':
                time = 't'
            message = {
                'roomid':
                roomid,
                'plat':
                plat,
                'ranktype':
                '送礼土豪',
                'ranktime':
                time,
                'detail':
                response.text.replace('\\u', '\\\\u').replace('\'', '\\\''),
                'Crawltime':
                datetime.now().strftime('%Y-%m-%d'),
            }
            sql = 'replace into ' + self.mysql.get_sql_sentence(
                'anchor_test', message)
            print(sql)
            self.mysql.insert_one(sql)

    def preferences(self, roomid, plat):  #土豪偏好
        for time in self.times:
            url = 'https://www.xiaohulu.com/test_anchor2/ajax_anchor_source_gname_range?plat_id={plat}&room_id={roomid}&t={time}'.format(
                plat=plat, roomid=roomid, time=time)
            response = requests.get(url, headers=self.headers)
            if time == '':
                time = 't'
            message = {
                'roomid':
                roomid,
                'plat':
                plat,
                'ranktype':
                '土豪偏好',
                'ranktime':
                time,
                'detail':
                response.text.replace('\\u', '\\\\u').replace('\'', '\\\''),
                'Crawltime':
                datetime.now().strftime('%Y-%m-%d'),
            }
            sql = 'replace into ' + self.mysql.get_sql_sentence(
                'anchor_test', message)
            print(sql)
            self.mysql.insert_one(sql)

    def rich_id(self, roomid, plat, name):
        message1 = {
            'roomid': roomid,
            'plat': plat,
            'name': name,
        }
        sql = 'replace into ' + self.mysql.get_sql_sentence(
            'rich_id', message1)
        # print(sql)
        self.mysql.insert_one(sql)

    def price_list(self, roomid, plat):  #礼物活跃排行榜
        for time in self.times:
            url = 'https://www.xiaohulu.com/test_anchor2/ajax_anchor_price_list?plat_id={plat}&room_id={roomid}&t={time}'.format(
                plat=plat, roomid=roomid, time=time)
            response = requests.get(url, headers=self.headers)
            text = response.text[:-1] + ',' + response.text[-1:]
            result = re.findall("({.*?}),", text)
            if len(result) != 0:
                for item in result:
                    item = json.loads(item)
                    self.rich_id(item['from_id'], item['platform_id'],
                                 item['from_name'])
            if time == '':
                time = 't'
            message = {
                'roomid':
                roomid,
                'plat':
                plat,
                'ranktype':
                '礼物活跃排行榜',
                'ranktime':
                time,
                'detail':
                response.text.replace('\\u', '\\\\u').replace('\'', '\\\''),
                'Crawltime':
                datetime.now().strftime('%Y-%m-%d'),
            }
            sql = 'replace into ' + self.mysql.get_sql_sentence(
                'anchor_test', message)
            print(sql)
            self.mysql.insert_one(sql)

    def timeline(self, roomid, plat):  #土豪活跃时段
        times = ['w', 'm']
        for time in times:
            url = 'https://www.xiaohulu.com/test_anchor2/ajax_anchor_timeline_range?plat_id={plat}&room_id={roomid}&t={time}'.format(
                plat=plat, roomid=roomid, time=time)
            response = requests.get(url, headers=self.headers)
            if time == '':
                time = 't'
            message = {
                'roomid':
                roomid,
                'plat':
                plat,
                'ranktype':
                '土豪活跃时段',
                'ranktime':
                time,
                'detail':
                response.text.replace('\\u', '\\\\u').replace('\'', '\\\''),
                'Crawltime':
                datetime.now().strftime('%Y-%m-%d'),
            }
            sql = 'replace into ' + self.mysql.get_sql_sentence(
                'anchor_test', message)
            print(sql)
            self.mysql.insert_one(sql)

    def get_table(self, index, response, timestr, data, dict):
        key = 'case{index}'.format(index=index)
        text = 'case {index}:(.*?)obj.str '.format(index=index)
        table1 = re.search(text, response.text, re.S).group(1).strip()
        dict[key] = timestr.search(table1).group(1) + ',' + data.search(
            table1).group(1)

    def others(self, roomid, plat):
        url = 'http://www.xiaohulu.com/anchor2/details/?plat={plat}&roomid={roomid}'.format(
            plat=plat, roomid=roomid)
        response = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(response.text, 'lxml')
        achievements = soup.select('.container .a_card_r2')
        list = []
        for achieve in achievements:  # 单场最高成就
            for i in range(0, len(achieve.select('dl dd p'))):
                list.append(achieve.select('dl dd p')[i].string.strip())
        # print(list)
        rankindexs = soup.select('.a_card_r1.a_card_r3 .rank_j li i')
        indexs = []
        for rankindex in rankindexs:  # 主播指数排名
            indexs.append(rankindex.get_text().strip())
        # print(indexs)
        r_gift = soup.select('.cb_left5_r dl dd')  # 最新收取送礼列表
        gift_list = []
        for item in r_gift:
            if item.get_text().strip() != '':
                gift_list.append(item.get_text().strip().replace(
                    '\u200e', '').replace('\u202d', ''))
            if item.select('img'):
                gift_list.append(item.select('img')[0]['src'])
        # print(gift_list)
        dict = {}
        timestr = re.compile('obj.timestr =(.*?);')
        data = re.compile('obj.data =(.*?);')
        self.get_table(0, response, timestr, data, dict)
        for i in range(2, 17):
            self.get_table(i, response, timestr, data, dict)
        # print(dict)
        anchor = soup.select('.a_card_left ul li')
        anchor_list = []  #主播个人信息
        for item in anchor[:-1]:
            if item.get_text().strip() != '':
                anchor_list.append(item.get_text().strip())
            if item.select('img'):
                anchor_list.append(item.select('img')[0]['src'])
        # print(anchor_list)
        message = {
            'roomid': roomid,
            'plat': plat,
            'anchor': str(anchor_list).replace('\'', '\\\''),
            'achieve': str(list).replace('\'', '\\\''),
            'rankindex': str(indexs).replace('\'', '\\\''),
            'crawltime': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'gift': str(gift_list).replace('\'', '\\\''),
            'table': str(dict).replace('\'', '\\\''),
        }
        sql = 'replace into ' + self.mysql.get_sql_sentence(
            'anchor_detail', message)
        print(sql)
        self.mysql.insert_one(sql)

    def main(self, roomid, plat):
        self.rich_gift(roomid, plat)
        self.preferences(roomid, plat)
        self.price_list(roomid, plat)
        self.timeline(roomid, plat)
        self.others(roomid, plat)

    def get(self):
        with open('detail.txt', 'r') as f:
            items = f.readlines()
        with open('detail.txt', 'w') as f_w:
            for item in items:
                plat = re.search('\?plat=(\d+)&', item).group(1)
                roomid = re.search('&roomid=(\d+)', item).group(1)
                try:
                    self.main(roomid, plat)
                except Exception:
                    f_w.write(item + '\n')

    def run(self):
        self.get()
        self.mysql.close_db()
Пример #8
0
class XiohuluSpider(scrapy.Spider):
    name = 'xhl_detail'
    allowed_domains = ['www.xioahulu.com']
    start_urls = ['http://www.xioahulu.com/']

    url_anchor_price = 'https://www.xiaohulu.com/test_anchor2/ajax_anchor_price_tyrants?plat_id={plat}&room_id={roomid}&t={time}'
    url_source_gname = 'https://www.xiaohulu.com/test_anchor2/ajax_anchor_source_gname_range?plat_id={plat}&room_id={roomid}&t={time}'
    url_price_list = 'https://www.xiaohulu.com/test_anchor2/ajax_anchor_price_list?plat_id={plat}&room_id={roomid}&t={time}'
    url_anchor_timeline = 'https://www.xiaohulu.com/test_anchor2/ajax_anchor_timeline_range?plat_id={plat}&room_id={roomid}&t={time}'
    url_anchor_other = 'http://www.xiaohulu.com/anchor2/details/?plat={plat}&roomid={roomid}'

    custom_settings = {
        'CONCURRENT_REQUESTS': 150,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 50,
        # 'LOG_LEVEL' : 'INFO'
        # 'DOWNLOAD_TIMEOUT': 30,
        # 'DOWNLOADER_MIDDLEWARES': {'xhl.middlewares.MyproxisSpiderMidleware': 125, },
        'ITEM_PIPELINES': {
            'xhl.pipelines.XhlPipeline': 300,
        },
    }

    mysql = Mysql()

    def start_requests(self):
        mysql = Mysql()
        sql = 'SELECT DISTINCT plat,roomid FROM rank_history WHERE plat and roomid is not NULL AND ranktype != \'水军榜\' AND ranktype != \'土豪榜\''
        for item in mysql.get_two(sql):
            plat = item[0]
            roomid = item[1]
            yield Request(self.url_anchor_other.format(plat=plat,
                                                       roomid=roomid),
                          meta={
                              'plat': plat,
                              'roomid': roomid,
                              'num': 2
                          },
                          callback=self.others)
            time = ''
            yield Request(url=self.url_anchor_price.format(plat=plat,
                                                           roomid=roomid,
                                                           time=time),
                          meta={
                              'time': time,
                              'plat': plat,
                              'roomid': roomid
                          },
                          callback=self.rich_gift)
            yield Request(self.url_source_gname.format(plat=plat,
                                                       roomid=roomid,
                                                       time=time),
                          meta={
                              'time': time,
                              'plat': plat,
                              'roomid': roomid
                          },
                          callback=self.preferences)
            yield Request(self.url_price_list.format(plat=plat,
                                                     roomid=roomid,
                                                     time=time),
                          meta={
                              'time': time,
                              'plat': plat,
                              'roomid': roomid,
                              'num': 1
                          },
                          callback=self.price_list)
            # for time in ['w', 'm']:
            #     yield Request(self.url_anchor_timeline.format(plat=plat, roomid=roomid, time=time),meta={'time': time,'plat':plat,'roomid':roomid},callback=self.timeline)
        mysql.close_db()

    def rich_id(self, list):
        sql = "replace into rich_id(roomid,plat,name)values(%s,%s,%s)"
        # # print(sql)
        self.mysql.insert_sql_many(sql, list)

    def price_list(self, response):  #礼物活跃排行榜
        if response.meta['num'] == 1:
            if response == []:
                yield Request(self.url_price_list.format(
                    plat=response.meta['plat'],
                    roomid=response.meta['roomid'],
                    time=response.meta['time']),
                              meta={
                                  'time': response.meta['time'],
                                  'plat': response.meta['plat'],
                                  'roomid': response.meta['roomid'],
                              },
                              callback=self.price_list)
            else:
                item = XhlItem()
                list_price = []
                text = response.text[:-1] + ',' + response.text[-1:]
                result = re.findall("({.*?}),", text)
                if len(result) != 0:
                    for ns in result:
                        n = json.loads(ns)
                        rich_set = (n['from_id'], n['platform_id'],
                                    n['from_name'])
                        list_price.append(rich_set)
                    self.rich_id(list_price)
                if response.meta['time'] == '':
                    time = 't'
                else:
                    time = response.meta['time']
                item['roomid'] = response.meta['roomid']
                item['plat'] = response.meta['plat']
                item['ranktype'] = '礼物活跃排行榜'
                item['ranktime'] = time
                item['detail'] = response.text.replace('\\u', '\\\\u').replace(
                    '\'', '\\\'')
                item['Crawltime'] = datetime.now().strftime('%Y-%m-%d')
                yield item
                # sql = 'insert into ' + self.mysql.get_sql_sentence(anchor_table, message)
                # print(sql)
                # self.mysql.insert_one(sql)
        else:
            item = XhlItem()
            list_price = []
            text = response.text[:-1] + ',' + response.text[-1:]
            result = re.findall("({.*?}),", text)
            if len(result) != 0:
                for ns in result:
                    n = json.loads(ns)
                    rich_set = (n['from_id'], n['platform_id'], n['from_name'])
                    list_price.append(rich_set)
                self.rich_id(list_price)
            if response.meta['time'] == '':
                time = 't'
            else:
                time = response.meta['time']
            item['roomid'] = response.meta['roomid']
            item['plat'] = response.meta['plat']
            item['ranktype'] = '礼物活跃排行榜'
            item['ranktime'] = time
            item['detail'] = response.text.replace('\\u', '\\\\u').replace(
                '\'', '\\\'')
            item['Crawltime'] = datetime.now().strftime('%Y-%m-%d')
            yield item
            # sql = 'insert into ' + self.mysql.get_sql_sentence(anchor_table, message)
            # print(sql)
            # self.mysql.insert_one(sql)

    def rich_gift(self, response):  #送礼土豪
        item = XhlItem()
        if response.meta['time'] == '':
            time = 't'
        else:
            time = response.meta['time']
        item['roomid'] = response.meta['roomid']
        item['plat'] = response.meta['plat']
        item['ranktype'] = '送礼土豪'
        item['ranktime'] = time
        item['detail'] = response.text.replace('\\u',
                                               '\\\\u').replace('\'', '\\\'')
        item['Crawltime'] = datetime.now().strftime('%Y-%m-%d')
        yield item

    def preferences(self, response):  #土豪偏好
        item = XhlItem()
        if response.meta['time'] == '':
            time = 't'
        else:
            time = response.meta['time']
        item['roomid'] = response.meta['roomid']
        item['plat'] = response.meta['plat']
        item['ranktype'] = '土豪偏好'
        item['ranktime'] = time
        item['detail'] = response.text.replace('\\u',
                                               '\\\\u').replace('\'', '\\\'')
        item['Crawltime'] = datetime.now().strftime('%Y-%m-%d')
        yield item

    def timeline(self, response):  #土豪活跃时段
        item = XhlItem()
        if response.meta['time'] == '':
            time = 't'
        else:
            time = response.meta['time']
        item['roomid'] = response.meta['roomid']
        item['plat'] = response.meta['plat']
        item['ranktype'] = '土豪活跃时段'
        item['ranktime'] = time
        item['detail'] = response.text.replace('\\u',
                                               '\\\\u').replace('\'', '\\\'')
        item['Crawltime'] = datetime.now().strftime('%Y-%m-%d')
        yield item

    def get_table(self, index, response, timestr, data, dict):
        key = 'case{index}'.format(index=index)
        text = 'case {index}:(.*?)obj.str '.format(index=index)
        try:
            table1 = re.search(text, response.text, re.S).group(1).strip()
            dict[key] = timestr.search(table1).group(1).strip().replace(
                '\"', '') + ',' + data.search(table1).group(1).strip().replace(
                    '\"', '')
        except Exception:
            with open('detail.txt', 'a+') as f:
                f.write(response.url + '\n')
                f.write(str(index))
            dict[key] = ''

    def others(self, response):
        soup = BeautifulSoup(response.text, 'lxml')
        achievements = soup.select('.container .a_card_r2')
        list = []
        for achieve in achievements:  # 单场最高成就
            for i in range(0, len(achieve.select('dl dd p'))):
                list.append(achieve.select('dl dd p')[i].string.strip())
        # print(list)
        rankindexs = soup.select('.a_card_r1.a_card_r3 .rank_j li i')
        indexs = []
        for rankindex in rankindexs:  # 主播指数排名
            indexs.append(rankindex.get_text().strip())
        # print(indexs)
        r_gift = soup.select('.cb_left5_r dl dd')  # 最新收取送礼列表
        gift_list = []
        for item in r_gift:
            if item.get_text().strip() != '':
                gift_list.append(item.get_text().strip().replace(
                    '\u200e', '').replace('\u202d', ''))
            if item.select('img'):
                gift_list.append(item.select('img')[0]['src'])
        # print(gift_list)
        dict = {}
        timestr = re.compile('obj.timestr =(.*?);')
        data = re.compile('obj.data =(.*?);')
        for i in range(3, 17):
            self.get_table(i, response, timestr, data, dict)
        # print(json.dumps(dict))
        anchor = soup.select('.a_card_left ul li')
        anchor_list = []  #主播个人信息
        for item in anchor[:-1]:
            if item.get_text().strip() != '':
                anchor_list.append(item.get_text().strip())
            if item.select('img'):
                anchor_list.append(item.select('img')[0]['src'])
        # print(anchor_list)
        item = otherItem()
        item['roomid'] = response.meta['roomid']
        item['plat'] = response.meta['plat']
        item['anchor'] = str(anchor_list).replace('\'', '\\\'')
        item['achieve'] = str(list).replace('\'', '\\\'')
        item['rankindex'] = str(indexs).replace('\'', '\\\'')
        item['Crawltime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        item['gift'] = str(gift_list).replace('\'', '\\\'')
        item['table'] = json.dumps(dict)
        if item['anchor'] == []:
            with open('detail.txt', 'a+') as f:
                f.write(response.url + '\n')
        else:
            yield item
Пример #9
0
class RankSpider(scrapy.Spider):
    name = 'xhl_rank'
    allowed_domains = ['www.xiaohulu.com']
    start_urls = ['http://www.xiaohulu.com/']
    mysql = Mysql()
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
    }

    def stringToDict(self):
        '''
        将从浏览器上Copy来的cookie字符串转化为Scrapy能使用的Dict
        :return:
        '''
        cookie = 'PHPSESSID=k9av555ng9ntqcrdguveq3nm22; Hm_lvt_2772005b8bc0b193d080228322981977=1528770876; Hm_lvt_1c358b33dfa30c89dd3a1927a5921793=1528770876; xhl_cok=d8e0Bl3Mda5qkQtjGYQexSBBP3o5Ewa3OZ%2BKzz5FQG%2FEqWjqYBgWRcoxnPoRt%2B23PhD6nyF6%2BVqkErfZSQ; 6N3e_f2ec___XHLTXZ__www=0f594v2V4vbeCABmpyV%2F1ZUmdYs5EtNt6b35BRSZSYMeO9xIzNViHBl%2FHVuiMI3MWguA8zYu%2FiCKTs%2F4e4W4IfJK2%2BPpU0uLGy378xDb1Q; Hm_lpvt_1c358b33dfa30c89dd3a1927a5921793=1528875351; Hm_lpvt_2772005b8bc0b193d080228322981977=1528875351'

        itemDict = {}
        items = cookie.split(';')
        for item in items:
            key = item.split('=')[0].replace(' ', '')
            value = item.split('=')[1]
            itemDict[key] = value
        return itemDict

    def start_requests(self):
        url = 'http://www.xiaohulu.com/Anchor/index.html?plat=1&class=all&day=1440m'
        url_set = 'http://www.xiaohulu.com/Anchor/index.html?plat={id}&class=all&day='
        url_rich = 'http://www.xiaohulu.com/Spectator/index.html?plat={id}&day='
        response = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(response.text, 'lxml')
        platforms = soup.select('.container ul.panktip select')[1]
        for platform in platforms.select('option')[1:]:
            platformname = platform.string.strip()
            plat = re.search('plat=(\d+)&', platform['value']).group(1)
            url_plat = url_set.format(id=plat)
            url_richplat = url_rich.format(id=re.search('plat=(\d+)&', platform['value']).group(1))
            response1 = requests.get(url_plat, headers=self.headers)
            soup = BeautifulSoup(response1.text, 'lxml')
            dates = soup.select('.container ul.panktip select')[0]
            datelist = [dates.select('option')[1], dates.select('option')[9], dates.select('option')[15]]
            for date in datelist:
            # for date in dates.select('option'):
                url_time = url_plat + date['value']
                url_rich1 = url_richplat + date['value']
                yield scrapy.Request(url=url_time, meta={'date': date.string, 'plat': plat}, callback=self.parse)
                yield scrapy.Request(url=url_rich1, meta={'date': date.string, 'plat': plat, 'name': platformname},
                                     callback=self.rich, headers=self.headers, cookies=self.stringToDict())
                # main(url_time, data.string, mysql, plat)
                # rich(url_rich1, data.string, platformname, mysql, plat)

    def parse(self, response):
        try:
            list_rank = []
            soup = BeautifulSoup(response.text, 'lxml')
            titalranks = soup.find_all(attrs={'class': 'w560'})
            for item in titalranks:
                print(item.find(attrs={'class': 'h_list'}).string)
                # print(item.select('.stitle ul li'))
                for i in range(0, len(item.select('.stitle ul li')))[1:]:
                    rankname = item.select('.stitle ul li')[i].string
                    ranks = item.select('.mt20.lmmain div.svtable')[i]
                    ids = ranks.select('a')
                    contents = ranks.select('tr')
                    for n in range(0, len(ids)):
                        roomid = re.search('roomid=(.*?)&', str(ids[n])).group(1)
                        content = contents[n]
                        messages = {
                            'roomid': roomid,
                            'plat': response.meta['plat'],
                            'ranktype': rankname,
                            'ranknum': content.select('span.num')[0].string,
                            'anchor': re.search('</i>(.*)</h4>', str(content.select('dd h4')[0]), re.S).group(1),
                            'platform': content.select('dd h4 i')[0].string,
                            'type': re.match('(.*?)\s', content.select('dd p')[0].get_text()).group(1),
                            'img': content.select('dt img')[0]['src'],
                            'rankdetail': content.select('td.hsnum')[0].string.strip(),
                            'ranktime': response.meta['date'],
                            'Crawltime': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        }
                        list_set = (
                        roomid, messages['plat'], messages['ranktype'], messages['ranknum'], messages['anchor'],
                        messages['platform'], messages['type'], messages['img'], messages['rankdetail'],
                        messages['ranktime'], messages['Crawltime'])
                        list_rank.append(list_set)
                    sql = "replace into rank_history(roomid,plat,ranktype,ranknum,anchor,platform,type,img,rankdetail,ranktime,Crawltime)values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
                    print(sql)
                    self.mysql.insert_sql_many(sql,list_rank)
        except Exception as e:
            with open('error.txt', 'a+') as f:
                try:
                    f.write(response.url + '\n')
                    f.write('error: %s \n' % e)
                except Exception:
                    f.write(response.url + '\n')

    def rich_id(self,list):
        sql = "replace into rich_id(roomid,plat,name)values(%s,%s,%s)"
        # # print(sql)
        self.mysql.insert_sql_many(sql,list)

    def rich(self, response):
        try:
            list_rank = []
            list_price = []
            soup = BeautifulSoup(response.text, 'lxml')
            titalranks = soup.find_all(attrs={'class': 'w560'})
            for item in titalranks:
                rankname = item.select('.stitle h3')[0].string

                ranks = item.select('.mt20 div.svtable')[0]
                contents = ranks.select('tr')
                ids = ranks.select('a')
                for n in range(0, len(ids)):
                    roomid = re.search('fromid=(.*?)\"', str(ids[n])).group(1)
                    content = contents[n]
                    try:
                        anchor = content.select('dd h4')[0].string.replace('\'', '\\\'')
                    except Exception:
                        anchor = content.select('dd h4')[0].string
                    messages = {
                        'roomid': roomid,
                        'plat': response.meta['plat'],
                        'ranktype': rankname,
                        'ranknum': content.select('span.num')[0].string,
                        'anchor': anchor,
                        'platform': response.meta['name'],
                        'img': content.select('dt img')[0]['src'],
                        'rankdetail': content.select('td .hsnum')[0].string.strip(),
                        'ranktime': response.meta['date'],
                        'Crawltime': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    }
                    if rankname == '土豪榜':
                        rich_set = (messages['roomid'], messages['plat'], messages['anchor'])
                        list_price.append(rich_set)
                    list_set = (
                        roomid, messages['plat'], messages['ranktype'], messages['ranknum'], messages['anchor'],
                        messages['platform'], messages['img'], messages['rankdetail'],
                        messages['ranktime'], messages['Crawltime'])
                    list_rank.append(list_set)
                sql = "replace into rank_history(roomid,plat,ranktype,ranknum,anchor,platform,img,rankdetail,ranktime,Crawltime)values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
                print(sql)
                self.rich_id(list_price)
                self.mysql.insert_sql_many(sql, list_rank)

        except Exception as e:
            with open('error.txt', 'a+') as f:
                try:
                    f.write(response.url + '\n')
                    f.write('error: %s \n' % e)
                except Exception:
                    f.write(response.url + '\n')
Пример #10
0
 def __init__(self):
     self.mysql = Mysql()
Пример #11
0
class UserSpider(scrapy.Spider):
    name = 'xhl_user'
    allowed_domains = ['www.xiaohulu.com']
    start_urls = ['http://www.xiaohulu.com/']

    url_gift = 'http://www.xiaohulu.com/test_spectator2/ajax_spec_price_range?t={time}&plat_id={plat}&from_id={roomid}'
    url_msg = 'https://www.xiaohulu.com/spectator2/ajax_spec_msg_range?t={time}&plat_id={plat}&from_id={roomid}'
    url_timeline = 'https://www.xiaohulu.com/spectator2/ajax_spec_price_period_range?t={time}&plat_id={plat}&from_id={roomid}'
    url_prefer = 'https://www.xiaohulu.com/spectator2/ajax_spec_source_gname_range?plat_id={plat}&from_id={roomid}'
    url_pug = 'https://www.xiaohulu.com/spectator2/ajax_spec_history?plat_id={plat}&from_id={roomid}'

    mysql = Mysql()
    custom_settings = {
        'CONCURRENT_REQUESTS': 60,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 30,
        # 'DOWNLOADER_MIDDLEWARES': {'xhl.middlewares.MyproxisSpiderMidleware': 125, }
        # 'LOG_LEVEL' : 'INFO'
        # 'ITEM_PIPELINES': {'xhl.pipelines.XhlPipeline': 300, },
    }

    def start_requests(self):
        mysql = Mysql()
        sql = 'SELECT * FROM `rich_id`'
        for item in mysql.get_two(sql):
            plat = item[1]
            roomid = item[0]
            yield Request(self.url_prefer.format(plat=plat, roomid=roomid),
                          meta={
                              'plat': plat,
                              'roomid': roomid
                          },
                          callback=self.preferences)
            yield Request(self.url_pug.format(plat=plat, roomid=roomid),
                          meta={
                              'plat': plat,
                              'roomid': roomid
                          },
                          callback=self.pug)
            for time in ['', 'm']:
                yield Request(url=self.url_gift.format(plat=plat,
                                                       roomid=roomid,
                                                       time=time),
                              meta={
                                  'time': time,
                                  'plat': plat,
                                  'roomid': roomid
                              },
                              callback=self.gift)
                yield Request(self.url_msg.format(plat=plat,
                                                  roomid=roomid,
                                                  time=time),
                              meta={
                                  'time': time,
                                  'plat': plat,
                                  'roomid': roomid
                              },
                              callback=self.msg)
                # yield Request(self.url_timeline.format(plat=plat, roomid=roomid, time=time),
                #               meta={'time': time, 'plat': plat, 'roomid': roomid}, callback=self.timeline)
        mysql.close_db()

    def parse(self, response):
        print(response.text)

    def gift(self, response):  # 礼物价值
        try:
            if response.meta['time'] == '':
                time = 't'
            else:
                time = response.meta['time']
            message = {
                'fromid':
                response.meta['roomid'],
                'plat':
                response.meta['plat'],
                'ranktype':
                '礼物价值趋势',
                'ranktime':
                time,
                'detail':
                response.text.replace('\\u', '\\\\u').replace('\'', '\\\''),
                'Crawltime':
                datetime.now().strftime('%Y-%m-%d'),
            }
            sql = 'replace into ' + self.mysql.get_sql_sentence(
                'user_detail', message)
            print(sql)
            self.mysql.insert_one(sql)
        except Exception as e:
            with open('user.txt', 'a+') as f:
                f.write(response.url + '\n')
                f.write('error: %s \n' % e)

    def msg(self, response):  # 弹幕数量趋势
        if response.meta['time'] == '':
            time = 't'
        else:
            time = response.meta['time']
        message = {
            'fromid': response.meta['roomid'],
            'plat': response.meta['plat'],
            'ranktype': '弹幕数量趋势',
            'ranktime': time,
            'detail': response.text.replace('\\u',
                                            '\\\\u').replace('\'', '\\\''),
            'Crawltime': datetime.now().strftime('%Y-%m-%d'),
        }
        sql = 'replace into ' + self.mysql.get_sql_sentence(
            'user_detail', message)
        print(sql)
        self.mysql.insert_one(sql)

    def timeline(self, response):  # 送礼时段分布
        if response.meta['time'] == '':
            time = 't'
        else:
            time = response.meta['time']
        message = {
            'fromid': response.meta['roomid'],
            'plat': response.meta['plat'],
            'ranktype': '送礼时段分布',
            'ranktime': time,
            'detail': response.text.replace('\\u',
                                            '\\\\u').replace('\'', '\\\''),
            'Crawltime': datetime.now().strftime('%Y-%m-%d'),
        }
        sql = 'replace into ' + self.mysql.get_sql_sentence(
            'user_detail', message)
        print(sql)
        self.mysql.insert_one(sql)

    def preferences(self, response):  # 土豪偏好
        message = {
            'fromid': response.meta['roomid'],
            'plat': response.meta['plat'],
            'ranktype': '土豪偏好',
            'ranktime': 'm',
            'detail': response.text.replace('\\u',
                                            '\\\\u').replace('\'', '\\\''),
            'Crawltime': datetime.now().strftime('%Y-%m-%d'),
        }
        sql = 'replace into ' + self.mysql.get_sql_sentence(
            'user_detail', message)
        print(sql)
        self.mysql.insert_one(sql)

    def pug_detail(mysql, response, plat, fromid):
        url1 = 'http://www.xiaohulu.com/test_spectator2/ajax_spec_history_list?plat_id={id}&from_id={fromid}&date={date}&p=1'
        result = re.findall("({.*?})", response.text)
        for item in result:
            info = json.loads(item)
            response = requests.get(url1.format(id=plat,
                                                fromid=fromid,
                                                date=info['date']),
                                    headers=headers)
            message = {
                'fromid':
                fromid,
                'plat':
                plat,
                'ranktime':
                info['date'],
                'detail':
                response.text.replace('\\u', '\\\\u').replace('\'', '\\\''),
                'Crawltime':
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            }
            sql = 'replace into ' + mysql.get_sql_sentence('user_pug', message)
            print(sql)
            mysql.insert_one(sql)
            # print(message)

    def pug(self, response):  # 土豪足迹
        message = {
            'fromid': response.meta['roomid'],
            'plat': response.meta['plat'],
            'ranktype': '土豪足迹',
            'ranktime': 'm',
            'detail': response.text.replace('\'', '\\\''),
            'Crawltime': datetime.now().strftime('%Y-%m-%d'),
        }
        print(message)
        sql = 'replace into ' + self.mysql.get_sql_sentence(
            'user_detail', message)
        print(sql)
        self.mysql.insert_one(sql)