def parse_line_info(self, response): if response.xpath('//ul[@id="match_group"]'): # 遍历赛程 JSON格式 year = response.xpath( '//span[@class="ldrop_tit_txt"]/text()').extract_first()[:-2] if '/' in year: year = year.split('/')[0] + '-01-01' else: year = year + '-01-01' for s in response.xpath( '//ul[@class="lsaiguo_round_list clearfix"]/li'): url = 'http://liansai.500.com/index.php?c=score&a=getmatch&stid=%s&round=%s' % ( str(response.url).split("-")[2][:-1], s.xpath('a/@data-group').extract_first()) infoJson = json.loads( str( BeautifulSoup( urllib.request.urlopen( urllib.request.Request(url)).read(), "html.parser"))) for t in infoJson: season = SeasonItem() season['league_name'] = response.xpath( '//ul[@class="lpage_race_nav clearfix"]/li[1]/a/text()' ).extract_first()[:-2] season['type_name'] = response.xpath( '//div[@id="match_stage"]/a[@class="ltab_btn on"]/text()' ).extract_first() season['sub_type_name'] = '' season['game_week'] = t['round'] season['start_time'] = t['stime'] season['team_a'] = t['hname'] season['team_b'] = t['gname'] if t['status'] == '5': season['score_a'] = t['hscore'] season['score_b'] = t['gscore'] else: season['score_a'] = 0 season['score_b'] = 0 statusId = 1 if t['status'] in ['-1', '2', '4', '6', '7', '11']: statusId = 6 if t['status'] == '5': statusId = 4 season['status'] = statusId season['fid'] = t['fid'] season['year'] = year yield season elif response.xpath('//div[@id="season_match_list"]'): for s in response.xpath('//tbody[@id="match_list_tbody"]/tr'): yield self.buildSeasonItem(response, s) else: for s in response.xpath('//div[@class="lmb3"]'): for t in s.xpath('table/tbody/tr'): yield self.buildSeasonItem(response, t)
def parse_line_info2(self, response): if response.xpath('//ul[@id="match_group"]'): year = response.xpath( '//span[@class="ldrop_tit_txt"]/text()').extract_first()[:-2] if '/' in year: year = year.split('/')[0] + '-01-01' else: year = year + '-01-01' # 遍历赛程 非JSON格式 for t in response.xpath('//tbody[@id="match_list_tbody"]/tr'): season = SeasonItem() season['league_name'] = response.xpath( '//ul[@class="lpage_race_nav clearfix"]/li[1]/a/text()' ).extract_first()[:-2] season['type_name'] = response.xpath( '//div[@id="match_stage"]/a[@class="ltab_btn on"]/text()' ).extract_first() season['sub_type_name'] = '' season['game_week'] = '' season['start_time'] = t.xpath( 'td[@class="td_time"]/text()').extract_first() season['team_a'] = t.xpath( 'td[@class="td_lteam"]/a/@title').extract_first() season['team_b'] = t.xpath( 'td[@class="td_rteam"]/a/@title').extract_first() if t.xpath('@data-status').extract_first() == '5': season['score_a'] = t.xpath( 'td[3]/span[1]/text()').extract_first() season['score_b'] = t.xpath( 'td[3]/span[2]/text()').extract_first() else: season['score_a'] = 0 season['score_b'] = 0 statusId = 1 status = t.xpath('@data-status').extract_first() if status in ['-1', '2', '4', '6', '7', '11']: statusId = 6 if status == '5': statusId = 4 season['status'] = statusId season['fid'] = t.xpath('@data-fid').extract_first() season['year'] = year yield season elif response.xpath('//div[@id="season_match_list"]'): for s in response.xpath('//tbody[@id="match_list_tbody"]/tr'): for bs in self.buildSeasonItem(response, s): yield bs else: for s in response.xpath('//div[@class="lmb3"]'): for t in s.xpath('table/tbody/tr'): for bs in self.buildSeasonItem(response, t): yield bs
def buildSeasonItem(self, response, s): year = response.xpath( '//span[@class="ldrop_tit_txt"]/text()').extract_first()[:-2] if '/' in year: year = year.split('/')[0] + '-01-01' else: year = year + '-01-01' season = SeasonItem() season['league_name'] = response.xpath( '//ul[@class="lpage_race_nav clearfix"]/li[1]/a/text()' ).extract_first()[:-2] season['type_name'] = response.xpath( '//div[@id="match_stage"]/a[@class="ltab_btn on"]/text()' ).extract_first() if s.xpath('h4/text()').extract_first(): season['sub_type_name'] = s.xpath('h4/text()').extract_first() else: season['sub_type_name'] = '' season['game_week'] = '' season['start_time'] = s.xpath( 'td[@class="td_time"]/text()').extract_first() season['team_a'] = s.xpath( 'td[@class="td_lteam"]/a/@title').extract_first() season['team_b'] = s.xpath( 'td[@class="td_rteam"]/a/@title').extract_first() if s.xpath('@data-status').extract_first() == '5': season['score_a'] = s.xpath('td[3]/span[1]/text()').extract_first() season['score_b'] = s.xpath('td[3]/span[2]/text()').extract_first() else: season['score_a'] = 0 season['score_b'] = 0 statusId = 1 status = s.xpath('@data-status').extract_first() if status in ['-1', '2', '4', '6', '7', '11']: statusId = 6 if status == '5': statusId = 4 season['status'] = statusId season['fid'] = s.xpath('@data-fid').extract_first() season['year'] = year yield season
def parse_line_info2(self, response): # 遍历赛程 非JSON格式 for t in response.xpath('//tbody[@id="match_list_tbody"]/tr'): season = SeasonItem() season['league_name'] = response.xpath( '//ul[@class="lpage_race_nav clearfix"]/li[1]/a/text()' ).extract_first()[:-2] season['type_name'] = response.xpath( '//div[@id="match_stage"]/a[@class="ltab_btn on"]/text()' ).extract_first() season['sub_type_name'] = '' season['game_week'] = '' season['start_time'] = t.xpath( 'td[@class="td_time"]/text()').extract_first() season['team_a'] = t.xpath( 'td[@class="td_lteam"]/a/@title').extract_first() season['team_b'] = t.xpath( 'td[@class="td_rteam"]/a/@title').extract_first() if t.xpath('@data-status').extract_first() == '5': season['score_a'] = t.xpath( 'td[3]/span[1]/text()').extract_first() season['score_b'] = t.xpath( 'td[3]/span[2]/text()').extract_first() else: season['score_a'] = 0 season['score_b'] = 0 statusId = 1 status = t.xpath('@data-status').extract_first() if status in ['-1', '2', '4', '6', '7', '11']: statusId = 6 if status == '1': statusId = 5 if status == '5': statusId = 4 season['status'] = statusId season['fid'] = t.xpath('@data-fid').extract_first() yield season
def buildSeasonItem(self, response, s): season = SeasonItem() season['league_name'] = response.xpath( '//ul[@class="lpage_race_nav clearfix"]/li[1]/a/text()' ).extract_first()[:-2] season['type_name'] = response.xpath( '//div[@id="match_stage"]/a[@class="ltab_btn on"]/text()' ).extract_first() if s.xpath('h4/text()').extract_first(): season['sub_type_name'] = s.xpath('h4/text()').extract_first() else: season['sub_type_name'] = '' season['game_week'] = '' season['start_time'] = s.xpath( 'td[@class="td_time"]/text()').extract_first() season['team_a'] = s.xpath( 'td[@class="td_lteam"]/a/@title').extract_first() season['team_b'] = s.xpath( 'td[@class="td_rteam"]/a/@title').extract_first() if s.xpath('@data-status').extract_first() == '5': season['score_a'] = s.xpath('td[3]/span[1]/text()').extract_first() season['score_b'] = s.xpath('td[3]/span[2]/text()').extract_first() else: season['score_a'] = 0 season['score_b'] = 0 statusId = 1 status = s.xpath('@data-status').extract_first() if status == -1 or status == 2 or status == 4 or status == 6 or status == 7 or status == 11: statusId = 6 if status == 1: statusId = 5 if status == 5: statusId = 4 season['status'] = statusId season['fid'] = s.xpath('@data-fid').extract_first() yield season
def parse_detail_info(self, response): for r in response.xpath('//div[@class="ltab_hd lmb3 clearfix"]/a'): if r.xpath('@data-id'): yield Request(response.urljoin( r.xpath('@href').extract_first()), self.parse_line_info2, dont_filter=True) for r in response.xpath('//div[@class="ltab_hd lmb2 clearfix"]/a'): if r.xpath('@data-id'): yield Request(response.urljoin( r.xpath('@href').extract_first()), self.parse_line_info, dont_filter=True) for r in response.xpath('//div[@class="ltab_hd"]/a')[:1]: for d in r.xpath('//div[@id="div_group_list"]/a')[1:]: year = response.xpath('//span[@class="ldrop_tit_txt"]/text()' ).extract_first()[:-2] if '/' in year: year = year.split('/')[0] + '-01-01' else: year = str(int(year) - 1) + '-01-01' url = 'http://liansai.500.com/index.php?c=score&a=getmatch&stid=%s&round=%s' % ( str(response.url).split("-")[2][:-1], d.xpath('@data-group').extract_first()) infoJson = json.loads( str( BeautifulSoup( urllib.request.urlopen( urllib.request.Request(url)).read(), "html.parser"))) for t in infoJson: season = SeasonItem() season['league_name'] = response.xpath( '//ul[@class="lpage_race_nav clearfix"]/li[1]/a/text()' ).extract_first()[:-2] season['type_name'] = d.xpath('text()').extract_first() season['sub_type_name'] = '' season['game_week'] = t['round'] season['start_time'] = t['stime'] season['team_a'] = t['hname'] season['team_b'] = t['gname'] if t['status'] == '5': season['score_a'] = t['hscore'] season['score_b'] = t['gscore'] else: season['score_a'] = 0 season['score_b'] = 0 statusId = 1 if t['status'] in ['-1', '2', '4', '6', '7', '11']: statusId = 6 if t['status'] == '5': statusId = 4 season['status'] = statusId season['fid'] = t['fid'] season['year'] = year yield season for r in response.xpath('//div[@class="ltab_hd"]/a')[1:-1]: yield Request(response.urljoin(r.xpath('@href').extract_first()), self.parse_line_info, dont_filter=True)