def parse_items(response): items = [] list_pos = response.xpath("//ul[@class='schedule-listings']/li[@class='listing']") for li in list_pos: try: title = li.xpath("./div[@class='content']/h2[@class='title']").extract()[0] reg = re.compile(r'\<h2.*?\>\s*(?:\<a href.*?\>)*(.*?)(?:\</a\>)*\s*\</h2\>', re.S) s = reg.search(title) title = s.group(1) meta = li.xpath("./div[@class='content']/div[@class='meta']/text()").extract()[0] name = title + " " + meta except IndexError: print "Xpath parse name error!" name = title except AttributeError: print "re parse name error!" continue try: desc = li.xpath("./div[@class='content']/div[@class='synopsis']/text()").extract()[0] except IndexError: print "description is none!" desc = "" try: timeStr = li.xpath("./div[@class='date-time']/time/text()").extract()[0] dateStr = li.xpath("./div[@class='date-time']/span/text()").extract()[0] #将6.00am 这种格式转化为24小时制 timeStr = time12to24(timeStr) times = "%s,%s,%s" % (time.strftime('%Y'), dateStr.strip(), timeStr) time_struct = time.strptime(times, "%Y,%A, %d %b,%H:%M") ftime = time.strftime("%Y.%m.%d %H:%M:%S", time_struct) except IndexError: print "Xpath parse time error!" continue except ValueError: print "time values error!" continue item = EpgItem() item['name'] = name item['starttime'] = ftime item['endtime'] = '' item['desc'] = desc.strip() items.append(item) return items
def parse(self, response): program_position = response.xpath("//div[@class='box-container-wrapper']/div[contains(@class,'date-program-wrapper')]") for dates in program_position: date = dates.xpath("./div[@class='box-inner-container-header']/h2/text()").extract()[0][-10:] date = trans_format(date, "%d-%m-%Y", "%Y.%m.%d") programs = dates.xpath("./div[@class='box-inner-container-wrapper']/div") for program in programs: name = program.xpath("./div[@class='title']/h2/text()").extract()[0] times = program.xpath("./div[@class='timing']/time/text()").extract()[0] times = times.split("/")[0][0:7].strip().replace(":", ".") times = time12to24(times) starttime = trans_format("%s %s" % (date, times), "%Y.%m.%d %H:%M") ftime = datetime.datetime.strptime(starttime, "%Y.%m.%d %H:%M:%S") ftime = ftime + datetime.timedelta(hours=8) starttime = ftime.strftime("%Y.%m.%d %H:%M:%S") item = EpgItem() item['name'] = name item['starttime'] = starttime item['endtime'] = '' item['desc'] = '' yield item
def parse_epg(self, response): date_str = response.url[-8:] program_position = response.xpath("//div[@class='schedule_grid ']") reg = re.compile(r"\s\s+") for dates in program_position: program = dates.xpath("./div[@class='schedule_details']") program_time = program.xpath("./p[@class='info']/text()").extract()[0] title = program.xpath("./p[@class='title']/a/text()").extract()[0] try: subtitle = program.xpath("./p[@class='title']/a/span/text()").extract()[0] except IndexError: subtitle = "" except ValueError: subtitle = "" program_time = time12to24(reg.sub("", program_time).replace(" ", "")) starttime = trans_format("%s %s" % (date_str, program_time), self.formats + " %H:%M") item = EpgItem() item["name"] = reg.sub("", title.strip() + subtitle.strip()) item["starttime"] = starttime item["endtime"] = "" item["desc"] = "" yield item