def getImages(chapter): images = [] html = utils.getUrlContent(chapter) doc = lh.fromstring(html) for option in doc.cssselect('#pageMenu option'): html2 = utils.getUrlContent(MAIN+option.attrib['value']) doc2 = lh.fromstring(html2) img = doc2.cssselect('#img')[0] images.append(utils.Content(option.text_content(), img.attrib['src'])) return images
def getChapters(link): chapters = [] html = utils.getUrlContent(link) doc = lh.fromstring(html) for i,a in enumerate(doc.cssselect('#listing tr a')): chapters.append(utils.Content(a.text_content(), MAIN+a.attrib['href'], i+1)) return chapters
def search(query): results = [] mangas = utils.getUrlContent(utils.getUrl(SEARCH, query.replace(" ", "+"))) for manga in mangas.split('\n'): if manga: obj = manga.split('|') results.append(utils.Content(obj[0].strip(), MAIN+obj[4].strip())) return results
def parse_page_data(self, raw_url,page_info,runtime_status,post_datas={}): #获取raw_url的应答页面,并处理好编码问题 page_encoding = "UTF-8" if page_info.encoding.strip(): page_encoding = page_info.encoding # raw_url = raw_url.decode("UTF-8","ignore").encode(page_encoding,"ignore") page_src = utils.getUrlContent(raw_url,post_datas) # if page_encoding == "unicode": page_src = eval("u'"+page_src+"'").encode('utf-8',"ignore") else: page_src = page_src.decode(page_encoding,"ignore").encode('utf-8',"ignore") #开始解析获得页面page_src #依据块的定位符,从应答页面中分理出需要详细解析的结果块,可以有多块结果 block_data_map_list = self.parse_block_match(page_src, page_info.block_match) #整个页面用正则表示式匹配,匹配结果都要补入块匹配的结果block_data_map_list中每一天记录中 for regular_match in page_info.regular_matchs: datalist = [] #正则表达式中捕获到的数据 #使用exp中的正则表达式匹配出相关结果 for regular in regular_match.regulars: tmp_src = page_src #page_src循环匹配中会多次使用 tmp_src = self.remove_tags(tmp_src, regular_match.omit_tags) pagedata_ret = re.compile(regular) tmp_datalist = pagedata_ret.findall(tmp_src) datalist.extend(tmp_datalist) #正则表达式未匹配到值时,前面的解析结果中有相关值则赋值到datalist中 if len(datalist)==0: tmp_list = [] tmp_n = 0 while (regular_match.result+str(tmp_n)) in runtime_status: tmp_list.append("n/a") tmp_n = tmp_n + 1 datalist.extend(tmp_list) # tmp_addon_list = [] #新增加的行结果记录 scroll_str = "" #正则匹配结果折叠 for data_i in range(0,len(datalist)): #如果is_unique等于一,则只取匹配结果中的第一个值。等于0时取所有的结果 if regular_match.is_unique == "1" and data_i>0: continue data = datalist[data_i] grub_status = {} #正则表达式中没有括号或者只有一个捕获型括号,返回的字符串list ['qqq', 'hyx',] #正则表达式中有多个捕获型括号,返回的字符串list [('qqq', 'hyx'),('12','hellooo'),] if type(data) == type("a"): grub_status[regular_match.result+"1"]=data scroll_str = scroll_str + data + "||" else: for i in range(0,len(data)): grub_status[regular_match.result+str(i+1)]=data[i] scroll_str = scroll_str + data[i] + "||" if regular_match.is_scroll!="1": #block_data_map_list记录数不变,每一条数据扩展上grub_status,成为tmp_addon_list for items in block_data_map_list: tmp_map = {} tmp_map.update(grub_status) tmp_map.update(items) tmp_addon_list.append(tmp_map) else: pass #将新增加的行记录tmp_addon_list加入到block_data_map_list结果中,记录数增加 if regular_match.is_scroll!="1": if len(tmp_addon_list) == 0: tmp_addon_list = [{}] else: block_data_map_list = [] for items in tmp_addon_list: if len(items)>0: block_data_map_list.append(items) else: tmp_addon_list = [] for items in block_data_map_list: if len(items)>0: tmp_addon_list.append(items) block_data_map_list = [] if len(tmp_addon_list)==0: tmp_addon_list = [{}] for items in tmp_addon_list: items[regular_match.result] = scroll_str block_data_map_list.append(items) return block_data_map_list