def parse_title(text, proxy=None): r = r_subject.search(text) if r: subject = r.group(1) else: subject = 'unknow' return tools.to_utf8(subject)
def parse_page(title, text, proxy=None): # r = r_meta.search(text) # if r: # encoding=r.group(1) # if encoding.lower() == 'gb2312': # encoding = 'gb18030' # else: # encoding='gb18030' r = r_title.search(text) if r: title = r.group(1).strip().split('/')[1] title = tools.to_utf8(title) r = r_content.search(text) if r: url = r.group(1) text = tools.get_url(url, proxy).strip() b = "document.write('" e = "');" if text.startswith(b) and text.endswith(e): text = text[len(b):-1 * len(e)] text = tools.format_html_text(text) else: text = '' return title + '\r\n' * 2 + text
def parse_title(text, proxy=None): encoding = tools.get_encoding(r_meta, text) r = r_subject.search(text) if r: subject = r.group(1) else: subject = 'unknow' return tools.to_utf8(subject, encoding)
def parse_title(text, proxy=None): r = r_subject.search(text) if r: subject = r.group(1) subject = subject[:len(subject)-27] else: subject = 'unknow' return tools.to_utf8(subject)
def parse_title(text, proxy=None): r = r_subject.search(text) if r: subject = r.group(1) # subject = subject.split('_', 1)[0] else: subject = 'unknow' return tools.to_utf8(subject)
def parse_index(text, proxy=None): encoding = tools.get_encoding(r_meta, text) b = r_bookid.findall(text) bookid = b[0] s = [] for (url, title) in r_index.findall(text): title = title.replace(' ', ' ').strip() # url = bookid + url yield url, tools.to_utf8(title, encoding)
def parse_index(text, proxy=None): # r = r_meta.search(text) # if r: # encoding=r.group(1) # if encoding.lower() == 'gb2312': # encoding = 'gb18030' # else: # encoding='gb18030' s = [] for (url, title) in r_index.findall(text): title = title.replace(' ', ' ') yield url, tools.to_utf8(title)
def parse_page(title, text, proxy=None): encoding = tools.get_encoding(r_meta, text) r = r_title.search(text) if r: title = r.group(1).strip() title = tools.to_utf8(title, encoding) r = r_content.search(text) if r: text = tools.format_html_text(r.group(1), encoding) else: text = '' return title + '\r\n' * 2 + text
def parse_page(title, text, proxy=None): encoding = tools.get_encoding(r_meta, text) r = r_title.search(text) if r: title = r.group(1).strip() title = tools.to_utf8(title, encoding) r = r_content.search(text) if r: text = tools.format_html_text(r.group(1), encoding) else: text = '' return title + '\r\n'*2 + text
def parse_index(text, proxy=None): # r = r_meta.search(text) # if r: # encoding=r.group(1) # if encoding.lower() == 'gb2312': # encoding = 'gb18030' # else: # encoding='gb18030' r = r_bookid.search(text) if r: id = r.group(1) else: raise Exception, "can't find the book id" s = [] for (c_id, title) in r_index.findall(text): url = "readchapter.asp?bu_id=" + c_id + "&bl_id=" + id title = title.replace(' ', ' ') yield url, tools.to_utf8(title)
def parse_page(title, text, proxy=None): # r = r_meta.search(text) # if r: # encoding=r.group(1) # if encoding.lower() == 'gb2312': # encoding = 'gb18030' # else: # encoding='gb18030' r = r_title.search(text) if r: title = r.group(1).strip() title = tools.to_utf8(title) r = r_content.search(text) if r: text = tools.format_html_text(r.group(1)) else: text = '' return title + '\r\n' * 2 + text
def parse_page(title, text, proxy=None): # r = r_meta.search(text) # if r: # encoding=r.group(1) # if encoding.lower() == 'gb2312': # encoding = 'gb18030' # else: # encoding='gb18030' r = r_title.search(text) if r: title = r.group(1) title = tools.to_utf8(title) r = r_content.search(text) if r: text = tools.format_html_text(r.group(1)) else: text = '' return title + '\r\n'*2 + text