def carwing_content(start, cartoon_id): sql = 'SELECT c.chapter_id, chapter_name, chapter_url, ct.cartoon_name from chapter c LEFT JOIN cartoon_title ct on ct.cartoon_id = c.cartoon_id where c.chapter_sort >= '\ +str(start)+' and c.cartoon_id = '+cartoon_id+' ORDER BY chapter_sort' list = selectsql(sql) for item in list: cartoon_name = item[3] chapter_id = item[0] chapter_name = item[1].replace('?', '') url = baseurl + item[2] html = load_url(url) title_pattern = r'L2ZpbGV.{20}' link = re.search(title_pattern, html).group() tag = True i = 0 while i < len(picture_ids): if i > 8: param = 'deimgtxtimg.js?txtimg=' + link + picture_ids[ i] + imglist[1] + '&lid=' + str(i) else: param = 'deimgtxtimg.js?txtimg=' + link + picture_ids[ i] + imglist[0] + '&lid=' + str(i) insertSql = 'insert into content (content_url,content_sort,chapter_id) value (\'' + param + '\', ' + str( i + 1) + ', ' + str(chapter_id) + ')' tag = paqu(picUrl + param, 'E:/漫画/' + cartoon_name + "/" + chapter_name, str(i), tag, insertSql) # insertsql(insertSql) if not tag: print('------------------第%d集完成:共%d页--------------' % (start, i)) break i += 1 print('第%d集:第%d页完成' % (start, i)) start += 1
def paquindex(url, key): html = load_url(url) name_pattern = r'[te]" id="detxt">.{10,100}</span>' url_pattern = r'listde.php\?act=list&aid=[0-9]{2,3}' name_list = re.findall(name_pattern, html) url_list = re.findall(url_pattern, html) i = 0 while i < len(url_list): url_list[i] = url_list[i].replace('"', '') name_list[i] = name_list[i][14:].replace('</span>', '') name_list[i] = aesDecrypt(key, name_list[i]) nameArray.append(name_list[i]) urlArray.append(url_list[i]) i += 1
def crawling_chapter_one(base_url, cartoon_id): url = base_url + '0' html = load_url(url) url_pattern = r'style.php\?act=style&aid=.{3}&cid=[0-9]{4,6}' title_pattern = r'detxt\">.{20,100}</span>' url_list, title_list = analysis_html(html, url_pattern, title_pattern) sql = 'insert into chapter (chapter_name,chapter_url,chapter_sort,project_id,cartoon_id) values' base = 1 t = 0 while t < len(url_list): sql = sql + ' (\'' + title_list[t] + '\',\'' + url_list[ t] + '\',' + str(t + base) + ',1,' + cartoon_id + '),' t += 1 sql = sql.strip(',') insertsql(sql) return True