def download_an_image(url, file_path, exclude_url=PARSER_EXCLUDE_URL): """ :param url: image's url :param file_path: path to save image :return: {"200", "Zero", "Error", "link-anh-ko-hop-le", "Small", exception_message} If url is in 'exclude_url' list : Stop downloading and return Get html content and write to file Check downloaded file's status and return it """ if re.search("|".join(exclude_url), url) is not None: print "{0} : link anh ko hop le".format(url) return "link-anh-ko-hop-le" if SHOW_DOWNLOAD_STATUS: print "[{0}] {1} : Downloading".format(utils.get_current_time(), url) html_content = utils.urllib2_get(url=url) utils.write_string_to_file(file_path, html_content.read()) if utils.get_file_size(file_path) == int(html_content.info()['Content-Length']): if utils.get_image_resolution(file_path) < 300: return "Small" else: return "200" elif utils.get_file_size(file_path) == 0: return "Zero" else: return "Error"
def get_url_status(url): """ Get html status code :param url: :return: """ status = utils.urllib2_get(url=url).getcode() if SHOW_DOWNLOAD_STATUS: print "[{0}] {1} : {2}".format(utils.get_current_time(), url, status) return str(status)
def get_page_list(self, url): xpath = _page_list_xpath['tttuan'] page_list = list() removed_data = list() try: html_content = utils.urllib2_get(url).read() except Exception as e: print e return list() # Rule : blogspot_urls priority is higher + tttuan_urls will be sorted tttuan_urls = re.findall("var slides_page_path.*", html_content) blogspot_urls = re.findall("var slides_page_url_path.*", html_content) if blogspot_urls and tttuan_urls: blogspot_urls = re.findall("\[(.*)\]", blogspot_urls[0]) tttuan_urls = re.findall("\[(.*)\]", tttuan_urls[0]) else: return list() if "http" in tttuan_urls[0]: if "http" in blogspot_urls[0] and "http" in tttuan_urls[0]: pages = blogspot_urls[0].split(",") else: pages = tttuan_urls[0].split(",") pages = sorted(pages, key=utils.sort_order_by_numeric) elif "http" in blogspot_urls[0]: pages = blogspot_urls[0].split(",") else: pages = list() for page in pages: if (xpath['advert'] and re.search("|".join(xpath['advert']), str(page), flags=re.IGNORECASE)) is not None: removed_data.append(page) continue # Remove advert url if "googleusercontent.com/gadgets/proxy?container" in page: # In case of site using Proxy page = re.sub(".*url ?= ?(https?://.*)", r"\1", page) page = re.sub("\"|\'", "", page) # Remove " ' page = re.sub("^ +| +$|\r", "", page) # Remove white space at beginning/ending of url page = re.sub(" ", "%20", page) # Replace white space character with %20 (html character) page_list.append(page) if PARSER_ENABLE_DEBUG: print "Page list removed data :", removed_data return page_list
def _get_chapter_list_general(url, xpath, chapter_name_parser=_chapter_name_parser_general): """ Get chapter list from url :param url: Chapter's url :param xpath: Passing xpath 'dictionary' from proper class :param chapter_name_parser: Passing chapter_name_parser 'function' from class, default: _chapter_name_parser_general :return: chapter list, format : {'1': 'url-1' ... 'n': 'url-n' ,'summary': ['1','2'...'n'] } :type return: dict() Create html_code_parser: To convert HTML code to UTF-8 code Get raw html_content Find all tags that match xpath (name + attrs) Try to find all url tag on previous result (If site=vechai => reverse result) Loop through 'chapters' - 'chapter' is BeautifulSoup.Tags object Assign 'chapter_url' variable Assign 'chapter_name' variable and convert HTML code to UTF-8 code If class = truyentranhtuan : chapter_name is parsed from chapter_url Else :chapter_name is parsed from 'chapter' (BeautifulSoup.Tags object) 'chapter_name': Remove duplicated url and chapter_url that not contain xpath['include_url'] 'chapter_name': Remove exclude urls 'chapter_name': contain no number will be placed on chapter_bonus parse 'chapter_name' with function 'chapter_name_parser(*args)' 'chapter_name': contain no number will be placed on chapter_bonus (repeated) Check for duplicating of 'chapter_name' on chapter_list: Change 15-5 => 15 (to avoid 15-5-1) Insert duplicated chapters to duplicated_list Insert 'chapter_name' + 'chapter_url' to 'chapter_list' and 'chapter_list['summary']' Insert all duplicated chapters from to 'chapter_list' Insert bonus chapters from to 'chapter_list' (add to top) Sort chapter_list's summary """ duplicated_list = dict() chapter_list = dict() chapter_bonus = list() chapter_list['summary'] = list() removed_data = { 'include': list(), 'exclude': list() } html_code_parser = HTMLParser() try: html_content = utils.urllib2_get(url).read() except Exception as e: print e return chapter_list soup = BeautifulSoup(html_content).findAll(xpath['name'], xpath['attrs']) try: chapters = soup[0].findAll('a') if "vechai.info" in url: chapters = soup[1].findAll('a') chapters.reverse() except Exception as e: print e chapters = list() for chapter in chapters: chapter_url = str(xpath['site_prefix']) + chapter['href'] if 'truyentranhtuan.com/' in url: chapter_name = chapter_url else: chapter_name = html_code_parser.unescape(chapter.string) if xpath['include_url'] not in chapter_url or chapter_url in chapter_list.values(): removed_data['include'].append(chapter) continue if (xpath['exclude_url'] and re.search("|".join(xpath['exclude_url']), chapter_url, flags=re.IGNORECASE)) is not None: removed_data['exclude'].append(chapter) continue if not re.search("[0-9]+", chapter_name) or re.search("|".join(xpath['bonus']), chapter_name, flags=re.IGNORECASE): chapter_bonus.append(chapter_url) continue chapter_name = chapter_name_parser(chapter_name=chapter_name, xpath=xpath) if not chapter_name: chapter_bonus.append(chapter_url) continue if (re.sub("(.*?)-.*", r"\1", chapter_name) or chapter_name) in chapter_list.keys(): chapter_name = re.sub("(.*?)-.*", r"\1", chapter_name) if chapter_name not in duplicated_list.keys(): duplicated_list[chapter_name] = list() duplicated_list[chapter_name].append(chapter_list[chapter_name]) duplicated_list[chapter_name].append(chapter_url) else: duplicated_list[chapter_name].append(chapter_url) continue chapter_list[chapter_name] = chapter_url chapter_list['summary'].append(chapter_name) chapter_list = _duplicate_chapters_parser(chapter_list=chapter_list, duplicated_list=duplicated_list) chapter_list = _chapter_bonus_parser(chapter_list=chapter_list, chapter_bonus=chapter_bonus) if PARSER_ENABLE_DEBUG: print "Chapter list removed data :", removed_data if re.search('|'.join(sort_chapter_list_summary), url, flags=re.IGNORECASE): chapter_list['summary'] = sorted(chapter_list['summary'], key=utils.sort_order_by_numeric, reverse=True) return chapter_list
def _get_page_list_general(url, xpath): """ Abstract method for get_page_list :param url: :param xpath: advert urls that need to remove :return: list of page_url Create html_code_parser: To convert HTML code to UTF-8 code Get raw html content from url with specific header => If error, return blank list() Find all tags that match xpath (name + attrs) Find all url tag from previous result - check xpath variable => Get result and append to 'pages' If "textarea" Search for line with pattern src=".*" => Loop through it Using regex and get page_urls If 'script' Search script-tag contain string 'function load' Search for line that contain text "data = " Using regex and get page_urls Else: Search all image tags Loop through 'pages' Convert HTML code to UTF-8 code Remove advert url Remove proxy wrapper Remove white space at beginning/ending of url Replace white space character with html character %20 """ page_list = list() pages = list() removed_data = list() html_code_parser = HTMLParser() try: html_content = utils.urllib2_get(url).read() except Exception as e: print e return list() soup = BeautifulSoup(html_content).findAll(name=xpath['name'], attrs=xpath['attrs']) if xpath['name'] is "textarea": # comicvn p = re.compile("src=\".*?\"") for page in p.findall(str(soup)): page = re.sub("\"|src=", "", page) if page.strip(): pages.append(page) elif xpath['name'] is "script": # izmanga for i in soup: if "function load" in str(i): for x in str(i).splitlines(): if "data =" in x: x = re.sub(".*[\'\"](.*)[\'\"].*", r"\1", x) pages = x.split("|") else: # blogtr, hamtr, vechai for div in soup: pages = (x['src'] for x in div.findAll('img')) for page in pages: page = html_code_parser.unescape(page) # Convert HTML code to UTF-8 code if (xpath['advert'] and re.search("|".join(xpath['advert']), str(page.encode("utf-8")), flags=re.IGNORECASE)) is not None: removed_data.append(page) # Remove advert url continue if "googleusercontent.com/gadgets/proxy?container" in page: # Remove proxy wrapper page = re.sub(".*url ?= ?(https?://.*)", r"\1", page) page = re.sub("^ +| +$|\r", "", page) # Remove white space at beginning/ending of url page = re.sub(" ", "%20", page) # Replace white space character with html character %20 page_list.append(page) if PARSER_ENABLE_DEBUG: print "Page list removed data :", removed_data return page_list