Python urllib2_get 예제들, pcrawler.Utils.utils.urllib2_get Python 예제들

예제 #1

0

파일 보기

파일: abstract_model.py 프로젝트: caohiephung/pcrawler

def download_an_image(url, file_path, exclude_url=PARSER_EXCLUDE_URL):
    """
    :param url: image's url
    :param file_path: path to save image
    :return: {"200", "Zero", "Error", "link-anh-ko-hop-le", "Small",  exception_message}
    If url is in 'exclude_url' list : Stop downloading and return
    Get html content and write to file
    Check downloaded file's status and return it
    """
    if re.search("|".join(exclude_url), url) is not None:
        print "{0} : link anh ko hop le".format(url)
        return "link-anh-ko-hop-le"

    if SHOW_DOWNLOAD_STATUS:
        print "[{0}] {1} : Downloading".format(utils.get_current_time(), url)

    html_content = utils.urllib2_get(url=url)
    utils.write_string_to_file(file_path, html_content.read())

    if utils.get_file_size(file_path) == int(html_content.info()['Content-Length']):
        if utils.get_image_resolution(file_path) < 300:
            return "Small"
        else:
            return "200"
    elif utils.get_file_size(file_path) == 0:
        return "Zero"
    else:
        return "Error"

예제 #2

0

파일 보기

파일: abstract_model.py 프로젝트: caohiephung/pcrawler

def get_url_status(url):
    """
    Get html status code
    :param url:
    :return:
    """
    status = utils.urllib2_get(url=url).getcode()

    if SHOW_DOWNLOAD_STATUS:
        print "[{0}] {1} : {2}".format(utils.get_current_time(), url, status)

    return str(status)

예제 #3

0

파일 보기

파일: m_parser.py 프로젝트: caohiephung/pcrawler

    def get_page_list(self, url):
        xpath = _page_list_xpath['tttuan']
        page_list = list()
        removed_data = list()

        try:
            html_content = utils.urllib2_get(url).read()
        except Exception as e:
            print e
            return list()

        # Rule  : blogspot_urls priority is higher + tttuan_urls will be sorted
        tttuan_urls = re.findall("var slides_page_path.*", html_content)
        blogspot_urls = re.findall("var slides_page_url_path.*", html_content)

        if blogspot_urls and tttuan_urls:
            blogspot_urls = re.findall("\[(.*)\]", blogspot_urls[0])
            tttuan_urls = re.findall("\[(.*)\]", tttuan_urls[0])
        else:
            return list()

        if "http" in tttuan_urls[0]:
            if "http" in blogspot_urls[0] and "http" in tttuan_urls[0]:
                pages = blogspot_urls[0].split(",")
            else:
                pages = tttuan_urls[0].split(",")
                pages = sorted(pages, key=utils.sort_order_by_numeric)
        elif "http" in blogspot_urls[0]:
            pages = blogspot_urls[0].split(",")
        else:
            pages = list()

        for page in pages:
            if (xpath['advert'] and re.search("|".join(xpath['advert']), str(page), flags=re.IGNORECASE)) is not None:
                removed_data.append(page)
                continue    # Remove advert url
            if "googleusercontent.com/gadgets/proxy?container" in page:  # In case of site using Proxy
                page = re.sub(".*url ?= ?(https?://.*)", r"\1", page)
            page = re.sub("\"|\'", "", page)  # Remove " '
            page = re.sub("^ +| +$|\r", "", page)  # Remove white space at beginning/ending of url
            page = re.sub(" ", "%20", page)     # Replace white space character with %20 (html character)
            page_list.append(page)
        if PARSER_ENABLE_DEBUG:
            print "Page list removed data :", removed_data
        return page_list

예제 #4

0

파일 보기

파일: m_parser.py 프로젝트: caohiephung/pcrawler

def _get_chapter_list_general(url, xpath, chapter_name_parser=_chapter_name_parser_general):
    """
    Get chapter list from url
    :param url: Chapter's url
    :param xpath: Passing xpath 'dictionary' from proper class
    :param chapter_name_parser: Passing chapter_name_parser 'function' from class, default: _chapter_name_parser_general
    :return: chapter list, format :  {'1': 'url-1' ... 'n': 'url-n' ,'summary': ['1','2'...'n'] }
    :type return: dict()

        Create html_code_parser: To convert HTML code to UTF-8 code
        Get raw html_content
        Find all tags that match xpath (name + attrs)
        Try to find all url tag on previous result (If site=vechai => reverse result)

        Loop through 'chapters' - 'chapter' is BeautifulSoup.Tags object
            Assign 'chapter_url' variable
            Assign 'chapter_name' variable and convert HTML code to UTF-8 code
                If class = truyentranhtuan : chapter_name is parsed from chapter_url
                Else :chapter_name is parsed from 'chapter' (BeautifulSoup.Tags object)
            'chapter_name': Remove duplicated url and chapter_url that not contain xpath['include_url']
            'chapter_name': Remove exclude urls
            'chapter_name': contain no number will be placed on chapter_bonus
            parse 'chapter_name' with function 'chapter_name_parser(*args)'
            'chapter_name': contain no number will be placed on chapter_bonus (repeated)
            Check for duplicating of 'chapter_name' on chapter_list:
                Change 15-5 => 15 (to avoid 15-5-1)
                Insert duplicated chapters to duplicated_list
            Insert 'chapter_name' + 'chapter_url' to 'chapter_list' and 'chapter_list['summary']'
        Insert all duplicated chapters from to 'chapter_list'
        Insert bonus chapters from to 'chapter_list' (add to top)
        Sort chapter_list's summary
    """
    duplicated_list = dict()
    chapter_list = dict()
    chapter_bonus = list()
    chapter_list['summary'] = list()
    removed_data = {
        'include': list(),
        'exclude': list()
    }

    html_code_parser = HTMLParser()

    try:
        html_content = utils.urllib2_get(url).read()
    except Exception as e:
        print e
        return chapter_list

    soup = BeautifulSoup(html_content).findAll(xpath['name'], xpath['attrs'])

    try:
        chapters = soup[0].findAll('a')

        if "vechai.info" in url:
            chapters = soup[1].findAll('a')
            chapters.reverse()
    except Exception as e:
        print e
        chapters = list()

    for chapter in chapters:

        chapter_url = str(xpath['site_prefix']) + chapter['href']

        if 'truyentranhtuan.com/' in url:
            chapter_name = chapter_url
        else:
            chapter_name = html_code_parser.unescape(chapter.string)

        if xpath['include_url'] not in chapter_url or chapter_url in chapter_list.values():
            removed_data['include'].append(chapter)
            continue

        if (xpath['exclude_url'] and re.search("|".join(xpath['exclude_url']),
                                               chapter_url, flags=re.IGNORECASE)) is not None:
            removed_data['exclude'].append(chapter)
            continue

        if not re.search("[0-9]+", chapter_name) or re.search("|".join(xpath['bonus']), chapter_name, flags=re.IGNORECASE):
            chapter_bonus.append(chapter_url)
            continue

        chapter_name = chapter_name_parser(chapter_name=chapter_name, xpath=xpath)

        if not chapter_name:
            chapter_bonus.append(chapter_url)
            continue

        if (re.sub("(.*?)-.*", r"\1", chapter_name) or chapter_name) in chapter_list.keys():
            chapter_name = re.sub("(.*?)-.*", r"\1", chapter_name)

            if chapter_name not in duplicated_list.keys():
                duplicated_list[chapter_name] = list()
                duplicated_list[chapter_name].append(chapter_list[chapter_name])
                duplicated_list[chapter_name].append(chapter_url)
            else:
                duplicated_list[chapter_name].append(chapter_url)
            continue

        chapter_list[chapter_name] = chapter_url
        chapter_list['summary'].append(chapter_name)

    chapter_list = _duplicate_chapters_parser(chapter_list=chapter_list, duplicated_list=duplicated_list)

    chapter_list = _chapter_bonus_parser(chapter_list=chapter_list, chapter_bonus=chapter_bonus)

    if PARSER_ENABLE_DEBUG:
        print "Chapter list removed data :", removed_data

    if re.search('|'.join(sort_chapter_list_summary), url, flags=re.IGNORECASE):
        chapter_list['summary'] = sorted(chapter_list['summary'], key=utils.sort_order_by_numeric, reverse=True)

    return chapter_list

예제 #5

0

파일 보기

파일: m_parser.py 프로젝트: caohiephung/pcrawler

def _get_page_list_general(url, xpath):
    """
    Abstract method for get_page_list
    :param url:
    :param xpath: advert urls that need to remove
    :return: list of page_url
    Create html_code_parser: To convert HTML code to UTF-8 code
    Get raw html content from url with specific header => If error, return blank list()
    Find all tags that match xpath (name + attrs)
    Find all url tag from previous result - check xpath variable => Get result and append to 'pages'
        If "textarea"
            Search for line with pattern src=".*" => Loop through it
            Using regex and get page_urls
        If 'script'
            Search script-tag contain string 'function load'
            Search for line that contain text "data = "
            Using regex and get page_urls
        Else: Search all image tags
    Loop through 'pages'
        Convert HTML code to UTF-8 code
        Remove advert url
        Remove proxy wrapper
        Remove white space at beginning/ending of url
        Replace white space character with html character %20
    """
    page_list = list()
    pages = list()
    removed_data = list()

    html_code_parser = HTMLParser()

    try:
        html_content = utils.urllib2_get(url).read()
    except Exception as e:
        print e
        return list()

    soup = BeautifulSoup(html_content).findAll(name=xpath['name'], attrs=xpath['attrs'])

    if xpath['name'] is "textarea":  # comicvn
        p = re.compile("src=\".*?\"")
        for page in p.findall(str(soup)):
            page = re.sub("\"|src=", "", page)
            if page.strip():
                pages.append(page)
    elif xpath['name'] is "script":  # izmanga
        for i in soup:
            if "function load" in str(i):
                for x in str(i).splitlines():
                    if "data =" in x:
                        x = re.sub(".*[\'\"](.*)[\'\"].*", r"\1", x)
                        pages = x.split("|")
    else:    # blogtr, hamtr, vechai
        for div in soup:
            pages = (x['src'] for x in div.findAll('img'))

    for page in pages:
        page = html_code_parser.unescape(page)  # Convert HTML code to UTF-8 code
        if (xpath['advert'] and re.search("|".join(xpath['advert']), str(page.encode("utf-8")), flags=re.IGNORECASE)) is not None:
            removed_data.append(page)   # Remove advert url
            continue
        if "googleusercontent.com/gadgets/proxy?container" in page:  # Remove proxy wrapper
            page = re.sub(".*url ?= ?(https?://.*)", r"\1", page)
        page = re.sub("^ +| +$|\r", "", page)   # Remove white space at beginning/ending of url
        page = re.sub(" ", "%20", page)         # Replace white space character with html character %20
        page_list.append(page)

    if PARSER_ENABLE_DEBUG:
        print "Page list removed data :", removed_data
    return page_list