Python ensure_UTF8示例，text_manip.ensure_UTF8 Python示例

示例#1

0

显示文件

文件： hyperlink_fetch.py 项目： ARDivekar/Wikipedia_closure

    def __init__(self, page_url):
        response = None
        self.url = None
        try:
            response = self.get_response_object(page_url)
        except Exception:
            try:
                # time.sleep(1)
                response = self.get_response_object(page_url)
            except Exception:
                try:
                    # time.sleep(5)
                    response = self.get_response_object(page_url)
                except Exception:
                    print "\nTried 3 times. Cannot access url: %s. \nHence, cannot make HTML_page_Obj\n" % page_url
        if response != None:

            self.url = page_url
            try:
                self.short_url = response.headers.getparam('Link')
            except Exception:
                self.short_url = ""
            self.charset = response.headers.getparam('charset')
            self.headers = {'charset': self.charset}
            for i in response.headers:
                self.headers[i] = response.headers[i].split(';')[0]

            self.html = response.read()  #the actual html
            self.html = text_manip.ensure_UTF8(self.html)

            self.html_soup = None
            self.link_dict = None
            self.all_style = None

示例#2

0

显示文件

文件： hyperlink_fetch.py 项目： ARDivekar/Wikipedia_closure

    def __init__(self, wikipedia_image_url=None, direct_url=None):
        self.direct_image_link = None
        self.saved_path = None
        self.saved_filename = None

        if direct_url is not None:
            self.direct_image_link = direct_url

        elif wikipedia_image_url is not None:
            img_page = HTML_page_Obj(wikipedia_image_url)
            img_soup = img_page.make_soup()
            # print img_soup.find('div', _class="mw-filepage-resolutioninfo")

            self.main_img_link = img_soup.find(
                'div', class_="fullMedia").find(
                    'a', class_="internal").get('href')
            if self.main_img_link[:9] == "//upload.":
                self.main_img_link = "https:" + self.main_img_link

            img_links_soup = img_soup.find(
                "div", class_="mw-filepage-resolutioninfo")
            self.img_px_to_links_map = {
            }  #maps the product of height and width to tuples of pixel dimensions and their links
            for img_link in img_links_soup.find_all('a'):
                actual_link = img_link.get('href')

                if actual_link[:9] == "//upload.":
                    actual_link = "https:" + actual_link

                img_link_text = text_manip.ensure_UTF8(
                    re.sub(",", "", img_link.get_text()))
                # print text_manip.ensure_ASCII("\n\nimg_link_text: %s"%img_link_text)
                img_dimensions = re.findall('\d+', img_link_text)
                width = int(img_dimensions[0])
                height = int(img_dimensions[1])
                if len(img_dimensions) == 2:
                    self.img_px_to_links_map[width * height] = (
                        actual_link, width, height)  #does mapping
                else:
                    print "Cannot use image %s, which is at %s with dimensions %s" % (
                        wikipedia_image_url, img_link.get('href'),
                        img_dimensions)

示例#3

0

显示文件

文件： hyperlink_fetch.py 项目： ARDivekar/Wikipedia_closure

 def html_soup_and_prettify(self):
     if self.html_soup == None:
         self.html_soup = self.make_soup()
     self.html = text_manip.ensure_UTF8(self.html_soup.prettify())
     return self.html_soup

示例#4

0

显示文件

文件： hyperlink_fetch.py 项目： ARDivekar/Wikipedia_closure

    def __init__(self, wikipedia_url, just_heading=False):
        #	inherit items from the HTML_page_Obj:
        self._page = HTML_page_Obj(wikipedia_url)
        self.url = self._page.url

        self.wiki_domain = text_manip.extract_website(
            self.url)  #gets en.wikipedia.org, etc. in different languages

        #	make the soup of the HTML and get the HTML, prettified:
        soup = self._page.html_soup_and_prettify()
        self.html = self._page.html  #prettified, in UTF-8 format

        #	Get the heading of the Wikipedia page:
        self.heading = text_manip.ensure_UTF8(
            BeautifulSoup('''%s''' % soup.find(
                "h1", id="firstHeading",
                class_="firstHeading")).get_text().strip())

        if just_heading == False:  # prevents a lot of unecessary processing if True

            #	Make a soup of the article html, set the article html:
            article_soup = soup.find(
                "div", id="mw-content-text")  #might come in handy later
            self.article_html = text_manip.ensure_UTF8(article_soup.prettify())
            temp_art_html = self.article_html
            temp_art_html = text_manip.remove_HTML_perfect(
                html=temp_art_html, tag="div", class_list=["reflist"])
            temp_art_html = text_manip.remove_after(
                regex='<span.*?id="References"', text=temp_art_html)
            temp_art_html = text_manip.remove_after(
                regex='<span.*?id="See_also"', text=temp_art_html)
            temp_art_html = text_manip.remove_after(
                regex='<span.*?id="Further_reading"', text=temp_art_html)
            temp_art_html = text_manip.remove_after(
                regex='<span.*?id="External_links"', text=temp_art_html)
            article_soup = BeautifulSoup(temp_art_html)

            #	Get the links from the article HTML:

            self.article_links = []  # real links, from article
            self.img_links = []  # real direct links, from article
            self.direct_img_to_img_map = {}
            self.media_links = []  # real links, from article

            not_allowed_in_url_from_article = [
                "/wiki/Special:", "/wiki/Template:", "/wiki/Portal:",
                "/wiki/Wikipedia:", "/wiki/Help:", "redlink=1"
            ]

            for link_tag in article_soup.find_all('a'):
                link = link_tag.get('href')

                if link is not None and link[:6] == "/wiki/":  #only extracts links to wikipedia
                    allowed_flag = True
                    for allowed_test in not_allowed_in_url_from_article:
                        if allowed_test in link:
                            allowed_flag = False
                            break

                    if allowed_flag:
                        link = text_manip.ensure_UTF8(link)
                        mapped_link = "https://" + self.wiki_domain + link

                        if link[:11] != "/wiki/File:":
                            self.article_links.append(mapped_link)
                        else:
                            if link[-4:] == '.ogg':  #video or audio file
                                self.media_links.append(mapped_link)
                            else:  # image file
                                img_tag = link_tag.img
                                # print "\n\n"
                                # print "link_tag :",link_tag
                                # print "\timg_tag :",img_tag
                                if img_tag is not None:
                                    direct_img_link = img_tag.get('src')
                                    # print "\t\tdirect_img_link :",direct_img_link
                                    self.img_links.append(direct_img_link)
                                    self.direct_img_to_img_map[
                                        direct_img_link] = mapped_link

            self.article_links = list(set(self.article_links))
            self.img_links = list(set(self.img_links))
            self.media_links = list(set(self.media_links))

            #	replace the /wiki/ links for actual links. Presetve the old html, and use replaced_html from now on

            self.replaced_html = self.html
            for link_tag in soup.find_all('a'):
                link = link_tag.get('href')
                if link is not None:
                    if link[:6] == "/wiki/":
                        mapped_link = "https://" + self.wiki_domain + link
                        self.replaced_html = self.replaced_html.replace(
                            'href="%s' % link, 'href="%s' % mapped_link)