Python BeautifulSoup示例，utils.BeautifulSoup Python示例

示例#1

0

显示文件

def main():
    for row in read_file():
        browser = Browser(debug=False, use_debug_proxy=False)
        create_post_url = "https://www.123saunas.com/customer/account/createpost/"
        # first request call is necessary to get cookies enabled
        browser.get(create_post_url)
        res = browser.get(create_post_url)
        html_text = res.text
        soap_page = BeautifulSoup(html_text)

        form_key = soap_page.find('input', {'name': 'form_key'}).get('value')
        res2 = browser.post(
            'https://www.123saunas.com/customer/account/createpost/', {
                "success_url": "",
                "error_url": "",
                "form_key": form_key,
                "firstname": row['FirstName'],
                "middlename": "",
                "lastname": row['Lastname'],
                "email": row['Email'],
                "password": row['Password'],
                "confirmation": row['Password_confirm'],
                "persistent_remember_me": "on",
            },
            headers={'Content-Type': 'application/x-www-form-urlencoded'})

        if res2.url == "https://www.123saunas.com/customer/account/index/":
            save_to_file("valid_reg.txt", row)
        else:
            save_to_file("checkagain_reg.txt", row)

        time.sleep(10)

示例#2

0

显示文件

    def _extract_asset_tags(self, text):
        """
        Extract asset tags from text into a convenient form.

        @param text: Text to extract asset tags from. This text contains HTML
            code that is parsed by BeautifulSoup.
        @type text: str

        @return: Asset map.
        @rtype: {
            '<id>': {
                'name': '<name>',
                'extension': '<extension>'
            },
            ...
        }
        """
        soup = BeautifulSoup(text)
        asset_tags_map = {}

        for asset in soup.find_all('asset'):
            asset_tags_map[asset['id']] = {
                'name': asset['name'],
                'extension': asset['extension']
            }

        return asset_tags_map

示例#3

0

显示文件

文件： api.py 项目： AnkurSheel/coursera-dl

    def _extract_asset_tags(self, text):
        """
        Extract asset tags from text into a convenient form.

        @param text: Text to extract asset tags from. This text contains HTML
            code that is parsed by BeautifulSoup.
        @type text: str

        @return: Asset map.
        @rtype: {
            '<id>': {
                'name': '<name>',
                'extension': '<extension>'
            },
            ...
        }
        """
        soup = BeautifulSoup(text)
        asset_tags_map = {}

        for asset in soup.find_all('asset'):
            asset_tags_map[asset['id']] = {'name': asset['name'],
                                           'extension': asset['extension']}

        return asset_tags_map

示例#4

0

显示文件

文件： list_parser.py 项目： torst/aliexpress_parser

    def parse_search_page(self, url):
        url_list = []
        for _ in range(self.page_count):
            res = self.browser.get(fix_url(url))
            html_text = res.text
            soap_page = BeautifulSoup(html_text)

            for prod_element in soap_page.find("ul", {
                    "id": re.compile(r"list-items")
            }).find_all("li"):
                url_list.append(
                    fix_url(
                        prod_element.find(
                            "a", {"href": re.compile("aliexpress.com/item")
                                  })["href"]))

            try:
                url = soap_page.find("div", {
                    "class": "ui-pagination-navi"
                }).find("a", {
                    "class": "page-next"
                }).attrs["href"]
            except Exception as e:
                logger.debug(e)
                break
        return url_list

示例#5

0

显示文件

文件： coursera_dl.py 项目： AnkurSheel/coursera-dl

def get_old_style_video(session, url):
    """
    Parse a old style Coursera video page.
    """

    page = get_page(session, url)
    soup = BeautifulSoup(page)
    return soup.find(attrs={'type': re.compile('^video/mp4')})['src']

示例#6

0

显示文件

文件： detail_page.py 项目： torst/aliexpress_parser

 def parse_details(self):
     details_url = fix_url(
         re.search(r'window.runParams.descUrl="(.*?)";',
                   self.main_page_soap.text).group(1))
     response = self.browser.get(details_url)
     soup = BS(response.text)
     only_text = soup.getText().replace("window.productDescription=",
                                        "").strip(" ")
     self.save_param('details', only_text)

示例#7

0

显示文件

文件： detail_page.py 项目： torst/aliexpress_parser

    def parse_feedbacks(self):
        feedback_url = fix_url(
            self.main_page_soap.find(id="feedback").iframe['thesrc'])
        comments = []
        last_page_count = None
        for page_count in range(1, 10000):
            feedback_r = self.browser.post(feedback_url, {"page": page_count})
            feddback_soap = BS(feedback_r.text)
            if not last_page_count:
                try:
                    a_tags = feddback_soap.find(
                        "div", {
                            "class": "ui-pagination-navi util-left"
                        }).find_all("a")
                    last_page_count = int(a_tags[len(a_tags) - 2].text)
                except Exception as e:
                    pass
            elif last_page_count < page_count:
                break

            for comment_div in feddback_soap.find_all(
                    'div', {'class': 'feedback-item'}):
                try:
                    comment = {}
                    user_data = comment_div.find('div',
                                                 {'class': 'fb-user-info'})
                    try:
                        user_name = user_data.span.a.text
                    except AttributeError:
                        user_name = user_data.span.text

                    comment['user_name'] = user_name
                    comment['country'] = user_data.b.text
                    comment['comment'] = comment_div.find(
                        'dt', {
                            'class': 'buyer-feedback'
                        }).span.text
                    comment['posted_time'] = comment_div.find(
                        'dd', {
                            "class": "r-time"
                        }).text

                    start_css = comment_div.find('span', {
                        "class": "star-view"
                    }).span["style"]
                    comment["rating"] = start_css[start_css.find(":"):].strip(
                        "%")
                    comments.append(comment)
                except Exception as e:
                    logger.debug(e)

            if self.max_comments < len(comments):
                logger.info("Stopped comments fetching by max_transactions")
                break
        self.save_param('comments', comments)

示例#8

0

显示文件

    def _extract_links_from_a_tags_in_text(self, text):
        """
        Extract supplement links from the html text that contains <a> tags
        with href attribute.

        @param text: HTML text.
        @type text: str

        @return: Dictionary with supplement links grouped by extension.
        @rtype: {
            '<extension1>': [
                ('<link1>', '<title1>'),
                ('<link2>', '<title2')
            ],
            'extension2': [
                ('<link3>', '<title3>'),
                ('<link4>', '<title4>')
            ]
        }
        """
        soup = BeautifulSoup(text)
        links = [
            item['href'].strip() for item in soup.find_all('a')
            if 'href' in item.attrs
        ]
        links = sorted(list(set(links)))
        supplement_links = {}

        for link in links:
            filename, extension = os.path.splitext(clean_url(link))
            # Some courses put links to sites in supplement section, e.g.:
            # http://pandas.pydata.org/
            if extension is '':
                continue

            # Make lowercase and cut the leading/trailing dot
            extension = clean_filename(extension.lower().strip('.').strip(),
                                       self._unrestricted_filenames)
            basename = clean_filename(os.path.basename(filename),
                                      self._unrestricted_filenames)
            if extension not in supplement_links:
                supplement_links[extension] = []
            # Putting basename into the second slot of the tuple is important
            # because that will allow to download many supplements within a
            # single lecture, e.g.:
            # 01_slides-presented-in-this-module.pdf
            # 01_slides-presented-in-this-module_Dalal-cvpr05.pdf
            # 01_slides-presented-in-this-module_LM-3dtexton.pdf
            supplement_links[extension].append((link, basename))

        return supplement_links

示例#9

0

显示文件

文件： api.py 项目： AnkurSheel/coursera-dl

    def _extract_links_from_a_tags_in_text(self, text):
        """
        Extract supplement links from the html text that contains <a> tags
        with href attribute.

        @param text: HTML text.
        @type text: str

        @return: Dictionary with supplement links grouped by extension.
        @rtype: {
            '<extension1>': [
                ('<link1>', '<title1>'),
                ('<link2>', '<title2')
            ],
            'extension2': [
                ('<link3>', '<title3>'),
                ('<link4>', '<title4>')
            ]
        }
        """
        soup = BeautifulSoup(text)
        links = [item['href'].strip()
                 for item in soup.find_all('a') if 'href' in item.attrs]
        links = sorted(list(set(links)))
        supplement_links = {}

        for link in links:
            filename, extension = os.path.splitext(clean_url(link))
            # Some courses put links to sites in supplement section, e.g.:
            # http://pandas.pydata.org/
            if extension is '':
                continue

            # Make lowercase and cut the leading/trailing dot
            extension = clean_filename(
                extension.lower().strip('.').strip(),
                self._unrestricted_filenames)
            basename = clean_filename(
                os.path.basename(filename),
                self._unrestricted_filenames)
            if extension not in supplement_links:
                supplement_links[extension] = []
            # Putting basename into the second slot of the tuple is important
            # because that will allow to download many supplements within a
            # single lecture, e.g.:
            # 01_slides-presented-in-this-module.pdf
            # 01_slides-presented-in-this-module_Dalal-cvpr05.pdf
            # 01_slides-presented-in-this-module_LM-3dtexton.pdf
            supplement_links[extension].append((link, basename))

        return supplement_links

示例#10

0

显示文件

文件： api.py 项目： AnkurSheel/coursera-dl

    def _prettify_instructions(self, text):
        """
        Prettify instructions text to make it more suitable for offline reading.

        @param text: HTML (kinda) text to prettify.
        @type text: str

        @return: Prettified HTML with several markup tags replaced with HTML
            equivalents.
        @rtype: str
        """
        soup = BeautifulSoup(text)
        self._convert_instructions_basic(soup)
        self._convert_instructions_images(soup)
        return soup.prettify()

示例#11

0

显示文件

    def _convert_markup_basic(self, soup):
        """
        Perform basic conversion of instructions markup. This includes
        replacement of several textual markup tags with their HTML equivalents.

        @param soup: BeautifulSoup instance.
        @type soup: BeautifulSoup
        """
        # Inject meta charset tag
        meta = soup.new_tag('meta', charset='UTF-8')
        soup.insert(0, meta)

        # 1. Inject basic CSS style
        css_soup = BeautifulSoup(INSTRUCTIONS_HTML_INJECTION)
        soup.append(css_soup)

        # 2. Replace <text> with <p>
        while soup.find('text'):
            soup.find('text').name = 'p'

        # 3. Replace <heading level="1"> with <h1>
        while soup.find('heading'):
            heading = soup.find('heading')
            heading.name = 'h%s' % heading.attrs.get('level', '1')

        # 4. Replace <code> with <pre>
        while soup.find('code'):
            soup.find('code').name = 'pre'

        # 5. Replace <list> with <ol> or <ul>
        while soup.find('list'):
            list_ = soup.find('list')
            type_ = list_.attrs.get('bullettype', 'numbers')
            list_.name = 'ol' if type_ == 'numbers' else 'ul'

示例#12

0

显示文件

    def __call__(self, markup):
        """
        Convert instructions markup to make it more suitable for
        offline reading.

        @param markup: HTML (kinda) markup to prettify.
        @type markup: str

        @return: Prettified HTML with several markup tags replaced with HTML
            equivalents.
        @rtype: str
        """
        soup = BeautifulSoup(markup)
        self._convert_markup_basic(soup)
        self._convert_markup_images(soup)
        self._convert_markup_audios(soup)
        return soup.prettify()

示例#13

0

显示文件

文件： coursera_dl.py 项目： AnkurSheel/coursera-dl

def grab_hidden_video_url(session, href):
    """
    Follow some extra redirects to grab hidden video URLs.

    The first of these "hidden" videos were seen in courses from the
    University of Washington, but others appeared after that (like in the
    course Social Psychology).
    """
    try:
        page = get_page(session, href)
    except requests.exceptions.HTTPError:
        return None

    soup = BeautifulSoup(page)
    l = soup.find('source', attrs={'type': 'video/mp4'})
    if l is not None:
        return l['src']
    else:
        return None

示例#14

0

显示文件

文件： detail_page.py 项目： torst/aliexpress_parser

 def __init__(self,
              browser,
              detail_url,
              max_comments=100,
              max_transactions=100):
     self.browser = browser
     self.max_comments = max_comments
     self.max_transactions = max_transactions
     self.product_id = get_product_id_from_url(detail_url)
     self.detail_url = detail_url
     res = browser.get(fix_url(detail_url))
     self.main_page_soap = BS(res.text)
     self.item = {}

示例#15

0

显示文件

文件： list_parser.py 项目： torst/aliexpress_parser

 def parse_sale_page(self, url):
     res = self.browser.get(fix_url(url))
     html_text = res.text
     soap_page = BeautifulSoup(html_text)
     var = soap_page(text=re.compile(r'data_widgety5zzyn'))
     json_data = json.loads(var[0][var[0].index('{'):])
     products_url = json_data["source"]["url"]
     res = self.browser.get(fix_url(products_url))
     res.text.lstrip("onJSONPCallback(").rstrip(")")
     json_data = json.loads(
         res.text.lstrip("onJSONPCallback(").rstrip(");"))
     nodeList = json_data['content']['nodeList'][0]
     name = nodeList['name']
     return [item['detailUrl'] for item in nodeList['nodeData']['dataList']]

示例#16

0

显示文件

文件： detail_page.py 项目： torst/aliexpress_parser

class AliexpressPageParser:
    def __init__(self,
                 browser,
                 detail_url,
                 max_comments=100,
                 max_transactions=100):
        self.browser = browser
        self.max_comments = max_comments
        self.max_transactions = max_transactions
        self.product_id = get_product_id_from_url(detail_url)
        self.detail_url = detail_url
        res = browser.get(fix_url(detail_url))
        self.main_page_soap = BS(res.text)
        self.item = {}

    def run(self):
        """
          Call methods which has parse_ prefix
          Position is important !!!
        """
        methods = [
            getattr(self, m) for m in dir(self) if m.startswith("parse_")
        ]
        for method in methods:
            try:
                method()
            except Exception as e:
                logger.exception(e)

        return self.item

    def save_param(self, key, value):
        self.item[key] = value

    def get_data(self, tag, attrs, val_type="str"):
        """
        Get values from main page
        :param tag: String
            Example: "h2"
        :param attrs: Dict
            {"class": "class_name"}
        :return String
        """
        text = ""
        try:
            text = self.main_page_soap.find(tag, attrs).text
            if val_type != "str":
                m = re.search(r"[-+]?\d*\.\d+|\d+", text)
                if m:
                    text = m.group()
        except Exception as e:
            logger.debug("PASS: tag: %s, attrs: %s", tag, attrs)
        return text

    def parse_commond_data(self):
        self.save_param('detailUrl', self.detail_url)
        self.save_param('ali_id', self.product_id)
        self.save_param('title', self.get_data("h1",
                                               {"class": "product-name"}))
        self.save_param('avgStar',
                        self.get_data("span", {"class": "percent-num"}))
        self.save_param('discount',
                        self.get_data("span", {"class": "p-discount-rate"}))
        self.save_param('minPrice', self.get_data("span",
                                                  {"id": "j-sku-price"}))
        self.save_param('minMobPromPrice',
                        self.get_data("span", {"id": "j-sku-discount-price"}))
        self.save_param('promLeft',
                        self.get_data("span", {"class": "p-eventtime-left"}))
        self.save_param('orderNum',
                        self.get_data("span", {"id": "j-order-num"}, "int"))
        self.save_param('rantingsNum',
                        self.get_data("span", {"id": "rantings-num"}, "int"))

    def parse_description(self):
        descriptions = []
        for li in self.main_page_soap.find('ul', {
                'class': 'product-property-list'
        }).find_all('li'):
            description = {}
            try:
                key, val = li.find_all("span")
                description[key.text.strip(":")] = val.text
                descriptions.append(description)
            except Exception as e:
                logger.debug(e)

        self.save_param('description', descriptions)

    def parse_details(self):
        details_url = fix_url(
            re.search(r'window.runParams.descUrl="(.*?)";',
                      self.main_page_soap.text).group(1))
        response = self.browser.get(details_url)
        soup = BS(response.text)
        only_text = soup.getText().replace("window.productDescription=",
                                           "").strip(" ")
        self.save_param('details', only_text)

    def parse_images(self):
        images = []
        for image in self.main_page_soap.find_all('span',
                                                  {'class': 'img-thumb-item'}):
            origin_image_path = origin_image(image.img['src'])
            images.append(origin_image_path)

        self.save_param('images', images)

    def parse_feedbacks(self):
        feedback_url = fix_url(
            self.main_page_soap.find(id="feedback").iframe['thesrc'])
        comments = []
        last_page_count = None
        for page_count in range(1, 10000):
            feedback_r = self.browser.post(feedback_url, {"page": page_count})
            feddback_soap = BS(feedback_r.text)
            if not last_page_count:
                try:
                    a_tags = feddback_soap.find(
                        "div", {
                            "class": "ui-pagination-navi util-left"
                        }).find_all("a")
                    last_page_count = int(a_tags[len(a_tags) - 2].text)
                except Exception as e:
                    pass
            elif last_page_count < page_count:
                break

            for comment_div in feddback_soap.find_all(
                    'div', {'class': 'feedback-item'}):
                try:
                    comment = {}
                    user_data = comment_div.find('div',
                                                 {'class': 'fb-user-info'})
                    try:
                        user_name = user_data.span.a.text
                    except AttributeError:
                        user_name = user_data.span.text

                    comment['user_name'] = user_name
                    comment['country'] = user_data.b.text
                    comment['comment'] = comment_div.find(
                        'dt', {
                            'class': 'buyer-feedback'
                        }).span.text
                    comment['posted_time'] = comment_div.find(
                        'dd', {
                            "class": "r-time"
                        }).text

                    start_css = comment_div.find('span', {
                        "class": "star-view"
                    }).span["style"]
                    comment["rating"] = start_css[start_css.find(":"):].strip(
                        "%")
                    comments.append(comment)
                except Exception as e:
                    logger.debug(e)

            if self.max_comments < len(comments):
                logger.info("Stopped comments fetching by max_transactions")
                break
        self.save_param('comments', comments)

    def parse_history_transactions(self):
        history_transaction = "https://feedback.aliexpress.com/display/evaluationProductDetailAjaxService.htm?" \
                              "productId=%s&type=default" % self.product_id
        transactions = []
        last_page = None
        for page_count in range(1, 100000):
            transaction_r = self.browser.get(history_transaction,
                                             {'page': page_count})
            transaction_json = transaction_r.json()
            if not last_page:
                last_page = int(transaction_json['page']['total'])
            elif last_page < page_count:
                break

            for records in transaction_json['records']:
                transactions.append(records)

            if self.max_transactions < len(transactions):
                logger.info(
                    "Stopped transactions fetching by max_transactions")
                break

        self.save_param('transaction', transactions)

示例#17

0

显示文件

文件： coursera_dl.py 项目： AnkurSheel/coursera-dl

def parse_old_style_syllabus(session, page, reverse=False, unrestricted_filenames=False,
                             subtitle_language='en'):
    """
    Parse an old style Coursera course listing/syllabus page.

    Each section is a week of classes.
    """

    sections = []
    soup = BeautifulSoup(page)

    # traverse sections
    stags = soup.findAll(attrs={'class': re.compile('^course-item-list-header')})
    for stag in stags:
        assert stag.contents[0] is not None, "couldn't find section"
        untouched_fname = stag.contents[0].contents[1]
        section_name = clean_filename(untouched_fname, unrestricted_filenames)
        logging.info(section_name)
        lectures = []  # resources for 1 lecture

        # traverse resources (e.g., video, ppt, ..)
        for vtag in stag.nextSibling.findAll('li'):
            assert vtag.a.contents[0], "couldn't get lecture name"
            untouched_fname = vtag.a.contents[0]
            vname = clean_filename(untouched_fname, unrestricted_filenames)
            logging.info('  %s', vname)
            lecture = {}
            lecture_page = None

            for a in vtag.findAll('a'):
                href = fix_url(a['href'])
                untouched_fname = a.get('title', '')
                title = clean_filename(untouched_fname, unrestricted_filenames)
                fmt = get_anchor_format(href)
                if fmt in ('srt', 'txt') and subtitle_language != 'en':
                    title = title.replace('_en&format', '_' + subtitle_language + '&format')
                    href = href.replace('_en&format', '_' + subtitle_language + '&format')

                logging.debug('    %s %s', fmt, href)
                if fmt:
                    lecture[fmt] = lecture.get(fmt, [])
                    lecture[fmt].append((href, title))
                    continue

                # Special case: find preview URLs
                lecture_page = transform_preview_url(href)
                if lecture_page:
                    try:
                        href = get_old_style_video(session, lecture_page)
                        lecture['mp4'] = lecture.get('mp4', [])
                        lecture['mp4'].append((fix_url(href), ''))
                    except TypeError:
                        logging.warning(
                            'Could not get resource: %s', lecture_page)

            # Special case: we possibly have hidden video links---thanks to
            # the University of Washington for that.
            if 'mp4' not in lecture:
                for a in vtag.findAll('a'):
                    if a.get('data-modal-iframe'):
                        href = grab_hidden_video_url(
                            session, a['data-modal-iframe'])
                        href = fix_url(href)
                        fmt = 'mp4'
                        logging.debug('    %s %s', fmt, href)
                        if href is not None:
                            lecture[fmt] = lecture.get(fmt, [])
                            lecture[fmt].append((href, ''))

            for fmt in lecture:
                count = len(lecture[fmt])
                for i, r in enumerate(lecture[fmt]):
                    if count == i + 1:
                        # for backward compatibility, we do not add the title
                        # to the filename (format_combine_number_resource and
                        # format_resource)
                        lecture[fmt][i] = (r[0], '')
                    else:
                        # make sure the title is unique
                        lecture[fmt][i] = (r[0], '{0:d}_{1}'.format(i, r[1]))

            lectures.append((vname, lecture))

        sections.append((section_name, lectures))

    logging.info('Found %d sections and %d lectures on this page',
                 len(sections), sum(len(s[1]) for s in sections))

    if sections and reverse:
        sections.reverse()

    if not len(sections):
        logging.error('The cookies file may be invalid, '
                      'please re-run with the `--clear-cache` option.')

    return sections

示例#18

0

显示文件

 def _replace_tag(self, text, initial_tag, target_tag):
     soup = BeautifulSoup(text)
     while soup.find(initial_tag):
         soup.find(initial_tag).name = target_tag
     return soup.prettify()