Пример #1
0
    def parse_search_page(self, url):
        url_list = []
        for _ in range(self.page_count):
            res = self.browser.get(fix_url(url))
            html_text = res.text
            soap_page = BeautifulSoup(html_text)

            for prod_element in soap_page.find("ul", {
                    "id": re.compile(r"list-items")
            }).find_all("li"):
                url_list.append(
                    fix_url(
                        prod_element.find(
                            "a", {"href": re.compile("aliexpress.com/item")
                                  })["href"]))

            try:
                url = soap_page.find("div", {
                    "class": "ui-pagination-navi"
                }).find("a", {
                    "class": "page-next"
                }).attrs["href"]
            except Exception as e:
                logger.debug(e)
                break
        return url_list
Пример #2
0
 def parse_sale_page(self, url):
     res = self.browser.get(fix_url(url))
     html_text = res.text
     soap_page = BeautifulSoup(html_text)
     var = soap_page(text=re.compile(r'data_widgety5zzyn'))
     json_data = json.loads(var[0][var[0].index('{'):])
     products_url = json_data["source"]["url"]
     res = self.browser.get(fix_url(products_url))
     res.text.lstrip("onJSONPCallback(").rstrip(")")
     json_data = json.loads(
         res.text.lstrip("onJSONPCallback(").rstrip(");"))
     nodeList = json_data['content']['nodeList'][0]
     name = nodeList['name']
     return [item['detailUrl'] for item in nodeList['nodeData']['dataList']]
Пример #3
0
 def parse_details(self):
     details_url = fix_url(
         re.search(r'window.runParams.descUrl="(.*?)";',
                   self.main_page_soap.text).group(1))
     response = self.browser.get(details_url)
     soup = BS(response.text)
     only_text = soup.getText().replace("window.productDescription=",
                                        "").strip(" ")
     self.save_param('details', only_text)
Пример #4
0
    def parse_feedbacks(self):
        feedback_url = fix_url(
            self.main_page_soap.find(id="feedback").iframe['thesrc'])
        comments = []
        last_page_count = None
        for page_count in range(1, 10000):
            feedback_r = self.browser.post(feedback_url, {"page": page_count})
            feddback_soap = BS(feedback_r.text)
            if not last_page_count:
                try:
                    a_tags = feddback_soap.find(
                        "div", {
                            "class": "ui-pagination-navi util-left"
                        }).find_all("a")
                    last_page_count = int(a_tags[len(a_tags) - 2].text)
                except Exception as e:
                    pass
            elif last_page_count < page_count:
                break

            for comment_div in feddback_soap.find_all(
                    'div', {'class': 'feedback-item'}):
                try:
                    comment = {}
                    user_data = comment_div.find('div',
                                                 {'class': 'fb-user-info'})
                    try:
                        user_name = user_data.span.a.text
                    except AttributeError:
                        user_name = user_data.span.text

                    comment['user_name'] = user_name
                    comment['country'] = user_data.b.text
                    comment['comment'] = comment_div.find(
                        'dt', {
                            'class': 'buyer-feedback'
                        }).span.text
                    comment['posted_time'] = comment_div.find(
                        'dd', {
                            "class": "r-time"
                        }).text

                    start_css = comment_div.find('span', {
                        "class": "star-view"
                    }).span["style"]
                    comment["rating"] = start_css[start_css.find(":"):].strip(
                        "%")
                    comments.append(comment)
                except Exception as e:
                    logger.debug(e)

            if self.max_comments < len(comments):
                logger.info("Stopped comments fetching by max_transactions")
                break
        self.save_param('comments', comments)
Пример #5
0
 def __init__(self,
              browser,
              detail_url,
              max_comments=100,
              max_transactions=100):
     self.browser = browser
     self.max_comments = max_comments
     self.max_transactions = max_transactions
     self.product_id = get_product_id_from_url(detail_url)
     self.detail_url = detail_url
     res = browser.get(fix_url(detail_url))
     self.main_page_soap = BS(res.text)
     self.item = {}
Пример #6
0
def parse_syllabus(session, page, reverse=False):
    """
    Parses a Coursera course listing/syllabus page.  Each section is a week
    of classes.
    """

    sections = []
    soup = BeautifulSoup(page)

    # traverse sections
    for stag in soup.findAll(attrs={'class':
                                    re.compile('^course-item-list-header')}):
        assert stag.contents[0] is not None, "couldn't find section"
        section_name = clean_filename(stag.contents[0].contents[1])
        logging.info(section_name)
        lectures = []  # resources for 1 lecture

        # traverse resources (e.g., video, ppt, ..)
        for vtag in stag.nextSibling.findAll('li'):
            assert vtag.a.contents[0], "couldn't get lecture name"
            vname = clean_filename(vtag.a.contents[0])
            logging.info('  %s', vname)
            lecture = {}
            lecture_page = None

            for a in vtag.findAll('a'):
                href = fix_url(a['href'])
                title = clean_filename(a.get('title', ''))
                fmt = get_anchor_format(href)
                logging.debug('    %s %s', fmt, href)
                if fmt:
                    lecture[fmt] = lecture.get(fmt, [])
                    lecture[fmt].append((href, title))
                    continue

                # Special case: find preview URLs
                lecture_page = transform_preview_url(href)
                if lecture_page:
                    try:
                        href = get_video(session, lecture_page)
                        lecture['mp4'] = lecture.get('mp4', [])
                        lecture['mp4'].append((fix_url(href), ''))
                    except TypeError:
                        logging.warn(
                            'Could not get resource: %s', lecture_page)

            # Special case: we possibly have hidden video links---thanks to
            # the University of Washington for that.
            if 'mp4' not in lecture:
                for a in vtag.findAll('a'):
                    if a.get('data-modal-iframe'):
                        href = grab_hidden_video_url(
                            session, a['data-modal-iframe'])
                        href = fix_url(href)
                        fmt = 'mp4'
                        logging.debug('    %s %s', fmt, href)
                        if href is not None:
                            lecture[fmt] = lecture.get(fmt, [])
                            lecture[fmt].append((href, ''))


            for fmt in lecture:
                count = len(lecture[fmt])
                for i, r in enumerate(lecture[fmt]):
                    if (count == i + 1):
                        # for backward compatibility, we do not add the title
                        # to the filename (format_combine_number_resource and
                        # format_resource)
                        lecture[fmt][i] = (r[0], '')
                    else:
                        # make sure the title is unique
                        lecture[fmt][i] = (r[0], '{0:d}_{1}'.format(i, r[1]))

            lectures.append((vname, lecture))

        sections.append((section_name, lectures))

    logging.info('Found %d sections and %d lectures on this page',
                 len(sections), sum(len(s[1]) for s in sections))

    if sections and reverse:
        sections.reverse()

    if not len(sections):
        logging.error('Probably bad cookies file (or wrong class name)')

    return sections
Пример #7
0
def parse_syllabus(session, page, reverse=False, intact_fnames=False):
    """
    Parses a Coursera course listing/syllabus page.  Each section is a week
    of classes.
    """

    sections = []
    soup = BeautifulSoup(page)

    # traverse sections
    for stag in soup.findAll(
            attrs={'class': re.compile('^course-item-list-header')}):
        assert stag.contents[0] is not None, "couldn't find section"
        untouched_fname = stag.contents[0].contents[1]
        section_name = clean_filename(untouched_fname, intact_fnames)
        logging.info(section_name)
        lectures = []  # resources for 1 lecture

        # traverse resources (e.g., video, ppt, ..)
        for vtag in stag.nextSibling.findAll('li'):
            assert vtag.a.contents[0], "couldn't get lecture name"
            untouched_fname = vtag.a.contents[0]
            vname = clean_filename(untouched_fname, intact_fnames)
            logging.info('  %s', vname)
            lecture = {}
            lecture_page = None

            for a in vtag.findAll('a'):
                href = fix_url(a['href'])
                untouched_fname = a.get('title', '')
                title = clean_filename(untouched_fname, intact_fnames)
                fmt = get_anchor_format(href)
                logging.debug('    %s %s', fmt, href)
                if fmt:
                    lecture[fmt] = lecture.get(fmt, [])
                    lecture[fmt].append((href, title))
                    continue

                # Special case: find preview URLs
                lecture_page = transform_preview_url(href)
                if lecture_page:
                    try:
                        href = get_video(session, lecture_page)
                        lecture['mp4'] = lecture.get('mp4', [])
                        lecture['mp4'].append((fix_url(href), ''))
                    except TypeError:
                        logging.warn('Could not get resource: %s',
                                     lecture_page)

            # Special case: we possibly have hidden video links---thanks to
            # the University of Washington for that.
            if 'mp4' not in lecture:
                for a in vtag.findAll('a'):
                    if a.get('data-modal-iframe'):
                        href = grab_hidden_video_url(session,
                                                     a['data-modal-iframe'])
                        href = fix_url(href)
                        fmt = 'mp4'
                        logging.debug('    %s %s', fmt, href)
                        if href is not None:
                            lecture[fmt] = lecture.get(fmt, [])
                            lecture[fmt].append((href, ''))

            for fmt in lecture:
                count = len(lecture[fmt])
                for i, r in enumerate(lecture[fmt]):
                    if count == i + 1:
                        # for backward compatibility, we do not add the title
                        # to the filename (format_combine_number_resource and
                        # format_resource)
                        lecture[fmt][i] = (r[0], '')
                    else:
                        # make sure the title is unique
                        lecture[fmt][i] = (r[0], '{0:d}_{1}'.format(i, r[1]))

            lectures.append((vname, lecture))

        sections.append((section_name, lectures))

    logging.info('Found %d sections and %d lectures on this page',
                 len(sections), sum(len(s[1]) for s in sections))

    if sections and reverse:
        sections.reverse()

    if not len(sections):
        logging.error('The cookies file may be invalid, '
                      'please re-run with the `--clear-cache` option.')

    return sections
Пример #8
0
 def get_error_rate(self, response):
     self.out_domains.add(get_domain(response.request.url))
     self.crawler.stats.inc_value("no_requests")
     if not self.domain.check_request_url(response.request.url):
         self.crawler.stats.inc_value('no_new_posts')
     self.sum_download_time += response.meta['request_time']
     urls = [response.urljoin(url.strip()) for url in response.xpath("//a/@href").getall() if fix_url(url)]
     for url in urls:
         yield Request(url=url, callback=self.get_error_rate, errback=self.check_error_back_rate)
Пример #9
0
def parse_old_style_syllabus(session, page, reverse=False, intact_fnames=False, subtitle_language="en"):
    """
    Parse an old style Coursera course listing/syllabus page.

    Each section is a week of classes.
    """

    sections = []
    soup = BeautifulSoup(page)

    # traverse sections
    stags = soup.findAll(attrs={"class": re.compile("^course-item-list-header")})
    for stag in stags:
        assert stag.contents[0] is not None, "couldn't find section"
        untouched_fname = stag.contents[0].contents[1]
        section_name = clean_filename(untouched_fname, intact_fnames)
        logging.info(section_name)
        lectures = []  # resources for 1 lecture

        # traverse resources (e.g., video, ppt, ..)
        for vtag in stag.nextSibling.findAll("li"):
            assert vtag.a.contents[0], "couldn't get lecture name"
            untouched_fname = vtag.a.contents[0]
            vname = clean_filename(untouched_fname, intact_fnames)
            logging.info("  %s", vname)
            lecture = {}
            lecture_page = None

            for a in vtag.findAll("a"):
                href = fix_url(a["href"])
                untouched_fname = a.get("title", "")
                title = clean_filename(untouched_fname, intact_fnames)
                fmt = get_anchor_format(href)
                if fmt in ("srt", "txt") and subtitle_language != "en":
                    title = title.replace("_en&format", "_" + subtitle_language + "&format")
                    href = href.replace("_en&format", "_" + subtitle_language + "&format")

                logging.debug("    %s %s", fmt, href)
                if fmt:
                    lecture[fmt] = lecture.get(fmt, [])
                    lecture[fmt].append((href, title))
                    continue

                # Special case: find preview URLs
                lecture_page = transform_preview_url(href)
                if lecture_page:
                    try:
                        href = get_old_style_video(session, lecture_page)
                        lecture["mp4"] = lecture.get("mp4", [])
                        lecture["mp4"].append((fix_url(href), ""))
                    except TypeError:
                        logging.warn("Could not get resource: %s", lecture_page)

            # Special case: we possibly have hidden video links---thanks to
            # the University of Washington for that.
            if "mp4" not in lecture:
                for a in vtag.findAll("a"):
                    if a.get("data-modal-iframe"):
                        href = grab_hidden_video_url(session, a["data-modal-iframe"])
                        href = fix_url(href)
                        fmt = "mp4"
                        logging.debug("    %s %s", fmt, href)
                        if href is not None:
                            lecture[fmt] = lecture.get(fmt, [])
                            lecture[fmt].append((href, ""))

            for fmt in lecture:
                count = len(lecture[fmt])
                for i, r in enumerate(lecture[fmt]):
                    if count == i + 1:
                        # for backward compatibility, we do not add the title
                        # to the filename (format_combine_number_resource and
                        # format_resource)
                        lecture[fmt][i] = (r[0], "")
                    else:
                        # make sure the title is unique
                        lecture[fmt][i] = (r[0], "{0:d}_{1}".format(i, r[1]))

            lectures.append((vname, lecture))

        sections.append((section_name, lectures))

    logging.info("Found %d sections and %d lectures on this page", len(sections), sum(len(s[1]) for s in sections))

    if sections and reverse:
        sections.reverse()

    if not len(sections):
        logging.error("The cookies file may be invalid, " "please re-run with the `--clear-cache` option.")

    return sections
Пример #10
0
def parse_old_style_syllabus(session, page, reverse=False, unrestricted_filenames=False,
                             subtitle_language='en'):
    """
    Parse an old style Coursera course listing/syllabus page.

    Each section is a week of classes.
    """

    sections = []
    soup = BeautifulSoup(page)

    # traverse sections
    stags = soup.findAll(attrs={'class': re.compile('^course-item-list-header')})
    for stag in stags:
        assert stag.contents[0] is not None, "couldn't find section"
        untouched_fname = stag.contents[0].contents[1]
        section_name = clean_filename(untouched_fname, unrestricted_filenames)
        logging.info(section_name)
        lectures = []  # resources for 1 lecture

        # traverse resources (e.g., video, ppt, ..)
        for vtag in stag.nextSibling.findAll('li'):
            assert vtag.a.contents[0], "couldn't get lecture name"
            untouched_fname = vtag.a.contents[0]
            vname = clean_filename(untouched_fname, unrestricted_filenames)
            logging.info('  %s', vname)
            lecture = {}
            lecture_page = None

            for a in vtag.findAll('a'):
                href = fix_url(a['href'])
                untouched_fname = a.get('title', '')
                title = clean_filename(untouched_fname, unrestricted_filenames)
                fmt = get_anchor_format(href)
                if fmt in ('srt', 'txt') and subtitle_language != 'en':
                    title = title.replace('_en&format', '_' + subtitle_language + '&format')
                    href = href.replace('_en&format', '_' + subtitle_language + '&format')

                logging.debug('    %s %s', fmt, href)
                if fmt:
                    lecture[fmt] = lecture.get(fmt, [])
                    lecture[fmt].append((href, title))
                    continue

                # Special case: find preview URLs
                lecture_page = transform_preview_url(href)
                if lecture_page:
                    try:
                        href = get_old_style_video(session, lecture_page)
                        lecture['mp4'] = lecture.get('mp4', [])
                        lecture['mp4'].append((fix_url(href), ''))
                    except TypeError:
                        logging.warning(
                            'Could not get resource: %s', lecture_page)

            # Special case: we possibly have hidden video links---thanks to
            # the University of Washington for that.
            if 'mp4' not in lecture:
                for a in vtag.findAll('a'):
                    if a.get('data-modal-iframe'):
                        href = grab_hidden_video_url(
                            session, a['data-modal-iframe'])
                        href = fix_url(href)
                        fmt = 'mp4'
                        logging.debug('    %s %s', fmt, href)
                        if href is not None:
                            lecture[fmt] = lecture.get(fmt, [])
                            lecture[fmt].append((href, ''))

            for fmt in lecture:
                count = len(lecture[fmt])
                for i, r in enumerate(lecture[fmt]):
                    if count == i + 1:
                        # for backward compatibility, we do not add the title
                        # to the filename (format_combine_number_resource and
                        # format_resource)
                        lecture[fmt][i] = (r[0], '')
                    else:
                        # make sure the title is unique
                        lecture[fmt][i] = (r[0], '{0:d}_{1}'.format(i, r[1]))

            lectures.append((vname, lecture))

        sections.append((section_name, lectures))

    logging.info('Found %d sections and %d lectures on this page',
                 len(sections), sum(len(s[1]) for s in sections))

    if sections and reverse:
        sections.reverse()

    if not len(sections):
        logging.error('The cookies file may be invalid, '
                      'please re-run with the `--clear-cache` option.')

    return sections