Exemplo n.º 1
0
 def test_basics(self):
     for url, output in TESTS:
         assert force_protocol(url) == output
     assert force_protocol('http://lemonde.fr?utm_hp_ref=test',
                           'ftp') == 'ftp://lemonde.fr?utm_hp_ref=test'
     assert force_protocol('ftp://lemonde.fr?utm_hp_ref=test',
                           'http://') == 'http://lemonde.fr?utm_hp_ref=test'
Exemplo n.º 2
0
    def __call__(self, url, detailed=False, per_call=False, format='raw'):

        if format not in FACEBOOK_OUTPUT_FORMATS:
            raise TypeError('minet.facebook.scrape_comments: unkown `format`.')

        # Reformatting url to hit mobile website
        url = force_protocol(url, 'https')
        url = convert_facebook_url_to_mobile(url)

        html = self.request_page(url)

        members_link = scrape_members_link(html)

        while members_link is not None:
            html = self.request_page(members_link)

            next_link, members = scrape_members(html)

            yield from members

            members_link = next_link
Exemplo n.º 3
0
def facebook_comments_action(namespace):

    # Reformatting url to hit mobile website
    url = force_protocol(namespace.url, 'https')
    url = convert_facebook_url_to_mobile(url)

    # Grabbing cookie
    cookie = grab_facebook_cookie(namespace)

    # Handling output
    output_file = open_output_file(namespace.output)

    writer = csv.writer(output_file)
    writer.writerow(CSV_HEADERS)

    http = create_pool()

    def request_page(target):
        error, result = request(http, target, cookie=cookie)

        if error is not None:
            raise error

        return result.data.decode('utf-8')

    # Loading bar
    loading_bar = tqdm(desc='Scraping comments',
                       dynamic_ncols=True,
                       unit=' comments')

    url_queue = deque([(url, None)])

    url_count = 0
    replies_count = 0

    while len(url_queue) != 0:
        current_url, in_reply_to = url_queue.popleft()

        html = request_page(current_url)
        data = scrape_comments(html, in_reply_to)

        url_count += 1

        for reply_url, commented_id in data['replies']:
            url_queue.append((reply_url, commented_id))

        if data['next'] is not None:
            url_queue.append((data['next'], in_reply_to))

        for comment in data['comments']:
            loading_bar.update()
            writer.writerow(format_csv_row(comment))

            if in_reply_to is not None:
                replies_count += 1

        loading_bar.set_postfix(urls=url_count,
                                replies=replies_count,
                                q=len(url_queue))

        # Don't be too greedy
        time.sleep(FACEBOOK_MOBILE_DEFAULT_THROTTLE)

    loading_bar.close()
Exemplo n.º 4
0
def convert_url_to_mobile(url):
    url = force_protocol(url, 'https')
    return convert_facebook_url_to_mobile(url)
Exemplo n.º 5
0
    def __call__(self, url, detailed=False, per_call=False, format='raw'):

        if format not in FACEBOOK_OUTPUT_FORMATS:
            raise TypeError('minet.facebook.scrape_comments: unkown `format`.')

        # Reformatting url to hit mobile website
        url = force_protocol(url, 'https')
        url = convert_facebook_url_to_mobile(url)

        url_queue = deque([(url, None, None)])

        calls = 0
        replies = 0

        while len(url_queue) != 0:
            current_url, direction, in_reply_to = url_queue.popleft()

            html = self.request_page(current_url)

            try:
                data = scrape_comments(html, direction, in_reply_to)
            except TypeError:
                # with open('./dump.html', 'w') as f:
                #     f.write(html)
                print('Could not process comment in %s' % current_url,
                      file=sys.stderr)
                sys.exit(1)

            calls += 1

            for reply_url, commented_id in data['replies']:
                url_queue.append((reply_url, None, commented_id))

            if data['next'] is not None:
                url_queue.append(
                    (data['next'], data['direction'], in_reply_to))

            comments = []

            for comment in data['comments']:
                if in_reply_to is not None:
                    replies += 1

                if format == 'csv_row':
                    comment = format_comment(comment)

                if per_call:
                    comments.append(comment)
                else:
                    yield comment

            if detailed:
                details = {
                    'calls': calls,
                    'replies': replies,
                    'queue_size': len(url_queue)
                }

                yield details, comments
            else:
                yield comments