示例#1
0
文件: crawl.py 项目: rangsutu88/minet
def crawl_action(namespace):

    # Loading crawler definition
    queue_path = join(namespace.output_dir, 'queue')
    definition = load_definition(namespace.crawler)

    if namespace.resume:
        print_err('Resuming crawl...')
    else:
        rmtree(queue_path, ignore_errors=True)

    # Scaffolding output directory
    os.makedirs(namespace.output_dir, exist_ok=True)

    jobs_output_path = join(namespace.output_dir, 'jobs.csv')
    jobs_output, jobs_writer = open_report(jobs_output_path,
                                           JOBS_HEADERS,
                                           resume=namespace.resume)

    # Creating crawler
    crawler = Crawler(definition,
                      throttle=namespace.throttle,
                      queue_path=queue_path)

    reporter_pool = ScraperReporterPool(crawler,
                                        namespace.output_dir,
                                        resume=namespace.resume)

    # Loading bar
    loading_bar = tqdm(desc='Crawling', unit=' pages', dynamic_ncols=True)

    def update_loading_bar(result):
        state = crawler.state

        loading_bar.set_postfix(queue=state.jobs_queued,
                                spider=result.job.spider)
        loading_bar.update()

    # Starting crawler
    crawler.start()

    # Running crawler
    for result in crawler:
        update_loading_bar(result)
        jobs_writer.writerow(format_job_for_csv(result))

        if result.error is not None:
            continue

        reporter_pool.write(result.job.spider, result.scraped)

    loading_bar.close()
    jobs_output.close()
    reporter_pool.close()
示例#2
0
def crawl_action(cli_args, defer):

    # Loading crawler definition
    queue_path = join(cli_args.output_dir, 'queue')

    if cli_args.resume:
        print_err('Resuming crawl...')
    else:
        rmtree(queue_path, ignore_errors=True)

    # Scaffolding output directory
    os.makedirs(cli_args.output_dir, exist_ok=True)

    jobs_output_path = join(cli_args.output_dir, 'jobs.csv')
    jobs_output, jobs_writer = open_report(jobs_output_path,
                                           JOBS_HEADERS,
                                           resume=cli_args.resume)
    defer(jobs_output.close)

    # Creating crawler
    crawler = Crawler(cli_args.crawler,
                      throttle=cli_args.throttle,
                      queue_path=queue_path)

    reporter_pool = ScraperReporterPool(crawler,
                                        cli_args.output_dir,
                                        resume=cli_args.resume)
    defer(reporter_pool.close)

    # Loading bar
    loading_bar = LoadingBar(desc='Crawling', unit='page')

    def update_loading_bar(result):
        state = crawler.state

        loading_bar.update_stats(queued=state.jobs_queued,
                                 doing=state.jobs_doing + 1,
                                 spider=result.job.spider)
        loading_bar.update()

    # Starting crawler
    crawler.start()

    # Running crawler
    for result in crawler:
        update_loading_bar(result)
        jobs_writer.writerow(format_job_for_csv(result))

        if result.error is not None:
            continue

        reporter_pool.write(result.job.spider, result.scraped)
示例#3
0
    def call(self, route, args={}, tryouts=MAX_TRYOUTS):

        if route not in self.auth:
            self.auth[route] = "user"
        auth = self.auth[route]

        try:
            return self.api[auth].__getattr__("/".join(route.split('.')))(**args)

        except TwitterHTTPError as e:
            if e.e.code == 429:
                now = time()
                reset = int(e.e.headers["x-rate-limit-reset"])

                if route not in self.waits:
                    self.waits[route] = {"user": now, "app": now}

                self.waits[route][auth] = reset
                print_err("REACHED API LIMITS on %s %s until %s for auth %s" % (route, args, reset, auth))
                minwait = sorted([(a, w) for a, w in self.waits[route].items()], key=lambda x: x[1])[0]

                if minwait[1] > now:
                    sleeptime = 5 + max(0, int(minwait[1] - now))
                    print_err("  will wait for %s for the next %ss (%s)" % (minwait[0], sleeptime, datetime.fromtimestamp(now + sleeptime).isoformat()[11:19]))
                    sleep(sleeptime)
                self.auth[route] = minwait[0]

                return self.call(route, args, tryouts)

            elif tryouts:
                return self.call(route, args, tryouts - 1)

            else:
                print_err("ERROR after %s tryouts for %s %s %s" % (self.MAX_TRYOUTS, route, auth, args))
                print_err("%s: %s" % (type(e), e))
示例#4
0
def collect_top_reactions(data):
    edges = getpath(data, ['top_reactions', 'edges'])

    if edges is None:
        return

    index = {}

    for edge in edges:
        emotion = FACEBOOK_REACTION_KEYS.get(edge['node']['key'])

        if emotion is None:
            print_err('Found unkown emotion %s' % edge)
            continue

        index[emotion] = edge['reaction_count'] or 0

    return index
示例#5
0
def captions_action(namespace, output_file):

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        keep=namespace.select,
        add=REPORT_HEADERS
    )

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit=' videos',
    )

    http = create_pool()

    for line, video_id in enricher.cells(namespace.column, with_rows=True):
        url_caption = ''
        url_inf = INFO_URL_TEMPLATE % {'id': video_id}
        err1, info_vid = request(http, url_inf)
        info_vid_dec = unquote(str(info_vid.data))
        captionsTracks = re.findall(get_info, info_vid_dec)
        if captionsTracks:
            dict_captions = json.loads(captionsTracks[0][0] + '}')['captionTracks']
            for i in range(len(dict_captions)):
                if namespace.lang and namespace.lang == dict_captions[i]['languageCode']:
                    url_caption = dict_captions[i]['baseUrl']
                    break
            if not(url_caption) and dict_captions:
                url_caption = dict_captions[0]['baseUrl']

        else:
            url_vid = VIDEO_CALL_TEMPLATE % {'id': video_id}
            urls = []
            time.sleep(0.01)
            err, result = request(http, url_vid)
            timedtext = re.findall(timed, str(result.data))
            for x in timedtext:
                proper_timed = x.replace("\\\\u0026", "&")
                if proper_timed[-2:] == namespace.lang:
                    url_caption = API_BASE_URL % {'temp': proper_timed}
                    break
            if not(url_caption) and timedtext and not(namespace.lang):
                url_caption = API_BASE_URL % {'temp': timedtext[1].replace("\\\\u0026", "&")}
        if not url_caption:
            print_err('no subtitles for {}'.format(video_id))
            continue

        time.sleep(0.01)
        err, result_caption = request(http, url_caption)

        if err is not None:
            print_err(err)
        elif result_caption.status >= 400:
            print_err(f'error, status : {result_caption.status} for id : {video_id}')
            enricher.writerow(line)
        else:
            soup = BeautifulSoup(result_caption.data, 'lxml')

            full_text = []

            caption_text = " ".join(item.get_text() for item in soup.find_all('text'))

            enricher.writerow(line, [caption_text])

        loading_bar.update()
示例#6
0
    def fetch_facebook_page_stats(url):
        err, response = request(url, cookie='locale=en_US')

        if err:
            return 'http-error', None

        if response.status == 404:
            return 'not-found', None

        if response.status >= 400:
            return 'http-error', None

        html = response.data

        if CAPTCHA in html:
            die(['Rate limit reached!', 'Last url: %s' % url])

        if (CURRENT_AVAILABILITY_DISCLAIMER in html
                or AVAILABILITY_DISCLAIMER in html):
            return 'unavailable', None

        if LOGIN_DISCLAIMER in html:
            return 'private-or-unavailable', None

        # TODO: integrate into ural
        bpost_id = url.rsplit('/', 1)[-1].encode()

        # Extracting metadata
        meta_extractor = re.compile(META_EXTRACTOR_TEMPLATE % bpost_id)

        match = meta_extractor.search(html)

        if match is None:
            return 'extraction-failed', None

        data = json5.loads(match.group(1).decode())
        data = getpath(data, [
            'jsmods', 'pre_display_requires', 0, 3, 1, '__bbox', 'result',
            'data', 'feedback'
        ])

        if data is None:
            return 'extraction-failed', None

        # TODO: remove, this is here as a test
        # TODO: try to find a post where comments are disabled
        if get_count(data['seen_by_count']):
            print_err('Found seen_by_count: %i for %s' %
                      (get_count(data['seen_by_count']), url))

        if 'political_figure_data' in data and data['political_figure_data']:
            print_err('Found political_figure_data:')
            print_err(data['political_figure_data'])

        if get_count(data['reaction_count']) != get_count(data['reactors']):
            print_err('Found different reactions/reactors for %s' % url)

        # Extracting data from hidden html
        hidden_html_extractor = re.compile(HTML_EXTRACTOR_TEMPLATE % bpost_id)
        match = hidden_html_extractor.search(html)

        if match is not None:
            hidden_html = match.group(1).decode()
            soup = BeautifulSoup(hidden_html, 'lxml')

            # Sometimes fetching a post behaves weirdly
            if soup.select_one('h5 a') is None:
                return 'extraction-failed', None

            data['scraped'] = {}

            timestamp_elem = soup.select_one('[data-utime]')
            timestamp = int(timestamp_elem.get('data-utime'))

            data['scraped']['account_name'] = soup.select_one(
                'h5 a').get_text().strip()
            data['scraped']['timestamp'] = timestamp
            data['scraped']['time'] = datetime.fromtimestamp(
                timestamp).isoformat()

            # TODO: use a context manager
            try:
                data['scraped']['aria_label'] = timestamp_elem.parent.get(
                    'aria-label')
            except:
                pass

            try:
                data['scraped']['text'] = soup.select_one(
                    '[data-testid="post_message"]').get_text()
            except:
                pass

            # try:
            #     data['scraped']['link'] = soup.select_one('[data-lynx-uri]').get('href')
            # except:
            #     pass

        return None, data
示例#7
0
    def action(namespace, output_file):

        # Do we need to resume?
        need_to_resume = False

        if getattr(namespace, "resume", False):
            need_to_resume = True

            if namespace.output is None:
                die(
                    "Cannot --resume without knowing the output (use -o/--output rather stdout).",
                )

            if namespace.sort_by != "date":
                die("Cannot --resume if --sort_by is not `date`.")

            if namespace.format != "csv":
                die("Cannot --resume jsonl format yet.")

            with open(namespace.output, "r") as f:
                resume_reader = casanova.reader(f)

                last_cell = None
                resume_loader = tqdm(desc="Resuming", unit=" lines")

                for cell in resume_reader.cells("datetime"):
                    resume_loader.update()
                    last_cell = cell

                resume_loader.close()

                if last_cell is not None:
                    last_date = last_cell.replace(" ", "T")
                    namespace.end_date = last_date

                    print_err("Resuming from: %s" % last_date)

        # Loading bar
        loading_bar = tqdm(
            desc="Fetching %s" % item_name,
            dynamic_ncols=True,
            unit=" %s" % item_name,
            total=namespace.limit,
        )

        if namespace.format == "csv":
            writer = csv.writer(output_file)

            if not need_to_resume:
                writer.writerow(
                    csv_headers(namespace) if callable(csv_headers
                                                       ) else csv_headers)
        else:
            writer = ndjson.writer(output_file)

        client = CrowdTangleClient(namespace.token,
                                   rate_limit=namespace.rate_limit)

        args = []

        if callable(get_args):
            args = get_args(namespace)

        create_iterator = getattr(client, method_name)
        iterator = create_iterator(
            *args,
            partition_strategy=getattr(namespace, "partition_strategy", None),
            limit=namespace.limit,
            format="csv_row" if namespace.format == "csv" else "raw",
            per_call=True,
            detailed=True,
            namespace=namespace)

        try:
            for details, items in iterator:
                if details is not None:
                    loading_bar.set_postfix(**details)

                for item in items:
                    writer.writerow(item)

                loading_bar.update(len(items))

        except CrowdTangleInvalidTokenError:
            loading_bar.close()
            die([
                "Your API token is invalid.",
                "Check that you indicated a valid one using the `--token` argument.",
            ])

        loading_bar.close()
示例#8
0
文件: utils.py 项目: lebelgique/minet
    def action(namespace, output_file):

        # Do we need to resume?
        need_to_resume = False

        if getattr(namespace, 'resume', False):
            need_to_resume = True

            if namespace.output is None:
                die(
                    'Cannot --resume without knowing the output (use -o/--output rather stdout).',
                )

            if namespace.sort_by != 'date':
                die('Cannot --resume if --sort_by is not `date`.')

            if namespace.format != 'csv':
                die('Cannot --resume jsonl format yet.')

            with open(namespace.output, 'r', encoding='utf-8') as f:
                resume_reader = casanova.reader(f)

                last_cell = None
                resume_loader = tqdm(desc='Resuming', unit=' lines')

                for cell in resume_reader.cells('datetime'):
                    resume_loader.update()
                    last_cell = cell

                resume_loader.close()

                if last_cell is not None:
                    last_date = last_cell.replace(' ', 'T')
                    namespace.end_date = last_date

                    print_err('Resuming from: %s' % last_date)

        if callable(announce):
            print_err(announce(namespace))

        # Loading bar
        loading_bar = tqdm(desc='Fetching %s' % item_name,
                           dynamic_ncols=True,
                           unit=' %s' % item_name,
                           total=namespace.limit)

        if namespace.format == 'csv':
            writer = csv.writer(output_file)

            if not need_to_resume:
                writer.writerow(
                    csv_headers(namespace) if callable(csv_headers
                                                       ) else csv_headers)
        else:
            writer = ndjson.writer(output_file)

        client = CrowdTangleAPIClient(namespace.token,
                                      rate_limit=namespace.rate_limit)

        args = []

        if callable(get_args):
            args = get_args(namespace)

        def before_sleep(retry_state):
            exc = retry_state.outcome.exception()

            if isinstance(exc, CrowdTangleRateLimitExceeded):
                reason = 'Call failed because of rate limit!'

            elif isinstance(exc, CrowdTangleInvalidJSONError):
                reason = 'Call failed because of invalid JSON payload!'

            else:
                reason = 'Call failed because of server timeout!'

            tqdm.write(
                '%s\nWill wait for %s before attempting again.' %
                (reason,
                 prettyprint_seconds(retry_state.idle_for, granularity=2)),
                file=sys.stderr)

        create_iterator = getattr(client, method_name)
        iterator = create_iterator(
            *args,
            partition_strategy=getattr(namespace, 'partition_strategy', None),
            limit=namespace.limit,
            format='csv_row' if namespace.format == 'csv' else 'raw',
            per_call=True,
            detailed=True,
            namespace=namespace,
            before_sleep=before_sleep)

        try:
            for details, items in iterator:
                if details is not None:
                    loading_bar.set_postfix(**details)

                for item in items:
                    writer.writerow(item)

                loading_bar.update(len(items))

        except CrowdTangleInvalidTokenError:
            loading_bar.close()
            die([
                'Your API token is invalid.',
                'Check that you indicated a valid one using the `--token` argument.'
            ])

        loading_bar.close()
示例#9
0
    def action(namespace, output_file):

        # Do we need to resume?
        need_to_resume = False

        if getattr(namespace, 'resume', False):
            need_to_resume = True

            if namespace.output is None:
                die(
                    'Cannot --resume without knowing the output (use -o/--output rather stdout).',
                )

            if namespace.sort_by != 'date':
                die('Cannot --resume if --sort_by is not `date`.')

            if namespace.format != 'csv':
                die('Cannot --resume jsonl format yet.')

            with open(namespace.output, 'r') as f:
                resume_reader = casanova.reader(f)

                last_cell = None
                resume_loader = tqdm(desc='Resuming', unit=' lines')

                for cell in resume_reader.cells('datetime'):
                    resume_loader.update()
                    last_cell = cell

                resume_loader.close()

                if last_cell is not None:
                    last_date = last_cell.replace(' ', 'T')
                    namespace.end_date = last_date

                    print_err('Resuming from: %s' % last_date)

        # Loading bar
        loading_bar = tqdm(desc='Fetching %s' % item_name,
                           dynamic_ncols=True,
                           unit=' %s' % item_name,
                           total=namespace.limit)

        if namespace.format == 'csv':
            writer = csv.writer(output_file)

            if not need_to_resume:
                writer.writerow(
                    csv_headers(namespace) if callable(csv_headers
                                                       ) else csv_headers)
        else:
            writer = ndjson.writer(output_file)

        client = CrowdTangleClient(namespace.token,
                                   rate_limit=namespace.rate_limit)

        args = []

        if callable(get_args):
            args = get_args(namespace)

        create_iterator = getattr(client, method_name)
        iterator = create_iterator(
            *args,
            partition_strategy=getattr(namespace, 'partition_strategy', None),
            limit=namespace.limit,
            format='csv_row' if namespace.format == 'csv' else 'raw',
            per_call=True,
            detailed=True,
            namespace=namespace)

        try:
            for details, items in iterator:
                if details is not None:
                    loading_bar.set_postfix(**details)

                for item in items:
                    writer.writerow(item)

                loading_bar.update(len(items))

        except CrowdTangleInvalidTokenError:
            loading_bar.close()
            die([
                'Your API token is invalid.',
                'Check that you indicated a valid one using the `--token` argument.'
            ])

        loading_bar.close()
示例#10
0
文件: utils.py 项目: medialab/minet
    def action(cli_args):

        resume = getattr(cli_args, 'resume', False)

        # Validation
        if resume:
            if cli_args.sort_by != 'date':
                die('Cannot --resume if --sort_by is not `date`.')

            if cli_args.format != 'csv':
                die('Cannot --resume jsonl format yet.')

        if cli_args.format == 'csv':
            fieldnames = csv_headers(cli_args) if callable(
                csv_headers) else csv_headers
            writer = casanova.writer(cli_args.output, fieldnames)
        else:
            writer = ndjson.writer(cli_args.output)

        # Acquiring state from resumer
        if getattr(cli_args, 'resume', False):
            last_date = cli_args.output.pop_state()

            if last_date is not None:
                cli_args.end_date = last_date.replace(' ', 'T')
                print_err('Resuming from: %s' % cli_args.end_date)

        if callable(announce):
            print_err(announce(cli_args))

        # Loading bar
        loading_bar = LoadingBar(desc='Fetching %s' % item_name,
                                 unit=item_name[:-1],
                                 total=cli_args.limit)

        args = []

        if callable(get_args):
            args = get_args(cli_args)

        client = CrowdTangleAPIClient(cli_args.token,
                                      rate_limit=cli_args.rate_limit)

        create_iterator = getattr(client, method_name)
        iterator = create_iterator(*args,
                                   limit=cli_args.limit,
                                   raw=cli_args.format != 'csv',
                                   per_call=True,
                                   detailed=True,
                                   namespace=cli_args)

        try:
            for details, items in iterator:
                loading_bar.update(len(items))

                if details is not None:
                    loading_bar.update_stats(**details)

                for item in items:
                    if cli_args.format == 'csv':
                        item = item.as_csv_row()

                    writer.writerow(item)

        except CrowdTangleInvalidTokenError:
            loading_bar.die([
                'Your API token is invalid.',
                'Check that you indicated a valid one using the `--token` argument.'
            ])
示例#11
0
    def action(cli_args):

        resume = getattr(cli_args, 'resume', False)

        # Validation
        if resume:
            if cli_args.sort_by != 'date':
                die('Cannot --resume if --sort_by is not `date`.')

            if cli_args.format != 'csv':
                die('Cannot --resume jsonl format yet.')

        if cli_args.format == 'csv':
            fieldnames = csv_headers(cli_args) if callable(
                csv_headers) else csv_headers
            writer = casanova.writer(cli_args.output, fieldnames)
        else:
            writer = ndjson.writer(cli_args.output)

        # Acquiring state from resumer
        if getattr(cli_args, 'resume', False):
            last_date = cli_args.output.pop_state()

            if last_date is not None:
                cli_args.end_date = last_date.replace(' ', 'T')
                print_err('Resuming from: %s' % cli_args.end_date)

        if callable(announce):
            print_err(announce(cli_args))

        # Loading bar
        loading_bar = LoadingBar(desc='Fetching %s' % item_name,
                                 unit=item_name[:-1],
                                 total=cli_args.limit)

        client = CrowdTangleAPIClient(cli_args.token,
                                      rate_limit=cli_args.rate_limit)

        args = []

        if callable(get_args):
            args = get_args(cli_args)

        def before_sleep(retry_state):
            exc = retry_state.outcome.exception()

            if isinstance(exc, CrowdTangleRateLimitExceeded):
                reason = 'Call failed because of rate limit!'

            elif isinstance(exc, CrowdTangleInvalidJSONError):
                reason = 'Call failed because of invalid JSON payload!'

            else:
                reason = 'Call failed because of server timeout!'

            loading_bar.print(
                '%s\nWill wait for %s before attempting again.' %
                (reason,
                 prettyprint_seconds(retry_state.idle_for, granularity=2)))

        create_iterator = getattr(client, method_name)
        iterator = create_iterator(*args,
                                   limit=cli_args.limit,
                                   raw=cli_args.format != 'csv',
                                   per_call=True,
                                   detailed=True,
                                   namespace=cli_args,
                                   before_sleep=before_sleep)

        try:
            for details, items in iterator:
                loading_bar.update(len(items))

                if details is not None:
                    loading_bar.update_stats(**details)

                for item in items:
                    if cli_args.format == 'csv':
                        item = item.as_csv_row()

                    writer.writerow(item)

        except CrowdTangleInvalidTokenError:
            loading_bar.die([
                'Your API token is invalid.',
                'Check that you indicated a valid one using the `--token` argument.'
            ])