示例#1
0
def query_tweets(query,
                 limit=None,
                 begindate=dt.date(2006, 3, 21),
                 enddate=dt.date.today(),
                 poolsize=20,
                 lang='',
                 use_proxies=False):
    no_days = (enddate - begindate).days

    if (no_days < 0):
        sys.exit('Begin date must occur before end date.')

    if poolsize > no_days:
        # Since we are assigning each pool a range of dates to query,
        # the number of pools should not exceed the number of dates.
        poolsize = no_days
    dateranges = [
        begindate + dt.timedelta(days=elem)
        for elem in linspace(0, no_days, poolsize + 1)
    ]

    if limit and poolsize:
        limit_per_pool = (limit // poolsize) + 1
    else:
        limit_per_pool = None

    # If we are setting pool size to 1, add a pause between requests to avoid IP ban by Twitter.
    throttled = poolsize == 1 and not use_proxies
    queries = [
        '{} since:{} until:{}'.format(query, since, until)
        for since, until in zip(dateranges[:-1], dateranges[1:])
    ]

    all_tweets = []
    try:
        pool = Pool(poolsize)
        logger.info('queries: {}'.format(queries))
        try:
            for new_tweets in pool.imap_unordered(
                    partial(query_tweets_once,
                            throttled=throttled,
                            limit=limit_per_pool,
                            lang=lang,
                            use_proxies=use_proxies), queries):
                all_tweets.extend(new_tweets)
                logger.info('Got {} tweets ({} new).'.format(
                    len(all_tweets), len(new_tweets)))
        except KeyboardInterrupt:
            logger.info('Program interrupted by user. Returning all tweets '
                        'gathered so far.')
    finally:
        pool.close()
        pool.join()

    return all_tweets
示例#2
0
class MultiprocessingDistributor(DistributorBaseClass):
    """
    Distributor using a multiprocessing Pool to calculate the jobs in parallel on the local machine.
    """
    def __init__(self,
                 n_workers,
                 disable_progressbar=False,
                 progressbar_title="Feature Extraction",
                 show_warnings=True):
        """
        Creates a new MultiprocessingDistributor instance

        :param n_workers: How many workers should the multiprocessing pool have?
        :type n_workers: int
        :param disable_progressbar: whether to show a progressbar or not.
        :type disable_progressbar: bool
        :param progressbar_title: the title of the progressbar
        :type progressbar_title: basestring
        :param show_warnings: whether to show warnings or not.
        :type show_warnings: bool
        """
        self.pool = Pool(processes=n_workers,
                         initializer=initialize_warnings_in_workers,
                         initargs=(show_warnings, ))
        self.n_workers = n_workers
        self.disable_progressbar = disable_progressbar
        self.progressbar_title = progressbar_title

    def distribute(self, func, partitioned_chunks, kwargs):
        """
        Calculates the features in a parallel fashion by distributing the map command to a thread pool

        :param func: the function to send to each worker.
        :type func: callable
        :param partitioned_chunks: The list of data chunks - each element is again
            a list of chunks - and should be processed by one worker.
        :type partitioned_chunks: iterable
        :param kwargs: parameters for the map function
        :type kwargs: dict of string to parameter

        :return: The result of the calculation as a list - each item should be the result of the application of func
            to a single element.
        """
        return self.pool.imap_unordered(partial(func, **kwargs),
                                        partitioned_chunks)

    def close(self):
        """
        Collects the result from the workers and closes the thread pool.
        """
        self.pool.close()
        self.pool.terminate()
        self.pool.join()
示例#3
0
def query_tweets(query,
                 limit=None,
                 begindate=dt.date(2006, 3, 21),
                 enddate=dt.datetime.now(),
                 poolsize=20,
                 lang=''):
    no_secs = (enddate - begindate).seconds

    if (no_secs < 0):
        sys.exit('Begin date must occur before end date.')

    if poolsize > no_secs:
        # Since we are assigning each pool a range of dates to query,
        # the number of pools should not exceed the number of dates.
        poolsize = no_secs
    dateranges = [
        begindate + dt.timedelta(seconds=elem)
        for elem in linspace(0, no_secs, poolsize + 1)
    ]

    if limit and poolsize:
        limit_per_pool = (limit // poolsize) + 1
    else:
        limit_per_pool = None

    queries = [
        '{} since_time:{} until_time:{}'.format(
            query, int(time.mktime(since.timetuple())),
            int(time.mktime(until.timetuple())))
        for since, until in zip(dateranges[:-1], dateranges[1:])
    ]

    all_tweets = []
    try:
        pool = Pool(poolsize)
        logger.info('queries: {}'.format(queries))
        try:
            for new_tweets in pool.imap_unordered(
                    partial(query_tweets_once, limit=limit_per_pool,
                            lang=lang), queries):
                all_tweets.extend(new_tweets)
                logger.info('Got {} tweets ({} new).'.format(
                    len(all_tweets), len(new_tweets)))
        except KeyboardInterrupt:
            logger.info('Program interrupted by user. Returning all tweets '
                        'gathered so far.')
    finally:
        pool.close()
        pool.join()

    return all_tweets
示例#4
0
def query_tweets_parallel(query,
                          limit=None,
                          begindate=dt.date(2006, 3, 21),
                          enddate=dt.date.today(),
                          poolsize=20,
                          lang=''):
    number_days = (enddate - begindate).days

    if poolsize > number_days:
        # Since we are assigning each pool a range of dates to query,
        # the number of pools should not exceed the number of dates.
        poolsize = number_days

    dateranges = [
        begindate + dt.timedelta(days=elem)
        for elem in linspace(0, number_days, poolsize + 1)
    ]

    if limit and poolsize:
        limit_per_pool = (limit // poolsize) + 1
    else:
        limit_per_pool = None

    queries = [
        '{} since:{} until:{}'.format(query, since, until)
        for since, until in zip(dateranges[:-1], dateranges[1:])
    ]

    all_tweets = []
    try:
        pool = Pool(poolsize)
        logger.info('queries: {}'.format(queries))
        try:
            for new_tweets in pool.imap_unordered(
                    partial(query_tweets_once,
                            limit=limit_per_pool,
                            lang=lang,
                            use_proxy=use_proxy), queries):
                all_tweets.extend(new_tweets)
                logger.info('Got {} tweets ({} new).'.format(
                    len(all_tweets), len(new_tweets)))
        except KeyboardInterrupt:
            logger.info('Program interrupted by user. Returning all tweets '
                        'gathered so far.')
    finally:
        pool.close()
        pool.join()

    return all_tweets
示例#5
0
def query_tweets(query,
                 limit=None,
                 begindate=dt.date(2006, 3, 21),
                 enddate=dt.date.today(),
                 poolsize=20,
                 lang=''):
    no_days = (enddate - begindate).days

    if (no_days < 0):
        sys.exit('Begin date must occur before end date.')

    if poolsize > no_days:
        poolsize = no_days
    dateranges = [
        begindate + dt.timedelta(days=elem)
        for elem in linspace(0, no_days, poolsize + 1)
    ]

    if limit and poolsize:
        limit_per_pool = (limit // poolsize) + 1
    else:
        limit_per_pool = None

    queries = [
        '{} since:{} until:{}'.format(query, since, until)
        for since, until in zip(dateranges[:-1], dateranges[1:])
    ]

    all_tweets = []
    try:
        pool = Pool(poolsize)
        for new_tweets in pool.imap_unordered(
                partial(query_tweets_once, limit=limit_per_pool, lang=lang),
                queries):
            all_tweets.extend(new_tweets)
    finally:
        pool.close()
        pool.join()

    return all_tweets
    def scrape(self, keywords):
        
        all_tweets = []
        pool_size = 20

        start_date = dt.date.today() - dt.timedelta(14)
        query = " ".join(keywords)        

        no_of_days = (dt.date.today() - start_date).days
        if no_of_days < pool_size:
            pool_size = no_of_days

        date_ranges = [
            start_date + dt.timedelta(days=elem)
            for elem in np.linspace(0, no_of_days, pool_size + 1)
        ]

        if self.limit and pool_size:
            self.limit = (self.limit // pool_size) + 1

        queries = [
            "{} since:{} until:{}".format(query, since, until)
            for since, until in zip(date_ranges[:-1], date_ranges[1:])
        ]

        pool = Pool(pool_size)
        logging.info("queries: {}".format(queries))

        try:
            for new_tweets in pool.imap_unordered(self.get_tweets, queries):
                all_tweets.extend(new_tweets)
        except KeyboardInterrupt:
            logging.info(
                "Program interrupted by user. Returning all tweets " "gathered so far."
            )
        finally:
            pool.close()
            pool.join()

        return all_tweets