Exemplo n.º 1
0
 def predict(self, document_frame):
     logger.info('Predicting {} values'.format(len(document_frame)))
     values = self.transform(document_frame)
     results = np.zeros((len(values), self.trainY.shape[1]))
     logger.info('Finding {} nearest neighbors'.format(self.knn))
     for idx in range(len(values)):
         vector = values[idx, :]
         returns = self.model.docvecs.most_similar(positive=[vector],
                                                   topn=self.knn)
         indices = [doctag for doctag, sim in returns]
         mean_vals = self.trainY.loc[indices, :].mean()
         results[idx, :] = mean_vals
         progressbar(idx, len(values), logger=logger)
     return results
Exemplo n.º 2
0
 def transform(self, document_frame):
     dim = self.model.vector_size
     inputs = np.zeros((len(document_frame), dim))
     logger.info('Transforming documents into matrix of '
                 'shape {}'.format(inputs.shape))
     tagged_docs = self.create_tagged_documents(document_frame)
     for kdx, (author, permalink) in enumerate(
             zip(document_frame.author, document_frame.permalink)):
         try:
             inputs[kdx, :] = self.model.docvecs[author + '/' + permalink]
         except KeyError:
             # infer the test vector
             inputs[kdx, :] = self.model.infer_vector(
                 tagged_docs[kdx].words, steps=self.infer_steps)
         progressbar(kdx, len(inputs), logger=logger)
     return inputs
Exemplo n.º 3
0
def get_all_posts_between_parallel(start_datetime, end_datetime, steem,
                                   stop_after=None, ncores=8,
                                   chunksize=20, timeout=1200):
    """As above but in parallel with `ncores` jobs of `chunksize`.

    Waits for posts unitl `timeout`.
    """
    start_num, block_start_datetime = find_nearest_block_num(start_datetime, steem)
    end_num, block_end_datetime = find_nearest_block_num(end_datetime, steem)

    logger.info('Querying IN PARALLEL with {} cores all posts between '
                '{} (block {}) and {} (block {})'.format(ncores,
                                                         block_start_datetime,
                                                         start_num,
                                                         block_end_datetime,
                                                         end_num))
    block_nums = list(range(start_num, end_num + 1))
    chunks = [block_nums[irun: irun + chunksize]
                for irun in range(0, len(block_nums), chunksize)]

    ctx = mp.get_context('spawn')
    pool = ctx.Pool(ncores, initializer=config_mp_logging)

    async_results = []
    for idx, chunk in enumerate(chunks):
        result = pool.apply_async(_get_all_posts_for_blocks_parallel,
                                  args=(chunk, steem,
                                        stop_after))
        async_results.append(result)
        if stop_after is not None and idx >= stop_after:
            break

    pool.close()

    posts = []
    terminate = False
    for kdx, async in enumerate(async_results):
        try:
            new_posts = async.get(timeout=timeout)
            posts.extend(new_posts)
            if progressbar(kdx, len(chunks), percentage_step=5, logger=logger):
                logger.info('Finished chunk {} '
                            'out of {} found so far {} '
                            'posts...'.format(kdx + 1, len(chunks), len(posts)))
        except Exception as e:
            logger.exception('Something went totally wrong dude!')
            terminate = True

    if terminate:
        logger.error('Terminating pool due to timeout or errors')
        pool.terminate()
    pool.join()
    return posts
Exemplo n.º 4
0
def check_all_ops_between(start_datetime,
                          end_datetime,
                          steem,
                          account,
                          stop_after=None):
    """ Queries all posts found in blocks between start and end

    Parameters
    ----------
    start_datetime: datetime
    end_datetime: datetime
    steem: Steem
    account: str
    stop_after: int or None
        For debugging

    Returns
    -------
    List of dicts of posts

    """
    start_num, block_start_datetime = tpbg.find_nearest_block_num(
        start_datetime, steem)
    end_num, block_end_datetime = tpbg.find_nearest_block_num(
        end_datetime, steem)

    total = end_num - start_num
    comment_authors_and_permalinks = []
    logger.info('Checking all operations for account {}  between '
                '{} (block {}) and {} (block {})'.format(
                    account, block_start_datetime, start_num,
                    block_end_datetime, end_num))

    for idx, block_num in enumerate(range(start_num, end_num + 1)):
        authors_and_permalinks = check_all_ops_in_block(
            block_num, steem, account)
        comment_authors_and_permalinks.extend(authors_and_permalinks)
        if progressbar(idx, total, percentage_step=1, logger=logger):
            logger.info('Finished block {} '
                        '(last is {}) found so far {} '
                        'comments mentioning me...'.format(
                            block_num, end_num,
                            len(comment_authors_and_permalinks)))
        if stop_after is not None and idx >= stop_after:
            break

    logger.info('Scraped {} comments mentioning me'.format(
        len(comment_authors_and_permalinks)))
    return comment_authors_and_permalinks
Exemplo n.º 5
0
def get_all_posts_between(start_datetime, end_datetime, steem,
                          stop_after=None):
    """ Queries all posts found in blocks between start and end

    Parameters
    ----------
    start_datetime: datetime
    end_datetime: datetime
    steem: Steem
    stop_after: int or None
        For debugging and shorter tests, stop after only a few iterations

    Returns
    -------
    List of dicts of posts

    """
    start_num, block_start_datetime = find_nearest_block_num(start_datetime, steem)
    end_num, block_end_datetime = find_nearest_block_num(end_datetime, steem)

    total = end_num - start_num
    posts = []
    logger.info('Querying all posts between '
                '{} (block {}) and {} (block {})'.format(block_start_datetime,
                                                         start_num,
                                                         block_end_datetime,
                                                         end_num))
    exclude_authors_and_permalinks = set()
    for idx, block_num in enumerate(range(start_num, end_num+1)):
        posts_in_block, authors_and_permalinks = get_all_posts_from_block(block_num,
                                                                          steem,
                                                                          exclude_authors_and_permalinks)
        exclude_authors_and_permalinks |= authors_and_permalinks
        posts.extend(posts_in_block)
        if progressbar(idx, total, percentage_step=1, logger=logger):
            logger.info('Finished block {} '
                    '(last is {}) found so far {} '
                    'posts...'.format(block_num, end_num, len(posts)))
        if stop_after is not None and len(posts) >= stop_after:
            break

    logger.info('Scraped {} posts'.format(len(posts)))
    return posts
Exemplo n.º 6
0
def get_upvote_payments_for_accounts(accounts,
                                     steem,
                                     min_datetime,
                                     max_datetime,
                                     chunksize=10,
                                     ncores=20,
                                     timeout=3600):
    logger.info('Querying upvote purchases between {} and '
                '{} for {} accounts'.format(min_datetime, max_datetime,
                                            len(accounts)))

    # do queries by day!
    start_datetimes = pd.date_range(min_datetime, max_datetime).tolist()
    end_datetimes = [x for x in start_datetimes[1:]] + [max_datetime]

    if ncores > 1:
        chunks = [
            accounts[irun:irun + chunksize]
            for irun in range(0, len(accounts), chunksize)
        ]

        ctx = mp.get_context('spawn')
        pool = ctx.Pool(ncores, initializer=tpbg.config_mp_logging)

        async_results = []
        for start_datetime, end_datetime in zip(start_datetimes,
                                                end_datetimes):
            for idx, chunk in enumerate(chunks):
                result = pool.apply_async(_get_upvote_payments_parrallel,
                                          args=(chunk, steem, start_datetime,
                                                end_datetime))
                async_results.append(result)

        pool.close()

        upvote_payments = {}
        terminate = False
        for kdx, async in enumerate(async_results):
            try:
                payments = async .get(timeout=timeout)
                upvote_payments = extend_upvotes_and_payments(
                    upvote_payments, payments)
                if progressbar(kdx,
                               len(async_results),
                               percentage_step=5,
                               logger=logger):
                    logger.info('Finished chunk {} '
                                'out of {} found so far {} '
                                'upvote buyers...'.format(
                                    kdx + 1, len(async_results),
                                    len(upvote_payments)))
            except Exception as e:
                logger.exception('Something went totally wrong dude!')
                terminate = True

        if terminate:
            logger.error('Terminating pool due to timeout or errors')
            pool.terminate()
        pool.join()
    else:
        return _get_upvote_payments_parrallel(accounts, steem, min_datetime,
                                              max_datetime)

    logger.info('Found {} upvote bought articles'.format(len(upvote_payments)))
    return upvote_payments
Exemplo n.º 7
0
def test_progressbar():
    result = []
    for irun in range(100):
        result.append(progressbar(irun, 100, percentage_step=1))
    assert all(result)