示例#1
0
def rescrape(processes=2, months=6, limit=50, overwrite=False):
    from dateutil.relativedelta import relativedelta
    from modularodm import Q
    from scripts import retag
    cutoff_date = datetime.datetime.utcnow() - relativedelta(months=months)
    query = (
        (
            Q('date_last_scraped', 'lt', datetime.datetime.utcnow()) |
            Q('date_last_scraped', 'eq', None)
        ) &
        Q('verified.0', 'exists', False)
    )
    retag.batch_rescrape(
        processes=processes,
        query=query,
        limit=limit,
        overwrite=overwrite,
    )
示例#2
0
def rescrape(processes=1, months=6, limit=50, missing='any', overwrite=False):
    """
    :param int processes: Number of processes to launch
    :param int months: Minimum time since last scraped
    :param int limit: Max number of articles to scrape
    :param str missing: Missing document type (html, pdf, pmc, any)
    :param bool overwrite: Overwrite existing articles
    """
    from dateutil.relativedelta import relativedelta
    from modularodm import Q
    from scripts import retag
    cutoff_date = datetime.datetime.utcnow() - relativedelta(months=months)
    query = (Q('date_last_scraped', 'lt', cutoff_date)
             | Q('date_last_scraped', 'eq', None))
    if missing == 'any':
        query = query & Q('verified.0', 'exists', False)
    else:
        query = query & Q('verified', 'ne', missing)
    retag.batch_rescrape(
        processes=processes,
        query=query,
        limit=limit,
        overwrite=overwrite,
    )