Пример #1
0
def alias_add(ctx, alias, groups):
    """
    Associate an alias with one or more group names.

    This makes further reference to the group that much easier since you don't
    have to type the entire thing out again.

    For example: one might create an alias of alt.binaries.test to a.b.test

    There are no retrictions as to what the alias has to be, so if you wanted
    to you could just associate them with 1 word (like the above a.b.test
    could have also just been associated with the word 'test' or even 't'.

    """
    session = ctx['NNTPSettings'].session()
    if not session:
        logger.error("The database is not correctly configured.")
        exit(1)

    if not alias:
        logger.error("You must specify an alias.")
        exit(1)

    if len(groups) == 0:
        logger.error("You must specify at least one group after the alias.")
        exit(1)

    # Simplify Alias
    _alias = alias.lower().strip()
    if not _alias:
        logger.error("An invalid alias identifier was specified.")
        exit(1)

    groups = get_groups(session, groups)
    if not groups:
        logger.error(
            "There were no alias/groups found matching your criteria.",
        )
        exit(1)

    # Track our database updates
    pending_commits = 0

    for name, _id in groups.iteritems():
        if session.merge(GroupAlias(group_id=_id, name=_alias)):
            logger.debug(
                "Adding alias '%s' to group '%s'." % (_alias, name),
            )
            pending_commits += 1

    if pending_commits > 0:
        # commit our results
        session.commit()
Пример #2
0
def group_unwatch(ctx, groups):
    """
    Remove specified group(s) from a watchlist.
    """
    session = ctx['NNTPSettings'].session()
    if not session:
        logger.error("The database is not correctly configured.")
        exit(1)

    # Track our database updates
    pending_commits = 0

    groups = get_groups(session, groups)
    if not groups:
        logger.error(
            "There were no alias/groups found matching your criteria.",
        )
        exit(1)

    # PEP8 E712 does not allow us to make a comparison to a boolean value
    # using the == instead of the keyword 'in'.  However SQLAlchemy
    # requires us to do just because that's how the amazing tool works.
    # so to get around the pep8 error, we'll just define a variable equal
    # to True and then we can compare to it
    pep8_e712 = True

    for name, _id in groups.items():
        # Remove the entry if we can otherwise we just gracefully move on
        if session.query(Group).filter(Group.id == _id)\
                .filter(Group.watch == pep8_e712)\
                .update({Group.watch: False}):
            logger.info(
                "Removed the group '%s' from the watchlist." % name,
            )
            pending_commits += 1

    if pending_commits > 0:
        # commit our results
        session.commit()

    return
Пример #3
0
def group_watch(ctx, groups):
    """
    Adds a group to a watch list.

    """
    session = ctx['NNTPSettings'].session()
    if not session:
        logger.error("The database is not correctly configured.")
        exit(1)

    # Track our database updates
    pending_commits = 0

    groups = get_groups(session, groups)
    if not groups:
        logger.error(
            "There were no alias/groups found matching your criteria.", )
        exit(1)

    # PEP8 E712 does not allow us to make a comparison to a boolean value
    # using the == instead of the keyword 'in'.  However SQLAlchemy
    # requires us to do just because that's how the amazing tool works.
    # so to get around the pep8 error, we'll just define a variable equal
    # to True and then we can compare to it
    pep8_e712 = False

    for name, _id in groups.items():
        # Remove the entry if we can otherwise we just gracefully move on
        if session.query(Group).filter(Group.id == _id)\
                .filter(Group.watch == pep8_e712)\
                .update({Group.watch: True}):
            logger.info("Added the group '%s' to the watchlist." % name)
            pending_commits += 1

    if pending_commits > 0:
        # commit our results
        session.commit()

    return
Пример #4
0
def search(ctx, group, keywords, minscore, maxscore, case_insensitive):
    """
    Searches cached groups for articles.

    Specified keywords stack on one another.  Each keyword specified must
    match somewhere in the subject line or else the result is filtered.

    Keywords can also be prefixed with special characters too to help
    identify what is being scanned.

        1. Example 1: A search that should ignore any text with 'Test' in it
                    but include text with 'Jack' in it. Unless you include
                    the case-insensitive switch (inspired from grep), the
                    search will be case sensitive:

                    -Test +Jack

        The + (plus) is always implied. It's primary use it to eliminate
        abiguity (and allow for the minus to exist).  It is also nessisary if
        you intend to search for something with a plus in it, hence the
        following would search for the string '+++AWESOME+++':

                    +++++AWESOME+++

        The extra plus symbol is stripped off and the search works as intended.

        2.  Example 2: Search by Poster.  Since all keywords imply that you're
                 searching for a subject keyword, the next token that changes
                 this is '%p' where as the subject is always implied
                 identified as '%s'.  Hence the following would look for me:

                    %pChris %pl2g

            This can also be written like this:

                    %p+Chris %p+l2g

            You should not be confused here, the tokens at the front will be
            stripped off and the search will run as normal. These tokens are
            very important because it allows you to mix and match search with
            both the subject and poster:

                    %p+Chris %p+l2g AWESOME

            The above implies that AWESOME will have a +%s infront of it.
            Make sense?

        The final thing worth noting is doing a search for text that contains
        dash/minus (-) signs.  Click (the awesome cli wrapper this script
        uses can pick the - up as an actual switch thinking you're trying to
        pass it into this function. So you can easily disable this with by
        adding a double dash/minus sign (--) like so:

            nr search -- -keyword +keyword2

    """

    session = ctx['NNTPSettings'].session()
    if not session:
        logger.error("The database is not correctly configured.")
        exit(1)

    if not group:
        logger.error("You must specify a group/alias.")
        exit(1)

    # Simplify Alias
    groups = get_groups(session, group)
    if not groups:
        logger.error("You must specify a group/alias.")
        exit(1)

    for name, _id in groups.iteritems():
        db_path = join(ctx['NNTPSettings'].cfg_path, 'cache', 'search')
        db_file = '%s%s' % (
            join(db_path, name),
            SQLITE_DATABASE_EXTENSION,
        )
        if not isfile(db_file):
            logger.warning(
                "There is no cached content for '%s'." % db_file
            )
            continue

        engine = 'sqlite:///%s' % db_file
        db = NNTPGroupDatabase(engine=engine, reset=None)
        group_session = db.session()

        gt = group_session.query(Article)

        # Parse our keywords
        parsed_keywords = parse_search_keyword(keywords)
        for _op, _cat, keyword in parsed_keywords:

            if _cat == SearchCategory.SUBJECT:
                if _op == SearchOperation.INCLUDE:
                    if case_insensitive:
                        logger.debug(
                            'Scanning -and- (case-insensitive) subject: '
                            '"%s"' % (keyword))
                        gt = gt.filter(
                            Article.subject.ilike('%%%s%%' % keyword))
                    else:
                        logger.debug(
                            'Scanning -and- (case-sensitive) subject: '
                            '"%s"' % (keyword))
                        gt = gt.filter(
                            Article.subject.like('%%%s%%' % keyword))
                else:
                    # _op == SearchCategory.EXCLUDE
                    if case_insensitive:
                        logger.debug(
                            'Scanning -not- (case-insensitive) subject: '
                            '"%s"' % (keyword))
                        gt = gt.filter(
                            not_(Article.subject.ilike('%%%s%%' % keyword)))
                    else:
                        logger.debug(
                            'Scanning -and not- (case-sensitive) subject: '
                            '"%s"' % (keyword))
                        gt = gt.filter(
                            not_(Article.subject.like('%%%s%%' % keyword)))

            elif _cat == SearchCategory.POSTER:
                if _op == SearchOperation.INCLUDE:
                    if case_insensitive:
                        logger.debug(
                            'Scanning -and- (case-insensitive) poster: '
                            '"%s"' % (keyword))
                        gt = gt.filter(
                            Article.poster.ilike('%%%s%%' % keyword))
                    else:
                        logger.debug(
                            'Scanning -and- (case-sensitive) poster: '
                            '"%s"' % (keyword))
                        gt = gt.filter(
                            Article.poster.like('%%%s%%' % keyword))

                else:
                    # _op == SearchCategory.EXCLUDE
                    if case_insensitive:
                        logger.debug(
                            'Scanning -and not- (case-insensitive) poster: '
                            '"%s"' % (keyword))
                        gt = gt.filter(
                            not_(Article.poster.ilike('%%%s%%' % keyword)))
                    else:
                        logger.debug(
                            'Scanning -and not- (case-sensitive) poster: '
                            '"%s"' % (keyword))
                        gt = gt.filter(
                            not_(Article.poster.like('%%%s%%' % keyword)))

        # Handle Scores
        if maxscore == minscore:
            logger.debug('Scanning -score == %d-' % (maxscore))
            gt = gt.filter(Article.score == maxscore)

        else:
            logger.debug(
                'Scanning -score >= %d and score <= %d-' % (
                    minscore, maxscore))

            gt = gt.filter(Article.score <= maxscore)\
                   .filter(Article.score >= minscore)

        gt = gt.order_by(Article.score.desc())

        # Iterate through our list
        print("%s:" % (name))
        for entry in gt:
            print("  [%.5d] %.4d %s" % (
                entry.article_no, entry.score, entry.subject))

        group_session.close()
        db.close()

    return
Пример #5
0
def update_search(ctx, groups, date_from, date_to, watched):
    """
    Cache specified articles.

    Articles are cached into their own database due to the sheer size of
    the content within each group.

    """
    # TODO: Support loading by date ranges (from and to)
    # TODO: Support loading by X articles from front or X articles from back
    # TODO: Support loading data by X days back
    # TODO: Support loading data by X days back
    # TODO: Support specifing how many entries to process. Hence if someone
    #       only does a --count=1 (or -c), then only 1 batch is processed.
    #       Support specifying the batch sizes otherwise we use the config
    #       file which is already in place --batch (or -b).
    # TODO: GroupTrack needs to be smarter and not block until the fetch
    #       is repeated on a failure. Each batch loaded should update the
    #       main database and track it's successful fetch. If it's fetch
    #       runs into another, then the 2 tables can be combined into a larger
    #       one.
    #
    #       GroupIndex example:
    #           The below is what the table might look like; you can see we
    #           successfully loaded 100 to 199, and 300 to 399
    #           a.b.test.group:
    #               <id>  <low>   <high>
    #                 1    100      199
    #                 1    300      399
    #
    #           If we fill the void (200 to 299), the table should restructure
    #           itself to look like this:
    #               <id>  <low>   <high>
    #                 1    100      399
    #
    #           Basically the more entries in this table, the more holes/gaps
    #           we have, but we can use this to adjust our batches when we
    #           collide with content that is already fetched.  If a reset
    #           switch is specified (or a reset is detected because the
    #           database is missing, then this table should be included in
    #           the reset too!!)
    #
    # TODO: Support a --force (-f) switch which forces a re-fetch of the
    #       specified ranges defined that override the GroupIndex table.
    #
    # Use our Database first if it exists
    session = ctx['NNTPSettings'].session()
    if not session:
        logger.error('Could not acquire a database connection.')
        exit(1)

    if not len(ctx['NNTPSettings'].nntp_servers) > 0:
        logger.error("There are no servers defined.")
        exit(1)

    if date_from:
        try:
            date_from = parse(date_from, fuzzy=True)

        except TypeError:
            logger.error(
                "An invalid from date/time was specified: %s" % str(date_from),
            )
            exit(1)

    if date_to:
        try:
            date_to = parse(date_to, fuzzy=True)

        except TypeError:
            logger.error(
                "An invalid to date/time was specified: %s" % str(date_to),
            )
            exit(1)

    if date_to and date_from and date_from > date_to:
        logger.error(
            "The from date can not be larger then the to date.",
        )
        exit(1)

    # Store Primary server
    s = ctx['NNTPSettings'].nntp_servers[0]
    try:
        _server = session.query(Server)\
            .filter(Server.host == s['host'])\
            .filter(Server.port == s['port']).first()

    except (InvalidRequestError, OperationalError):
        # Database isn't set up
        logger.error("The database is not correctly configured.")
        exit(1)

    if not _server:
        logger.error("Server entry is not in the database.")
        exit(1)

    groups = get_groups(session=session, lookup=groups, watched=watched)
    if not groups:
        logger.error("There were not groups identified for indexing.")
        exit(1)

    # Get our RamDisk if we got one
    ramdisk = ctx['NNTPSettings'].nntp_processing.get('ramdisk')
    if ramdisk:
        if not (isdir(ramdisk) and access(ramdisk, W_OK)):
            logger.warning('Ramdisk "%s" is not accessible.' % (ramdisk))
            # Turn it off so we don't reference it
            ramdisk = None
        else:
            logger.info('Using ramdisk: %s' % (ramdisk))

    for name, _id in groups.iteritems():

        db_path = join(ctx['NNTPSettings'].cfg_path, 'cache', 'search')
        db_file = '%s%s' % (
            join(db_path, name),
            SQLITE_DATABASE_EXTENSION,
        )
        if not isdir(db_path):
            if not mkdir(db_path):
                logger.error("Failed to create directory %s" % db_path)
                exit(1)
            logger.info("Created directory %s" % db_path)

        if not access(db_path, W_OK):
            logger.error('The directory "%s" is not accessible.' % db_path)
            exit(1)

        reset = not exists(db_file)

        ram_db_file = None
        if ramdisk:
            # Create a ramdisk db
            ram_db_file = '%s%s' % (
                join(ramdisk, name),
                SQLITE_DATABASE_EXTENSION,
            )

            # Remove the existing file if it's there
            try:
                unlink(ram_db_file)

            except OSError:
                # No problem; the file just doesn't already exist
                pass

            engine = 'sqlite:///%s' % ram_db_file

            if not reset:
                # Database exists, and ramdisk exists, and we're not
                # reseting anything... copy existing database onto
                # ramdisk for processing
                logger.debug('Transfering %s database to ramdisk.' % name)
                copy(db_file, ram_db_file)
                logger.info('Transfered %s database to ramdisk.' % name)
        else:
            engine = 'sqlite:///%s' % db_file

        db = NNTPGroupDatabase(engine=engine, reset=reset)
        group_session = db.session()
        if not group_session:
            logger.warning("The database %s not be accessed." % db_file)
            continue

        # TODO:
        # Get current index associated with our primary group so we can
        # begin fetching from that point.  The index "MUST" but the one
        # associated with our server hostname. If one doesn't exist; create
        # it initialized at 0
        logger.debug('Retrieving information on group %s' % (name))
        gt = session.query(GroupTrack)\
                    .filter(GroupTrack.group_id == _id)\
                    .filter(GroupTrack.server_id == _server.id).first()

        if not gt or reset:
            # Get an connection to work with
            con = ctx['NNTPManager'].get_connection()

            _, low, high, _ = con.group(name)
            if low is None:
                # Could not set group
                logger.warning("Could not access group '%s' on '%s'." % (
                    name,
                    _server.host,
                ))
                continue

            # Create a GroupTrack object using the group info
            gt = GroupTrack(
                group_id=_id,
                server_id=_server.id,
                low=low,
                high=high,
                scan_pointer=low,
                index_pointer=low,
            )
            group_session.commit()
            session.add(gt)

        # Initialize our high/low variables
        low = gt.high
        high = gt.low

        # starting pointer
        cur = gt.scan_pointer + 1

        requests = []
        if date_to:
            requests.append(
                ctx['NNTPManager'].seek_by_date(
                    date_to + timedelta(seconds=1), group=name, block=False))

            # Mark our item
            requests[-1]._watermark = 'high'

        if date_from:
            requests.append(
                ctx['NNTPManager'].seek_by_date(
                    date_from, group=name, block=False))

            # Mark our item
            requests[-1]._watermark = 'low'

        while len(requests):
            # Wait for requeest to complete
            requests[-1].wait()

            # we have a request at this point
            request = requests.pop()
            if not request:
                continue

            # Store our watermark so we update the correct entry
            watermark = request._watermark

            # Retrieve our response
            response = request.response.pop()
            if response is None:
                # We got an error in our response; take an early
                # exit for now
                logger.error(
                    'An unhandled server response was received: %s.' % (
                        response))

            # Store our watermark (high/low)
            if watermark == 'low':
                low = response
                # Store our current pointer at the starting point we found
                cur = low + 1

            elif watermark == 'high':
                high = response

        if high <= cur:
            # Skip
            continue

        # Drop all indexes; this makes inserts that much faster
        # TODO: make the header_batch_size a entry in NNTPSettings since it's
        # so powerful and allows pulling down multiple things at once
        # Retrieve a list of articles from the database in concurrent blocks
        # Scan them and place them into the NNTPGroupDatabase()
        batch_size = ctx['NNTPSettings'].nntp_processing\
                                        .get('header_batch_size', 5000)

        logger.info('Fetching from %d to %d [%d article(s)]' % (
                    cur, high, (high - cur + 1)))

        # Initialize our batch
        batch = list()

        # Parse the Database URL
        db_url = db.parse_url()

        if db_url['schema'].lower() == 'sqlite':
            # db_url['path'] contains the full path to the database file
            logger.info('Optimizing update for an SQLite database.')
            # SQLite Speed changes
            db._engine.execute('PRAGMA journal_mode = MEMORY')
            db._engine.execute("PRAGMA temp_store = MEMORY")
            db._engine.execute('PRAGMA synchronous = OFF')
            # 2 GB of RAM used for Caching for speed
            db._engine.execute('PRAGMA cache_size = 2000000')

        # we'll re-add them later
        for index in Article.__table__.indexes:
            try:
                index.drop(bind=db._engine)
                logger.info('Dropping Article Index "%s"' % index.name)

            except OperationalError:
                # The index is probably already dropped
                pass

        while high > cur:
            # Figure out our bach size
            inc = min(batch_size - 1, high - cur)
            logger.debug('Pushing XOVER batch %d-%d (inc=%d)' % (
                cur, cur + inc, inc + 1,
            ))

            # Prepare our batch list
            batch.append((cur, cur + inc, ctx['NNTPManager'].xover(
                group=name, start=cur, end=cur + inc,
                sort=XoverGrouping.BY_ARTICLE_NO,
                block=False,
            )))

            # Increment our pointer
            cur += inc + 1

        # Reverse sort the list since we know the first items pushed will be
        # the first ones completed. we want to pop items from the batch in the
        # same order we pushed them on:
        #       batch = list(reversed(batch))

        # The below is faster than the above for reversing a list then using
        # the reversed() function (and does just that: reverses the results)
        batch = batch[::-1]

        logger.info('%d Article batches prepared (batch size=%d).' % (
            len(batch),
            batch_size,
        ))
        # Now we process the entries
        while len(batch):

            # Block until results the oldest item added to the queue
            # (usually the first one to return) is done:
            batch[-1][-1].wait()

            # If we reach here, we've got a request object to work
            # with
            low, high, request = batch.pop()
            if not request:
                continue

            response = request.response.pop()
            if response is None:
                # We got an error in our response; take an early
                # exit for now
                logger.error(
                    'An unhandled server response was received: %s.' % (
                        response))

                # Reverse our list again
                batch = batch[::-1]
                while len(batch) > 0:
                    _, _, request = batch.pop()
                    request.abort()
                break

            logger.debug(
                'Retrieved (XOVER) batch %d-%d (%d articles).' % (
                    low, high, len(response),
                ))
            # Get the current time for our timer
            cur_time = datetime.now()

            # For output logging
            load_speed = 'fast'

            try:
                # Try the fast way; this will always succeed unless
                # we're dealing with a messed up table
                db._engine.execute(
                    Article.__table__.insert(), [{
                        "message_id": article['id'],
                        "article_no": article['article_no'],
                        "subject": article['subject'],
                        "poster": article['poster'],
                        "size": article['size'],
                        "lines": article['lines'],
                        "date": article['date'],
                        "score": article['score'],
                    } for article in response.itervalues()]
                )

            except (OperationalError, IntegrityError):
                logger.debug('Preparing for a slow load of %d items' %
                             len(response))
                for article in response.itervalues():
                    # Store our batch into the database and update
                    # our pointer
                    try:
                        group_session.merge(Article(
                            message_id=article['id'],
                            article_no=article['article_no'],
                            subject=article['subject'],
                            poster=article['poster'],
                            size=article['size'],
                            lines=article['lines'],
                            posted_date=article['date'],
                            score=article['score'],
                        ))

                    except OperationalError, e:
                        logger.error(
                            'A database operational error occured.'
                        )
                        logger.debug('Exception: %s' % str(e))
                        exit(1)

                    except TypeError, e:
                        logger.error(
                            'Failed to save article: %s.' %
                            str(article),
                        )
                        logger.debug('Exception: %s' % str(e))
                        exit(1)

                group_session.commit()
                load_speed = 'slow'
Пример #6
0
def update_search(ctx, groups, date_from, date_to, watched):
    """
    Cache specified articles.

    Articles are cached into their own database due to the sheer size of
    the content within each group.

    """
    # TODO: Support loading by date ranges (from and to)
    # TODO: Support loading by X articles from front or X articles from back
    # TODO: Support loading data by X days back
    # TODO: Support loading data by X days back
    # TODO: Support specifing how many entries to process. Hence if someone
    #       only does a --count=1 (or -c), then only 1 batch is processed.
    #       Support specifying the batch sizes otherwise we use the config
    #       file which is already in place --batch (or -b).
    # TODO: GroupTrack needs to be smarter and not block until the fetch
    #       is repeated on a failure. Each batch loaded should update the
    #       main database and track it's successful fetch. If it's fetch
    #       runs into another, then the 2 tables can be combined into a larger
    #       one.
    #
    #       GroupIndex example:
    #           The below is what the table might look like; you can see we
    #           successfully loaded 100 to 199, and 300 to 399
    #           a.b.test.group:
    #               <id>  <low>   <high>
    #                 1    100      199
    #                 1    300      399
    #
    #           If we fill the void (200 to 299), the table should restructure
    #           itself to look like this:
    #               <id>  <low>   <high>
    #                 1    100      399
    #
    #           Basically the more entries in this table, the more holes/gaps
    #           we have, but we can use this to adjust our batches when we
    #           collide with content that is already fetched.  If a reset
    #           switch is specified (or a reset is detected because the
    #           database is missing, then this table should be included in
    #           the reset too!!)
    #
    # TODO: Support a --force (-f) switch which forces a re-fetch of the
    #       specified ranges defined that override the GroupIndex table.
    #
    # Use our Database first if it exists
    session = ctx['NNTPSettings'].session()
    if not session:
        logger.error('Could not acquire a database connection.')
        exit(1)

    if not len(ctx['NNTPSettings'].nntp_servers) > 0:
        logger.error("There are no servers defined.")
        exit(1)

    if date_from:
        try:
            date_from = parse(date_from, fuzzy=True)

        except TypeError:
            logger.error(
                "An invalid from date/time was specified: %s" %
                str(date_from), )
            exit(1)

    if date_to:
        try:
            date_to = parse(date_to, fuzzy=True)

        except TypeError:
            logger.error(
                "An invalid to date/time was specified: %s" % str(date_to), )
            exit(1)

    if date_to and date_from and date_from > date_to:
        logger.error("The from date can not be larger then the to date.", )
        exit(1)

    # Store Primary server
    s = ctx['NNTPSettings'].nntp_servers[0]
    try:
        _server = session.query(Server)\
            .filter(Server.host == s['host'])\
            .filter(Server.port == s['port']).first()

    except (InvalidRequestError, OperationalError):
        # Database isn't set up
        logger.error("The database is not correctly configured.")
        exit(1)

    if not _server:
        logger.error("Server entry is not in the database.")
        exit(1)

    groups = get_groups(session=session, lookup=groups, watched=watched)
    if not groups:
        logger.error("There were not groups identified for indexing.")
        exit(1)

    # Get our RamDisk if we got one
    ramdisk = ctx['NNTPSettings'].nntp_processing.get('ramdisk')
    if ramdisk:
        if not (isdir(ramdisk) and access(ramdisk, W_OK)):
            logger.warning('Ramdisk "%s" is not accessible.' % (ramdisk))
            # Turn it off so we don't reference it
            ramdisk = None
        else:
            logger.info('Using ramdisk: %s' % (ramdisk))

    for name, _id in groups.iteritems():

        db_path = join(ctx['NNTPSettings'].cfg_path, 'cache', 'search')
        db_file = '%s%s' % (
            join(db_path, name),
            SQLITE_DATABASE_EXTENSION,
        )
        if not isdir(db_path):
            if not mkdir(db_path):
                logger.error("Failed to create directory %s" % db_path)
                exit(1)
            logger.info("Created directory %s" % db_path)

        if not access(db_path, W_OK):
            logger.error('The directory "%s" is not accessible.' % db_path)
            exit(1)

        reset = not exists(db_file)

        ram_db_file = None
        if ramdisk:
            # Create a ramdisk db
            ram_db_file = '%s%s' % (
                join(ramdisk, name),
                SQLITE_DATABASE_EXTENSION,
            )

            # Remove the existing file if it's there
            try:
                unlink(ram_db_file)

            except OSError:
                # No problem; the file just doesn't already exist
                pass

            engine = 'sqlite:///%s' % ram_db_file

            if not reset:
                # Database exists, and ramdisk exists, and we're not
                # reseting anything... copy existing database onto
                # ramdisk for processing
                logger.debug('Transfering %s database to ramdisk.' % name)
                copy(db_file, ram_db_file)
                logger.info('Transfered %s database to ramdisk.' % name)
        else:
            engine = 'sqlite:///%s' % db_file

        db = NNTPGroupDatabase(engine=engine, reset=reset)
        group_session = db.session()
        if not group_session:
            logger.warning("The database %s not be accessed." % db_file)
            continue

        # TODO:
        # Get current index associated with our primary group so we can
        # begin fetching from that point.  The index "MUST" but the one
        # associated with our server hostname. If one doesn't exist; create
        # it initialized at 0
        logger.debug('Retrieving information on group %s' % (name))
        gt = session.query(GroupTrack)\
                    .filter(GroupTrack.group_id == _id)\
                    .filter(GroupTrack.server_id == _server.id).first()

        if not gt or reset:
            # Get an connection to work with
            con = ctx['NNTPManager'].get_connection()

            _, low, high, _ = con.group(name)
            if low is None:
                # Could not set group
                logger.warning("Could not access group '%s' on '%s'." % (
                    name,
                    _server.host,
                ))
                continue

            # Create a GroupTrack object using the group info
            gt = GroupTrack(
                group_id=_id,
                server_id=_server.id,
                low=low,
                high=high,
                scan_pointer=low,
                index_pointer=low,
            )
            group_session.commit()
            session.add(gt)

        # Initialize our high/low variables
        low = gt.high
        high = gt.low

        # starting pointer
        cur = gt.scan_pointer + 1

        requests = []
        if date_to:
            requests.append(ctx['NNTPManager'].seek_by_date(
                date_to + timedelta(seconds=1), group=name, block=False))

            # Mark our item
            requests[-1]._watermark = 'high'

        if date_from:
            requests.append(ctx['NNTPManager'].seek_by_date(date_from,
                                                            group=name,
                                                            block=False))

            # Mark our item
            requests[-1]._watermark = 'low'

        while len(requests):
            # Wait for requeest to complete
            requests[-1].wait()

            # we have a request at this point
            request = requests.pop()
            if not request:
                continue

            # Store our watermark so we update the correct entry
            watermark = request._watermark

            # Retrieve our response
            response = request.response.pop()
            if response is None:
                # We got an error in our response; take an early
                # exit for now
                logger.error('An unhandled server response was received: %s.' %
                             (response))

            # Store our watermark (high/low)
            if watermark == 'low':
                low = response
                # Store our current pointer at the starting point we found
                cur = low + 1

            elif watermark == 'high':
                high = response

        if high <= cur:
            # Skip
            continue

        # Drop all indexes; this makes inserts that much faster
        # TODO: make the header_batch_size a entry in NNTPSettings since it's
        # so powerful and allows pulling down multiple things at once
        # Retrieve a list of articles from the database in concurrent blocks
        # Scan them and place them into the NNTPGroupDatabase()
        batch_size = ctx['NNTPSettings'].nntp_processing\
                                        .get('header_batch_size', 5000)

        logger.info('Fetching from %d to %d [%d article(s)]' %
                    (cur, high, (high - cur + 1)))

        # Initialize our batch
        batch = list()

        # Parse the Database URL
        db_url = db.parse_url()

        if db_url['schema'].lower() == 'sqlite':
            # db_url['path'] contains the full path to the database file
            logger.info('Optimizing update for an SQLite database.')
            # SQLite Speed changes
            db._engine.execute('PRAGMA journal_mode = MEMORY')
            db._engine.execute("PRAGMA temp_store = MEMORY")
            db._engine.execute('PRAGMA synchronous = OFF')
            # 2 GB of RAM used for Caching for speed
            db._engine.execute('PRAGMA cache_size = 2000000')

        # we'll re-add them later
        for index in Article.__table__.indexes:
            try:
                index.drop(bind=db._engine)
                logger.info('Dropping Article Index "%s"' % index.name)

            except OperationalError:
                # The index is probably already dropped
                pass

        while high > cur:
            # Figure out our bach size
            inc = min(batch_size - 1, high - cur)
            logger.debug('Pushing XOVER batch %d-%d (inc=%d)' % (
                cur,
                cur + inc,
                inc + 1,
            ))

            # Prepare our batch list
            batch.append((cur, cur + inc, ctx['NNTPManager'].xover(
                group=name,
                start=cur,
                end=cur + inc,
                sort=XoverGrouping.BY_ARTICLE_NO,
                block=False,
            )))

            # Increment our pointer
            cur += inc + 1

        # Reverse sort the list since we know the first items pushed will be
        # the first ones completed. we want to pop items from the batch in the
        # same order we pushed them on:
        #       batch = list(reversed(batch))

        # The below is faster than the above for reversing a list then using
        # the reversed() function (and does just that: reverses the results)
        batch = batch[::-1]

        logger.info('%d Article batches prepared (batch size=%d).' % (
            len(batch),
            batch_size,
        ))
        # Now we process the entries
        while len(batch):

            # Block until results the oldest item added to the queue
            # (usually the first one to return) is done:
            batch[-1][-1].wait()

            # If we reach here, we've got a request object to work
            # with
            low, high, request = batch.pop()
            if not request:
                continue

            response = request.response.pop()
            if response is None:
                # We got an error in our response; take an early
                # exit for now
                logger.error('An unhandled server response was received: %s.' %
                             (response))

                # Reverse our list again
                batch = batch[::-1]
                while len(batch) > 0:
                    _, _, request = batch.pop()
                    request.abort()
                break

            logger.debug('Retrieved (XOVER) batch %d-%d (%d articles).' % (
                low,
                high,
                len(response),
            ))
            # Get the current time for our timer
            cur_time = datetime.now()

            # For output logging
            load_speed = 'fast'

            try:
                # Try the fast way; this will always succeed unless
                # we're dealing with a messed up table
                db._engine.execute(Article.__table__.insert(),
                                   [{
                                       "message_id": article['id'],
                                       "article_no": article['article_no'],
                                       "subject": article['subject'],
                                       "poster": article['poster'],
                                       "size": article['size'],
                                       "lines": article['lines'],
                                       "date": article['date'],
                                       "score": article['score'],
                                   } for article in response.itervalues()])

            except (OperationalError, IntegrityError):
                logger.debug('Preparing for a slow load of %d items' %
                             len(response))
                for article in response.itervalues():
                    # Store our batch into the database and update
                    # our pointer
                    try:
                        group_session.merge(
                            Article(
                                message_id=article['id'],
                                article_no=article['article_no'],
                                subject=article['subject'],
                                poster=article['poster'],
                                size=article['size'],
                                lines=article['lines'],
                                posted_date=article['date'],
                                score=article['score'],
                            ))

                    except OperationalError, e:
                        logger.error('A database operational error occured.')
                        logger.debug('Exception: %s' % str(e))
                        exit(1)

                    except TypeError, e:
                        logger.error(
                            'Failed to save article: %s.' % str(article), )
                        logger.debug('Exception: %s' % str(e))
                        exit(1)

                group_session.commit()
                load_speed = 'slow'
Пример #7
0
def update_index(ctx, workdir, groups, watched):
    """
    Updating Indexes (download NZBFiles).

    If a group(s) is/are specified on the command line, then just those
    are indexed as well.

    """
    # Use our Database first if it exists
    session = ctx['NNTPSettings'].session()
    if not session:
        logger.error('Could not acquire a database connection.')
        exit(1)

    if not len(ctx['NNTPSettings'].nntp_servers) > 0:
        logger.error("There are no servers defined.")
        exit(1)

    # Store Primary server
    s = ctx['NNTPSettings'].nntp_servers[0]
    try:
        _server = session.query(Server)\
            .filter(Server.host == s['host']).first()

    except (InvalidRequestError, OperationalError):
        # Database isn't set up
        logger.error("The database is not correctly configured.")
        exit(1)

    if not _server:
        logger.error("Server entry is not in the database.")
        exit(1)

    # Get tempory directory for download
    if not workdir:
        workdir = join(ctx['NNTPSettings'].work_dir, 'tmp')

    # initialize our return code to zero (0) which means okay
    # but we'll toggle it if we have any sort of failure
    return_code = 0

    # PEP8 E712 does not allow us to make a comparison to a boolean value
    # using the == instead of the keyword 'in'.  However SQLAlchemy
    # requires us to do just because that's how the amazing tool works.
    # so to get around the pep8 error, we'll just define a variable equal
    # to True and then we can compare to it
    pep8_e712 = True

    if watched:
        _groups = session.query(Group.name)\
                         .filter(Group.watch == pep8_e712).all()
        if not _groups:
            logger.error("There are no current groups being watched.")
            exit(1)

        groups = set(groups) | set([g[0] for g in _groups])

    if not groups:
        logger.error("There were not groups identified for indexing.")
        exit(1)

    groups = get_groups(session=session, lookup=groups, watched=watched)
    if not groups:
        logger.error("There were not groups identified for searching.")
        exit(1)

    # Maintain a list of completed groups; This allows us to not
    # parse a group twice
    completed = list()

    for name, _id in groups.iteritems():
        _group = name.lower().strip()
        if not _group:
            continue

        if _id in completed:
            # We've indexed this group
            continue

        # Index Group based on It's placeholders in GroupTrack

        # get patch to groups cache db
        db_path = join(ctx['NNTPSettings'].work_dir, 'cache', 'search')
        db_file = '%s%s' % (
            join(db_path, name),
            SQLITE_DATABASE_EXTENSION,
        )
        if not isfile(db_file):
            logger.warning("There is no cached content for '%s'." % db_file)
            continue

        reset = not exists(db_file)

        engine = 'sqlite:///%s' % db_file
        reset = not exists(db_file)
        db = NNTPGroupDatabase(engine=engine, reset=reset)
        group_session = db.session()
        if not group_session:
            logger.error("The database %s is not accessible.." % db_file)
            continue

        # TODO:
        # Get current index associated with our primary group so we can
        # begin fetching from that point.  The index "MUST" but the one
        # associated with our server hostname. If one doesn't exist; create
        # it initialized at 0

        logger.info('Retrieving information on group %s' % (name))

        gt = session.query(GroupTrack)\
                    .filter(GroupTrack.group_id == _id)\
                    .filter(GroupTrack.server_id == _server.id).first()

        if not gt:
            logger.warning('No GroupTrack found for %s' % (group))
            continue

        # Initialize our high/low variables
        low = gt.low
        high = gt.high

        # starting pointer
        cur = gt.index_pointer + 1

        logger.info('Indexing from %d to %d [%d article(s)]' %
                    (cur, high, (high - cur + 1)))

        # search cache for NZB since the last index
        articles = group_session.query(Article)\
                          .filter(Article.subject.ilike('%%%s%%' % 'nzb'))\
                          .filter(Article.article_no.between(cur, high))\
                          .order_by(Article.article_no.asc())

        # assuming have something to index
        if articles.count():
            # Initialize our GetFactory
            mgr = ctx['NNTPManager']
            gf = NNTPGetFactory(connection=mgr, decode=True, groups=name)

            index_high = articles.count()
            index_cur = 0

            # Iterate through our list of matched articles
            for entry in articles:
                # Get the current time for our timer
                cur_time = datetime.now()

                # download NZB
                if not gf.load(entry.message_id, work_dir=workdir):
                    return_code = 1
                    continue

                if not gf.download():
                    # our download failed
                    return_code = 1
                    continue

                # clean up after download
                if not gf.clean():
                    return_code = 1
                    continue

                # Update our marker
                session.query(GroupTrack)\
                    .filter(GroupTrack.group_id == _id)\
                    .filter(GroupTrack.server_id == _server.id)\
                    .update({
                        GroupTrack.index_pointer: entry.article_no,
                        GroupTrack.last_index: datetime.now(),
                    })

                # Save this now as it allows for Cntrl-C or aborts
                # To take place and we'll resume from where we left off
                session.commit()

                index_cur = index_cur + 1

                # Calculate Processing Time
                delta_time = datetime.now() - cur_time
                delta_time = (delta_time.days * 86400) + delta_time.seconds + (
                    delta_time.microseconds / 1e6)
                logger.info(
                    'indexed article (%d) in %s sec(s) [remaining=%d]' %
                    (entry.article_no, delta_time, (index_high - index_cur)))

        # Append to completed list (this prevents us from processing entries
        # twice)
        completed.append(_id)

    session.close()
    group_session.close()
    db.close()