def alias_add(ctx, alias, groups): """ Associate an alias with one or more group names. This makes further reference to the group that much easier since you don't have to type the entire thing out again. For example: one might create an alias of alt.binaries.test to a.b.test There are no retrictions as to what the alias has to be, so if you wanted to you could just associate them with 1 word (like the above a.b.test could have also just been associated with the word 'test' or even 't'. """ session = ctx['NNTPSettings'].session() if not session: logger.error("The database is not correctly configured.") exit(1) if not alias: logger.error("You must specify an alias.") exit(1) if len(groups) == 0: logger.error("You must specify at least one group after the alias.") exit(1) # Simplify Alias _alias = alias.lower().strip() if not _alias: logger.error("An invalid alias identifier was specified.") exit(1) groups = get_groups(session, groups) if not groups: logger.error( "There were no alias/groups found matching your criteria.", ) exit(1) # Track our database updates pending_commits = 0 for name, _id in groups.iteritems(): if session.merge(GroupAlias(group_id=_id, name=_alias)): logger.debug( "Adding alias '%s' to group '%s'." % (_alias, name), ) pending_commits += 1 if pending_commits > 0: # commit our results session.commit()
def group_unwatch(ctx, groups): """ Remove specified group(s) from a watchlist. """ session = ctx['NNTPSettings'].session() if not session: logger.error("The database is not correctly configured.") exit(1) # Track our database updates pending_commits = 0 groups = get_groups(session, groups) if not groups: logger.error( "There were no alias/groups found matching your criteria.", ) exit(1) # PEP8 E712 does not allow us to make a comparison to a boolean value # using the == instead of the keyword 'in'. However SQLAlchemy # requires us to do just because that's how the amazing tool works. # so to get around the pep8 error, we'll just define a variable equal # to True and then we can compare to it pep8_e712 = True for name, _id in groups.items(): # Remove the entry if we can otherwise we just gracefully move on if session.query(Group).filter(Group.id == _id)\ .filter(Group.watch == pep8_e712)\ .update({Group.watch: False}): logger.info( "Removed the group '%s' from the watchlist." % name, ) pending_commits += 1 if pending_commits > 0: # commit our results session.commit() return
def group_watch(ctx, groups): """ Adds a group to a watch list. """ session = ctx['NNTPSettings'].session() if not session: logger.error("The database is not correctly configured.") exit(1) # Track our database updates pending_commits = 0 groups = get_groups(session, groups) if not groups: logger.error( "There were no alias/groups found matching your criteria.", ) exit(1) # PEP8 E712 does not allow us to make a comparison to a boolean value # using the == instead of the keyword 'in'. However SQLAlchemy # requires us to do just because that's how the amazing tool works. # so to get around the pep8 error, we'll just define a variable equal # to True and then we can compare to it pep8_e712 = False for name, _id in groups.items(): # Remove the entry if we can otherwise we just gracefully move on if session.query(Group).filter(Group.id == _id)\ .filter(Group.watch == pep8_e712)\ .update({Group.watch: True}): logger.info("Added the group '%s' to the watchlist." % name) pending_commits += 1 if pending_commits > 0: # commit our results session.commit() return
def search(ctx, group, keywords, minscore, maxscore, case_insensitive): """ Searches cached groups for articles. Specified keywords stack on one another. Each keyword specified must match somewhere in the subject line or else the result is filtered. Keywords can also be prefixed with special characters too to help identify what is being scanned. 1. Example 1: A search that should ignore any text with 'Test' in it but include text with 'Jack' in it. Unless you include the case-insensitive switch (inspired from grep), the search will be case sensitive: -Test +Jack The + (plus) is always implied. It's primary use it to eliminate abiguity (and allow for the minus to exist). It is also nessisary if you intend to search for something with a plus in it, hence the following would search for the string '+++AWESOME+++': +++++AWESOME+++ The extra plus symbol is stripped off and the search works as intended. 2. Example 2: Search by Poster. Since all keywords imply that you're searching for a subject keyword, the next token that changes this is '%p' where as the subject is always implied identified as '%s'. Hence the following would look for me: %pChris %pl2g This can also be written like this: %p+Chris %p+l2g You should not be confused here, the tokens at the front will be stripped off and the search will run as normal. These tokens are very important because it allows you to mix and match search with both the subject and poster: %p+Chris %p+l2g AWESOME The above implies that AWESOME will have a +%s infront of it. Make sense? The final thing worth noting is doing a search for text that contains dash/minus (-) signs. Click (the awesome cli wrapper this script uses can pick the - up as an actual switch thinking you're trying to pass it into this function. So you can easily disable this with by adding a double dash/minus sign (--) like so: nr search -- -keyword +keyword2 """ session = ctx['NNTPSettings'].session() if not session: logger.error("The database is not correctly configured.") exit(1) if not group: logger.error("You must specify a group/alias.") exit(1) # Simplify Alias groups = get_groups(session, group) if not groups: logger.error("You must specify a group/alias.") exit(1) for name, _id in groups.iteritems(): db_path = join(ctx['NNTPSettings'].cfg_path, 'cache', 'search') db_file = '%s%s' % ( join(db_path, name), SQLITE_DATABASE_EXTENSION, ) if not isfile(db_file): logger.warning( "There is no cached content for '%s'." % db_file ) continue engine = 'sqlite:///%s' % db_file db = NNTPGroupDatabase(engine=engine, reset=None) group_session = db.session() gt = group_session.query(Article) # Parse our keywords parsed_keywords = parse_search_keyword(keywords) for _op, _cat, keyword in parsed_keywords: if _cat == SearchCategory.SUBJECT: if _op == SearchOperation.INCLUDE: if case_insensitive: logger.debug( 'Scanning -and- (case-insensitive) subject: ' '"%s"' % (keyword)) gt = gt.filter( Article.subject.ilike('%%%s%%' % keyword)) else: logger.debug( 'Scanning -and- (case-sensitive) subject: ' '"%s"' % (keyword)) gt = gt.filter( Article.subject.like('%%%s%%' % keyword)) else: # _op == SearchCategory.EXCLUDE if case_insensitive: logger.debug( 'Scanning -not- (case-insensitive) subject: ' '"%s"' % (keyword)) gt = gt.filter( not_(Article.subject.ilike('%%%s%%' % keyword))) else: logger.debug( 'Scanning -and not- (case-sensitive) subject: ' '"%s"' % (keyword)) gt = gt.filter( not_(Article.subject.like('%%%s%%' % keyword))) elif _cat == SearchCategory.POSTER: if _op == SearchOperation.INCLUDE: if case_insensitive: logger.debug( 'Scanning -and- (case-insensitive) poster: ' '"%s"' % (keyword)) gt = gt.filter( Article.poster.ilike('%%%s%%' % keyword)) else: logger.debug( 'Scanning -and- (case-sensitive) poster: ' '"%s"' % (keyword)) gt = gt.filter( Article.poster.like('%%%s%%' % keyword)) else: # _op == SearchCategory.EXCLUDE if case_insensitive: logger.debug( 'Scanning -and not- (case-insensitive) poster: ' '"%s"' % (keyword)) gt = gt.filter( not_(Article.poster.ilike('%%%s%%' % keyword))) else: logger.debug( 'Scanning -and not- (case-sensitive) poster: ' '"%s"' % (keyword)) gt = gt.filter( not_(Article.poster.like('%%%s%%' % keyword))) # Handle Scores if maxscore == minscore: logger.debug('Scanning -score == %d-' % (maxscore)) gt = gt.filter(Article.score == maxscore) else: logger.debug( 'Scanning -score >= %d and score <= %d-' % ( minscore, maxscore)) gt = gt.filter(Article.score <= maxscore)\ .filter(Article.score >= minscore) gt = gt.order_by(Article.score.desc()) # Iterate through our list print("%s:" % (name)) for entry in gt: print(" [%.5d] %.4d %s" % ( entry.article_no, entry.score, entry.subject)) group_session.close() db.close() return
def update_search(ctx, groups, date_from, date_to, watched): """ Cache specified articles. Articles are cached into their own database due to the sheer size of the content within each group. """ # TODO: Support loading by date ranges (from and to) # TODO: Support loading by X articles from front or X articles from back # TODO: Support loading data by X days back # TODO: Support loading data by X days back # TODO: Support specifing how many entries to process. Hence if someone # only does a --count=1 (or -c), then only 1 batch is processed. # Support specifying the batch sizes otherwise we use the config # file which is already in place --batch (or -b). # TODO: GroupTrack needs to be smarter and not block until the fetch # is repeated on a failure. Each batch loaded should update the # main database and track it's successful fetch. If it's fetch # runs into another, then the 2 tables can be combined into a larger # one. # # GroupIndex example: # The below is what the table might look like; you can see we # successfully loaded 100 to 199, and 300 to 399 # a.b.test.group: # <id> <low> <high> # 1 100 199 # 1 300 399 # # If we fill the void (200 to 299), the table should restructure # itself to look like this: # <id> <low> <high> # 1 100 399 # # Basically the more entries in this table, the more holes/gaps # we have, but we can use this to adjust our batches when we # collide with content that is already fetched. If a reset # switch is specified (or a reset is detected because the # database is missing, then this table should be included in # the reset too!!) # # TODO: Support a --force (-f) switch which forces a re-fetch of the # specified ranges defined that override the GroupIndex table. # # Use our Database first if it exists session = ctx['NNTPSettings'].session() if not session: logger.error('Could not acquire a database connection.') exit(1) if not len(ctx['NNTPSettings'].nntp_servers) > 0: logger.error("There are no servers defined.") exit(1) if date_from: try: date_from = parse(date_from, fuzzy=True) except TypeError: logger.error( "An invalid from date/time was specified: %s" % str(date_from), ) exit(1) if date_to: try: date_to = parse(date_to, fuzzy=True) except TypeError: logger.error( "An invalid to date/time was specified: %s" % str(date_to), ) exit(1) if date_to and date_from and date_from > date_to: logger.error( "The from date can not be larger then the to date.", ) exit(1) # Store Primary server s = ctx['NNTPSettings'].nntp_servers[0] try: _server = session.query(Server)\ .filter(Server.host == s['host'])\ .filter(Server.port == s['port']).first() except (InvalidRequestError, OperationalError): # Database isn't set up logger.error("The database is not correctly configured.") exit(1) if not _server: logger.error("Server entry is not in the database.") exit(1) groups = get_groups(session=session, lookup=groups, watched=watched) if not groups: logger.error("There were not groups identified for indexing.") exit(1) # Get our RamDisk if we got one ramdisk = ctx['NNTPSettings'].nntp_processing.get('ramdisk') if ramdisk: if not (isdir(ramdisk) and access(ramdisk, W_OK)): logger.warning('Ramdisk "%s" is not accessible.' % (ramdisk)) # Turn it off so we don't reference it ramdisk = None else: logger.info('Using ramdisk: %s' % (ramdisk)) for name, _id in groups.iteritems(): db_path = join(ctx['NNTPSettings'].cfg_path, 'cache', 'search') db_file = '%s%s' % ( join(db_path, name), SQLITE_DATABASE_EXTENSION, ) if not isdir(db_path): if not mkdir(db_path): logger.error("Failed to create directory %s" % db_path) exit(1) logger.info("Created directory %s" % db_path) if not access(db_path, W_OK): logger.error('The directory "%s" is not accessible.' % db_path) exit(1) reset = not exists(db_file) ram_db_file = None if ramdisk: # Create a ramdisk db ram_db_file = '%s%s' % ( join(ramdisk, name), SQLITE_DATABASE_EXTENSION, ) # Remove the existing file if it's there try: unlink(ram_db_file) except OSError: # No problem; the file just doesn't already exist pass engine = 'sqlite:///%s' % ram_db_file if not reset: # Database exists, and ramdisk exists, and we're not # reseting anything... copy existing database onto # ramdisk for processing logger.debug('Transfering %s database to ramdisk.' % name) copy(db_file, ram_db_file) logger.info('Transfered %s database to ramdisk.' % name) else: engine = 'sqlite:///%s' % db_file db = NNTPGroupDatabase(engine=engine, reset=reset) group_session = db.session() if not group_session: logger.warning("The database %s not be accessed." % db_file) continue # TODO: # Get current index associated with our primary group so we can # begin fetching from that point. The index "MUST" but the one # associated with our server hostname. If one doesn't exist; create # it initialized at 0 logger.debug('Retrieving information on group %s' % (name)) gt = session.query(GroupTrack)\ .filter(GroupTrack.group_id == _id)\ .filter(GroupTrack.server_id == _server.id).first() if not gt or reset: # Get an connection to work with con = ctx['NNTPManager'].get_connection() _, low, high, _ = con.group(name) if low is None: # Could not set group logger.warning("Could not access group '%s' on '%s'." % ( name, _server.host, )) continue # Create a GroupTrack object using the group info gt = GroupTrack( group_id=_id, server_id=_server.id, low=low, high=high, scan_pointer=low, index_pointer=low, ) group_session.commit() session.add(gt) # Initialize our high/low variables low = gt.high high = gt.low # starting pointer cur = gt.scan_pointer + 1 requests = [] if date_to: requests.append( ctx['NNTPManager'].seek_by_date( date_to + timedelta(seconds=1), group=name, block=False)) # Mark our item requests[-1]._watermark = 'high' if date_from: requests.append( ctx['NNTPManager'].seek_by_date( date_from, group=name, block=False)) # Mark our item requests[-1]._watermark = 'low' while len(requests): # Wait for requeest to complete requests[-1].wait() # we have a request at this point request = requests.pop() if not request: continue # Store our watermark so we update the correct entry watermark = request._watermark # Retrieve our response response = request.response.pop() if response is None: # We got an error in our response; take an early # exit for now logger.error( 'An unhandled server response was received: %s.' % ( response)) # Store our watermark (high/low) if watermark == 'low': low = response # Store our current pointer at the starting point we found cur = low + 1 elif watermark == 'high': high = response if high <= cur: # Skip continue # Drop all indexes; this makes inserts that much faster # TODO: make the header_batch_size a entry in NNTPSettings since it's # so powerful and allows pulling down multiple things at once # Retrieve a list of articles from the database in concurrent blocks # Scan them and place them into the NNTPGroupDatabase() batch_size = ctx['NNTPSettings'].nntp_processing\ .get('header_batch_size', 5000) logger.info('Fetching from %d to %d [%d article(s)]' % ( cur, high, (high - cur + 1))) # Initialize our batch batch = list() # Parse the Database URL db_url = db.parse_url() if db_url['schema'].lower() == 'sqlite': # db_url['path'] contains the full path to the database file logger.info('Optimizing update for an SQLite database.') # SQLite Speed changes db._engine.execute('PRAGMA journal_mode = MEMORY') db._engine.execute("PRAGMA temp_store = MEMORY") db._engine.execute('PRAGMA synchronous = OFF') # 2 GB of RAM used for Caching for speed db._engine.execute('PRAGMA cache_size = 2000000') # we'll re-add them later for index in Article.__table__.indexes: try: index.drop(bind=db._engine) logger.info('Dropping Article Index "%s"' % index.name) except OperationalError: # The index is probably already dropped pass while high > cur: # Figure out our bach size inc = min(batch_size - 1, high - cur) logger.debug('Pushing XOVER batch %d-%d (inc=%d)' % ( cur, cur + inc, inc + 1, )) # Prepare our batch list batch.append((cur, cur + inc, ctx['NNTPManager'].xover( group=name, start=cur, end=cur + inc, sort=XoverGrouping.BY_ARTICLE_NO, block=False, ))) # Increment our pointer cur += inc + 1 # Reverse sort the list since we know the first items pushed will be # the first ones completed. we want to pop items from the batch in the # same order we pushed them on: # batch = list(reversed(batch)) # The below is faster than the above for reversing a list then using # the reversed() function (and does just that: reverses the results) batch = batch[::-1] logger.info('%d Article batches prepared (batch size=%d).' % ( len(batch), batch_size, )) # Now we process the entries while len(batch): # Block until results the oldest item added to the queue # (usually the first one to return) is done: batch[-1][-1].wait() # If we reach here, we've got a request object to work # with low, high, request = batch.pop() if not request: continue response = request.response.pop() if response is None: # We got an error in our response; take an early # exit for now logger.error( 'An unhandled server response was received: %s.' % ( response)) # Reverse our list again batch = batch[::-1] while len(batch) > 0: _, _, request = batch.pop() request.abort() break logger.debug( 'Retrieved (XOVER) batch %d-%d (%d articles).' % ( low, high, len(response), )) # Get the current time for our timer cur_time = datetime.now() # For output logging load_speed = 'fast' try: # Try the fast way; this will always succeed unless # we're dealing with a messed up table db._engine.execute( Article.__table__.insert(), [{ "message_id": article['id'], "article_no": article['article_no'], "subject": article['subject'], "poster": article['poster'], "size": article['size'], "lines": article['lines'], "date": article['date'], "score": article['score'], } for article in response.itervalues()] ) except (OperationalError, IntegrityError): logger.debug('Preparing for a slow load of %d items' % len(response)) for article in response.itervalues(): # Store our batch into the database and update # our pointer try: group_session.merge(Article( message_id=article['id'], article_no=article['article_no'], subject=article['subject'], poster=article['poster'], size=article['size'], lines=article['lines'], posted_date=article['date'], score=article['score'], )) except OperationalError, e: logger.error( 'A database operational error occured.' ) logger.debug('Exception: %s' % str(e)) exit(1) except TypeError, e: logger.error( 'Failed to save article: %s.' % str(article), ) logger.debug('Exception: %s' % str(e)) exit(1) group_session.commit() load_speed = 'slow'
def update_search(ctx, groups, date_from, date_to, watched): """ Cache specified articles. Articles are cached into their own database due to the sheer size of the content within each group. """ # TODO: Support loading by date ranges (from and to) # TODO: Support loading by X articles from front or X articles from back # TODO: Support loading data by X days back # TODO: Support loading data by X days back # TODO: Support specifing how many entries to process. Hence if someone # only does a --count=1 (or -c), then only 1 batch is processed. # Support specifying the batch sizes otherwise we use the config # file which is already in place --batch (or -b). # TODO: GroupTrack needs to be smarter and not block until the fetch # is repeated on a failure. Each batch loaded should update the # main database and track it's successful fetch. If it's fetch # runs into another, then the 2 tables can be combined into a larger # one. # # GroupIndex example: # The below is what the table might look like; you can see we # successfully loaded 100 to 199, and 300 to 399 # a.b.test.group: # <id> <low> <high> # 1 100 199 # 1 300 399 # # If we fill the void (200 to 299), the table should restructure # itself to look like this: # <id> <low> <high> # 1 100 399 # # Basically the more entries in this table, the more holes/gaps # we have, but we can use this to adjust our batches when we # collide with content that is already fetched. If a reset # switch is specified (or a reset is detected because the # database is missing, then this table should be included in # the reset too!!) # # TODO: Support a --force (-f) switch which forces a re-fetch of the # specified ranges defined that override the GroupIndex table. # # Use our Database first if it exists session = ctx['NNTPSettings'].session() if not session: logger.error('Could not acquire a database connection.') exit(1) if not len(ctx['NNTPSettings'].nntp_servers) > 0: logger.error("There are no servers defined.") exit(1) if date_from: try: date_from = parse(date_from, fuzzy=True) except TypeError: logger.error( "An invalid from date/time was specified: %s" % str(date_from), ) exit(1) if date_to: try: date_to = parse(date_to, fuzzy=True) except TypeError: logger.error( "An invalid to date/time was specified: %s" % str(date_to), ) exit(1) if date_to and date_from and date_from > date_to: logger.error("The from date can not be larger then the to date.", ) exit(1) # Store Primary server s = ctx['NNTPSettings'].nntp_servers[0] try: _server = session.query(Server)\ .filter(Server.host == s['host'])\ .filter(Server.port == s['port']).first() except (InvalidRequestError, OperationalError): # Database isn't set up logger.error("The database is not correctly configured.") exit(1) if not _server: logger.error("Server entry is not in the database.") exit(1) groups = get_groups(session=session, lookup=groups, watched=watched) if not groups: logger.error("There were not groups identified for indexing.") exit(1) # Get our RamDisk if we got one ramdisk = ctx['NNTPSettings'].nntp_processing.get('ramdisk') if ramdisk: if not (isdir(ramdisk) and access(ramdisk, W_OK)): logger.warning('Ramdisk "%s" is not accessible.' % (ramdisk)) # Turn it off so we don't reference it ramdisk = None else: logger.info('Using ramdisk: %s' % (ramdisk)) for name, _id in groups.iteritems(): db_path = join(ctx['NNTPSettings'].cfg_path, 'cache', 'search') db_file = '%s%s' % ( join(db_path, name), SQLITE_DATABASE_EXTENSION, ) if not isdir(db_path): if not mkdir(db_path): logger.error("Failed to create directory %s" % db_path) exit(1) logger.info("Created directory %s" % db_path) if not access(db_path, W_OK): logger.error('The directory "%s" is not accessible.' % db_path) exit(1) reset = not exists(db_file) ram_db_file = None if ramdisk: # Create a ramdisk db ram_db_file = '%s%s' % ( join(ramdisk, name), SQLITE_DATABASE_EXTENSION, ) # Remove the existing file if it's there try: unlink(ram_db_file) except OSError: # No problem; the file just doesn't already exist pass engine = 'sqlite:///%s' % ram_db_file if not reset: # Database exists, and ramdisk exists, and we're not # reseting anything... copy existing database onto # ramdisk for processing logger.debug('Transfering %s database to ramdisk.' % name) copy(db_file, ram_db_file) logger.info('Transfered %s database to ramdisk.' % name) else: engine = 'sqlite:///%s' % db_file db = NNTPGroupDatabase(engine=engine, reset=reset) group_session = db.session() if not group_session: logger.warning("The database %s not be accessed." % db_file) continue # TODO: # Get current index associated with our primary group so we can # begin fetching from that point. The index "MUST" but the one # associated with our server hostname. If one doesn't exist; create # it initialized at 0 logger.debug('Retrieving information on group %s' % (name)) gt = session.query(GroupTrack)\ .filter(GroupTrack.group_id == _id)\ .filter(GroupTrack.server_id == _server.id).first() if not gt or reset: # Get an connection to work with con = ctx['NNTPManager'].get_connection() _, low, high, _ = con.group(name) if low is None: # Could not set group logger.warning("Could not access group '%s' on '%s'." % ( name, _server.host, )) continue # Create a GroupTrack object using the group info gt = GroupTrack( group_id=_id, server_id=_server.id, low=low, high=high, scan_pointer=low, index_pointer=low, ) group_session.commit() session.add(gt) # Initialize our high/low variables low = gt.high high = gt.low # starting pointer cur = gt.scan_pointer + 1 requests = [] if date_to: requests.append(ctx['NNTPManager'].seek_by_date( date_to + timedelta(seconds=1), group=name, block=False)) # Mark our item requests[-1]._watermark = 'high' if date_from: requests.append(ctx['NNTPManager'].seek_by_date(date_from, group=name, block=False)) # Mark our item requests[-1]._watermark = 'low' while len(requests): # Wait for requeest to complete requests[-1].wait() # we have a request at this point request = requests.pop() if not request: continue # Store our watermark so we update the correct entry watermark = request._watermark # Retrieve our response response = request.response.pop() if response is None: # We got an error in our response; take an early # exit for now logger.error('An unhandled server response was received: %s.' % (response)) # Store our watermark (high/low) if watermark == 'low': low = response # Store our current pointer at the starting point we found cur = low + 1 elif watermark == 'high': high = response if high <= cur: # Skip continue # Drop all indexes; this makes inserts that much faster # TODO: make the header_batch_size a entry in NNTPSettings since it's # so powerful and allows pulling down multiple things at once # Retrieve a list of articles from the database in concurrent blocks # Scan them and place them into the NNTPGroupDatabase() batch_size = ctx['NNTPSettings'].nntp_processing\ .get('header_batch_size', 5000) logger.info('Fetching from %d to %d [%d article(s)]' % (cur, high, (high - cur + 1))) # Initialize our batch batch = list() # Parse the Database URL db_url = db.parse_url() if db_url['schema'].lower() == 'sqlite': # db_url['path'] contains the full path to the database file logger.info('Optimizing update for an SQLite database.') # SQLite Speed changes db._engine.execute('PRAGMA journal_mode = MEMORY') db._engine.execute("PRAGMA temp_store = MEMORY") db._engine.execute('PRAGMA synchronous = OFF') # 2 GB of RAM used for Caching for speed db._engine.execute('PRAGMA cache_size = 2000000') # we'll re-add them later for index in Article.__table__.indexes: try: index.drop(bind=db._engine) logger.info('Dropping Article Index "%s"' % index.name) except OperationalError: # The index is probably already dropped pass while high > cur: # Figure out our bach size inc = min(batch_size - 1, high - cur) logger.debug('Pushing XOVER batch %d-%d (inc=%d)' % ( cur, cur + inc, inc + 1, )) # Prepare our batch list batch.append((cur, cur + inc, ctx['NNTPManager'].xover( group=name, start=cur, end=cur + inc, sort=XoverGrouping.BY_ARTICLE_NO, block=False, ))) # Increment our pointer cur += inc + 1 # Reverse sort the list since we know the first items pushed will be # the first ones completed. we want to pop items from the batch in the # same order we pushed them on: # batch = list(reversed(batch)) # The below is faster than the above for reversing a list then using # the reversed() function (and does just that: reverses the results) batch = batch[::-1] logger.info('%d Article batches prepared (batch size=%d).' % ( len(batch), batch_size, )) # Now we process the entries while len(batch): # Block until results the oldest item added to the queue # (usually the first one to return) is done: batch[-1][-1].wait() # If we reach here, we've got a request object to work # with low, high, request = batch.pop() if not request: continue response = request.response.pop() if response is None: # We got an error in our response; take an early # exit for now logger.error('An unhandled server response was received: %s.' % (response)) # Reverse our list again batch = batch[::-1] while len(batch) > 0: _, _, request = batch.pop() request.abort() break logger.debug('Retrieved (XOVER) batch %d-%d (%d articles).' % ( low, high, len(response), )) # Get the current time for our timer cur_time = datetime.now() # For output logging load_speed = 'fast' try: # Try the fast way; this will always succeed unless # we're dealing with a messed up table db._engine.execute(Article.__table__.insert(), [{ "message_id": article['id'], "article_no": article['article_no'], "subject": article['subject'], "poster": article['poster'], "size": article['size'], "lines": article['lines'], "date": article['date'], "score": article['score'], } for article in response.itervalues()]) except (OperationalError, IntegrityError): logger.debug('Preparing for a slow load of %d items' % len(response)) for article in response.itervalues(): # Store our batch into the database and update # our pointer try: group_session.merge( Article( message_id=article['id'], article_no=article['article_no'], subject=article['subject'], poster=article['poster'], size=article['size'], lines=article['lines'], posted_date=article['date'], score=article['score'], )) except OperationalError, e: logger.error('A database operational error occured.') logger.debug('Exception: %s' % str(e)) exit(1) except TypeError, e: logger.error( 'Failed to save article: %s.' % str(article), ) logger.debug('Exception: %s' % str(e)) exit(1) group_session.commit() load_speed = 'slow'
def update_index(ctx, workdir, groups, watched): """ Updating Indexes (download NZBFiles). If a group(s) is/are specified on the command line, then just those are indexed as well. """ # Use our Database first if it exists session = ctx['NNTPSettings'].session() if not session: logger.error('Could not acquire a database connection.') exit(1) if not len(ctx['NNTPSettings'].nntp_servers) > 0: logger.error("There are no servers defined.") exit(1) # Store Primary server s = ctx['NNTPSettings'].nntp_servers[0] try: _server = session.query(Server)\ .filter(Server.host == s['host']).first() except (InvalidRequestError, OperationalError): # Database isn't set up logger.error("The database is not correctly configured.") exit(1) if not _server: logger.error("Server entry is not in the database.") exit(1) # Get tempory directory for download if not workdir: workdir = join(ctx['NNTPSettings'].work_dir, 'tmp') # initialize our return code to zero (0) which means okay # but we'll toggle it if we have any sort of failure return_code = 0 # PEP8 E712 does not allow us to make a comparison to a boolean value # using the == instead of the keyword 'in'. However SQLAlchemy # requires us to do just because that's how the amazing tool works. # so to get around the pep8 error, we'll just define a variable equal # to True and then we can compare to it pep8_e712 = True if watched: _groups = session.query(Group.name)\ .filter(Group.watch == pep8_e712).all() if not _groups: logger.error("There are no current groups being watched.") exit(1) groups = set(groups) | set([g[0] for g in _groups]) if not groups: logger.error("There were not groups identified for indexing.") exit(1) groups = get_groups(session=session, lookup=groups, watched=watched) if not groups: logger.error("There were not groups identified for searching.") exit(1) # Maintain a list of completed groups; This allows us to not # parse a group twice completed = list() for name, _id in groups.iteritems(): _group = name.lower().strip() if not _group: continue if _id in completed: # We've indexed this group continue # Index Group based on It's placeholders in GroupTrack # get patch to groups cache db db_path = join(ctx['NNTPSettings'].work_dir, 'cache', 'search') db_file = '%s%s' % ( join(db_path, name), SQLITE_DATABASE_EXTENSION, ) if not isfile(db_file): logger.warning("There is no cached content for '%s'." % db_file) continue reset = not exists(db_file) engine = 'sqlite:///%s' % db_file reset = not exists(db_file) db = NNTPGroupDatabase(engine=engine, reset=reset) group_session = db.session() if not group_session: logger.error("The database %s is not accessible.." % db_file) continue # TODO: # Get current index associated with our primary group so we can # begin fetching from that point. The index "MUST" but the one # associated with our server hostname. If one doesn't exist; create # it initialized at 0 logger.info('Retrieving information on group %s' % (name)) gt = session.query(GroupTrack)\ .filter(GroupTrack.group_id == _id)\ .filter(GroupTrack.server_id == _server.id).first() if not gt: logger.warning('No GroupTrack found for %s' % (group)) continue # Initialize our high/low variables low = gt.low high = gt.high # starting pointer cur = gt.index_pointer + 1 logger.info('Indexing from %d to %d [%d article(s)]' % (cur, high, (high - cur + 1))) # search cache for NZB since the last index articles = group_session.query(Article)\ .filter(Article.subject.ilike('%%%s%%' % 'nzb'))\ .filter(Article.article_no.between(cur, high))\ .order_by(Article.article_no.asc()) # assuming have something to index if articles.count(): # Initialize our GetFactory mgr = ctx['NNTPManager'] gf = NNTPGetFactory(connection=mgr, decode=True, groups=name) index_high = articles.count() index_cur = 0 # Iterate through our list of matched articles for entry in articles: # Get the current time for our timer cur_time = datetime.now() # download NZB if not gf.load(entry.message_id, work_dir=workdir): return_code = 1 continue if not gf.download(): # our download failed return_code = 1 continue # clean up after download if not gf.clean(): return_code = 1 continue # Update our marker session.query(GroupTrack)\ .filter(GroupTrack.group_id == _id)\ .filter(GroupTrack.server_id == _server.id)\ .update({ GroupTrack.index_pointer: entry.article_no, GroupTrack.last_index: datetime.now(), }) # Save this now as it allows for Cntrl-C or aborts # To take place and we'll resume from where we left off session.commit() index_cur = index_cur + 1 # Calculate Processing Time delta_time = datetime.now() - cur_time delta_time = (delta_time.days * 86400) + delta_time.seconds + ( delta_time.microseconds / 1e6) logger.info( 'indexed article (%d) in %s sec(s) [remaining=%d]' % (entry.article_no, delta_time, (index_high - index_cur))) # Append to completed list (this prevents us from processing entries # twice) completed.append(_id) session.close() group_session.close() db.close()