def get(self, group_name, messages=None): """Get a set of messages from the server for the specified group.""" log.info('{}: Getting {:d} messages...'.format(group_name, len(messages))) data = '' if messages: try: _, total, first, last, _ = self.connection.group(group_name) log.debug('{}: Total articles in group: {:d}'.format(group_name, total)) for message in messages: article = '<{}>'.format(message) log.debug('{}: Getting article: {}'.format(group_name, article)) response, (number, message_id, lines) = self.connection.body(article) res = pynab.yenc.yenc_decode(lines) if res: data += res else: return None except nntplib.NNTPError as nntpe: log.error('{}: Problem retrieving messages from server: {}.'.format(group_name, nntpe)) return None return data else: log.error('{}: No messages were specified.'.format(group_name)) return None
def determine_category(name, group_name=''): """Categorise release based on release name and group name.""" category = '' if is_hashed(name): category = CAT_MISC_OTHER else: if group_name: category = check_group_category(name, group_name) if not category: for parent_category in parent_category_regex.keys(): category = check_parent_category(name, parent_category) if category: break if not category: category = CAT_MISC_OTHER log.info('category: ({}) [{}]: {} ({})'.format( group_name, name, get_category_name(category), category )) return category
def rename_bad_releases(category): for release in db.releases.find( {"category._id": int(category), "$or": [{"nfo": {"$nin": [None, False]}}, {"files.count": {"$exists": True}}]} ): log.debug("Finding name for {}...".format(release["search_name"])) name, category_id = pynab.releases.discover_name(release) if name and not category_id: # don't change anything, it was fine pass elif name and category_id: # we found a new name! log.info( "Renaming {} ({:d}) to {} ({:d})...".format( release["search_name"], release["category"]["_id"], name, category_id ) ) category = db.categories.find_one({"_id": category_id}) category["parent"] = db.categories.find_one({"_id": category["parent_id"]}) db.releases.update( {"_id": release["_id"]}, {"$set": {"search_name": pynab.releases.clean_release_name(name), "category": category}}, ) else: # bad release! log.debug("Noting unwanted release {} ({:d})...".format(release["search_name"], release["category"]["_id"])) db.releases.update({"_id": release["_id"]}, {"$set": {"unwanted": True}})
def search(name, year): """Search OMDB for a movie and return the IMDB ID.""" log.info('Searching for movie: {}'.format(name)) # if we managed to parse the year from the name # include it, since it'll narrow results if year: year_query = '&y={}'.format(year.replace('(', '').replace(')', '')) else: year_query = '' r = requests.get(OMDB_SEARCH_URL + name + year_query) try: data = r.json() except: log.debug('There was a problem accessing the API page.') return None if 'Search' in data: for movie in data['Search']: # doublecheck, but the api should've searched properly ratio = difflib.SequenceMatcher(None, clean_name(name), clean_name(movie['Title'])).ratio() if ratio > 0.8 and year == movie['Year'] and movie['Type'] == 'movie': log.info('OMDB movie match found: {}'.format(movie['Title'])) return movie
def process(limit=20, category=0): """Processes release rarfiles to check for passwords and filecounts.""" with Server() as server: query = {"passworded": None} if category: query["category._id"] = int(category) for release in db.releases.find(query).limit(limit).sort("posted", pymongo.DESCENDING).batch_size(50): nzb = pynab.nzbs.get_nzb_dict(release["nzb"]) if nzb and "rars" in nzb: info = check_release_files(server, release["group"]["name"], nzb) if info: log.info("[{}] - [{}] - file info: added".format(release["_id"], release["search_name"])) db.releases.update( {"_id": release["_id"]}, { "$set": { "files.count": info["files.count"], "files.size": info["files.size"], "files.names": info["files.names"], "passworded": info["passworded"], } }, ) continue log.warning( "rar: [{}] - [{}] - file info: no rars in release".format(release["_id"], release["search_name"]) ) db.releases.update( {"_id": release["_id"]}, {"$set": {"files.count": 0, "files.size": 0, "files.names": [], "passworded": "unknown"}}, )
def update_blacklist(): """Check for Blacklist update and load them into Mongo.""" if 'blacklist_url' in config.site: log.info('Starting blacklist update...') response = requests.get(config.site['blacklist_url']) lines = response.text.splitlines() for line in lines: elements = line.split('\t\t') if len(elements) == 4: log.debug('Updating blacklist {}...'.format(elements[1])) db.blacklists.update( { 'regex': elements[1] }, { '$setOnInsert': { 'status': 0 }, '$set': { 'group_name': elements[0], 'regex': elements[1], 'description': elements[3], } }, upsert=True ) return True else: log.error('No blacklist update url in config.') return False
def rename_pre_releases(): count = 0 with db_session() as db: query = db.query(Release).filter(Release.pre_id != None) query = query.outerjoin( Pre, Pre.id == Release.pre_id).filter((Release.name != Pre.name) | ( Release.search_name != Pre.searchname)) for release in query.all(): old_category_id = release.category_id release.name = release.pre.name release.search_name = release.pre.searchname release.category_id = pynab.categories.determine_category( release.search_name, release.group.name) db.merge(release) count += 1 log.info('rename: [{}] -> [{}]'.format(release.search_name, release.pre.searchname)) db.commit() log.info('rename: successfully renamed {} releases'.format(count))
def process(): # process binaries log.info('scan: processing binaries...') pynab.binaries.process() # process releases log.info('scan: processing releases...') pynab.releases.process()
def start(self): log.info("nabbot: xmpp bot started") if self.xmpp.connect(): self.xmpp.process(block=False) # pynab.xmpp is started in its own thread # self.create_nodes() #I have autocreate set, don't need to pre-populate self.handle_queue() else: log.error("nabbot: client didn't connect.")
def start(self): log.info("nabbot: xmpp bot started") if self.xmpp.connect(): self.xmpp.process( block=False) # pynab.xmpp is started in its own thread # self.create_nodes() #I have autocreate set, don't need to pre-populate self.handle_queue() else: log.error("nabbot: client didn't connect.")
def publish(self, guid, name, catid): categories = self.get_categories() data = "<name>{}</name><guid>{}</guid>".format(escape(name), guid) log.info("nabbot: publishing {} to {}[{}] at {}".format(data, categories[catid], catid, datetime.now())) try: self.xmpp.publish(str(catid), data) pass except: pass
def main(): channel = config.prebot.get("channel") nick = config.prebot.get("nick") server = config.prebot.get("server") port = config.prebot.get("port") log.info("Pre: Bot Nick - {}".format(nick)) bot = TestBot(channel, nick, server, port) bot.start()
def main(): channel = config.prebot.get('channel') nick = config.prebot.get('nick') server = config.prebot.get('server') port = config.prebot.get('port') log.info("Pre: Bot Nick - {}".format(nick)) bot = TestBot(channel, nick, server, port) bot.start()
def get_details(id): log.info("Retrieving movie details for {}...".format(id)) r = requests.get(OMDB_DETAIL_URL + id) data = r.json() if "Response" in data: imdb = {"_id": data["imdbID"], "title": data["Title"], "year": data["Year"], "genre": data["Genre"].split(",")} return imdb else: return None
def publish(self, guid, name, catid): categories = self.get_categories() data = "<name>{}</name><guid>{}</guid>".format(escape(name), guid) log.info("nabbot: publishing {} to {}[{}] at {}".format( data, categories[catid], catid, datetime.now())) try: self.xmpp.publish(str(catid), data) pass except: pass
def process(limit=None, category=0): """Process releases for SFV parts and download them.""" with Server() as server: with db_session() as db: # noinspection PyComparisonWithNone,PyComparisonWithNone query = db.query(Release).join(Group).join(NZB).filter(Release.sfv == None).filter( Release.sfv_metablack_id == None) if category: query = query.filter(Release.category_id == int(category)) if limit: releases = query.order_by(Release.posted.desc()).limit(limit) else: releases = query.order_by(Release.posted.desc()).all() for release in releases: found = False nzb = pynab.nzbs.get_nzb_details(release.nzb) if nzb: sfvs = [] for sfv in nzb['sfvs']: for part in sfv['segments']: if int(part['size']) > SFV_MAX_FILESIZE: continue sfvs.append(part) for sfv in sfvs: try: article = server.get(release.group.name, [sfv['message_id'], ]) except: article = None if article: data = gzip.compress(article.encode('utf-8')) sfv = SFV(data=data) db.add(sfv) release.sfv = sfv release.sfv_metablack_id = None db.add(release) log.info('sfv: [{}] - sfv added'.format( release.search_name )) found = True break if not found: log.debug('sfv: [{}] - no sfvs in release'.format( release.search_name )) mb = MetaBlack(sfv=release, status='IMPOSSIBLE') db.add(mb) db.commit()
def process(limit=None, category=0): """Process releases for SFV parts and download them.""" with Server() as server: with db_session() as db: # noinspection PyComparisonWithNone,PyComparisonWithNone query = db.query(Release).join(Group).join(NZB).filter( Release.sfv == None).filter(Release.sfv_metablack_id == None) if category: query = query.filter(Release.category_id == int(category)) if limit: releases = query.order_by(Release.posted.desc()).limit(limit) else: releases = query.order_by(Release.posted.desc()).all() for release in releases: found = False nzb = pynab.nzbs.get_nzb_details(release.nzb) if nzb: sfvs = [] for sfv in nzb['sfvs']: for part in sfv['segments']: if int(part['size']) > SFV_MAX_FILESIZE: continue sfvs.append(part) for sfv in sfvs: try: article = server.get(release.group.name, [ sfv['message_id'], ]) except: article = None if article: data = gzip.compress(article.encode('utf-8')) sfv = SFV(data=data) db.add(sfv) release.sfv = sfv release.sfv_metablack_id = None db.add(release) log.info('sfv: [{}] - sfv added'.format( release.search_name)) found = True break if not found: log.debug('sfv: [{}] - no sfvs in release'.format( release.search_name)) mb = MetaBlack(sfv=release, status='IMPOSSIBLE') db.add(mb) db.commit()
def process(limit=None, category=0): """Processes release rarfiles to check for passwords and filecounts.""" with Server() as server: with db_session() as db: # noinspection PyComparisonWithNone query = db.query(Release).join(Group).join(NZB).filter(~Release.files.any()). \ filter(Release.passworded == 'UNKNOWN').filter(Release.rar_metablack_id == None) if category: query = query.filter(Release.category_id == int(category)) if limit: releases = query.order_by(Release.posted.desc()).limit(limit) else: releases = query.order_by(Release.posted.desc()).all() for release in releases: log.debug('rar: processing {}'.format(release.search_name)) nzb = pynab.nzbs.get_nzb_details(release.nzb) if nzb and nzb['rars']: try: passworded, info = check_release_files(server, release.group.name, nzb) except Exception as e: # if usenet isn't accessible, we don't want to blacklist it log.error('rar: file info failed: {}'.format(e)) continue if info: log.info('rar: file info add [{}]'.format( release.search_name )) release.passworded = passworded size = 0 for file in info: f = File(name=file['name'][:512], size=file['size']) f.release = release size += file['size'] db.add(f) if size != 0: release.size = size release.rar_metablack_id = None db.add(release) db.commit() continue log.debug('rar: [{}] - file info: no readable rars in release'.format( release.search_name )) mb = MetaBlack(rar=release, status='IMPOSSIBLE') db.add(mb) db.commit()
def process(limit=None, category=0): """Processes release rarfiles to check for passwords and filecounts.""" with Server() as server: with db_session() as db: # noinspection PyComparisonWithNone query = db.query(Release).join(Group).join(NZB).filter(~Release.files.any()). \ filter(Release.passworded == 'UNKNOWN').filter(Release.rar_metablack_id == None) if category: query = query.filter(Release.category_id == int(category)) if limit: releases = query.order_by(Release.posted.desc()).limit(limit) else: releases = query.order_by(Release.posted.desc()).all() for release in releases: log.debug('rar: processing {}'.format(release.search_name)) nzb = pynab.nzbs.get_nzb_details(release.nzb) if nzb and nzb['rars']: try: passworded, info = check_release_files( server, release.group.name, nzb) except Exception as e: # if usenet isn't accessible, we don't want to blacklist it log.error('rar: file info failed: {}'.format(e)) continue if info: log.info('rar: file info add [{}]'.format( release.search_name)) release.passworded = passworded size = 0 for file in info: f = File(name=file['name'][:512], size=file['size']) f.release = release size += file['size'] db.add(f) if size != 0: release.size = size release.rar_metablack_id = None db.add(release) db.commit() continue log.debug('rar: [{}] - file info: no readable rars in release'. format(release.search_name)) mb = MetaBlack(rar=release, status='IMPOSSIBLE') db.add(mb) db.commit()
def rename_bad_releases(category): count = 0 s_count = 0 for_deletion = [] with db_session() as db: # noinspection PyComparisonWithNone,PyComparisonWithNone,PyComparisonWithNone,PyComparisonWithNone query = db.query(Release).filter(Release.category_id==int(category)).filter( (Release.files.any())|(Release.nfo_id!=None)|(Release.sfv_id!=None)|(Release.pre_id!=None) ).filter((Release.status!=1)|(Release.status==None)).filter(Release.unwanted==False) for release in windowed_query(query, Release.id, config.scan.get('binary_process_chunk_size', 1000)): count += 1 name, category_id = pynab.releases.discover_name(release) if not name and category_id: # don't change the name, but the category might need changing release.category_id = category_id # we're done with this release release.status = 1 db.merge(release) elif name and category_id: # only add it if it doesn't exist already existing = db.query(Release).filter(Release.name==name, Release.group_id==release.group_id, Release.posted==release.posted).first() if existing: # if it does, delete this one for_deletion.append(release.id) db.expunge(release) else: # we found a new name! s_count += 1 release.name = name release.search_name = pynab.releases.clean_release_name(name) release.category_id = category_id # we're done with this release release.status = 1 db.merge(release) else: # nein release.status = 0 release.unwanted = True db.commit() if for_deletion: deleted = db.query(Release).filter(Release.id.in_(for_deletion)).delete(synchronize_session=False) else: deleted = 0 log.info('rename: successfully renamed {} of {} releases and deleted {} duplicates'.format(s_count, count, deleted))
def process(limit=5, category=0): """Process releases for NFO parts and download them.""" log.info('Checking for NFO segments...') with Server() as server: query = {'nfo': None} if category: query['category._id'] = int(category) for release in db.releases.find(query).limit(limit).sort('posted', pymongo.DESCENDING).batch_size(50): log.debug('Checking for NFO in {}...'.format(release['search_name'])) nzb = pynab.nzbs.get_nzb_dict(release['nzb']) if nzb: nfos = [] if nzb['nfos']: for nfo in nzb['nfos']: if not isinstance(nfo['segments']['segment'], list): nfo['segments']['segment'] = [nfo['segments']['segment'], ] for part in nfo['segments']['segment']: if int(part['@bytes']) > NFO_MAX_FILESIZE: continue nfos.append(part) if nfos: for nfo in nfos: try: article = server.get(release['group']['name'], [nfo['#text'], ]) except: article = None if article: data = gzip.compress(article.encode('utf-8')) nfo_file = fs.put(data, filename='.'.join([release['name'], 'nfo', 'gz'])) if nfo_file: db.releases.update({'_id': release['_id']}, { '$set': { 'nfo': nfo_file } }) log.info('Grabbed and saved NFO for: {}'.format(release['name'])) break else: log.debug('Error retrieving NFO.') continue else: log.debug('No NFOs found in this release.') db.releases.update({'_id': release['_id']}, { '$set': { 'nfo': False } })
def process_release(release, online=True): name, year = parse_movie(release['search_name']) if name and year: method = 'local' imdb = db.imdb.find_one({'name': clean_name(name), 'year': year}) if not imdb and online: method = 'online' movie = search(clean_name(name), year) if movie and movie['Type'] == 'movie': db.imdb.update({ '_id': movie['imdbID'] }, {'$set': { 'name': movie['Title'], 'year': movie['Year'] }}, upsert=True) imdb = db.imdb.find_one({'_id': movie['imdbID']}) if imdb: log.info('[{}] - [{}] - imdb added: {}'.format( release['_id'], release['search_name'], method)) db.releases.update({ '_id': release['_id'] }, {'$set': { 'imdb': imdb }}) elif not imdb and online: log.warning('[{}] - [{}] - imdb not found: online'.format( release['_id'], release['search_name'])) db.releases.update({ '_id': release['_id'] }, { '$set': { 'imdb': { 'attempted': datetime.datetime.now(pytz.utc) } } }) else: log.warning('[{}] - [{}] - imdb not found: local'.format( release['_id'], release['search_name'])) else: log.error( '[{}] - [{}] - imdb not found: no suitable regex for movie name'. format(release['_id'], release['search_name'])) db.releases.update({ '_id': release['_id'] }, {'$set': { 'imdb': { 'possible': False } }})
def scan_missing_segments(group_name): """Scan for previously missed segments.""" log.info('missing: checking for missed segments') with db_session() as db: # recheck for anything to delete expired = db.query(Miss).filter( Miss.attempts >= config.scan.get('miss_retry_limit')).filter( Miss.group_name == group_name).delete() db.commit() if expired: log.info('missing: deleted {} expired misses'.format(expired)) # get missing articles for this group missing_messages = [ r for r, in db.query(Miss.message).filter( Miss.group_name == group_name).all() ] if missing_messages: # mash it into ranges missing_ranges = intspan(missing_messages).ranges() server = Server() server.connect() status, parts, messages, missed = server.scan( group_name, message_ranges=missing_ranges) # if we got some missing parts, save them if parts: pynab.parts.save_all(parts) # even if they got blacklisted, delete the ones we got from the misses if messages: db.query(Miss).filter(Miss.message.in_(messages)).filter( Miss.group_name == group_name).delete(False) db.commit() if missed: # clear up those we didn't get save_missing_segments(group_name, missed) if server.connection: try: server.connection.quit() except: pass
def save_and_clear(binaries=None, parts=None): """Helper function to save a set of binaries and delete associated parts from the DB. This is a lot faster than Newznab's part deletion, which routinely took 10+ hours on my server. Turns out MySQL kinda sucks at deleting lots of shit. If we need more speed, move the parts away and drop the temporary table instead.""" log.info('Saving discovered binaries...') for binary in binaries.values(): save(binary) if parts: log.info('Removing parts that were either packaged or terrible...') db.parts.remove({'_id': {'$in': parts}})
def create(email): """Creates a user by email with a random API key.""" log.info('Creating user {}...'.format(email)) api_key = hashlib.md5(uuid.uuid4().bytes).hexdigest() user = { 'email': email, 'api_key': api_key, 'grabs': 0 } db.users.update({'email': email}, user, upsert=True) return api_key
def get_details(id): log.info('Retrieving movie details for {}...'.format(id)) r = requests.get(OMDB_DETAIL_URL + id) data = r.json() if 'Response' in data: imdb = { '_id': data['imdbID'], 'title': data['Title'], 'year': data['Year'], 'genre': data['Genre'].split(',') } return imdb else: return None
def nzedbirc(unformattedPre): formattedPre = parseNzedbirc(unformattedPre) with db_session() as db: p = db.query(Pre).filter(Pre.name == formattedPre['name']).first() if not p: p = Pre(**formattedPre) else: for k, v in formattedPre.items(): setattr(p, k, v) try: db.add(p) log.info("pre: Inserted/Updated - {}".format(formattedPre["name"])) except Exception as e: log.debug("pre: Error - {}".format(e))
def save_missing_segments(group_name, missing_segments): """Handles any missing segments by mashing them into ranges and saving them to the db for later checking.""" with db_session() as db: # we don't want to get the whole db's worth of segments # just get the ones in the range we need first, last = min(missing_segments), max(missing_segments) # get previously-missed parts previous_misses = [r for r, in db.query(Miss.message).filter(Miss.message >= first).filter(Miss.message <= last).filter( Miss.group_name == group_name).all()] # find any messages we're trying to get again repeats = list(set(previous_misses) & set(missing_segments)) # update the repeats to include the new attempt if repeats: stmt = Miss.__table__.update().where( Miss.__table__.c.message == bindparam('m') ).values( attempts=Miss.__table__.c.attempts + 1 ) db.execute(stmt, [{'m': m} for m in repeats if m]) # subtract the repeats from our new list new_misses = list(set(missing_segments) - set(repeats)) # batch-insert the missing messages if new_misses: db.execute(Miss.__table__.insert(), [ { 'message': m, 'group_name': group_name, 'attempts': 1 } for m in new_misses ]) # delete anything that's been attempted enough expired = db.query(Miss).filter(Miss.attempts >= config.scan.get('miss_retry_limit')).filter( Miss.group_name == group_name).delete() db.commit() log.info('missing: saved {} misses and deleted {} expired misses'.format(len(new_misses), expired))
def save_missing_segments(group_name, missing_segments): """Handles any missing segments by mashing them into ranges and saving them to the db for later checking.""" with db_session() as db: # we don't want to get the whole db's worth of segments # just get the ones in the range we need first, last = min(missing_segments), max(missing_segments) # get previously-missed parts previous_misses = [ r for r, in db.query(Miss.message).filter( Miss.message >= first).filter(Miss.message <= last).filter( Miss.group_name == group_name).all() ] # find any messages we're trying to get again repeats = list(set(previous_misses) & set(missing_segments)) # update the repeats to include the new attempt if repeats: stmt = Miss.__table__.update().where( Miss.__table__.c.message == bindparam('m')).values( attempts=Miss.__table__.c.attempts + 1) db.execute(stmt, [{'m': m} for m in repeats if m]) # subtract the repeats from our new list new_misses = list(set(missing_segments) - set(repeats)) # batch-insert the missing messages if new_misses: db.execute(Miss.__table__.insert(), [{ 'message': m, 'group_name': group_name, 'attempts': 1 } for m in new_misses]) # delete anything that's been attempted enough expired = db.query(Miss).filter( Miss.attempts >= config.scan.get('miss_retry_limit')).filter( Miss.group_name == group_name).delete() db.commit() log.info( 'missing: saved {} misses and deleted {} expired misses'.format( len(new_misses), expired))
def check_single_category(name, category): """Check release against a single category.""" log.info('checking {}'.format(category)) for regex in category_regex[category]: if isinstance(regex, collections.Mapping): if all(bool(expr.search(name)) == expected for expr, expected in regex.items()): return True elif isinstance(regex, tuple): (r, ret) = regex if r.search(name) is not None: return ret else: if regex.search(name) is not None: return True return False
def scan_missing_segments(group_name): """Scan for previously missed segments.""" log.info('missing: checking for missed segments') with db_session() as db: # recheck for anything to delete expired = db.query(Miss).filter(Miss.attempts >= config.scan.get('miss_retry_limit')).filter( Miss.group_name == group_name).delete() db.commit() if expired: log.info('missing: deleted {} expired misses'.format(expired)) # get missing articles for this group missing_messages = [r for r, in db.query(Miss.message).filter(Miss.group_name == group_name).all()] if missing_messages: # mash it into ranges missing_ranges = intspan(missing_messages).ranges() server = Server() server.connect() status, parts, messages, missed = server.scan(group_name, message_ranges=missing_ranges) # if we got some missing parts, save them if parts: pynab.parts.save_all(parts) # even if they got blacklisted, delete the ones we got from the misses if messages: db.query(Miss).filter(Miss.message.in_(messages)).filter(Miss.group_name == group_name).delete(False) db.commit() if missed: # clear up those we didn't get save_missing_segments(group_name, missed) if server.connection: try: server.connection.quit() except: pass
def save_all(parts): """Save a set of parts to the DB, in a batch if possible.""" log.info('Saving collected segments and parts...') # if possible, do a quick batch insert # rarely possible! # TODO: filter this more - batch import if first set in group? try: if db.parts.count() == 0: db.parts.insert([value for key, value in parts.items()]) return True else: # otherwise, it's going to be slow for key, part in parts.items(): save(part) return True except pymongo.errors.PyMongoError as e: log.error('Could not write parts to db: {0}'.format(e)) return False
def orlydb(name, search_name): # BeautifulSoup is required try: from bs4 import BeautifulSoup except: log.error( "BeautifulSoup is required to use orlydb scraping: pip install beautifulsoup4" ) try: preHTML = requests.get('http://orlydb.com/?q={}'.format(search_name)) except: log.debug("Error connecting to orlydb") return False soup = bs4.BeautifulSoup(preHTML.read()) releases = soup.find(id="releases").findAll("div") rlsDict = {} rlsname = None for rls in releases: # Try/except used to filter out None types # pretime left as may be used later try: rlsname = rls.find("span", {"class": "release"}).get_text() # pretime = rls.find("span", {"class" : "timestamp"}).get_text() category = rls.find("span", { "class": "section" }).find("a").get_text() # If the release matches what is passed, return the category in a dict # This could be a problem if 2 pre's have the same name but different categories, chances are slim though if rlsname == name: rlsDict["category"] = category except Exception as e: log.debug("Error parsing to orlydb reponse: {}".format(e)) return False if rlsDict: log.info("Orlydb pre found: {}".format(rlsname)) return rlsDict else: return False
def strip_req(release): """Strips REQ IDs out of releases and cleans them up so they can be properly matched in post-processing.""" regexes = [ regex.compile('^a\.b\.mmEFNet - REQ (?P<reqid>.+) - (?P<name>.*)', regex.I) ] for r in regexes: result = r.search(release['search_name']) if result: result_dict = result.groupdict() if 'name' in result_dict and 'reqid' in result_dict: log.info('Found request {}, storing req_id and renaming...'.format(result_dict['name'])) db.releases.update({'_id': release['_id']}, { '$set': { 'search_name': result_dict['name'], 'req_id': result_dict['reqid'] } }) return
def search(name, year): """Search OMDB for a movie and return the IMDB ID.""" log.info("Searching for movie: {}".format(name)) # if we managed to parse the year from the name # include it, since it'll narrow results if year: year_query = "&y={}".format(year.replace("(", "").replace(")", "")) else: year_query = "" r = requests.get(OMDB_SEARCH_URL + name + year_query) data = r.json() if "Search" in data: for movie in data["Search"]: # doublecheck, but the api should've searched properly ratio = difflib.SequenceMatcher(None, clean_name(name), clean_name(movie["Title"])).ratio() if ratio > 0.8 and year == movie["Year"] and movie["Type"] == "movie": log.info("OMDB movie match found: {}".format(movie["Title"])) return movie
def rename_bad_releases(category): count = 0 s_count = 0 for release in db.releases.find({'category._id': int(category), 'unwanted': {'$ne': True}, '$or': [{'nfo': {'$nin': [None, False]}}, {'files.count': {'$exists': True}}]}): count += 1 name, category_id = pynab.releases.discover_name(release) if name and not category_id: # don't change anything, it was fine pass elif name and category_id: # we found a new name! s_count += 1 category = db.categories.find_one({'_id': category_id}) category['parent'] = db.categories.find_one({'_id': category['parent_id']}) db.releases.update({'_id': release['_id']}, { '$set': { 'search_name': pynab.releases.clean_release_name(name), 'category': category, } } ) else: # bad release! log.info('Noting unwanted release {} ({:d})...'.format( release['search_name'], release['category']['_id'], )) db.releases.update({'_id': release['_id']}, { '$set': { 'unwanted': True } } ) log.info('rename: successfully renamed {} of {} releases'.format(s_count, count))
def vacuum(mode='scan', full=False): conn = engine.connect() if 'postgre' in config.db.get('engine'): conn.connection.connection.set_isolation_level( psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) if mode == 'scan': if full: conn.execute('VACUUM FULL ANALYZE binaries') conn.execute('VACUUM FULL ANALYZE parts') conn.execute('VACUUM FULL ANALYZE segments') else: conn.execute('VACUUM ANALYZE binaries') conn.execute('VACUUM ANALYZE parts') conn.execute('VACUUM ANALYZE segments') else: if full: conn.execute('VACUUM FULL ANALYZE releases') conn.execute('VACUUM FULL ANALYZE metablack') conn.execute('VACUUM FULL ANALYZE episodes') conn.execute('VACUUM FULL ANALYZE tvshows') conn.execute('VACUUM FULL ANALYZE movies') conn.execute('VACUUM FULL ANALYZE nfos') conn.execute('VACUUM FULL ANALYZE sfvs') conn.execute('VACUUM FULL ANALYZE files') else: conn.execute('VACUUM ANALYZE releases') conn.execute('VACUUM ANALYZE metablack') conn.execute('VACUUM ANALYZE episodes') conn.execute('VACUUM ANALYZE tvshows') conn.execute('VACUUM ANALYZE movies') conn.execute('VACUUM ANALYZE nfos') conn.execute('VACUUM ANALYZE sfvs') conn.execute('VACUUM ANALYZE files') elif 'mysql' in config.db.get('engine'): log.info( 'db: not optimising or analysing innodb tables, do it yourself.') pass conn.close()
def connect(self): """Creates a connection to a news server.""" log.info('Attempting to connect to news server...') # i do this because i'm lazy ssl = config.news.pop('ssl', False) # TODO: work out how to enable compression (no library support?) try: if ssl: self.connection = nntplib.NNTP_SSL(**config.news) else: self.connection = nntplib.NNTP(**config.news) # nttplib sometimes throws EOFErrors instead #except nntplib.NNTPError as e: except Exception as e: log.error('Could not connect to news server: ' + str(e)) return False log.info('Connected!') return True
def process(limit=100, online=True): """Process movies without imdb data and append said data.""" log.info('Processing movies to add IMDB data...') expiry = datetime.datetime.now(pytz.utc) - datetime.timedelta(config.site['fetch_blacklist_duration']) query = { 'imdb._id': {'$exists': False}, 'category.parent_id': 2000, } if online: query.update({ 'imdb.possible': {'$exists': False}, '$or': [ {'imdb.attempted': {'$exists': False}}, {'imdb.attempted': {'$lte': expiry}} ] }) for release in db.releases.find(query).limit(limit): process_release(release, online)
def discover_name(release): """Attempts to fix a release name by nfo or filelist.""" potential_names = [release['search_name'],] if 'files' in release: potential_names += names_from_files(release) if release['nfo']: potential_names += names_from_nfos(release) if len(potential_names) > 1: old_category = release['category']['_id'] calculated_old_category = pynab.categories.determine_category(release['search_name']) for name in potential_names: new_category = pynab.categories.determine_category(name) # the release may already be categorised by the group it came from # so if we check the name and it doesn't fit a category, it's probably # a shitty name if (math.floor(calculated_old_category / 1000) * 1000) == pynab.categories.CAT_PARENT_MISC: # sometimes the group categorisation is better than name-based # so check if they're in the same parent and that parent isn't misc if (math.floor(new_category / 1000) * 1000) == pynab.categories.CAT_PARENT_MISC: # ignore this name, since it's apparently gibberish continue else: if (math.floor(new_category / 1000) * 1000) == (math.floor(old_category / 1000) * 1000)\ or (math.floor(old_category / 1000) * 1000) == pynab.categories.CAT_PARENT_MISC: # if they're the same parent, use the new category # or, if the old category was misc>other, fix it search_name = name category_id = new_category log.info('release: [{}] - [{}] - rename: {} ({} -> {} -> {})'.format( release['_id'], release['search_name'], search_name, old_category, calculated_old_category, category_id )) return search_name, category_id else: # if they're not the same parent and they're not misc, ignore continue else: # the old name was apparently fine log.info('release: [{}] - [{}] - old name was fine'.format( release['_id'], release['search_name'] )) return True, False log.info('release: [{}] - [{}] - no good name candidates'.format( release['_id'], release['search_name'] )) return None, None
def process(limit=None): """Process releases for requests""" with db_session() as db: requests = {} for group, reg in GROUP_REQUEST_REGEXES.items(): # noinspection PyComparisonWithNone query = db.query(Release).join(Group).filter(Group.name==group).filter(Release.pre_id == None).\ filter(Release.category_id == '8010').filter("releases.name ~ '{}'".format(reg)) for release in windowed_query( query, Release.id, config.scan.get('binary_process_chunk_size')): # check if it's aliased if release.group.name in GROUP_ALIASES: group_name = GROUP_ALIASES[release.group.name] else: group_name = release.group.name if group_name not in requests: requests[group_name] = {} result = regex.search(reg, release.name) if result: requests[group_name][result.group(0)] = release else: log.info("requests: no release requests to process") # per-group for group_name, group_requests in requests.items(): # query for the requestids if requests: pres = db.query(Pre).filter( Pre.requestgroup == group_name).filter( Pre.requestid.in_(group_requests.keys())).all() else: log.info("requests: no pre requests found") pres = [] # loop through and associate pres with their requests for pre in pres: # no longer need to check group updated_release = group_requests.get(str(pre.requestid)) updated_release.pre_id = pre.id db.merge(updated_release) log.info( "requests: found pre request id {} ({}) for {}".format( pre.requestid, group_name, updated_release.name)) db.commit()
parser = argparse.ArgumentParser( description= 'Recursively import NZBs into Pynab. NOTE: DESTRUCTIVE. Will delete NZB upon successful import. Don\'t run it on a directory you may need to use again.' ) parser.add_argument('directory') if __name__ == '__main__': args = parser.parse_args() print( 'NOTE: DESTRUCTIVE. Will delete NZB upon successful import. Don\'t run it on a directory you may need to use again.' ) input('To continue, press enter. To exit, press ctrl-c.') for root, dirs, files in os.walk(args.directory): for name in files: print('Importing {0}...'.format(os.path.join(root, name))) try: if pynab.nzbs.import_nzb_file(os.path.join(root, name)): os.remove(os.path.join(root, name)) except Exception as e: log.error(str(e)) continue log.info( 'Import completed. Running scripts/recategorise_everything.py to fix release categories...' ) scripts.recategorise_everything.recategorise() log.info('Completed.')
def process(type, interfaces=None, limit=None, online=True): """ Process ID fetching for releases. :param type: tv/movie :param interfaces: interfaces to use or None will use all :param limit: optional limit :param online: whether to check online apis :return: """ expiry = datetime.datetime.now(pytz.utc) - datetime.timedelta( config.postprocess.get('fetch_blacklist_duration', 7)) with db_session() as db: # noinspection PyComparisonWithNone,PyComparisonWithNone db.query(MetaBlack).filter((MetaBlack.movie != None) | (MetaBlack.tvshow != None)).filter( MetaBlack.time <= expiry).delete( synchronize_session='fetch') if type == 'movie': # noinspection PyComparisonWithNone query = db.query(Release).filter( Release.movie == None).join(Category).filter( Category.parent_id == 2000) if online: # noinspection PyComparisonWithNone query = query.filter(Release.movie_metablack_id == None) elif type == 'tv': # noinspection PyComparisonWithNone query = db.query(Release).filter( Release.tvshow == None).join(Category).filter( Category.parent_id == 5000) if online: # noinspection PyComparisonWithNone query = query.filter(Release.tvshow_metablack_id == None) else: raise Exception('wrong release type') query = query.order_by(Release.posted.desc()) if limit: releases = query.limit(limit) else: releases = windowed_query( query, Release.id, config.scan.get('binary_process_chunk_size')) if type == 'movie': parse_func = parse_movie iface_list = MOVIE_INTERFACES obj_class = Movie attr = 'movie' def extract_func(data): return { 'name': data.get('name'), 'genre': data.get('genre', None), 'year': data.get('year', None) } elif type == 'tv': parse_func = parse_tv iface_list = TV_INTERFACES obj_class = TvShow attr = 'tvshow' def extract_func(data): return { 'name': data.get('name'), 'country': data.get('country', None) } else: raise Exception('wrong release type') for release in releases: method = 'local' data = parse_func(release.search_name) if data: if type == 'movie': q = db.query(Movie).filter( Movie.name.ilike('%'.join( clean_name(data['name']).split(' ')))).filter( Movie.year == data['year']) elif type == 'tv': q = db.query(TvShow).filter( TvShow.name.ilike('%'.join( clean_name(data['name']).split(' ')))) else: q = None entity = q.first() if not entity and online: method = 'online' ids = {} for iface in iface_list: if interfaces and iface.NAME not in interfaces: continue exists = q.join(DBID).filter( DBID.db == iface.NAME).first() if not exists: id = iface.search(data) if id: ids[iface.NAME] = id if ids: entity = obj_class(**extract_func(data)) db.add(entity) for interface_name, id in ids.items(): i = DBID() i.db = interface_name i.db_id = id setattr(i, attr, entity) db.add(i) if entity: log.info('{}: [{}] - [{}] - data added: {}'.format( attr, release.id, release.search_name, method)) if type == 'tv': # episode processing ep = db.query(Episode).filter( Episode.tvshow_id == entity.id).filter( Episode.series_full == data['series_full']).first() if not ep: ep = Episode(season=data.get('season'), episode=data.get('episode'), series_full=data.get('series_full'), air_date=data.get('air_date'), year=data.get('year'), tvshow=entity) release.episode = ep setattr(release, attr, entity) db.add(release) else: log.info('{}: [{}] - data not found: {}'.format( attr, release.search_name, method)) if online: mb = MetaBlack(status='ATTEMPTED') setattr(mb, attr, release) db.add(mb) else: log.info( '{}: [{}] - {} data not found: no suitable regex for {} name' .format(attr, release.id, release.search_name, attr)) mb = MetaBlack(status='IMPOSSIBLE') setattr(mb, attr, release) db.add(mb) db.add( DataLog(description='parse_{} regex'.format(attr), data=release.search_name)) db.commit() if method != 'local': time.sleep(1)
def scan(group_name, direction='forward', date=None, target=None, limit=None): log.info('group: {}: scanning group'.format(group_name)) with Server() as server: _, count, first, last, _ = server.group(group_name) if count: with db_session() as db: group = db.query(Group).filter( Group.name == group_name).first() if group: # sort out missing first/lasts if not group.first and not group.last: group.first = last group.last = last direction = 'backward' elif not group.first: group.first = group.last elif not group.last: group.last = group.first # check that our firsts and lasts are valid if group.first < first: log.error( 'group: {}: first article was older than first on server' .format(group_name)) return True elif group.last > last: log.error( 'group: {}: last article was newer than last on server' .format(group_name)) return True db.merge(group) # sort out a target start = 0 mult = 0 if direction == 'forward': start = group.last target = last mult = 1 elif direction == 'backward': start = group.first if not target: target = server.day_to_post( group_name, server.days_old(date) if date else config.scan.get('backfill_days', 10)) mult = -1 if not target: log.info( 'group: {}: unable to continue'.format(group_name)) return True if group.first <= target <= group.last: log.info( 'group: {}: nothing to do, already have target'. format(group_name)) return True if first > target or last < target: log.error( 'group: {}: server doesn\'t carry target article'. format(group_name)) return True iterations = 0 num = config.scan.get('message_scan_limit') * mult for i in range(start, target, num): # set the beginning and ends of the scan to their respective values begin = i + mult end = i + (mult * config.scan.get('message_scan_limit')) # check if the target is before our end if abs(begin) <= abs(target) <= abs(end): # we don't want to overscan end = target # at this point, we care about order # flip them if one is bigger begin, end = (begin, end) if begin < end else (end, begin) status, parts, messages, missed = server.scan( group_name, first=begin, last=end) try: if direction == 'forward': group.last = max(messages) elif direction == 'backward': group.first = min(messages) except: log.error( 'group: {}: problem updating group ({}-{})'. format(group_name, start, end)) return False # don't save misses if we're backfilling, there are too many if status and missed and config.scan.get( 'retry_missed') and direction == 'forward': save_missing_segments(group_name, missed) if status and parts: if pynab.parts.save_all(parts): db.merge(group) db.commit() else: log.error( 'group: {}: problem saving parts to db, restarting scan' .format(group_name)) return False to_go = abs(target - end) log.info( 'group: {}: {:.0f} iterations ({} messages) to go'. format( group_name, to_go / config.scan.get('message_scan_limit'), to_go)) parts.clear() del messages[:] del missed[:] iterations += 1 if limit and iterations >= 3: #* config.scan.get('message_scan_limit') >= limit: log.info( 'group: {}: scan limit reached, ending early (will continue later)' .format(group_name)) return False log.info('group: {}: scan completed'.format(group_name)) return True
Generate a header string. """ return '{:^21}|{:^21}|{:^21}|{:^21}'.format('Parts', 'Binaries', 'Releases', 'Other-Misc Releases') if __name__ == '__main__': log_init('stats', '%(message)s') colorama.init() config_time = os.stat(config.__file__).st_mtime logging_dir = config.log.get('logging_dir') csv_path = os.path.join(logging_dir, 'stats.csv') log.info(build_header()) i = 1 first = True last_parts = 0 last_binaries = 0 last_releases = 0 last_others = 0 while True: parts, binaries, releases, others = get_stats() if not first: p_diff = parts - last_parts b_diff = binaries - last_binaries
def update_regex(): """Check for NN+ regex update and load them into db.""" with db_session() as db: regex_type = config.postprocess.get('regex_type') regex_url = config.postprocess.get('regex_url') if regex_url: regexes = {} response = requests.get(regex_url) lines = response.text.splitlines() # get the revision or headers by itself first_line = lines.pop(0) if regex_type == 'nzedb': for line in lines: try: id, group, reg, status, desc, ordinal = tuple( line.split('\t')) except ValueError: # broken line continue regexes[int(id)] = { 'id': int(id), 'group_name': group.replace('^', '').replace('\\', '').replace('$', ''), 'regex': reg.replace('\\\\', '\\'), 'ordinal': ordinal, 'status': bool(status), 'description': desc[:255] } else: revision = regex.search('\$Rev: (\d+) \$', first_line) if revision: revision = int(revision.group(1)) log.info('Regex at revision: {:d}'.format(revision)) # and parse the rest of the lines, since they're an sql dump for line in lines: reg = regex.search( '\((\d+), \'(.*)\', \'(.*)\', (\d+), (\d+), (.*), (.*)\);$', line) if reg: try: if reg.group(6) == 'NULL': description = '' else: description = reg.group(6).replace('\'', '') regexes[int(reg.group(1))] = { 'id': int(reg.group(1)), 'group_name': reg.group(2), 'regex': reg.group(3).replace('\\\\', '\\'), 'ordinal': int(reg.group(4)), 'status': bool(reg.group(5)), 'description': description } except: log.error('Problem importing regex dump.') return False # if the parsing actually worked if len(regexes) > 0: db.query(Regex).filter(Regex.id < 100000).delete() log.info('Retrieved {:d} regexes.'.format(len(regexes))) ids = [] regexes = modify_regex(regexes, regex_type) for reg in regexes.values(): r = Regex(**reg) ids.append(r.id) db.merge(r) log.info('Added/modified {:d} regexes.'.format(len(regexes))) # add pynab regex for reg in regex_data.additions: r = Regex(**reg) db.merge(r) log.info('Added/modified {:d} Pynab regexes.'.format( len(regex_data.additions))) db.commit() return True else: log.error( 'No config item set for regex_url - do you own newznab plus?') return False
def stop(self): self.xmpp.disconnect() log.info("nabbot: client disconnected.")
def rename_bad_releases(category): count = 0 s_count = 0 for_deletion = [] with db_session() as db: # noinspection PyComparisonWithNone,PyComparisonWithNone,PyComparisonWithNone,PyComparisonWithNone query = db.query(Release).filter( Release.category_id == int(category)).filter( (Release.files.any()) | (Release.nfo_id != None) | (Release.sfv_id != None) | (Release.pre_id != None)).filter((Release.status != 1) | ( Release.status == None)).filter(Release.unwanted == False) for release in windowed_query( query, Release.id, config.scan.get('binary_process_chunk_size', 1000)): count += 1 name, category_id = pynab.releases.discover_name(release) if not name and category_id: # don't change the name, but the category might need changing release.category_id = category_id # we're done with this release release.status = 1 db.merge(release) elif name and category_id: # only add it if it doesn't exist already existing = db.query(Release).filter( Release.name == name, Release.group_id == release.group_id, Release.posted == release.posted).first() if existing: # if it does, delete this one for_deletion.append(release.id) db.expunge(release) else: # we found a new name! s_count += 1 release.name = name release.search_name = pynab.releases.clean_release_name( name) release.category_id = category_id # we're done with this release release.status = 1 db.merge(release) else: # nein release.status = 0 release.unwanted = True db.commit() if for_deletion: deleted = db.query(Release).filter( Release.id.in_(for_deletion)).delete(synchronize_session=False) else: deleted = 0 log.info( 'rename: successfully renamed {} of {} releases and deleted {} duplicates' .format(s_count, count, deleted))
def day_to_post(self, group_name, days): """Converts a datetime to approximate article number for the specified group.""" self.connect() log.info('server: {}: finding post {} days old...'.format(group_name, days)) try: with nntp_handler(self, group_name): _, count, first, last, _ = self.connection.group(group_name) except: return None # calculate tolerance if days <= 50: tolerance = 1 elif days <= 100: tolerance = 5 elif days <= 1000: tolerance = 10 else: tolerance = 20 # get first, last and target dates candidate_post = None target_date = datetime.datetime.now(pytz.utc) - datetime.timedelta(days) bottom_date = self.post_date(group_name, first) if not bottom_date: log.error('server: {}: can\'t get first date on group, fatal group error. try again later?'.format( group_name )) return None # check bottom_date if target_date < bottom_date: log.info('server: {}: post was before first available, starting from the beginning'.format( group_name )) return first top_date = self.post_date(group_name, last) if not top_date: log.warning('server: {}: can\'t get first date on group, fatal group error. try again later?'.format( group_name )) return None if target_date > top_date: log.info('server: {}: requested post was newer than most recent, ending'.format(group_name)) return None bottom = first top = last # Keep track of previously seen candidate posts so that we # can adjust and avoid getting into a loop. seen_post = {} # iterative, obviously while True: # do something like a binary search # find the percentage-point of target date between first and last dates # ie. start |-------T---| end = ~70% # so we'd find the post number ~70% through the message count try: target = target_date - bottom_date total = top_date - bottom_date except: log.error('server: {}: nntp server problem while getting first/last article dates'.format( group_name)) return None perc = target.total_seconds() / total.total_seconds() while True: candidate_post = int(abs(bottom + ((top - bottom) * perc))) candidate_date = self.post_date(group_name, candidate_post) if candidate_date: break else: addition = (random.choice([-1, 1]) / 100) * perc if perc + addition > 1.0: perc -= addition elif perc - addition < 0.0: perc += addition else: perc += addition # If we begin to see posts multiple times then we may need to # slide our tolerance out a bit to compensate for holes in posts. if candidate_post in seen_post: tolerance_adjustment = tolerance / 2 log.debug('server: {}: Seen post more than once, increasing tolerance by {} to compensate.'.format(group_name, tolerance_adjustment)) tolerance += tolerance_adjustment else: seen_post[candidate_post] = 1 # tolerance sliding scale, about 0.1% rounded to the nearest day # we don't need a lot of leeway, since this is a lot faster than previously if abs(target_date - candidate_date) < datetime.timedelta(days=tolerance): break if candidate_date > target_date: top = candidate_post top_date = candidate_date else: bottom = candidate_post bottom_date = candidate_date log.debug('server: {}: post {} was {} days old'.format(group_name, candidate_post, Server.days_old(candidate_date))) return candidate_post
def discover_name(release): """Attempts to fix a release name by nfo, filelist or sfv.""" potential_names = [ release.search_name, ] # base64-decode the name in case it's that try: n = release.name missing_padding = 4 - len(release.name) % 4 if missing_padding: n += '=' * missing_padding n = base64.b64decode(n.encode('utf-8')) potential_names.append(n.decode('utf-8')) except: pass # add a reversed name, too potential_names.append(release.name[::-1]) if release.files: potential_names += names_from_files(release) if release.nfo: potential_names += names_from_nfos(release) if release.sfv: potential_names += names_from_sfvs(release) if release.pre: potential_names.append(release.pre.name) if len(potential_names) > 1: old_category = release.category_id calculated_old_category = pynab.categories.determine_category( release.search_name) for name in potential_names: new_category = pynab.categories.determine_category(name) # the release may already be categorised by the group it came from # so if we check the name and it doesn't fit a category, it's probably # a shitty name if (math.floor(calculated_old_category / 1000) * 1000) == pynab.categories.CAT_PARENT_MISC: # sometimes the group categorisation is better than name-based # so check if they're in the same parent and that parent isn't misc if (math.floor(new_category / 1000) * 1000) == pynab.categories.CAT_PARENT_MISC: # ignore this name, since it's apparently gibberish continue else: if (math.floor(new_category / 1000) * 1000) == (math.floor(old_category / 1000) * 1000) \ or (math.floor(old_category / 1000) * 1000) == pynab.categories.CAT_PARENT_MISC: # if they're the same parent, use the new category # or, if the old category was misc>other, fix it search_name = name category_id = new_category log.info('release: [{}] - rename: {} ({} -> {} -> {})'. format(release.search_name, search_name, old_category, calculated_old_category, category_id)) return search_name, category_id else: # if they're not the same parent and they're not misc, ignore continue else: # the old name was apparently fine log.debug('release: [{}] - old name was fine'.format( release.search_name)) return False, calculated_old_category log.debug('release: no good name candidates [{}]'.format( release.search_name)) return None, None
def process(): """Helper function to begin processing binaries. Checks for 100% completion and will create NZBs/releases for each complete release. Will also categorise releases, and delete old binaries.""" # TODO: optimise query usage in this, it's using like 10-15 per release binary_count = 0 added_count = 0 if config.scan.get('publish', False): request_session = FuturesSession() else: request_session = None start = time.time() with db_session() as db: binary_query = """ SELECT binaries.id, binaries.name, binaries.posted, binaries.total_parts FROM binaries INNER JOIN ( SELECT parts.id, parts.binary_id, parts.total_segments, count(*) as available_segments FROM parts INNER JOIN segments ON parts.id = segments.part_id GROUP BY parts.id ) as parts ON binaries.id = parts.binary_id GROUP BY binaries.id HAVING count(*) >= binaries.total_parts AND (sum(parts.available_segments) / sum(parts.total_segments)) * 100 >= {} ORDER BY binaries.posted DESC """.format(config.postprocess.get('min_completion', 100)) # pre-cache blacklists and group them blacklists = db.query(Blacklist).filter(Blacklist.status == True).all() for blacklist in blacklists: db.expunge(blacklist) # cache categories parent_categories = {} for category in db.query(Category).all(): parent_categories[ category. id] = category.parent.name if category.parent else category.name # for interest's sakes, memory usage: # 38,000 releases uses 8.9mb of memory here # no real need to batch it, since this will mostly be run with # < 1000 releases per run for completed_binary in engine.execute(binary_query).fetchall(): # some optimisations here. we used to take the binary id and load it # then compare binary.name and .posted to any releases # in doing so, we loaded the binary into the session # this meant that when we deleted it, it didn't cascade # we had to submit many, many delete queries - one per segment/part # by including name/posted in the big query, we don't load that much data # but it lets us check for a release without another query, and means # that we cascade delete when we clear the binary # first we check if the release already exists r = db.query(Release).filter( Release.name == completed_binary[1]).filter( Release.posted == completed_binary[2]).first() if r: # if it does, we have a duplicate - delete the binary db.query(Binary).filter( Binary.id == completed_binary[0]).delete() else: # get an approx size for the binary without loading everything # if it's a really big file, we want to deal with it differently binary = db.query(Binary).filter( Binary.id == completed_binary[0]).first() # get the group early for use in uniqhash group = db.query(Group).filter( Group.name == binary.group_name).one() # check if the uniqhash already exists too dupe_release = db.query(Release).filter( Release.uniqhash == _create_hash(binary.name, group.id, binary.posted)).first() if dupe_release: db.query(Binary).filter( Binary.id == completed_binary[0]).delete() continue # this is an estimate, so it doesn't matter too much # 1 part nfo, 1 part sfv or something similar, so ignore two parts # take an estimate from the middle parts, since the first/last # have a good chance of being something tiny # we only care if it's a really big file # abs in case it's a 1 part release (abs(1 - 2) = 1) # int(/2) works fine (int(1/2) = 0, array is 0-indexed) try: est_size = (abs(binary.total_parts - 2) * binary.parts[int( binary.total_parts / 2)].total_segments * binary.parts[int( binary.total_parts / 2)].segments[0].size) except IndexError: log.error( 'release: binary [{}] - couldn\'t estimate size - bad regex: {}?' .format(binary.id, binary.regex_id)) continue oversized = est_size > config.postprocess.get( 'max_process_size', 10 * 1024 * 1024 * 1024) if oversized and not config.postprocess.get( 'max_process_anyway', True): log.debug('release: [{}] - removed (oversized)'.format( binary.name)) db.query(Binary).filter( Binary.id == completed_binary[0]).delete() db.commit() continue if oversized: # for giant binaries, we do it differently # lazyload the segments in parts and expunge when done # this way we only have to store binary+parts # and one section of segments at one time binary = db.query(Binary).options( subqueryload('parts'), lazyload('parts.segments'), ).filter(Binary.id == completed_binary[0]).first() else: # otherwise, start loading all the binary details binary = db.query(Binary).options( subqueryload('parts'), subqueryload('parts.segments'), Load(Part).load_only(Part.id, Part.subject, Part.segments), ).filter(Binary.id == completed_binary[0]).first() blacklisted = False for blacklist in blacklists: if regex.search(blacklist.group_name, binary.group_name): # we're operating on binaries, not releases field = 'name' if blacklist.field == 'subject' else blacklist.field if regex.search(blacklist.regex, getattr(binary, field)): log.debug( 'release: [{}] - removed (blacklisted: {})'. format(binary.name, blacklist.id)) db.query(Binary).filter( Binary.id == binary.id).delete() db.commit() blacklisted = True break if blacklisted: continue binary_count += 1 release = Release() release.name = binary.name release.original_name = binary.name release.posted = binary.posted release.posted_by = binary.posted_by release.regex_id = binary.regex_id release.grabs = 0 # this counts segment sizes, so we can't use it for large releases # use the estimate for min_size and firm it up later during postproc if oversized: release.size = est_size else: release.size = binary.size() # check against minimum size for this group undersized = False for size, groups in config.postprocess.get('min_size', {}).items(): if binary.group_name in groups: if release.size < size: undersized = True break if undersized: log.debug( 'release: [{}] - removed (smaller than minimum size for group)' .format(binary.name)) db.query(Binary).filter(Binary.id == binary.id).delete() db.commit() continue # check to make sure we have over the configured minimum files # this one's okay for big releases, since we're only looking at part-level rars = [] rar_count = 0 zip_count = 0 nzb_count = 0 for part in binary.parts: if pynab.nzbs.rar_part_regex.search(part.subject): rar_count += 1 if pynab.nzbs.rar_regex.search( part.subject ) and not pynab.nzbs.metadata_regex.search(part.subject): rars.append(part) if pynab.nzbs.zip_regex.search( part.subject ) and not pynab.nzbs.metadata_regex.search(part.subject): zip_count += 1 if pynab.nzbs.nzb_regex.search(part.subject): nzb_count += 1 # handle min_archives # keep, nzb, under status = 'keep' archive_rules = config.postprocess.get('min_archives', 1) if isinstance(archive_rules, dict): # it's a dict if binary.group_name in archive_rules: group = binary.group_name else: group = '*' # make sure the catchall exists if group not in archive_rules: archive_rules[group] = 1 # found a special rule if rar_count + zip_count < archive_rules[group]: if nzb_count > 0: status = 'nzb' else: status = 'under' else: # it's an integer, globalise that shit yo if rar_count + zip_count < archive_rules: if nzb_count > 0: status = 'nzb' else: status = 'under' # if it's an nzb or we're under, kill it if status in ['nzb', 'under']: if status == 'nzb': log.debug('release: [{}] - removed (nzb only)'.format( binary.name)) elif status == 'under': log.debug( 'release: [{}] - removed (less than minimum archives)' .format(binary.name)) db.query(Binary).filter(Binary.id == binary.id).delete() db.commit() continue # clean the name for searches release.search_name = clean_release_name(binary.name) # assign the release group release.group = group # give the release a category release.category_id = pynab.categories.determine_category( binary.name, binary.group_name) # create the nzb, store it and link it here # no need to do anything special for big releases here # if it's set to lazyload, it'll kill rows as they're used # if it's a small release, it'll go straight from memory nzb = pynab.nzbs.create(release.search_name, parent_categories[release.category_id], binary) if nzb: added_count += 1 log.info( 'release: [{}]: added release ({} rars, {} rarparts)'. format(release.search_name, len(rars), rar_count)) release.nzb = nzb # save the release db.add(release) try: db.flush() except Exception as e: # this sometimes raises if we get a duplicate # this requires a post of the same name at exactly the same time (down to the second) # pretty unlikely, but there we go log.debug( 'release: [{}]: duplicate release, discarded'. format(release.search_name)) db.rollback() # delete processed binaries db.query(Binary).filter(Binary.id == binary.id).delete() # publish processed releases? if config.scan.get('publish', False): futures = [ request_session.post(host, data=to_json(release)) for host in config.scan.get('publish_hosts') ] db.commit() end = time.time() log.info('release: added {} out of {} binaries in {:.2f}s'.format( added_count, binary_count, end - start))
def process(): """Helper function to process parts into binaries based on regex in DB. Copies parts/segments across to the binary document. Keeps a list of parts that were processed for deletion.""" start = time.time() binaries = {} dead_parts = [] total_processed = 0 total_binaries = 0 count = 0 # new optimisation: if we only have parts from a couple of groups, # we don't want to process the regex for every single one. # this removes support for "alt.binaries.games.*", but those weren't # used anyway, aside from just * (which it does work with) with db_session() as db: db.expire_on_commit = False relevant_groups = [ x[0] for x in db.query(Part.group_name).group_by(Part.group_name).all() ] if relevant_groups: # grab all relevant regex all_regex = db.query(Regex).filter(Regex.status == True).filter( Regex.group_name.in_(relevant_groups + ['.*'])).order_by( Regex.ordinal).all() # cache compiled regex compiled_regex = {} for reg in all_regex: r = reg.regex flags = r[r.rfind('/') + 1:] r = r[r.find('/') + 1:r.rfind('/')] regex_flags = regex.I if 'i' in flags else 0 try: compiled_regex[reg.id] = regex.compile(r, regex_flags) except Exception as e: log.error( 'binary: broken regex detected. id: {:d}, removing...'. format(reg.id)) db.query(Regex).filter(Regex.id == reg.id).delete() db.commit() # noinspection PyComparisonWithNone query = db.query(Part).filter( Part.group_name.in_(relevant_groups)).filter( Part.binary_id == None) total_parts = query.count() for part in windowed_query( query, Part.id, config.scan.get('binary_process_chunk_size', 1000)): found = False total_processed += 1 count += 1 for reg in all_regex: if reg.group_name != part.group_name and reg.group_name != '.*': continue # convert php-style regex to python # ie. /(\w+)/i -> (\w+), regex.I # no need to handle s, as it doesn't exist in python # why not store it as python to begin with? some regex # shouldn't be case-insensitive, and this notation allows for that try: result = compiled_regex[reg.id].search(part.subject) except: log.error( 'binary: broken regex detected. id: {:d}, removing...' .format(reg.id)) all_regex.remove(reg) db.query(Regex).filter(Regex.id == reg.id).delete() db.commit() continue match = result.groupdict() if result else None if match: # remove whitespace in dict values try: match = {k: v.strip() for k, v in match.items()} except: pass # fill name if reqid is available if match.get('reqid') and not match.get('name'): match['name'] = '{}'.format(match['reqid']) # make sure the regex returns at least some name if not match.get('name'): match['name'] = ' '.join( [v for v in match.values() if v]) # if regex are shitty, look for parts manually # segment numbers have been stripped by this point, so don't worry # about accidentally hitting those instead if not match.get('parts'): result = PART_REGEX.search(part.subject) if result: match['parts'] = result.group(1) if match.get('name') and match.get('parts'): if match['parts'].find('/') == -1: match['parts'] = match['parts'].replace('-', '/') \ .replace('~', '/').replace(' of ', '/') match['parts'] = match['parts'].replace('[', '').replace(']', '') \ .replace('(', '').replace(')', '') if '/' not in match['parts']: continue current, total = match['parts'].split('/') # calculate binary hash for matching hash = generate_hash(match['name'], part.group_name, part.posted_by, total) # if the binary is already in our chunk, # just append to it to reduce query numbers if hash in binaries: if current in binaries[hash]['parts']: # but if we already have this part, pick the one closest to the binary if binaries[hash]['posted'] - part.posted < binaries[hash]['posted'] - \ binaries[hash]['parts'][current].posted: binaries[hash]['parts'][current] = part else: dead_parts.append(part.id) break else: binaries[hash]['parts'][current] = part else: log.debug( 'binaries: new binary found: {}'.format( match['name'])) b = { 'hash': hash, 'name': match['name'], 'posted': part.posted, 'posted_by': part.posted_by, 'group_name': part.group_name, 'xref': part.xref, 'regex_id': reg.id, 'total_parts': int(total), 'parts': { current: part } } binaries[hash] = b found = True break # the part matched no regex, so delete it if not found: dead_parts.append(part.id) if count >= config.scan.get('binary_process_chunk_size', 1000) or (total_parts - count) == 0: total_parts -= count total_binaries += len(binaries) save(db, binaries) if dead_parts: deleted = db.query(Part).filter( Part.id.in_(dead_parts)).delete( synchronize_session='fetch') else: deleted = 0 db.commit() log.info( 'binary: saved {} binaries and deleted {} dead parts ({} parts left)...' .format(len(binaries), deleted, total_parts)) binaries = {} dead_parts = [] count = 0 db.expire_on_commit = True db.close() end = time.time() log.info( 'binary: processed {} parts and formed {} binaries in {:.2f}s'.format( total_processed, total_binaries, end - start))
def main(mode='update', group=None, date=None): log_init(mode) log.info('scan: starting {}...'.format(mode)) groups = [] active_groups = {} if mode == 'backfill': log.info('scan: finding targets for backfill...') with pynab.server.Server() as server: with db_session() as db: if not group: groups = [group.name for group in db.query(Group).filter(Group.active == True).all()] else: if db.query(Group).filter(Group.name == group).first(): groups = [group] for group in groups: target = server.day_to_post(group, server.days_old(pytz.utc.localize(dateutil.parser.parse(date))) if date else config.scan.get('backfill_days', 10) ) if target: active_groups[group] = target iterations = 0 while True: iterations += 1 data = [] # refresh the db session each iteration, just in case with db_session() as db: if db.query(Segment).count() > config.scan.get('early_process_threshold', 50000000): if mode == 'update': log.info('scan: backlog of segments detected, processing first') process() else: log.info('scan: backlog of segments detected during backfill, waiting until update has cleared them') time.sleep(config.scan.get('update_wait', 600)) continue # for scanning, we want to re-check active groups each iteration # we don't want to do that for backfilling, though if mode == 'update': if not group: active_groups = {group.name: None for group in db.query(Group).filter(Group.active == True).all()} else: if db.query(Group).filter(Group.name == group).first(): active_groups = {group: None} else: log.error('scan: no such group exists') return if active_groups: with concurrent.futures.ThreadPoolExecutor(config.scan.get('update_threads', None)) as executor: # if maxtasksperchild is more than 1, everything breaks # they're long processes usually, so no problem having one task per child if mode == 'backfill': result = [executor.submit(backfill, active_group, date, target) for active_group, target in active_groups.items()] else: result = [executor.submit(update, active_group) for active_group in active_groups.keys()] for r in concurrent.futures.as_completed(result): data.append(r.result()) if mode == 'backfill': if all(data): return # don't retry misses during backfill, it ain't gonna happen if config.scan.get('retry_missed') and not mode == 'backfill': miss_groups = [group_name for group_name, in db.query(Miss.group_name).group_by(Miss.group_name).all()] miss_result = [executor.submit(scan_missing, miss_group) for miss_group in miss_groups] # no timeout for these, because it could take a while for r in concurrent.futures.as_completed(miss_result): data = r.result() db.commit() if mode == 'update': process() # clean up dead binaries and parts if config.scan.get('dead_binary_age', 1) != 0: dead_time = pytz.utc.localize(datetime.datetime.now()).replace( tzinfo=None) - datetime.timedelta(days=config.scan.get('dead_binary_age', 3)) dead_binaries = db.query(Binary).filter(Binary.posted <= dead_time).delete() db.commit() log.info('scan: deleted {} dead binaries'.format(dead_binaries)) else: log.info('scan: no groups active, cancelling pynab.py...') break if mode == 'update': # vacuum the segments, parts and binaries tables log.info('scan: vacuuming relevant tables...') if iterations >= config.scan.get('full_vacuum_iterations', 288): # this may look weird, but we want to reset iterations even if full_vacuums are off # so it doesn't count to infinity if config.scan.get('full_vacuum', True): vacuum(mode='scan', full=True) iterations = 0 else: iterations = 0 db.close() # don't bother waiting if we're backfilling, just keep going if mode == 'update': # wait for the configured amount of time between cycles update_wait = config.scan.get('update_wait', 300) log.info('scan: sleeping for {:d} seconds...'.format(update_wait)) time.sleep(update_wait)
def main(): log_init('postprocess') log.info('postprocess: starting post-processing...') # start with a quick post-process #log.info('postprocess: starting with a quick post-process to clear out the cruft that\'s available locally...') #scripts.quick_postprocess.local_postprocess() iterations = 0 while True: with db_session() as db: # delete passworded releases first so we don't bother processing them if config.postprocess.get('delete_passworded', True): query = db.query(Release) if config.postprocess.get('delete_potentially_passworded', True): query = query.filter((Release.passworded == 'MAYBE') | (Release.passworded == 'YES')) else: query = query.filter(Release.passworded == 'YES') deleted = query.delete() db.commit() log.info('postprocess: deleted {} passworded releases'.format( deleted)) with concurrent.futures.ThreadPoolExecutor(4) as executor: threads = [] if config.postprocess.get('process_tvshows', True): threads.append(executor.submit(process_tvshows)) if config.postprocess.get('process_movies', True): threads.append(executor.submit(process_movies)) # grab and append nfo data to all releases if config.postprocess.get('process_nfos', True): threads.append(executor.submit(process_nfos)) # grab and append sfv data to all releases if config.postprocess.get('process_sfvs', False): threads.append(executor.submit(process_sfvs)) # check for passwords, file count and size if config.postprocess.get('process_rars', True): threads.append(executor.submit(process_rars)) # check for requests in local pre table if config.postprocess.get('process_requests', True): threads.append(executor.submit(process_requests)) #for t in concurrent.futures.as_completed(threads): # data = t.result() # every 25 iterations (roughly), reset the unwanted status on releases """ if iterations % 25 == 0: log.info('postprocess: resetting unwanted status') db.query(Release).filter(Release.unwanted==True).update({Release.unwanted: False}) db.commit() """ # rename misc->other and all ebooks scripts.rename_bad_releases.rename_bad_releases(8010) scripts.rename_bad_releases.rename_bad_releases(7020) # do a postproc deletion of any enabled blacklists # assuming it's enabled, of course if config.postprocess.get('delete_blacklisted_releases'): deleted = 0 for blacklist in db.query(Blacklist).filter( Blacklist.status == True).all(): # remap subject to name, since normal blacklists operate on binaries # this is on releases, and the attribute changes field = 'search_name' if blacklist.field == 'subject' else blacklist.field # filter by: # group_name should match the blacklist's # <field> should match the blacklist's regex # <field> is determined by blacklist's field (usually subject/name) # date (optimisation) query = db.query(Release).filter( Release.group_id.in_( db.query(Group.id).filter( Group.name.op('~*')( blacklist.group_name)).subquery())).filter( getattr(Release, field).op('~*')( blacklist.regex)) if config.postprocess.get('delete_blacklisted_days'): query = query.filter(Release.posted >= ( datetime.datetime.now(pytz.utc) - datetime.timedelta(days=config.postprocess.get( 'delete_blacklisted_days')))) deleted += query.delete(False) log.info('postprocess: deleted {} blacklisted releases'.format( deleted)) db.commit() if config.postprocess.get('delete_bad_releases', False): # kill unwanteds pass """ deletes = db.query(Release).filter(Release.unwanted==True).delete() deletes = 0 # and also kill other-miscs that we can't retrieve a rar for sub = db.query(Release.id).join(MetaBlack, Release.rar_metablack).\ filter(Release.category_id==8010).\ filter(MetaBlack.status=='IMPOSSIBLE').\ subquery() deletes += db.query(Release).filter(Release.id.in_(sub)).delete(synchronize_session='fetch') log.info('postprocess: deleted {} bad releases'.format(deletes)) db.commit() """ if config.postprocess.get('release_expiry_days', 0) > 0: expire_days = config.postprocess.get('release_expiry_days', 0) log.info( 'postprocess: expiring releases posted more than {} days ago.' .format(expire_days)) deleted_releases = db.query(Release).filter(Release.posted < ( datetime.datetime.now(pytz.utc) - datetime.timedelta(days=expire_days))).delete( synchronize_session='fetch') log.info('postprocess: expired {} releases'.format( deleted_releases)) # delete any orphan metablacks log.info('postprocess: deleting orphan metablacks...') # noinspection PyComparisonWithNone,PyComparisonWithNone,PyComparisonWithNone,PyComparisonWithNone,PyComparisonWithNone deleted_metablacks = db.query(MetaBlack).filter( (MetaBlack.movie == None) & (MetaBlack.tvshow == None) & (MetaBlack.rar == None) & (MetaBlack.nfo == None) & (MetaBlack.sfv == None)).delete(synchronize_session='fetch') log.info('postprocess: deleted {} orphaned metablacks.'.format( deleted_metablacks)) # delete any orphan nzbs log.info('postprocess: deleting orphan nzbs...') # noinspection PyComparisonWithNone deleted_nzbs = db.query(NZB).filter(NZB.release == None).delete( synchronize_session='fetch') log.info( 'postprocess: deleted {} orphaned nzbs.'.format(deleted_nzbs)) # delete any orphan nfos log.info('postprocess: deleting orphan nfos...') # noinspection PyComparisonWithNone deleted_nfos = db.query(NFO).filter(NFO.release == None).delete( synchronize_session='fetch') log.info( 'postprocess: deleted {} orphaned nfos.'.format(deleted_nfos)) # delete any orphan sfvs log.info('postprocess: deleting orphan sfvs...') # noinspection PyComparisonWithNone deleted_sfvs = db.query(SFV).filter(SFV.release == None).delete( synchronize_session='fetch') log.info( 'postprocess: deleted {} orphaned sfvs.'.format(deleted_sfvs)) db.commit() # vacuum the segments, parts and binaries tables log.info('postprocess: vacuuming relevant tables...') if iterations >= config.scan.get('full_vacuum_iterations', 288): # this may look weird, but we want to reset iterations even if full_vacuums are off # so it doesn't count to infinity if config.scan.get('full_vacuum', True): vacuum(mode='postprocess', full=True) else: vacuum(mode='postprocess', full=False) iterations = 0 iterations += 1 # wait for the configured amount of time between cycles postprocess_wait = config.postprocess.get('postprocess_wait', 300) log.info('sleeping for {:d} seconds...'.format(postprocess_wait)) time.sleep(postprocess_wait)
def scan(self, group_name, first=None, last=None, message_ranges=None): """Scan a group for segments and return a list.""" self.connect() messages_missed = [] overviews = [] start = time.time() i = 0 # grab the headers we're after check = 0 while True: try: check += 1 if check == 3: return False, None, None, None with nntp_handler(self): self.connection.group(group_name) break except: continue if message_ranges: for first, last in message_ranges: range_overviews = None while True: i += 1 log.debug('server: {}: getting range {}-{}'.format(group_name, first, last)) try: with nntp_handler(self, group_name): status, range_overviews = self.connection.over((first, last)) except: # 3 attempts if i == 3: log.warning('server: {}: timed out a bunch, we\'ll try again later'.format(group_name)) break continue if range_overviews: overviews += range_overviews else: # we missed them messages_missed += range(first, last + 1) break else: while True: i += 1 log.debug('server: {}: getting range {}-{}'.format(group_name, first, last)) try: with nntp_handler(self, group_name): status, overviews = self.connection.over((first, last)) break except: # 3 attempts if i == 3: log.warning('server: {}: timed out a bunch, we\'ll try again later'.format(group_name)) break continue parts = {} messages = [] ignored = 0 if overviews: with db_session() as db: blacklists = db.query(Blacklist).filter(Blacklist.status == True).all() for blacklist in blacklists: db.expunge(blacklist) for (id, overview) in overviews: # keep track of which messages we received so we can # optionally check for ones we missed later messages.append(id) # some messages don't have subjects? who knew if 'subject' not in overview: continue # get the current segment number results = SEGMENT_REGEX.findall(overview['subject']) # it might match twice, so just get the last one # the first is generally the part number if results: (segment_number, total_segments) = results[-1] else: # if there's no match at all, it's probably not a binary ignored += 1 continue # make sure the header contains everything we need try: size = int(overview[':bytes']) except: # TODO: cull this later log.debug('server: bad message: {}'.format(overview)) continue # assuming everything didn't f**k up, continue if int(segment_number) > 0 and int(total_segments) > 0: # strip the segment number off the subject so # we can match binary parts together subject = nntplib.decode_header(overview['subject'].replace( '(' + str(segment_number) + '/' + str(total_segments) + ')', '' ).strip()).encode('utf-8', 'replace').decode('latin-1') posted_by = nntplib.decode_header(overview['from']).encode('utf-8', 'replace').decode('latin-1') # generate a hash to perform matching hash = pynab.parts.generate_hash(subject, posted_by, group_name, int(total_segments)) # this is spammy as shit, for obvious reasons # pynab.log.debug('Binary part found: ' + subject) # build the segment, make sure segment number and size are ints segment = { 'message_id': overview['message-id'][1:-1], 'segment': int(segment_number), 'size': size } # if we've already got a binary by this name, add this segment if hash in parts: parts[hash]['segments'][segment_number] = segment parts[hash]['available_segments'] += 1 else: # dateutil will parse the date as whatever and convert to UTC # some subjects/posters have odd encoding, which will break pymongo # so we make sure it doesn't try: message = { 'hash': hash, 'subject': subject, 'posted': dateutil.parser.parse(overview['date']), 'posted_by': posted_by, 'group_name': group_name, 'xref': pynab.util.smart_truncate(overview['xref'], length=1024), 'total_segments': int(total_segments), 'available_segments': 1, 'segments': {segment_number: segment, }, } parts[hash] = message except Exception as e: log.error('server: bad message parse: {}'.format(e)) continue else: # :getout: ignored += 1 # instead of checking every single individual segment, package them first # so we typically only end up checking the blacklist for ~150 parts instead of thousands blacklist = [k for k, v in parts.items() if pynab.parts.is_blacklisted(v, group_name, blacklists)] blacklisted_parts = len(blacklist) total_parts = len(parts) for k in blacklist: del parts[k] else: total_parts = 0 blacklisted_parts = 0 # check for missing messages if desired # don't do this if we're grabbing ranges, because it won't work if not message_ranges: messages_missed = list(set(range(first, last)) - set(messages)) end = time.time() log.info('server: {}: retrieved {} - {} in {:.2f}s [{} recv, {} pts, {} ign, {} blk]'.format( group_name, first, last, end - start, len(messages), total_parts, ignored, blacklisted_parts )) # check to see if we at least got some messages - they might've been ignored if len(messages) > 0: status = True else: status = False return status, parts, messages, messages_missed
def main(): channel = "#nZEDbPRE" nickname = ''.join([random.choice(string.ascii_letters) for n in range(8)]) log.info("Pre: Bot Nick - {}".format(nickname)) bot = TestBot(channel, nickname, "irc.synirc.net", 6667) bot.start()