def post_date(self, group_name, article): """Retrieves the date of the specified post.""" self.connect() art_num = 0 overview = None try: self.connection.group(group_name) art_num, overview = self.connection.head('{0:d}'.format(article)) except nntplib.NNTPError as e: log.debug('server: unable to get date of message {}: {}'.format(article, e)) # leave this alone - we don't expect any data back return None if art_num and overview: # overview[0] = article number # overview[1] = message-id # overview[2] = headers for header in overview[2]: date_header = '' head = nntplib.decode_header(header.decode('utf-8', errors='surrogateescape')) if 'X-Server-Date:' in head: continue elif 'NNTP-Posting-Date:' in head: date_header = head.replace('NNTP-Posting-Date: ', '') elif 'Date:' in head: date_header = head.replace('Date: ', '') if date_header: try: date = dateutil.parser.parse(date_header) except Exception as e: log.error('server: date parse failed while dating message: {}'.format(e)) return None try: date = pytz.utc.localize(date) except: # no problem, it's already localised pass return date else: return None
def post_date(self, group_name, article): """Retrieves the date of the specified post.""" self.connect() art_num = 0 overview = None try: with nntp_handler(self, group_name): self.connection.group(group_name) art_num, overview = self.connection.head('{0:d}'.format(article)) except: return None if art_num and overview: # overview[0] = article number # overview[1] = message-id # overview[2] = headers for header in overview[2]: date_header = '' head = nntplib.decode_header(header.decode('utf-8', errors='surrogateescape')) if 'X-Server-Date:' in head: continue elif 'NNTP-Posting-Date:' in head: date_header = head.replace('NNTP-Posting-Date: ', '') elif 'Date:' in head: date_header = head.replace('Date: ', '') if date_header: try: date = dateutil.parser.parse(date_header) except Exception as e: log.error('server: date parse failed while dating message: {}'.format(e)) return None try: date = pytz.utc.localize(date) except: # no problem, it's already localised pass return date else: return None
def scan(self, group_name, first=None, last=None, message_ranges=None): """Scan a group for segments and return a list.""" self.connect() messages_missed = [] overviews = [] start = time.time() i = 0 # grab the headers we're after check = 0 while True: try: check += 1 if check == 3: return False, None, None, None with nntp_handler(self): self.connection.group(group_name) break except: continue if message_ranges: for first, last in message_ranges: range_overviews = None while True: i += 1 log.debug('server: {}: getting range {}-{}'.format(group_name, first, last)) try: with nntp_handler(self, group_name): status, range_overviews = self.connection.over((first, last)) except: # 3 attempts if i == 3: log.warning('server: {}: timed out a bunch, we\'ll try again later'.format(group_name)) break continue if range_overviews: overviews += range_overviews else: # we missed them messages_missed += range(first, last + 1) break else: while True: i += 1 log.debug('server: {}: getting range {}-{}'.format(group_name, first, last)) try: with nntp_handler(self, group_name): status, overviews = self.connection.over((first, last)) break except: # 3 attempts if i == 3: log.warning('server: {}: timed out a bunch, we\'ll try again later'.format(group_name)) break continue parts = {} messages = [] ignored = 0 if overviews: with db_session() as db: blacklists = db.query(Blacklist).filter(Blacklist.status == True).all() for blacklist in blacklists: db.expunge(blacklist) for (id, overview) in overviews: # keep track of which messages we received so we can # optionally check for ones we missed later messages.append(id) # some messages don't have subjects? who knew if 'subject' not in overview: continue # get the current segment number results = SEGMENT_REGEX.findall(overview['subject']) # it might match twice, so just get the last one # the first is generally the part number if results: (segment_number, total_segments) = results[-1] else: # if there's no match at all, it's probably not a binary ignored += 1 continue # make sure the header contains everything we need try: size = int(overview[':bytes']) except: # TODO: cull this later log.debug('server: bad message: {}'.format(overview)) continue # assuming everything didn't f**k up, continue if int(segment_number) > 0 and int(total_segments) > 0: # strip the segment number off the subject so # we can match binary parts together subject = nntplib.decode_header(overview['subject'].replace( '(' + str(segment_number) + '/' + str(total_segments) + ')', '' ).strip()).encode('utf-8', 'replace').decode('latin-1') posted_by = nntplib.decode_header(overview['from']).encode('utf-8', 'replace').decode('latin-1') # generate a hash to perform matching hash = pynab.parts.generate_hash(subject, posted_by, group_name, int(total_segments)) # this is spammy as shit, for obvious reasons # pynab.log.debug('Binary part found: ' + subject) # build the segment, make sure segment number and size are ints segment = { 'message_id': overview['message-id'][1:-1], 'segment': int(segment_number), 'size': size } # if we've already got a binary by this name, add this segment if hash in parts: parts[hash]['segments'][segment_number] = segment parts[hash]['available_segments'] += 1 else: # dateutil will parse the date as whatever and convert to UTC # some subjects/posters have odd encoding, which will break pymongo # so we make sure it doesn't try: message = { 'hash': hash, 'subject': subject, 'posted': dateutil.parser.parse(overview['date']), 'posted_by': posted_by, 'group_name': group_name, 'xref': pynab.util.smart_truncate(overview['xref'], length=1024), 'total_segments': int(total_segments), 'available_segments': 1, 'segments': {segment_number: segment, }, } parts[hash] = message except Exception as e: log.error('server: bad message parse: {}'.format(e)) continue else: # :getout: ignored += 1 # instead of checking every single individual segment, package them first # so we typically only end up checking the blacklist for ~150 parts instead of thousands blacklist = [k for k, v in parts.items() if pynab.parts.is_blacklisted(v, group_name, blacklists)] blacklisted_parts = len(blacklist) total_parts = len(parts) for k in blacklist: del parts[k] else: total_parts = 0 blacklisted_parts = 0 # check for missing messages if desired # don't do this if we're grabbing ranges, because it won't work if not message_ranges: messages_missed = list(set(range(first, last)) - set(messages)) end = time.time() log.info('server: {}: retrieved {} - {} in {:.2f}s [{} recv, {} pts, {} ign, {} blk]'.format( group_name, first, last, end - start, len(messages), total_parts, ignored, blacklisted_parts )) # check to see if we at least got some messages - they might've been ignored if len(messages) > 0: status = True else: status = False return status, parts, messages, messages_missed
def scan(self, group_name, first, last): """Scan a group for segments and return a list.""" log.info('{}: Collecting parts {:d} to {:d}...'.format(group_name, first, last)) start = time.clock() try: # grab the headers we're after self.connection.group(group_name) status, overviews = self.connection.over((first, last)) except nntplib.NNTPError as nntpe: log.debug('NNTP Error: ' + str(nntpe)) return {} messages = {} ignored = 0 received = [] for (id, overview) in overviews: # keep track of which messages we received so we can # optionally check for ones we missed later received.append(id) # get the current segment number results = regex.findall('\((\d+)[\/](\d+)\)', overview['subject']) # it might match twice, so just get the last one # the first is generally the part number if results: (segment_number, total_segments) = results[-1] else: # if there's no match at all, it's probably not a binary ignored += 1 continue # make sure the header contains everything we need if ':bytes' not in overview: continue # assuming everything didn't f**k up, continue if int(segment_number) > 0 and int(total_segments) > 0: # strip the segment number off the subject so # we can match binary parts together subject = overview['subject'].replace( '(' + str(segment_number) + '/' + str(total_segments) + ')', '' ).strip() # this is spammy as shit, for obvious reasons #pynab.log.debug('Binary part found: ' + subject) # build the segment, make sure segment number and size are ints segment = { 'message_id': overview['message-id'][1:-1], 'segment': int(segment_number), 'size': int(overview[':bytes']), } # if we've already got a binary by this name, add this segment if subject in messages: messages[subject]['segments'][segment_number] = segment messages[subject]['available_segments'] += 1 else: # dateutil will parse the date as whatever and convert to UTC # some subjects/posters have odd encoding, which will break pymongo # so we make sure it doesn't message = { 'subject': nntplib.decode_header(subject).encode('utf-8', 'surrogateescape').decode('latin-1'), 'posted': dateutil.parser.parse(overview['date']), 'posted_by': nntplib.decode_header(overview['from']).encode('utf-8', 'surrogateescape').decode( 'latin-1'), 'group_name': group_name, 'xref': overview['xref'], 'total_segments': int(total_segments), 'available_segments': 1, 'segments': {segment_number: segment, }, } messages[subject] = message else: # :getout: ignored += 1 # instead of checking every single individual segment, package them first # so we typically only end up checking the blacklist for ~150 parts instead of thousands blacklist = [k for k in messages if pynab.parts.is_blacklisted(k, group_name)] blacklisted_parts = len(blacklist) total_parts = len(messages) for k in blacklist: del messages[k] log.info( '{}: Received {:d} articles of {:d}, forming {:d} parts with {:d} ignored and {:d} blacklisted.' .format(group_name, len(received), last - first + 1, total_parts, ignored, blacklisted_parts) ) # TODO: implement re-checking of missed messages, or maybe not # most parts that get ko'd these days aren't coming back anyway messages_missed = list(set(range(first, last)) - set(received)) end = time.clock() log.info('Time elapsed: {:.2f}s'.format(end - start)) return messages