Exemplo n.º 1
0
    def post_date(self, group_name, article):
        """Retrieves the date of the specified post."""
        self.connect()

        art_num = 0
        overview = None

        try:
            self.connection.group(group_name)
            art_num, overview = self.connection.head('{0:d}'.format(article))
        except nntplib.NNTPError as e:
            log.debug('server: unable to get date of message {}: {}'.format(article, e))
            # leave this alone - we don't expect any data back
            return None

        if art_num and overview:
            # overview[0] = article number
            # overview[1] = message-id
            # overview[2] = headers
            for header in overview[2]:
                date_header = ''
                head = nntplib.decode_header(header.decode('utf-8', errors='surrogateescape'))

                if 'X-Server-Date:' in head:
                    continue
                elif 'NNTP-Posting-Date:' in head:
                    date_header = head.replace('NNTP-Posting-Date: ', '')
                elif 'Date:' in head:
                    date_header = head.replace('Date: ', '')

                if date_header:
                    try:
                        date = dateutil.parser.parse(date_header)
                    except Exception as e:
                        log.error('server: date parse failed while dating message: {}'.format(e))
                        return None

                    try:
                        date = pytz.utc.localize(date)
                    except:
                        # no problem, it's already localised
                        pass

                    return date
        else:
            return None
Exemplo n.º 2
0
    def post_date(self, group_name, article):
        """Retrieves the date of the specified post."""
        self.connect()

        art_num = 0
        overview = None

        try:
            with nntp_handler(self, group_name):
                self.connection.group(group_name)
                art_num, overview = self.connection.head('{0:d}'.format(article))
        except:
            return None

        if art_num and overview:
            # overview[0] = article number
            # overview[1] = message-id
            # overview[2] = headers
            for header in overview[2]:
                date_header = ''
                head = nntplib.decode_header(header.decode('utf-8', errors='surrogateescape'))

                if 'X-Server-Date:' in head:
                    continue
                elif 'NNTP-Posting-Date:' in head:
                    date_header = head.replace('NNTP-Posting-Date: ', '')
                elif 'Date:' in head:
                    date_header = head.replace('Date: ', '')

                if date_header:
                    try:
                        date = dateutil.parser.parse(date_header)
                    except Exception as e:
                        log.error('server: date parse failed while dating message: {}'.format(e))
                        return None

                    try:
                        date = pytz.utc.localize(date)
                    except:
                        # no problem, it's already localised
                        pass

                    return date
        else:
            return None
Exemplo n.º 3
0
    def scan(self, group_name, first=None, last=None, message_ranges=None):
        """Scan a group for segments and return a list."""
        self.connect()

        messages_missed = []
        overviews = []

        start = time.time()

        i = 0

        # grab the headers we're after
        check = 0
        while True:
            try:
                check += 1
                if check == 3:
                    return False, None, None, None
                with nntp_handler(self):
                    self.connection.group(group_name)
                    break
            except:
                continue

        if message_ranges:
            for first, last in message_ranges:
                range_overviews = None
                while True:
                    i += 1
                    log.debug('server: {}: getting range {}-{}'.format(group_name, first, last))
                    try:
                        with nntp_handler(self, group_name):
                            status, range_overviews = self.connection.over((first, last))
                    except:
                        # 3 attempts
                        if i == 3:
                            log.warning('server: {}: timed out a bunch, we\'ll try again later'.format(group_name))
                            break
                        continue

                    if range_overviews:
                        overviews += range_overviews
                    else:
                        # we missed them
                        messages_missed += range(first, last + 1)
                    break
        else:
            while True:
                i += 1
                log.debug('server: {}: getting range {}-{}'.format(group_name, first, last))
                try:
                    with nntp_handler(self, group_name):
                        status, overviews = self.connection.over((first, last))
                        break
                except:
                    # 3 attempts
                    if i == 3:
                        log.warning('server: {}: timed out a bunch, we\'ll try again later'.format(group_name))
                        break
                    continue

        parts = {}
        messages = []
        ignored = 0

        if overviews:
            with db_session() as db:
                blacklists = db.query(Blacklist).filter(Blacklist.status == True).all()
                for blacklist in blacklists:
                    db.expunge(blacklist)

            for (id, overview) in overviews:
                # keep track of which messages we received so we can
                # optionally check for ones we missed later
                messages.append(id)

                # some messages don't have subjects? who knew
                if 'subject' not in overview:
                    continue

                # get the current segment number
                results = SEGMENT_REGEX.findall(overview['subject'])

                # it might match twice, so just get the last one
                # the first is generally the part number
                if results:
                    (segment_number, total_segments) = results[-1]
                else:
                    # if there's no match at all, it's probably not a binary
                    ignored += 1
                    continue

                # make sure the header contains everything we need
                try:
                    size = int(overview[':bytes'])
                except:
                    # TODO: cull this later
                    log.debug('server: bad message: {}'.format(overview))
                    continue

                # assuming everything didn't f**k up, continue
                if int(segment_number) > 0 and int(total_segments) > 0:
                    # strip the segment number off the subject so
                    # we can match binary parts together
                    subject = nntplib.decode_header(overview['subject'].replace(
                        '(' + str(segment_number) + '/' + str(total_segments) + ')', ''
                    ).strip()).encode('utf-8', 'replace').decode('latin-1')

                    posted_by = nntplib.decode_header(overview['from']).encode('utf-8', 'replace').decode('latin-1')

                    # generate a hash to perform matching
                    hash = pynab.parts.generate_hash(subject, posted_by, group_name, int(total_segments))

                    # this is spammy as shit, for obvious reasons
                    # pynab.log.debug('Binary part found: ' + subject)

                    # build the segment, make sure segment number and size are ints
                    segment = {
                        'message_id': overview['message-id'][1:-1],
                        'segment': int(segment_number),
                        'size': size
                    }

                    # if we've already got a binary by this name, add this segment
                    if hash in parts:
                        parts[hash]['segments'][segment_number] = segment
                        parts[hash]['available_segments'] += 1
                    else:
                        # dateutil will parse the date as whatever and convert to UTC
                        # some subjects/posters have odd encoding, which will break pymongo
                        # so we make sure it doesn't
                        try:
                            message = {
                                'hash': hash,
                                'subject': subject,
                                'posted': dateutil.parser.parse(overview['date']),
                                'posted_by': posted_by,
                                'group_name': group_name,
                                'xref': pynab.util.smart_truncate(overview['xref'], length=1024),
                                'total_segments': int(total_segments),
                                'available_segments': 1,
                                'segments': {segment_number: segment, },
                            }

                            parts[hash] = message
                        except Exception as e:
                            log.error('server: bad message parse: {}'.format(e))
                            continue
                else:
                    # :getout:
                    ignored += 1

            # instead of checking every single individual segment, package them first
            # so we typically only end up checking the blacklist for ~150 parts instead of thousands
            blacklist = [k for k, v in parts.items() if pynab.parts.is_blacklisted(v, group_name, blacklists)]
            blacklisted_parts = len(blacklist)
            total_parts = len(parts)
            for k in blacklist:
                del parts[k]
        else:
            total_parts = 0
            blacklisted_parts = 0

        # check for missing messages if desired
        # don't do this if we're grabbing ranges, because it won't work
        if not message_ranges:
            messages_missed = list(set(range(first, last)) - set(messages))

        end = time.time()

        log.info('server: {}: retrieved {} - {} in {:.2f}s [{} recv, {} pts, {} ign, {} blk]'.format(
            group_name,
            first, last,
            end - start,
            len(messages),
            total_parts,
            ignored,
            blacklisted_parts
        ))

        # check to see if we at least got some messages - they might've been ignored
        if len(messages) > 0:
            status = True
        else:
            status = False

        return status, parts, messages, messages_missed
Exemplo n.º 4
0
Arquivo: server.py Projeto: shpd/pynab
    def scan(self, group_name, first, last):
        """Scan a group for segments and return a list."""
        log.info('{}: Collecting parts {:d} to {:d}...'.format(group_name, first, last))

        start = time.clock()

        try:
            # grab the headers we're after
            self.connection.group(group_name)
            status, overviews = self.connection.over((first, last))
        except nntplib.NNTPError as nntpe:
            log.debug('NNTP Error: ' + str(nntpe))
            return {}

        messages = {}
        ignored = 0
        received = []
        for (id, overview) in overviews:
            # keep track of which messages we received so we can
            # optionally check for ones we missed later
            received.append(id)

            # get the current segment number
            results = regex.findall('\((\d+)[\/](\d+)\)', overview['subject'])

            # it might match twice, so just get the last one
            # the first is generally the part number
            if results:
                (segment_number, total_segments) = results[-1]
            else:
                # if there's no match at all, it's probably not a binary
                ignored += 1
                continue

            # make sure the header contains everything we need
            if ':bytes' not in overview:
                continue

            # assuming everything didn't f**k up, continue
            if int(segment_number) > 0 and int(total_segments) > 0:
                # strip the segment number off the subject so
                # we can match binary parts together
                subject = overview['subject'].replace(
                    '(' + str(segment_number) + '/' + str(total_segments) + ')', ''
                ).strip()

                # this is spammy as shit, for obvious reasons
                #pynab.log.debug('Binary part found: ' + subject)

                # build the segment, make sure segment number and size are ints
                segment = {
                    'message_id': overview['message-id'][1:-1],
                    'segment': int(segment_number),
                    'size': int(overview[':bytes']),
                }

                # if we've already got a binary by this name, add this segment
                if subject in messages:
                    messages[subject]['segments'][segment_number] = segment
                    messages[subject]['available_segments'] += 1
                else:
                    # dateutil will parse the date as whatever and convert to UTC
                    # some subjects/posters have odd encoding, which will break pymongo
                    # so we make sure it doesn't
                    message = {
                        'subject': nntplib.decode_header(subject).encode('utf-8', 'surrogateescape').decode('latin-1'),
                        'posted': dateutil.parser.parse(overview['date']),
                        'posted_by': nntplib.decode_header(overview['from']).encode('utf-8', 'surrogateescape').decode(
                            'latin-1'),
                        'group_name': group_name,
                        'xref': overview['xref'],
                        'total_segments': int(total_segments),
                        'available_segments': 1,
                        'segments': {segment_number: segment, },
                    }

                    messages[subject] = message
            else:
                # :getout:
                ignored += 1

        # instead of checking every single individual segment, package them first
        # so we typically only end up checking the blacklist for ~150 parts instead of thousands
        blacklist = [k for k in messages if pynab.parts.is_blacklisted(k, group_name)]
        blacklisted_parts = len(blacklist)
        total_parts = len(messages)
        for k in blacklist:
            del messages[k]

        log.info(
            '{}: Received {:d} articles of {:d}, forming {:d} parts with {:d} ignored and {:d} blacklisted.'
            .format(group_name, len(received), last - first + 1, total_parts, ignored, blacklisted_parts)
        )

        # TODO: implement re-checking of missed messages, or maybe not
        # most parts that get ko'd these days aren't coming back anyway
        messages_missed = list(set(range(first, last)) - set(received))

        end = time.clock()
        log.info('Time elapsed: {:.2f}s'.format(end - start))

        return messages