예제 #1
0
class NNTPConnector(BaseConnector):  
    @logit(log,'fetch')
    def fetch(self):
        """
        Fetches all the messages for a given news group uri and return Fetched staus depending 
        on the success and faliure of the task
        """
        try:
            #eg. self.currenturi = nntp://msnews.microsoft.com/microsoft.public.exchange.setup
            #nntp_server = 'msnews.microsoft.com'
            #nntp_group = 'microsoft.public.exchange.setup'
            self.genre = 'review'
            try:
                nntp_server = urlparse(self.currenturi)[1]
            except:
                log.exception(self.log_msg("Exception occured while connecting to NNTP server %s"%self.currenturi))
                return False
            nntp_group =  urlparse(self.currenturi)[2][1:]
            self.server = NNTP(nntp_server)
            try:
                self.__updateParentSessionInfo()
                resp, count, first, last, name = self.server.group(nntp_group)
                last_id = int(last)
                first_id = self.__getMaxCrawledId(last_id)+1
                log.debug("first_id is %d:"%first_id)
                log.debug("last_id is %d:"%last_id)
                if last_id >= first_id:
                    resp, items = self.server.xover(str(first_id), str(last_id))
                    log.debug(self.log_msg("length of items:%s"%str(len(items))))
                    for self.id, self.subject, self.author, self.date, self.message_id,\
                            self.references, size, lines in items:
                        self.__getMessages(self.task.instance_data['uri'])
                self.server.quit()
                return True
            except:
                log.exception(self.log_msg("Exception occured in fetch()"))
                self.server.quit()
                return False
        except Exception,e:
            log.exception(self.log_msg("Exception occured in fetch()"))
            return False
예제 #2
0
파일: dump.py 프로젝트: rrealmuto/usenet
sql = sqlite3.connect('usenet.db')

print "About to start dumping articles from " + group
serv = NNTP('news.astraweb.com', 119, 'arealmuto', 'stock1114')

resp = serv.group(group)
count = int(resp[1])
first = int(resp[2])
last = int(resp[3])

print "There are " + str(count) + " articles to get"
print "First: " + str(first)
print "Last: " + str(last)
print "Using chunks size of: " + str(chunk_size)
print "It should take " + str(count/chunk_size) + " requests to finish"

id = int(first)

i = 0
master_list = []
while id < last:
	print str(i) +": Getting id's " + str(id) + " - " + str(id + chunk_size)
	resp, list = serv.xover(str(id), str(id+ chunk_size))
	print "Done fetching"
	print "Adding to master list"
	for line in list:
		article = (line[0], line[1], line[2], line[3], line[4])
		master_list.append(article)			
	id += chunk_size + 1
	i += 1
예제 #3
0
    resp, count, first, last, name = s.group(group)
    print "Group [" + group + "] has " + count + " articles (" + first + ", " + last + ")"
    
    # Skip empty newsgroups
    if count > 0:

        # Read items info from group
        print "- Reading items"
        
        # DEBUG - ******** THIS NEEDS TO BE REMOVED, IT JUST LOOKS AT LAST 50 MESSAGES TO SAVE TIME FOR NOW *********
        #if int(last)-int(first) > 200:
        #    first = str(int(last)-200)
        #    print "-- DEBUG: Truncating to (" + first + "," + last + ")"
        # DEBUG
        
        resp, items = s.xover(first, last)
		
        # Find unique subjects and authors
        print "- Sorting items"
        items_unique, subject_number, author_numbers = find_unique_subjects_and_authors(items)
        print "-- There are " + str(subject_number) + " unique subjects in this forum"
        
        # Write group, number of subjects, average and maximum number of authors
        g = open('stats_message.csv', 'ab')
        group_writer = csv.writer(g)
        group_writer.writerow([group, subject_number, np.mean(author_numbers), np.max(author_numbers)])
        g.close()
        
        # Combine conversations
        print "- Combining conversations"
        conversations = combine_conversations(items_unique, subject_number)
예제 #4
0
def newsgroup(G='', F='', C='', A=None, P=None, RESPONSE=None):
    """The article list for group G in framestyle F"""
    if G=='': return "Missing newsgroup name."
    group = G
    showframes = 1
    show_articles = default_show_articles
    if os.environ.has_key('HTTP_USER_AGENT'):
        browser = os.environ['HTTP_USER_AGENT']
    else:
        browser = "unknown"
    if string.find(browser, "Mozilla/") == 0:
        browser_version = string.atof(browser[8:string.index(browser, ' ')])
        if browser_version >= 2.00:
            showframes = 3
    if F != '':
	try:
	    showframes = string.atoi(F)
	except AttributeError:
	    showframes = 0
    if C != '':
        try:
            show_articles = string.atoi(C)
        except AttributeError:
            show_articles = default_show_articles

    user = A
    acc_string = ''
    if A: acc_string = '&A=' + A
    password = P
    pass_string = ''
    if P: pass_string = '&P=' + P
    
    lines = []
    RESPONSE.headers['expires'] = time.asctime(time.gmtime(time.time() + 60))
    RESPONSE.write( """<HTML><HEAD><TITLE>Tokyo PC Users Group: %s</TITLE></HEAD>""" % group)
    try:
        try:
	    news = NNTP(NEWS_SERVER)
        except:
	    RESPONSE.write( "<BODY><B>Can not connect to server:</B> ", NEWS_SERVER)
	    raise NewsError

        try:
	    resp = news.shortcmd('MODE READER')
        except:
	    RESPONSE.write( "<BODY><B>Can not communicate with server:</B> ", NEWS_SERVER)
	    raise NewsError

        if user:
            resp = news.shortcmd('authinfo user '+user)
            if resp[:3] == '381':
                if not password:
		    RESPONSE.write( "<BODY><B>Can not fetch newsgroup</B>")
		    raise NewsError
                else:
                    resp = news.shortcmd('authinfo pass '+password)
                    if resp[:3] != '281':
		        RESPONSE.write( "<BODY><B>Can not fetch newsgroup</B>")
			raise NewsError

        try:
	    resp, count, first, last, name = news.group(group)
        except:
            RESPONSE.write( "<BODY><B>No such newsgroup:</B> " + group )
            raise NewsError

	description = ""
	try:
	    resp, lines = news.xgtitle(group)
	except:
	    pass
	else:
	    for line in lines:
		name, description = line

	if showframes == 0:
	    RESPONSE.write( '<BODY BGCOLOR="#FFFFFF"><H1>%s</H1>' % group)
	    RESPONSE.write( "<EM>%s</EM><P>" % cgi.escape(description))
	elif showframes == 1 or showframes == 3:
	    if description:	description = "&D="+quote(description)
	    RESPONSE.write( '<FRAMESET ROWS="33%,*">')
	    RESPONSE.write( '  <FRAMESET COLS="220,*">')
	    RESPONSE.write( '    <FRAME SRC="/cgi-bin/webnews/logo?G=%s%s" scrolling="auto">' % (group, description))
	    RESPONSE.write( '    <FRAME SRC="/cgi-bin/webnews/newsgroup?G=%s&F=2%s%s#last" scrolling="yes"> ' % (group, acc_string, pass_string))
	    RESPONSE.write( '  </FRAMESET>')
            if string.find(G, "ttalk") >= 0:
                RESPONSE.write( '  <FRAME SRC="http://ttalk.soholutions.com/welcome.html" scrolling="auto" name="d">')
            else:
                RESPONSE.write( '  <FRAME SRC="/webnews/welcome.html" scrolling="auto" name="d">')
	    RESPONSE.write( '</FRAMESET><BODY BGCOLOR="#FFFFFF">')
	else:
	    RESPONSE.write( '<BODY BGCOLOR="#FFFFFF">')

	if showframes == 3:
	    raise NewsError

        if (show_articles > 0):
            ilast = string.atoi(last)
            ifirst = string.atoi(first)
            if ((ilast - ifirst + 1) > show_articles):
                first = "%d" % (ilast - show_articles + 1)
                RESPONSE.write( '<A HREF="/cgi-bin/webnews/newsgroup?G=%s&F=%d&C=0%s%s"><I>Retrieve earlier article headers</I></A> ' % (group, showframes, acc_string, pass_string))

	try:
	    resp, lines = news.xover(first, last)
	except:
	    RESPONSE.write( "<B>Unable to get article list for:</B> " + group)
	    raise NewsError

	RESPONSE.write( '<UL TYPE="none">')

	# pass 1: build a dictionary of message IDs
	ids = {}
	index = 0
	for line in lines:
	    art_nr, subject, poster, date, id, references, size, line_cnt = line
	    ids[id] = index
	    index = index + 1

	# pass 2: discover child articles
	childof = []
	subs = {}
#	subject_re_less = regex.symcomp("\([Rr]e:\)? *\(<real_subject>.*\)")
        subject_re_less = re.compile(r"(re:)?\s*(?P<real_subject>.*)")
	index = 0
	for line in lines:
	    art_nr, subject, poster, date, id, references, size, line_cnt = line
	    childof.append(-1)
#	    if subject_re_less.match(subject) > 0:
#		subject = subject_re_less.group('real_subject')
	    srl = subject_re_less.match(subject)
	    if srl: subject = srl.group('real_subject')
	    # if there are references, use them (most recent first)
	    if len(references) > 0:
		references.reverse()
		for ref in references:
		    if ids.has_key(ref):
			childof[index] = ids[ref]
			break
	    # if no references (or referee not found), use subject
	    if childof[index] == -1:
		if subs.has_key(subject) :
		    childof[index] = subs[subject]
		else:
		    subs[subject] = index
	    index = index + 1

#   index = 0
#   for line in lines:
#	art_nr, subject, poster, date, id, size, line_cnt, references = line
#	print index,childof[index],subject
#	index = index + 1

        index = 0
	for seq in childof:
	    if seq == -1:
		show_article_and_kids(index, 0, lines, childof,
				      acc_string, pass_string, RESPONSE)
	    index = index + 1

#	art_nr, subject, poster, date, id, size, line_cnt, references = line
#	name, email = parseaddr(poster)
#	print '<LI><A HREF="http:/cgi-bin/readnews.cgi?%s" TARGET="d">%s</A> (%s)  %s, %s' % (quote(id), subject, line_cnt, name, time.strftime('%b %d, %H:%M', parsedate(date)))

	RESPONSE.write('<A NAME="last">&nbsp</A></UL>')

    finally:
        if showframes != 2:
            if string.find(G, "ttalk") >= 0:
                RESPONSE.write( """<P><HR><P>A service of the
                <A HREF="http://www.soholutions.com/">SoHolutions</A>.""")
            else:
                RESPONSE.write( """<P><HR><P>A service of the
                <A HREF="http://www.tpc.ml.org/">Tokyo PC Users Group</A>.""")
        # print "<P><ADDRESS>",os.environ['HTTP_USER_AGENT'],"</ADDRESS>"
        RESPONSE.write( """</BODY></HTML>""")
class Archive(object):

    @staticmethod
    def is_diff(body):
        return bool([line for line in body if line.startswith("diff ")])

    def __init__(self, group, server):
        self.conn = NNTP(server)
        resp, count, first, last, name = self.conn.group(group)

        self.group = group
        self.server = server
        self.first = int(first)
        self.last = int(last)

    def get_number_from_user(self, msg_id):
        """
            Convert something the user might input into a message id.

            These are:
            # An NNTP message number
            # A gmane link that includes the NNTP message number
            # The original Message-Id header of the message.

            NOTE: gmane's doesn't include the message number in STAT requests
            that involve only the Message-Id (hence the convolution of getting
            all the headers).
        """
        msg_id = re.sub(r".*gmane.org/gmane.comp.version-control.git/([0-9]+).*", r"\1", str(msg_id))
        _, n, id, result = self.conn.head(msg_id)

        for header in result:
            m = re.match(r"Xref: .*:([0-9]+)\s*$", header, re.I)
            if m:
                return int(m.group(1))
        else:
            raise FatalError("No (or bad) Xref header for message '%s'" % msg_id)

    def get_patch_series(self, user_input, search_limit=100):
        """
            Given an NNTP message number or a Message-Id header return
            an mbox containing the patches introduced by the author of that message.

            This handles the case where the threading is right *and* the patches
            are numbered in a simple scheme:

            [PATCH] this patch has no replies and stands on its own

            [PATCH 0/2] this is an introduction to the series
              |- [PATCH 1/2] the first commit
              |- [PATCH 2/2] the second commit

            [PATCH 1/3] this is the first commit
              |- [PATCH 2/3] and this is the second
                   |- [PATCH 3/3] and this is the third

            TODO: it would be nice to make the search more efficient, we can
            use the numbers in [PATCH <foo>/<bar>] to stop early.
        """

        start_id = self.get_number_from_user(user_input)

        messages = limit(self.messages_starting_from(start_id), search_limit)
        try:
            thread = Thread(messages.next())
        except StopIteration:
            raise FatalError("No message at id '%s' using XOVER")

        n_since_last = 0
        for message in messages:
            if n_since_last > 5:
                break

            elif thread.should_include(message):
                n_since_last = 0
                thread.append(message)

            else:
                n_since_last += 1

        else:
            raise FatalError('did not find end of series within %s messages', search_limit)

        for message in self.xover(start_id - 5, start_id -1):
            if thread.should_include(message):
                thread.append(message)

        return self.mboxify(thread)

    def mboxify(self, thread):
        """
            Convert a thread into an mbox for application via git-am.
        """
        lines = []

        for message in thread.in_order():
            _, number, msg_id, body = self.conn.body(str(message.number))

            # git-am doesn't like empty patches very much, and the 0/X'th patch is
            # often not a patch, we skip it here. (TODO, warn the user about this)
            if re.search(r" 0+/[0-9]+", message.subject) and not self.is_diff(body):
                continue

            poster = parseaddr(message.poster)[0]
            date = ctime(mktime(parsedate(message.date)))
            lines.append("From %s %s" % (poster, date))

            lines.append("From: %s" % message.poster)
            lines.append("Subject: %s" % message.subject)
            lines.append("Date: %s" % message.date)
            lines.append("Message-Id: %s" % message.msg_id)
            lines.append("Xref: %s %s:%s" % (self.server, self.group, message.number))
            lines.append("References: %s" % "\n\t".join(message.references))
            lines.append("")
            lines += body
            lines.append("")

        return "\n".join(lines)

    def messages_starting_from(self, start_id):
        """
            Generate all message headers starting from the given id and working upwards.
        """
        while start_id < self.last:
            next_id = min(start_id + 20, self.last)
            for message in self.xover(start_id, next_id):
                yield message

            start_id = next_id + 1

    def xover(self, begin, end):
        """
            Get the headers for the messages with numbers between begin and end.
        """
        if begin == end:
            return []

        _, result = self.conn.xover(str(min(begin, end)), str(max(begin, end)))

        result = [Message(int(number), subject, poster, date, msg_id, references) for
                  (number, subject, poster, date, msg_id, references, size, lines) in result]

        return sorted(result, key=lambda x: x.number)