예제 #1
0
def process_chunk(l0, l1, group, group_id,chunk,chunks):
	s=''
	try:
		s = NNTP_SSL('secure.news.astraweb.com', 443, 'nemik1', 'this1moois')
	except:
		return ("fail", 0)
	resp, count, first, last, name = s.group(group)
	
	print "\n------NOW PROCESSING CHUNK %d of %d (%d/%d) FROM %s-----\n\n" % (int(chunk),int(chunks),int(l0),int(l1),group)
	syslog.syslog("NOW PROCESSING CHUNK %d of %d (%d/%d) FROM %s" % (int(chunk),int(chunks),int(l0),int(l1),group))
	
	docs_xml = ""
	resp, subs = s.xover(str(l0), str(l1))
	
	#dictionary of files
	#key is file name which is subject_file
	#value is tuple of poster->String, date->Int(timestamp), subject_exact->String, total parts, list [] segments -> (), segment is tuple of (id, segment, size)
	files = {}
	id = 0
	for sub in subs: 
		size = sub[6]
		subject_exact = decode(sub[1])#.encode("utf-8").decode("utf-8")
		#maybe do this by looking for the pattern everywhere and taking the last one from the group, not just end of line? may files appearing as articles....
		subject_file = re.sub("\((\d+)\/(\d+)\)$", "", subject_exact)
		p = re.search("\((\d+)\/(\d+)\)$",subject_exact)
		#data PartInfo = Part (Int, Int)
		
		id = sub[0]
		
		poster = decode(sub[2])
		date_str = sub[3]
	
		#subject_file_id = query for it
		#if None subject_file_id insert into files

		#d = "Fri, 03 Jul 2009 19:31:58 +0200"
		date = int(time.mktime(rfc822.parsedate(date_str)))

		fid = 0
		
		#part is tuple w/ ints (part #, total)
		part=()
		try:
			part = (int(p.group(1)), int(p.group(2)))
		except:
			c.execute("SELECT id FROM articles WHERE id = %s", (sub[4],))
			aid = c.fetchone()
			if aid != None:
				continue
			else:
				try:
					print "ARTICLE: : %s" % subject_exact
				except:
					print "ARTICLE: f**k you print-ascii"
				#c.execute("INSERT INTO articles (id, group_id, subject, poster, date) VALUES (%s, %s, %s, %s, %s)", (sub[4], group_id, subject_exact, poster, date,))
				#c.execute("INSERT INTO articles (id, group_id, subject, poster, date, time) VALUES (%s, %s, %s, %s, %s, current_timestamp)", (sub[4], group_id, subject_exact, poster, date,))
				c.execute("INSERT INTO articles (id, group_id, subject, poster, date, time) VALUES (%s, %s, %s, %s, %s, (TIMESTAMP 'epoch' + %s * INTERVAL '1 second'))", (sub[4], group_id, subject_exact, poster, date,date,))
				connection.commit()
				continue
		
		#guaranteed the file will be in the DB
		c.execute("SELECT id FROM files WHERE subject = %s", (subject_file))
		subject_file_id = c.fetchone()
		if subject_file_id != None:
			subject_file_id = int(subject_file_id[0])
			fid = subject_file_id
		else:
			try:
				print "processing %s" % subject_file
			except:
				print "processing SOMETHING CAN'T BE PRINTED, f**k you ascii"
			#c.execute("INSERT INTO files (subject, name, poster, parts_total, date) VALUES (%s, %s, %s, %s, %s)", (subject_file, subject_file, poster, part[1], date))
			#c.execute("INSERT INTO files (subject, name, poster, parts_total, date, time) VALUES (%s, %s, %s, %s, %s, current_timestamp)", (subject_file, subject_file, poster, part[1], date))
			c.execute("INSERT INTO files (subject, name, poster, parts_total, date, time) VALUES (%s, %s, %s, %s, %s, (TIMESTAMP 'epoch' + %s * INTERVAL '1 second'))", (subject_file, subject_file, poster, part[1], date,date))
			connection.commit()

			headers = s.head(id)
			groups_posted = ""
			for header in headers[3]:
				if "Newsgroups: " in header:
					groups_posted = header.replace('Newsgroups: ','').split(',')
	
			if fid == 0:
				c.execute("SELECT id FROM files WHERE name = %s", (subject_file,))
				fid = int(c.fetchone()[0])
			#guarantee that group exists otherwise create it
			gid = 0
			for group in groups_posted:
				c.execute("SELECT id FROM groups WHERE name = %s", (group,))
				gid = c.fetchone()
				#if i dont have that group, add it
				if gid != None:
					gid = int(gid[0])
				else:
					c.execute("INSERT INTO groups (name, last_article) VALUES(%s, %s)", (group, 0,))
					connection.commit()
				if gid == 0:
					c.execute("SELECT id FROM groups WHERE name = %s", (group,))
					gid = int(c.fetchone())
				c.execute("SELECT id FROM file_group WHERE file_id = %s AND group_id = %s", (fid, gid,))
				exists = c.fetchone()
				if exists != None:
					continue
				else:
					c.execute("INSERT INTO file_group (file_id, group_id) VALUES (%s, %s)", (fid, gid,))
					connection.commit()

		#print "sub is %s, groups are %s" % (sub, groups_posted)
		#add part to parts table and associate with file
		try:
			c.execute("INSERT INTO parts (id, file_id, bytes, number) VALUES (%s, %s, %s, %s)", (sub[4], fid, size, part[0]))
			connection.commit()
		except:
			continue
	
	firprint = str("first article:"+subs[1][0]+"last article:"+subs[-1][0]+"id is "+id)
	print firprint
	syslog.syslog(firprint)
	c.execute("UPDATE groups SET last_article = %s WHERE name = %s", (subs[-1][0], group,))
	connection.commit()
	
	print "\n\n ---- STATUS OF %s UDPATED! ---- \n\n" % (group)
	syslog.syslog("STATUS OF %s UDPATED!" % (group))
	
	s.quit()
	return (docs_xml,subs[-1][0])
예제 #2
0
파일: ss.py 프로젝트: nemik/usenet-indexer
def process_chunk(l0, l1, group, group_id, chunk, chunks):
    s = ""
    try:
        s = NNTP_SSL("secure.news.astraweb.com", 443, "nemik1", "this1moois")
    except:
        return ("fail", 0)
    resp, count, first, last, name = s.group(group)

    print "\n------NOW PROCESSING CHUNK %d of %d (%d/%d) FROM %s-----\n\n" % (
        int(chunk),
        int(chunks),
        int(l0),
        int(l1),
        group,
    )
    syslog.syslog("NOW PROCESSING CHUNK %d of %d (%d/%d) FROM %s" % (int(chunk), int(chunks), int(l0), int(l1), group))

    docs_xml = ""
    resp, subs = s.xover(str(l0), str(l1))

    # dictionary of files
    # key is file name which is subject_file
    # value is tuple of poster->String, date->Int(timestamp), subject_exact->String, total parts, list [] segments -> (), segment is tuple of (id, segment, size)
    files = {}
    id = 0
    for sub in subs:
        size = sub[6]
        subject_exact = decode(sub[1])  # .encode("utf-8").decode("utf-8")
        # maybe do this by looking for the pattern everywhere and taking the last one from the group, not just end of line? may files appearing as articles....
        subject_file = re.sub("\((\d+)\/(\d+)\)$", "", subject_exact)
        p = re.search("\((\d+)\/(\d+)\)$", subject_exact)
        # data PartInfo = Part (Int, Int)

        id = sub[0]

        poster = decode(sub[2])
        date_str = sub[3]

        # subject_file_id = query for it
        # if None subject_file_id insert into files

        # d = "Fri, 03 Jul 2009 19:31:58 +0200"
        date = int(time.mktime(rfc822.parsedate(date_str)))

        fid = 0

        # part is tuple w/ ints (part #, total)
        part = ()
        try:
            part = (int(p.group(1)), int(p.group(2)))
        except:
            c.execute("SELECT id FROM articles WHERE id = %s", (sub[4],))
            aid = c.fetchone()
            if aid != None:
                continue
            else:
                try:
                    print "ARTICLE: : %s" % subject_exact
                except:
                    print "ARTICLE: f**k you print-ascii"
                    # c.execute("INSERT INTO articles (id, group_id, subject, poster, date) VALUES (%s, %s, %s, %s, %s)", (sub[4], group_id, subject_exact, poster, date,))
                    # c.execute("INSERT INTO articles (id, group_id, subject, poster, date, time) VALUES (%s, %s, %s, %s, %s, current_timestamp)", (sub[4], group_id, subject_exact, poster, date,))
                c.execute(
                    "INSERT INTO articles (id, group_id, subject, poster, date, time) VALUES (%s, %s, %s, %s, %s, (TIMESTAMP 'epoch' + %s * INTERVAL '1 second'))",
                    (sub[4], group_id, subject_exact, poster, date, date),
                )
                connection.commit()
                continue

                # guaranteed the file will be in the DB
        c.execute("SELECT id FROM files WHERE subject = %s", (subject_file))
        subject_file_id = c.fetchone()
        if subject_file_id != None:
            subject_file_id = int(subject_file_id[0])
            fid = subject_file_id
        else:
            try:
                print "processing %s" % subject_file
            except:
                print "processing SOMETHING CAN'T BE PRINTED, f**k you ascii"
                # c.execute("INSERT INTO files (subject, name, poster, parts_total, date) VALUES (%s, %s, %s, %s, %s)", (subject_file, subject_file, poster, part[1], date))
                # c.execute("INSERT INTO files (subject, name, poster, parts_total, date, time) VALUES (%s, %s, %s, %s, %s, current_timestamp)", (subject_file, subject_file, poster, part[1], date))
            c.execute(
                "INSERT INTO files (subject, name, poster, parts_total, date, time) VALUES (%s, %s, %s, %s, %s, (TIMESTAMP 'epoch' + %s * INTERVAL '1 second'))",
                (subject_file, subject_file, poster, part[1], date, date),
            )
            connection.commit()

            headers = s.head(id)
            groups_posted = ""
            for header in headers[3]:
                if "Newsgroups: " in header:
                    groups_posted = header.replace("Newsgroups: ", "").split(",")

            if fid == 0:
                c.execute("SELECT id FROM files WHERE name = %s", (subject_file,))
                fid = int(c.fetchone()[0])
                # guarantee that group exists otherwise create it
            gid = 0
            for group in groups_posted:
                c.execute("SELECT id FROM groups WHERE name = %s", (group,))
                gid = c.fetchone()
                # if i dont have that group, add it
                if gid != None:
                    gid = int(gid[0])
                else:
                    c.execute("INSERT INTO groups (name, last_article) VALUES(%s, %s)", (group, 0))
                    connection.commit()
                if gid == 0:
                    c.execute("SELECT id FROM groups WHERE name = %s", (group,))
                    gid = int(c.fetchone())
                c.execute("SELECT id FROM file_group WHERE file_id = %s AND group_id = %s", (fid, gid))
                exists = c.fetchone()
                if exists != None:
                    continue
                else:
                    c.execute("INSERT INTO file_group (file_id, group_id) VALUES (%s, %s)", (fid, gid))
                    connection.commit()

                    # print "sub is %s, groups are %s" % (sub, groups_posted)
                    # add part to parts table and associate with file
        try:
            c.execute(
                "INSERT INTO parts (id, file_id, bytes, number) VALUES (%s, %s, %s, %s)", (sub[4], fid, size, part[0])
            )
            connection.commit()
        except:
            continue

    firprint = str("first article:" + subs[1][0] + "last article:" + subs[-1][0] + "id is " + id)
    print firprint
    syslog.syslog(firprint)
    c.execute("UPDATE groups SET last_article = %s WHERE name = %s", (subs[-1][0], group))
    connection.commit()

    print "\n\n ---- STATUS OF %s UDPATED! ---- \n\n" % (group)
    syslog.syslog("STATUS OF %s UDPATED!" % (group))

    s.quit()
    return (docs_xml, subs[-1][0])
예제 #3
0
def process_group(group,c,connection):
	
	docs_xml = ""
	
	last_article = 0

	not_in_db = False
	group_id = 0

	#guarantees group is in the DB
	"""
	read last good article for the group from DB or file
	"""
	#last_article = int(file('last_article','r').readlines()[0])
	c.execute("SELECT id, last_article FROM groups WHERE name = %s", (group,))
	rows=c.fetchall()
	if len(rows) > 0:
		last_article = int(rows[0][1])
		group_id = int(rows[0][0])
	else:
		print "ADDING %s to DB!" % (group)
		not_in_db = True
		c.execute("INSERT INTO groups (name, last_article) VALUES (%s, %s)", (group,0))
		connection.commit()
		c.execute("SELECT id FROM groups WHERE name = %s", (group,))
		group_id = int(c.fetchone()[0])

	print "last article is %s" % last_article
	
	#connect to usenet, with SSL! WOO!
	s=''
	try:
		s = NNTP_SSL('secure.news.astraweb.com', 443, 'nemik1', 'this1moois')
	except:
		return ("fail", 0)
	
	#group_table = group_to_table(group)
	#print group_table
	
	resp, count, first, last, name = s.group(group)
	
	print 'Group', name, 'has', count, 'articles, range', first, 'to', last
	
	tolerance = 10000
	#tolerance = 300
	
	#oldest = last
	if last_article == 0:# and not_in_db:
		latest = str(int(last) - tolerance)
		print "NOT IN DB! USING TOLERANCE OF %d!!!" % (tolerance)
	else:
		latest = str(last_article)
	to_do = int(last) - int(latest)
	print "\n------NOW PROCESSING %d ARTICLES FROM %s-----\n\n" % (to_do, group)
	syslog.syslog("NOW PROCESSING %d ARTICLES FROM %s" % (to_do, group))
	if to_do ==0:
		s.quit()
		return ("",0)

	s.quit()
	
	#split work of todo into 5,000 article chunks
	chunks = (to_do / 5000) + 1
	if chunks == 1:
		print "only 1 chunk"
		return process_chunk(latest, last, group, group_id, 1, chunks)

	end = int(latest)
	for chunk in range(1,chunks):
		if chunk == len(range(0,chunks)):
			print "last chunk"
			return process_chunk(end, last, group, group_id,chunk,chunks)
		process_chunk(end,end+5000, group, group_id,chunk,chunks)
		end = end+5000
예제 #4
0
파일: ss.py 프로젝트: nemik/usenet-indexer
def process_group(group, c, connection):

    docs_xml = ""

    last_article = 0

    not_in_db = False
    group_id = 0

    # guarantees group is in the DB
    """
	read last good article for the group from DB or file
	"""
    # last_article = int(file('last_article','r').readlines()[0])
    c.execute("SELECT id, last_article FROM groups WHERE name = %s", (group,))
    rows = c.fetchall()
    if len(rows) > 0:
        last_article = int(rows[0][1])
        group_id = int(rows[0][0])
    else:
        print "ADDING %s to DB!" % (group)
        not_in_db = True
        c.execute("INSERT INTO groups (name, last_article) VALUES (%s, %s)", (group, 0))
        connection.commit()
        c.execute("SELECT id FROM groups WHERE name = %s", (group,))
        group_id = int(c.fetchone()[0])

    print "last article is %s" % last_article

    # connect to usenet, with SSL! WOO!
    s = ""
    try:
        s = NNTP_SSL("secure.news.astraweb.com", 443, "nemik1", "this1moois")
    except:
        return ("fail", 0)

        # group_table = group_to_table(group)
        # print group_table

    resp, count, first, last, name = s.group(group)

    print "Group", name, "has", count, "articles, range", first, "to", last

    tolerance = 10000
    # tolerance = 300

    # oldest = last
    if last_article == 0:  # and not_in_db:
        latest = str(int(last) - tolerance)
        print "NOT IN DB! USING TOLERANCE OF %d!!!" % (tolerance)
    else:
        latest = str(last_article)
    to_do = int(last) - int(latest)
    print "\n------NOW PROCESSING %d ARTICLES FROM %s-----\n\n" % (to_do, group)
    syslog.syslog("NOW PROCESSING %d ARTICLES FROM %s" % (to_do, group))
    if to_do == 0:
        s.quit()
        return ("", 0)

    s.quit()

    # split work of todo into 5,000 article chunks
    chunks = (to_do / 5000) + 1
    if chunks == 1:
        print "only 1 chunk"
        return process_chunk(latest, last, group, group_id, 1, chunks)

    end = int(latest)
    for chunk in range(1, chunks):
        if chunk == len(range(0, chunks)):
            print "last chunk"
            return process_chunk(end, last, group, group_id, chunk, chunks)
        process_chunk(end, end + 5000, group, group_id, chunk, chunks)
        end = end + 5000