Пример #1
0
	def splitBunchedCollections(self):
		mdb = db.DB()
		# namecleaner = namecleaning
		res = mdb.queryDirect("SELECT b.ID as bID, b.name as bname, c.* FROM binaries b LEFT JOIN collections c ON b.collectionID = c.ID where c.filecheck = 10")
		if res:
			if len(res) > 0:
				print 'Extracting bunched up collections.'
				bunchedcnt = 0
				cIDS = list()
				for row in res:
					cIDS.append(row['ID'])
					newMD5 = hashlib.md5(namecleaning.collectionsCleaner(row['bname'], 'split')+row['fromname']+row['groupID']+row['totalFiles']).hexdigest()
					cres = mdb.queryOneRow("SELECT ID FROM collections WHERE collectionhash = %s", (newMD5,))
					if not cres:
						bunchedcnt += 1
						csql = "INSERT INTO collections (name, subject, fromname, date, xref, groupID, totalFiles, collectionhash, filecheck, dateadded) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, 11, now())"
						collectionID = mdb.queryInsert(csql, (namecleaning.releaseCleaner(row['bname']), row['bname'], row['fromname'], row['date'], row['xref'], row['groupID'], row['totalFiles'], newMD5))
					else:
						collectionID = cres['ID']
						# update the collection table with the last seen date for the collection
						mdb.queryDirect("UPDATE collections set dateadded = now() where ID = %s", (collectionID,))
					# update the parts/binaries with new info
					mdb.query("UPDATE binaries SET collectionID = %s where ID = %s", (collectionID, row['bID'],))
					mdb.query("UPDATE parts SET binaryID = %s where binaryID = %s", (row['bID'], row['bID'],))
				# remove the old collections
				for cID in list(set(cIDS)):
					mdb.query("DELETE FROM collections WHERE ID = %s", (cID,))

				# update the collections to say we are done
				mdb.query("UPDATE collections SET filecheck = 0 WHERE filecheck = 11")
				print 'Extracted %d bunched collections.' % bunchedcnt
Пример #2
0
	def scan(self, nntp, groupArr, first, last, stype='update'):
		mdb = DB()
		self.startHeaders = time.time()
		resp, overviews = nntp.over((first, last))

		rangerequested = range(first, last)
		msgsreceived = list()
		msgsblacklisted = list()
		msgsignored = list()
		msgsnotinserted = list()

		timeHeaders = int(time.time() - self.startHeaders)

		self.startCleaning = time.time()
		if type(overviews) is list:
			# loop articles, figure out files/parts
			for msg in overviews:
				try:
					msg[0]
				except IndexError:
					continue

				msgsreceived.append(msg[0])

				# for part count
				pattern = '\((\d+)\/(\d+)\)$'

				# not a binary post most likely.. continue
				try:
					pRegex = re.compile(pattern, re.IGNORECASE)
					matches = pRegex.search(msg[1]['subject'])
					if matches is None:
						continue
				except KeyError:
					continue

				# filter subject based on black/white list
				if self.isBlackListed(msg, groupArr['name']):
					msgsblacklisted.append(msg[0])
					continue

				# attempt to get file count
				cRegex = re.compile('(\[|\(|\s)(\d{1,4})(\/|(\s|_)of(\s|_)|\-)(\d{1,4})(\]|\)|\s)(?!"?$)')
				filecnt = cRegex.search(msg[1]['subject'])
				if filecnt is None:
					filecnt = list()
					filecnt = ['0' for x in range(0,6)]
				else:
					filecnt = [x for x in filecnt.groups()]

				matches = [str(x).strip() for x in matches.groups()]
				if matches[0].isdigit() and matches[0].isdigit():
					subject = re.sub(pattern, '', msg[1]['subject']).strip().encode('utf-8', 'ignore')
					cleansubject = namecleaning.collectionsCleaner(msg[1]['subject'])

					# if msg['subject']:
					self.message[subject] = msg[1]
					self.message[subject]['MaxParts'] = int(matches[1])
					self.message[subject]['Date'] = parse(self.message[subject]['date']).strftime('%s')
					self.message[subject]['CollectionHash'] = hashlib.md5(cleansubject+msg[1]['from']+str(groupArr['ID'])+str(filecnt[5])).hexdigest()
					self.message[subject]['MaxFiles'] = int(filecnt[5])
					self.message[subject]['File'] = int(filecnt[1])

					if int(matches[0]) > 0:
						self.message[subject]['Parts'] = dict()
						self.message[subject]['Parts'][int(matches[0])] = {'Message-ID' : msg[1]['message-id'][1:-1], 'number' : msg[0], 'part' : int(matches[0]), 'size' : msg[1][':bytes']}

			timeCleaning = int(time.time() - self.startCleaning)
			del msg
			maxnum = last
			rangenotreceived = list(set(rangerequested) - set(msgsreceived))

			if stype != 'partrepair':
				print 'Received ', len(msgsreceived), 'articles of', last-first+1, 'requested,', len(msgsblacklisted), 'blacklisted,', len(msgsignored), 'not binary.'					

			if len(rangenotreceived) > 0:
				if stype == 'backfill':
					''' dont add missing articles'''
				else:
					if self.DoPartRepair:
						self.addMissingParts(rangenotreceived, groupArr['ID'])

				if stype != 'partrepair':
					print 'Server did not return %d articles.' % (len(rangenotreceived))

			self.startUpdate = time.time()
			try:
				len(self.message)
			except NameError:
				pass
			else:
				maxnum = first
				# insert binaries and parts into database. When binaries already exists; only insert new parts
				insPartsStmt = "INSERT IGNORE INTO parts (binaryID, number, messageID, partnumber, size) VALUES (%s, %s, %s, %s, %s)"

				lastCollectionHash = ''
				lastCollectionID = -1
				lastBinaryHash = ''
				lastBinaryID = -1

				mdb.setAutoCommit(False)

				for subject, data in self.message.iteritems():
					collectionHash = data['CollectionHash']
					subject = namecleaning.unfuckString(subject)

					if lastCollectionHash == collectionHash:
						collectionID = lastCollectionID
					else:
						lastCollectionHash = collectionHash
						lastBinaryHash = ''
						lastBinaryID = -1

						cres = mdb.queryOneRow("SELECT ID FROM collections WHERE collectionhash = %s", (collectionHash,))
						if cres is None:
							cleanerName = namecleaning.releaseCleaner(subject)
							csql = "INSERT INTO collections (name, subject, fromname, date, xref, groupID, totalFiles, collectionhash, dateadded) VALUES (%s, %s, %s, FROM_UNIXTIME(%s), %s, %s, %s, %s, now())"
							collectionID = mdb.queryInsert(csql, (cleanerName, subject, data['from'], data['Date'], data['xref'], groupArr['ID'], data['MaxFiles'], collectionHash))
						else:
							collectionID = int(cres['ID'])
							cusql = 'UPDATE collections SET dateadded = now() where ID = %s'
							mdb.queryDirect(cusql, (collectionID,))

						lastCollectionID = collectionID
					binaryHash = hashlib.md5(subject+data['from']+str(groupArr['ID'])).hexdigest()

					if lastBinaryHash == binaryHash:
						binaryID = lastBinaryID
					else:
						lastBinaryHash = binaryHash

						bres = mdb.queryOneRow('SELECT ID FROM binaries WHERE binaryhash = %s', (binaryHash,))
						if bres is None:
							bsql = "INSERT INTO binaries (binaryhash, name, collectionID, totalParts, filenumber) VALUES (%s, %s, %s, %s, %s)"
							binaryID = mdb.queryInsert(bsql, (binaryHash, subject, collectionID, data['MaxParts'], round(data['File'])))
						else:
							binaryID = bres['ID']
						lastBinaryID = binaryID

					for partdata in data['Parts'].values():
						pBinaryID = binaryID
						pMessageID = partdata['Message-ID']
						pNumber = partdata['number']
						pPartNumber = round(partdata['part'])
						pSize = partdata['size']
						maxnum = partdata['number'] if (partdata['number'] > maxnum) else maxnum
						params = (pBinaryID, pNumber, pMessageID, pPartNumber, pSize)
						
						try:
							mdb.query(insPartsStmt, params)
						except MySQLdb.Error, e:
							msgsnotinserted.append(partdata['number'])

				if len(msgsnotinserted) > 0:
					print 'WARNING: %d parts failed to insert.' % len(msgsnotinserted)
					if self.DoPartRepair:
						self.addMissingParts(msgsnotinserted, groupArr['ID'])
				mdb.commit()
				mdb.setAutoCommit(True)
			timeUpdate = int(time.time() - self.startUpdate)
			timeLoop = int(time.time() - self.startLoop)

			if stype != 'partrepair':
				print '%ds to download articles, %ds to clean articles, %d to insert articles, %ds total.\n\n' % (timeHeaders, timeCleaning, timeUpdate, timeLoop)
			data, self.message = None, {}
			return maxnum
Пример #3
0
    def scan(self, nntp, groupArr, first, last, stype='update'):
        mdb = DB()
        self.startHeaders = time.time()
        resp, overviews = nntp.over((first, last))

        rangerequested = range(first, last)
        msgsreceived = list()
        msgsblacklisted = list()
        msgsignored = list()
        msgsnotinserted = list()

        timeHeaders = int(time.time() - self.startHeaders)

        self.startCleaning = time.time()
        if type(overviews) is list:
            # loop articles, figure out files/parts
            for msg in overviews:
                try:
                    msg[0]
                except IndexError:
                    continue

                msgsreceived.append(msg[0])

                # for part count
                pattern = '\((\d+)\/(\d+)\)$'

                # not a binary post most likely.. continue
                try:
                    pRegex = re.compile(pattern, re.IGNORECASE)
                    matches = pRegex.search(msg[1]['subject'])
                    if matches is None:
                        continue
                except KeyError:
                    continue

                # filter subject based on black/white list
                if self.isBlackListed(msg, groupArr['name']):
                    msgsblacklisted.append(msg[0])
                    continue

                # attempt to get file count
                cRegex = re.compile(
                    '(\[|\(|\s)(\d{1,4})(\/|(\s|_)of(\s|_)|\-)(\d{1,4})(\]|\)|\s)(?!"?$)'
                )
                filecnt = cRegex.search(msg[1]['subject'])
                if filecnt is None:
                    filecnt = list()
                    filecnt = ['0' for x in range(0, 6)]
                else:
                    filecnt = [x for x in filecnt.groups()]

                matches = [str(x).strip() for x in matches.groups()]
                if matches[0].isdigit() and matches[0].isdigit():
                    subject = re.sub(pattern, '',
                                     msg[1]['subject']).strip().encode(
                                         'utf-8', 'ignore')
                    cleansubject = namecleaning.collectionsCleaner(
                        msg[1]['subject'])

                    try:
                        self.message[subject]
                    except KeyError:
                        self.message[subject] = msg[1]
                        self.message[subject]['MaxParts'] = int(matches[1])
                        self.message[subject]['Date'] = parse(
                            self.message[subject]['date']).strftime('%s')
                        self.message[subject]['CollectionHash'] = hashlib.md5(
                            cleansubject + msg[1]['from'] +
                            str(groupArr['ID']) + str(filecnt[5])).hexdigest()
                        self.message[subject]['MaxFiles'] = int(filecnt[5])
                        self.message[subject]['File'] = int(filecnt[1])

                    if int(matches[0]) > 0:
                        try:
                            self.message[subject]['Parts']
                        except KeyError:
                            self.message[subject]['Parts'] = dict()

                        self.message[subject]['Parts'][int(matches[0])] = {
                            'Message-ID': msg[1]['message-id'][1:-1],
                            'number': msg[0],
                            'part': int(matches[0]),
                            'size': msg[1][':bytes']
                        }

            timeCleaning = int(time.time() - self.startCleaning)
            try:
                del msg
            except UnboundLocalError:
                pass
            maxnum = last
            rangenotreceived = list(set(rangerequested) - set(msgsreceived))

            if stype != 'partrepair':
                print 'Received ', len(
                    msgsreceived
                ), 'articles of', last - first + 1, 'requested,', len(
                    msgsblacklisted), 'blacklisted,', len(
                        msgsignored), 'not binary.'

            if len(rangenotreceived) > 0:
                if stype == 'backfill':
                 ''' dont add missing articles'''
                else:
                    if self.DoPartRepair:
                        self.addMissingParts(rangenotreceived, groupArr['ID'])

                if stype != 'partrepair':
                    print 'Server did not return %d articles.' % (
                        len(rangenotreceived))

            self.startUpdate = time.time()
            try:
                len(self.message)
            except NameError:
                pass
            else:
                maxnum = first
                # insert binaries and parts into database. When binaries already exists; only insert new parts
                insPartsStmt = "INSERT IGNORE INTO parts (binaryID, number, messageID, partnumber, size) VALUES (%s, %s, %s, %s, %s)"

                lastCollectionHash = ''
                lastCollectionID = -1
                lastBinaryHash = ''
                lastBinaryID = -1

                mdb.setAutoCommit(False)

                for subject, data in self.message.iteritems():
                    collectionHash = data['CollectionHash']
                    subject = namecleaning.unfuckString(subject)

                    if lastCollectionHash == collectionHash:
                        collectionID = lastCollectionID
                    else:
                        lastCollectionHash = collectionHash
                        lastBinaryHash = ''
                        lastBinaryID = -1

                        cres = mdb.queryOneRow(
                            "SELECT ID FROM collections WHERE collectionhash = %s",
                            (collectionHash, ))
                        if cres is None:
                            cleanerName = namecleaning.releaseCleaner(subject)
                            csql = "INSERT INTO collections (name, subject, fromname, date, xref, groupID, totalFiles, collectionhash, dateadded) VALUES (%s, %s, %s, FROM_UNIXTIME(%s), %s, %s, %s, %s, now())"
                            collectionID = int(
                                mdb.queryInsert(
                                    csql, (cleanerName, subject, data['from'],
                                           data['Date'], data['xref'],
                                           groupArr['ID'], data['MaxFiles'],
                                           collectionHash)))
                        else:
                            collectionID = int(cres['ID'])
                            cusql = 'UPDATE collections SET dateadded = now() where ID = %s'
                            mdb.queryDirect(cusql, (collectionID, ))

                        lastCollectionID = collectionID
                    binaryHash = hashlib.md5(subject + data['from'] +
                                             str(groupArr['ID'])).hexdigest()

                    if lastBinaryHash == binaryHash:
                        binaryID = lastBinaryID
                    else:
                        lastBinaryHash = binaryHash

                        bres = mdb.queryOneRow(
                            'SELECT ID FROM binaries WHERE binaryhash = %s',
                            (binaryHash, ))
                        if bres is None:
                            bsql = "INSERT INTO binaries (binaryhash, name, collectionID, totalParts, filenumber) VALUES (%s, %s, %s, %s, %s)"
                            binaryID = mdb.queryInsert(
                                bsql, (binaryHash, subject, collectionID,
                                       data['MaxParts'], round(data['File'])))
                        else:
                            binaryID = bres['ID']
                        lastBinaryID = binaryID

                    for partdata in data['Parts'].values():
                        pBinaryID = binaryID
                        pMessageID = partdata['Message-ID']
                        pNumber = partdata['number']
                        pPartNumber = round(partdata['part'])
                        pSize = partdata['size']
                        maxnum = partdata['number'] if (
                            partdata['number'] > maxnum) else maxnum
                        params = (pBinaryID, pNumber, pMessageID, pPartNumber,
                                  pSize)

                        try:
                            mdb.query(insPartsStmt, params)
                        except MySQLdb.Error, e:
                            msgsnotinserted.append(partdata['number'])

                if len(msgsnotinserted) > 0:
                    print 'WARNING: %d parts failed to insert.' % len(
                        msgsnotinserted)
                    if self.DoPartRepair:
                        self.addMissingParts(msgsnotinserted, groupArr['ID'])
                mdb.commit()
                mdb.setAutoCommit(True)
            timeUpdate = int(time.time() - self.startUpdate)
            timeLoop = int(time.time() - self.startLoop)

            if stype != 'partrepair':
                print '%ds to download articles, %ds to clean articles, %d to insert articles, %ds total.\n\n' % (
                    timeHeaders, timeCleaning, timeUpdate, timeLoop)
            data, self.message = None, {}
            return maxnum