def splitBunchedCollections(self): mdb = db.DB() # namecleaner = namecleaning res = mdb.queryDirect("SELECT b.ID as bID, b.name as bname, c.* FROM binaries b LEFT JOIN collections c ON b.collectionID = c.ID where c.filecheck = 10") if res: if len(res) > 0: print 'Extracting bunched up collections.' bunchedcnt = 0 cIDS = list() for row in res: cIDS.append(row['ID']) newMD5 = hashlib.md5(namecleaning.collectionsCleaner(row['bname'], 'split')+row['fromname']+row['groupID']+row['totalFiles']).hexdigest() cres = mdb.queryOneRow("SELECT ID FROM collections WHERE collectionhash = %s", (newMD5,)) if not cres: bunchedcnt += 1 csql = "INSERT INTO collections (name, subject, fromname, date, xref, groupID, totalFiles, collectionhash, filecheck, dateadded) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, 11, now())" collectionID = mdb.queryInsert(csql, (namecleaning.releaseCleaner(row['bname']), row['bname'], row['fromname'], row['date'], row['xref'], row['groupID'], row['totalFiles'], newMD5)) else: collectionID = cres['ID'] # update the collection table with the last seen date for the collection mdb.queryDirect("UPDATE collections set dateadded = now() where ID = %s", (collectionID,)) # update the parts/binaries with new info mdb.query("UPDATE binaries SET collectionID = %s where ID = %s", (collectionID, row['bID'],)) mdb.query("UPDATE parts SET binaryID = %s where binaryID = %s", (row['bID'], row['bID'],)) # remove the old collections for cID in list(set(cIDS)): mdb.query("DELETE FROM collections WHERE ID = %s", (cID,)) # update the collections to say we are done mdb.query("UPDATE collections SET filecheck = 0 WHERE filecheck = 11") print 'Extracted %d bunched collections.' % bunchedcnt
def scan(self, nntp, groupArr, first, last, stype='update'): mdb = DB() self.startHeaders = time.time() resp, overviews = nntp.over((first, last)) rangerequested = range(first, last) msgsreceived = list() msgsblacklisted = list() msgsignored = list() msgsnotinserted = list() timeHeaders = int(time.time() - self.startHeaders) self.startCleaning = time.time() if type(overviews) is list: # loop articles, figure out files/parts for msg in overviews: try: msg[0] except IndexError: continue msgsreceived.append(msg[0]) # for part count pattern = '\((\d+)\/(\d+)\)$' # not a binary post most likely.. continue try: pRegex = re.compile(pattern, re.IGNORECASE) matches = pRegex.search(msg[1]['subject']) if matches is None: continue except KeyError: continue # filter subject based on black/white list if self.isBlackListed(msg, groupArr['name']): msgsblacklisted.append(msg[0]) continue # attempt to get file count cRegex = re.compile('(\[|\(|\s)(\d{1,4})(\/|(\s|_)of(\s|_)|\-)(\d{1,4})(\]|\)|\s)(?!"?$)') filecnt = cRegex.search(msg[1]['subject']) if filecnt is None: filecnt = list() filecnt = ['0' for x in range(0,6)] else: filecnt = [x for x in filecnt.groups()] matches = [str(x).strip() for x in matches.groups()] if matches[0].isdigit() and matches[0].isdigit(): subject = re.sub(pattern, '', msg[1]['subject']).strip().encode('utf-8', 'ignore') cleansubject = namecleaning.collectionsCleaner(msg[1]['subject']) # if msg['subject']: self.message[subject] = msg[1] self.message[subject]['MaxParts'] = int(matches[1]) self.message[subject]['Date'] = parse(self.message[subject]['date']).strftime('%s') self.message[subject]['CollectionHash'] = hashlib.md5(cleansubject+msg[1]['from']+str(groupArr['ID'])+str(filecnt[5])).hexdigest() self.message[subject]['MaxFiles'] = int(filecnt[5]) self.message[subject]['File'] = int(filecnt[1]) if int(matches[0]) > 0: self.message[subject]['Parts'] = dict() self.message[subject]['Parts'][int(matches[0])] = {'Message-ID' : msg[1]['message-id'][1:-1], 'number' : msg[0], 'part' : int(matches[0]), 'size' : msg[1][':bytes']} timeCleaning = int(time.time() - self.startCleaning) del msg maxnum = last rangenotreceived = list(set(rangerequested) - set(msgsreceived)) if stype != 'partrepair': print 'Received ', len(msgsreceived), 'articles of', last-first+1, 'requested,', len(msgsblacklisted), 'blacklisted,', len(msgsignored), 'not binary.' if len(rangenotreceived) > 0: if stype == 'backfill': ''' dont add missing articles''' else: if self.DoPartRepair: self.addMissingParts(rangenotreceived, groupArr['ID']) if stype != 'partrepair': print 'Server did not return %d articles.' % (len(rangenotreceived)) self.startUpdate = time.time() try: len(self.message) except NameError: pass else: maxnum = first # insert binaries and parts into database. When binaries already exists; only insert new parts insPartsStmt = "INSERT IGNORE INTO parts (binaryID, number, messageID, partnumber, size) VALUES (%s, %s, %s, %s, %s)" lastCollectionHash = '' lastCollectionID = -1 lastBinaryHash = '' lastBinaryID = -1 mdb.setAutoCommit(False) for subject, data in self.message.iteritems(): collectionHash = data['CollectionHash'] subject = namecleaning.unfuckString(subject) if lastCollectionHash == collectionHash: collectionID = lastCollectionID else: lastCollectionHash = collectionHash lastBinaryHash = '' lastBinaryID = -1 cres = mdb.queryOneRow("SELECT ID FROM collections WHERE collectionhash = %s", (collectionHash,)) if cres is None: cleanerName = namecleaning.releaseCleaner(subject) csql = "INSERT INTO collections (name, subject, fromname, date, xref, groupID, totalFiles, collectionhash, dateadded) VALUES (%s, %s, %s, FROM_UNIXTIME(%s), %s, %s, %s, %s, now())" collectionID = mdb.queryInsert(csql, (cleanerName, subject, data['from'], data['Date'], data['xref'], groupArr['ID'], data['MaxFiles'], collectionHash)) else: collectionID = int(cres['ID']) cusql = 'UPDATE collections SET dateadded = now() where ID = %s' mdb.queryDirect(cusql, (collectionID,)) lastCollectionID = collectionID binaryHash = hashlib.md5(subject+data['from']+str(groupArr['ID'])).hexdigest() if lastBinaryHash == binaryHash: binaryID = lastBinaryID else: lastBinaryHash = binaryHash bres = mdb.queryOneRow('SELECT ID FROM binaries WHERE binaryhash = %s', (binaryHash,)) if bres is None: bsql = "INSERT INTO binaries (binaryhash, name, collectionID, totalParts, filenumber) VALUES (%s, %s, %s, %s, %s)" binaryID = mdb.queryInsert(bsql, (binaryHash, subject, collectionID, data['MaxParts'], round(data['File']))) else: binaryID = bres['ID'] lastBinaryID = binaryID for partdata in data['Parts'].values(): pBinaryID = binaryID pMessageID = partdata['Message-ID'] pNumber = partdata['number'] pPartNumber = round(partdata['part']) pSize = partdata['size'] maxnum = partdata['number'] if (partdata['number'] > maxnum) else maxnum params = (pBinaryID, pNumber, pMessageID, pPartNumber, pSize) try: mdb.query(insPartsStmt, params) except MySQLdb.Error, e: msgsnotinserted.append(partdata['number']) if len(msgsnotinserted) > 0: print 'WARNING: %d parts failed to insert.' % len(msgsnotinserted) if self.DoPartRepair: self.addMissingParts(msgsnotinserted, groupArr['ID']) mdb.commit() mdb.setAutoCommit(True) timeUpdate = int(time.time() - self.startUpdate) timeLoop = int(time.time() - self.startLoop) if stype != 'partrepair': print '%ds to download articles, %ds to clean articles, %d to insert articles, %ds total.\n\n' % (timeHeaders, timeCleaning, timeUpdate, timeLoop) data, self.message = None, {} return maxnum
def scan(self, nntp, groupArr, first, last, stype='update'): mdb = DB() self.startHeaders = time.time() resp, overviews = nntp.over((first, last)) rangerequested = range(first, last) msgsreceived = list() msgsblacklisted = list() msgsignored = list() msgsnotinserted = list() timeHeaders = int(time.time() - self.startHeaders) self.startCleaning = time.time() if type(overviews) is list: # loop articles, figure out files/parts for msg in overviews: try: msg[0] except IndexError: continue msgsreceived.append(msg[0]) # for part count pattern = '\((\d+)\/(\d+)\)$' # not a binary post most likely.. continue try: pRegex = re.compile(pattern, re.IGNORECASE) matches = pRegex.search(msg[1]['subject']) if matches is None: continue except KeyError: continue # filter subject based on black/white list if self.isBlackListed(msg, groupArr['name']): msgsblacklisted.append(msg[0]) continue # attempt to get file count cRegex = re.compile( '(\[|\(|\s)(\d{1,4})(\/|(\s|_)of(\s|_)|\-)(\d{1,4})(\]|\)|\s)(?!"?$)' ) filecnt = cRegex.search(msg[1]['subject']) if filecnt is None: filecnt = list() filecnt = ['0' for x in range(0, 6)] else: filecnt = [x for x in filecnt.groups()] matches = [str(x).strip() for x in matches.groups()] if matches[0].isdigit() and matches[0].isdigit(): subject = re.sub(pattern, '', msg[1]['subject']).strip().encode( 'utf-8', 'ignore') cleansubject = namecleaning.collectionsCleaner( msg[1]['subject']) try: self.message[subject] except KeyError: self.message[subject] = msg[1] self.message[subject]['MaxParts'] = int(matches[1]) self.message[subject]['Date'] = parse( self.message[subject]['date']).strftime('%s') self.message[subject]['CollectionHash'] = hashlib.md5( cleansubject + msg[1]['from'] + str(groupArr['ID']) + str(filecnt[5])).hexdigest() self.message[subject]['MaxFiles'] = int(filecnt[5]) self.message[subject]['File'] = int(filecnt[1]) if int(matches[0]) > 0: try: self.message[subject]['Parts'] except KeyError: self.message[subject]['Parts'] = dict() self.message[subject]['Parts'][int(matches[0])] = { 'Message-ID': msg[1]['message-id'][1:-1], 'number': msg[0], 'part': int(matches[0]), 'size': msg[1][':bytes'] } timeCleaning = int(time.time() - self.startCleaning) try: del msg except UnboundLocalError: pass maxnum = last rangenotreceived = list(set(rangerequested) - set(msgsreceived)) if stype != 'partrepair': print 'Received ', len( msgsreceived ), 'articles of', last - first + 1, 'requested,', len( msgsblacklisted), 'blacklisted,', len( msgsignored), 'not binary.' if len(rangenotreceived) > 0: if stype == 'backfill': ''' dont add missing articles''' else: if self.DoPartRepair: self.addMissingParts(rangenotreceived, groupArr['ID']) if stype != 'partrepair': print 'Server did not return %d articles.' % ( len(rangenotreceived)) self.startUpdate = time.time() try: len(self.message) except NameError: pass else: maxnum = first # insert binaries and parts into database. When binaries already exists; only insert new parts insPartsStmt = "INSERT IGNORE INTO parts (binaryID, number, messageID, partnumber, size) VALUES (%s, %s, %s, %s, %s)" lastCollectionHash = '' lastCollectionID = -1 lastBinaryHash = '' lastBinaryID = -1 mdb.setAutoCommit(False) for subject, data in self.message.iteritems(): collectionHash = data['CollectionHash'] subject = namecleaning.unfuckString(subject) if lastCollectionHash == collectionHash: collectionID = lastCollectionID else: lastCollectionHash = collectionHash lastBinaryHash = '' lastBinaryID = -1 cres = mdb.queryOneRow( "SELECT ID FROM collections WHERE collectionhash = %s", (collectionHash, )) if cres is None: cleanerName = namecleaning.releaseCleaner(subject) csql = "INSERT INTO collections (name, subject, fromname, date, xref, groupID, totalFiles, collectionhash, dateadded) VALUES (%s, %s, %s, FROM_UNIXTIME(%s), %s, %s, %s, %s, now())" collectionID = int( mdb.queryInsert( csql, (cleanerName, subject, data['from'], data['Date'], data['xref'], groupArr['ID'], data['MaxFiles'], collectionHash))) else: collectionID = int(cres['ID']) cusql = 'UPDATE collections SET dateadded = now() where ID = %s' mdb.queryDirect(cusql, (collectionID, )) lastCollectionID = collectionID binaryHash = hashlib.md5(subject + data['from'] + str(groupArr['ID'])).hexdigest() if lastBinaryHash == binaryHash: binaryID = lastBinaryID else: lastBinaryHash = binaryHash bres = mdb.queryOneRow( 'SELECT ID FROM binaries WHERE binaryhash = %s', (binaryHash, )) if bres is None: bsql = "INSERT INTO binaries (binaryhash, name, collectionID, totalParts, filenumber) VALUES (%s, %s, %s, %s, %s)" binaryID = mdb.queryInsert( bsql, (binaryHash, subject, collectionID, data['MaxParts'], round(data['File']))) else: binaryID = bres['ID'] lastBinaryID = binaryID for partdata in data['Parts'].values(): pBinaryID = binaryID pMessageID = partdata['Message-ID'] pNumber = partdata['number'] pPartNumber = round(partdata['part']) pSize = partdata['size'] maxnum = partdata['number'] if ( partdata['number'] > maxnum) else maxnum params = (pBinaryID, pNumber, pMessageID, pPartNumber, pSize) try: mdb.query(insPartsStmt, params) except MySQLdb.Error, e: msgsnotinserted.append(partdata['number']) if len(msgsnotinserted) > 0: print 'WARNING: %d parts failed to insert.' % len( msgsnotinserted) if self.DoPartRepair: self.addMissingParts(msgsnotinserted, groupArr['ID']) mdb.commit() mdb.setAutoCommit(True) timeUpdate = int(time.time() - self.startUpdate) timeLoop = int(time.time() - self.startLoop) if stype != 'partrepair': print '%ds to download articles, %ds to clean articles, %d to insert articles, %ds total.\n\n' % ( timeHeaders, timeCleaning, timeUpdate, timeLoop) data, self.message = None, {} return maxnum