def endElement(self, name): if 'Result' == name: self.count += 1 if 0 == self.count % 256: self.log('processed %s file references' % (self.count)) self.totalFileSize += int(self.info[CGHubRecordInfo.bamFilesize_index]) record = CGHubRecordInfo(self.info) filename = record.files['bam'].filename if self.removedups and filename in self.filename2cghubRecords: self.dupcount += 1 # check the dates and keep the latest currentdate = self.createDateTime(self.filename2cghubRecords[filename].upload_date) newdate = self.createDateTime(record.upload_date) if currentdate < newdate: self.filename2cghubRecords[filename] = record else: self.filename2cghubRecords[filename] = record self.inResult = False self.info = ['' for _ in range(CGHubRecordInfo.infoCount)] self.fileIndices = 0 if 'live' == record.state: if self.minmaxsize['min'].filesize > record.files['bam'].filesize and record.files['bam'].filesize: self.minmaxsize['min'] = record.files['bam'] if self.minmaxsize['max'].filesize < record.files['bam'].filesize: self.minmaxsize['max'] = record.files['bam'] if not record.files['bam'].filesize: self.log('no file size: %s--%s' % (record.write(), record.files['bam'].write())) if self.limit and self.count >= self.limit_count: self.endDocument() raise LimitReachedException() elif name in element2index and -1 == element2index[name] and self.hasChars: # self.log('WARNING: found value for %s:%s = %s' % (name, self.count, self.unexpectedChars)) pass self.inElement = -1 self.hasChars = False
def main(platform, type_uri='detail', log=None, removedups=False, limit=-1, verbose=False, print_response=False): util.log_info(log, 'begin reading cghub archive') filename2cghubRecords = {} minmaxsize = { 'min': CGHubFileInfo('', 500000000000, ''), 'max': CGHubFileInfo('', 1, '') } try: # archives = util.getURLData(manifest_uri, 'latest_manifest', log) response = urllib.urlopen(manifest_uri) archives = response.read() lines = archives.split('\n') util.log_info( log, '\tarchive size is %s with %s lines' % (len(archives), len(lines))) util.log_info(log, '\n\t' + '\n\t'.join(lines[:10])) except Exception as e: util.log_exception(log, 'problem fetching latest_manifest: %s') raise e headers = lines[0].split('\t') column_index2header = {} for index, header in enumerate(headers): if header in header2record_index.keys(): column_index2header[index] = header count = 0 dupcount = 0 for line in lines[1:]: if not line: continue if 0 == count % 4096: util.log_info(log, 'processed %s records' % (count)) count += 1 fields = line.split('\t') header2record = {} try: for index in column_index2header.keys(): header2record[header2record_index[ column_index2header[index]]] = fields[index] except Exception as e: util.log_info(log, 'problem with parsing line(%s): %s' % (count, line)) if platform not in header2record[CGHubRecordInfo.study_index]: continue header2record.update(index2none) record = CGHubRecordInfo(header2record) filename = header2record[CGHubRecordInfo.bamFilename_index] if removedups and filename in filename2cghubRecords: if 'Live' == header2record[CGHubRecordInfo.state_index]: dupcount += 1 # check the dates and keep the latest currentdate = createDateTime( filename2cghubRecords[filename].upload_date) newdate = createDateTime(record.upload_date) if currentdate < newdate: filename2cghubRecords[filename] = record else: filename2cghubRecords[filename] = record if 'Live' == record.state: if minmaxsize['min'].filesize > record.files[ 'bam'].filesize and record.files['bam'].filesize: minmaxsize['min'] = record.files['bam'] if minmaxsize['max'].filesize < record.files['bam'].filesize: minmaxsize['max'] = record.files['bam'] if not record.files['bam'].filesize: util.log_info( log, 'no file size: %s--%s' % (record.write(), record.files['bam'].write())) statistics(log, filename2cghubRecords, minmaxsize, verbose) util.log_info( log, 'finished reading cghub archive. %s total records, %s duplicates' % (count, dupcount)) return filename2cghubRecords.values(), minmaxsize, archives
def main(platform, type_uri = 'detail', log = None, removedups = False, limit = -1, verbose = False, print_response = False): util.log_info(log, 'begin reading cghub archive') filename2cghubRecords = {} minmaxsize = {'min': CGHubFileInfo('', 500000000000, ''), 'max': CGHubFileInfo('', 1, '')} try: # archives = util.getURLData(manifest_uri, 'latest_manifest', log) response = urllib.urlopen(manifest_uri) archives = response.read() lines = archives.split('\n') util.log_info(log, '\tarchive size is %s with %s lines' % (len(archives), len(lines))) util.log_info(log, '\n\t' + '\n\t'.join(lines[:10])) except Exception as e: util.log_exception(log, 'problem fetching latest_manifest: %s') raise e headers = lines[0].split('\t') column_index2header = {} for index, header in enumerate(headers): if header in header2record_index.keys(): column_index2header[index] = header count = 0 dupcount = 0 for line in lines[1:]: if not line: continue if 0 == count % 4096: util.log_info(log, 'processed %s records' % (count)) count += 1 fields = line.split('\t') header2record = {} try: for index in column_index2header.keys(): header2record[header2record_index[column_index2header[index]]] = fields[index] except Exception as e: util.log_info(log, 'problem with parsing line(%s): %s' % (count, line)) if platform not in header2record[CGHubRecordInfo.study_index]: continue header2record.update(index2none) record = CGHubRecordInfo(header2record) filename = header2record[CGHubRecordInfo.bamFilename_index] if removedups and filename in filename2cghubRecords: if 'Live' == header2record[CGHubRecordInfo.state_index]: dupcount += 1 # check the dates and keep the latest currentdate = createDateTime(filename2cghubRecords[filename].upload_date) newdate = createDateTime(record.upload_date) if currentdate < newdate: filename2cghubRecords[filename] = record else: filename2cghubRecords[filename] = record if 'Live' == record.state: if minmaxsize['min'].filesize > record.files['bam'].filesize and record.files['bam'].filesize: minmaxsize['min'] = record.files['bam'] if minmaxsize['max'].filesize < record.files['bam'].filesize: minmaxsize['max'] = record.files['bam'] if not record.files['bam'].filesize: util.log_info(log, 'no file size: %s--%s' % (record.write(), record.files['bam'].write())) statistics(log, filename2cghubRecords, minmaxsize, verbose) util.log_info(log, 'finished reading cghub archive. %s total records, %s duplicates' % (count, dupcount)) return filename2cghubRecords.values(), minmaxsize