def append(self, text, filename): if self.verbose: print text #echo text to stdout if not path.isdir(path.dirname(filename)): hUtil.mkdir_p(path.dirname(filename)) fh = open(filename, 'a') fh.write(text + '\n') #append text to file fh.close()
def count(gzFastqFile, outDir): counts = dict() for line in gzip.open(gzFastqFile, 'rb'): matchObj = re.match('@(ILLUMINA|NS500422).+:([ACGTN+]+)$', line) #matches from beginning of line if matchObj: index = matchObj.group(2) if index in counts: counts[index] += 1 else: counts[index] = 1 #sort dict by count orderedCounts = OrderedDict(sorted(counts.items(), key=lambda t: t[1], reverse=True)) #write counts to file if not path.isdir(outDir): hUtil.mkdir_p(outDir) f = open(path.join(outDir,"Undetermined_index_counts.txt"),'w') fTop = open(path.join(outDir,"Undetermined_index_counts_Top100.txt"),'w') f.write("Index\tCount\n") fTop.write("Index\tCount\n") i = 1; for index in orderedCounts: line = "%s\t%d\n" % (index, orderedCounts[index]) f.write(line) if i <= 100: fTop.write(line) i += 1 f.close() fTop.close()
def safeCopy(self, src, dst): #copy a file or directory. self.checkDest(dst) hUtil.deleteItem(dst) if not path.isdir(path.dirname(dst)): hUtil.mkdir_p(path.dirname(dst)) hUtil.copy(src, dst)
def writeSamplesheet(self, outDir = None): if not outDir: outDir = self.processingDir if not path.isdir(outDir): mkdir_p(outDir) self.ssFile = path.join(outDir, 'SampleSheet.' + self.name + '.csv') with open(self.ssFile, 'w') as fh: fh.write('\n'.join( self.Run.SampleSheet.ss[:self.Run.SampleSheet.colNamesLineIndex+1] )) #write out header portion of samplesheet fh.write('\n'.join( [self.Run.SampleSheet.ss[x] for x in self.ssLineIndices] )) #write out lines corresponding to this analysis
def writeSamplesheet(self, outDir = None): if not outDir: outDir = self.Run.processingDir #bcl2fastq for HiSeq 2000 requires that analysis processing dir (self.processingDir) not yet exist. Therefore write to Run.processingDir if not path.isdir(outDir): mkdir_p(outDir) self.ssFile = path.join(outDir, 'SampleSheet.' + self.name + '.csv') with open(self.ssFile, 'w') as fh: fh.write(self.Run.SampleSheet.ss[0] + '\n') #write out header line fh.write('\n'.join( [self.Run.SampleSheet.ss[x] for x in self.ssLineIndices] )) #write out lines corresponding to this analysis
def processRun(self): try: self.clearDir(self.processingDir) self.parseSamplesheet(write_validated=True, write_analysis_samplesheets=True) self.bcl2fastq() self.postProcess() except: if not path.isdir(path.dirname(self.logFile)): hUtil.mkdir_p(path.dirname(self.logFile)) self.notify('Seqprep Exception', 'Error in ' + self.runOutName + ':\n' + traceback.format_exc()) return
def fastQC(self): self.Run.log('Running FastQC...') outDir = path.join(self.finishingDir, 'QC') hUtil.mkdir_p(outDir) fastqDir = path.join(self.finishingDir, 'Fastq') for filename in os.listdir(fastqDir): if re.match('\S+.fastq.gz', filename) and hUtil.gzNotEmpty( path.join(fastqDir, filename) ): command = 'module load centos6/fastqc-0.10.1; fastqc -t 4 --noextract --nogroup -o ' + outDir + ' ' + path.join(fastqDir,filename) self.Run.shell(command, self.Run.logFile)
def write_validatedSamplesheet(self): ssBkupDir = path.join(path.dirname(self.file),'ss') hUtil.mkdir_p(ssBkupDir) ssBkupBase = path.join(ssBkupDir,'SampleSheet.csv.orig') ssBkupFile = ssBkupBase k = 2 while path.isfile(ssBkupFile): ssBkupFile = ssBkupBase + str(k) k += 1 hUtil.copy(self.file, ssBkupFile) hUtil.setPermissions(ssBkupFile) hUtil.deleteItem(self.file) #cannot set permissions if someone else is owner. Therefore delete before openning to rewrite with open(self.file, 'w') as fh: fh.write('\n'.join(self.ss)) hUtil.setPermissions(self.file)
def initLogFile(self): hUtil.mkdir_p(self.logDir) k = 1 if path.isfile(self.logFile): #preserve any previous log files logBkup = self.logFile + str(k) while path.isfile(logBkup): k += 1 logBkup = self.logFile + str(k) self.safeCopy(self.logFile, logBkup) hUtil.setPermissions(logBkup) self.safeDeleteItem(self.logFile)
def gather_analysis_fastq(self): self.Run.log('Concatenating fastq files...') #Example file set to concatenate: # SampleA_ACAGTG_L001_R1_001.fastq.gz # SampleA_ACAGTG_L001_R1_002.fastq.gz # SampleA_ACAGTG_L001_R1_003.fastq.gz projectDir = path.join(self.processingDir, 'Project_Fastq_Files') sampDirs = glob.glob( path.join(projectDir, 'Sample_*') ) undetDir = path.join(self.processingDir, 'Undetermined_indices') undetSampDirs = glob.glob( path.join(undetDir, 'Sample_*') ) outDir = path.join(self.finishingDir, 'Fastq') hUtil.mkdir_p(outDir) for sampDir in sampDirs + undetSampDirs: for filename in os.listdir(sampDir): for readNumStr in ['1', '2']: labelMatch = re.match('(?P<fileLabel>(?P<sampLabel>[\S]+)_L[0-9]+_R'+readNumStr+')_001.fastq.gz', filename) if labelMatch: sampLabel = labelMatch.group('sampLabel') fileLabel = labelMatch.group('fileLabel') mergeFile = path.join(outDir, sampLabel + '.R' + readNumStr + '.fastq.gz') componentFiles = sorted([path.join(projectDir, sampDir, f) for f in os.listdir(sampDir) if re.match(fileLabel + '_[0-9]+.fastq.gz', f)]) self.Run.log('Concatenating ' + sampLabel + ' R'+readNumStr+' fastq component files:') self.Run.log('\n'.join([' %s' % path.basename(x) for x in componentFiles])) fout = file(mergeFile, 'wb') for componentFile in componentFiles: fin = file(componentFile,'rb') while True: data = fin.read(65536) if not data: break fout.write(data) fin.close() fout.close()
def clearDir(self, item): if path.isdir(item) or path.isfile(item): self.safeDeleteItem(item) hUtil.mkdir_p(item)