def lsFiles(files, add='', group=50): """ list a set of files in parallel (when the set is huge) Args: ---- files: gs paths add: additional params to add group: files to do in parallel """ print('listing files in gs') by = len(files) if len(files) < group else group res = [] for sfiles in h.grouped(files, by): a = '' for val in sfiles: a += val + ' ' data = subprocess.run("gsutil -m ls " + add + " " + a, capture_output=True, shell=True) if data.returncode != 0: if "One or more URLs matched no objects" not in str(data.stderr): raise ValueError('issue with the command: ' + str(data.stderr)) if len(str(data.stdout)) < 4: return [] res += str( data.stdout)[2:-1].split('\\n')[:-1] if 'L' not in add else [ 'gs://' + i for i in str(data.stdout).split('\\ngs://') ] if "TOTAL:" in res[-1] and 'L' not in add: res = res[:-1] return res
def cpFiles(files, location, group=50): """ copy a set of files in parallel (when the set is huge) Args: ---- files: gs paths location to copy group: files to do in parallel """ by = len(files) if len(files) < group else group for sfiles in h.grouped(files, by): a = '' for val in sfiles: a += val + ' ' code = os.system("gsutil -m cp " + a + location) if code != 0: print('pressed ctrl+c or command failed') break
def mvFiles(files, location, group=50, listen_to_errors=False): """ move a set of files in parallel (when the set is huge) Args: ---- files: gs paths location: to move the files to group: files to do in parallel """ by = len(files) if len(files) < group else group for sfiles in h.grouped(files, by): a = '' for val in sfiles: a += val + ' ' code = os.system("gsutil -m mv " + a + location) if code != 0 and listen_to_errors: print('pressed ctrl+c or command failed') break
def catFiles(files, group=50, split=False, cut=False): """ copy a set of files in parallel (when the set is huge) Args: ---- files: gs paths location to copy group: files to do in parallel cut: split all lines into chunks of size cut split: split lines by split e.g. \\n """ by = len(files) if len(files) < group else group res = [] for i, sfiles in enumerate(h.grouped(files, by)): print(i / (len(files) / by)) a = '' for val in sfiles: a += val + ' ' data = subprocess.run("gsutil -m cat " + a, capture_output=True, shell=True) if data.returncode != 0: if "One or more URLs matched no objects" not in str(data.stderr): print(ValueError('issue with the command: ' + str(data.stderr))) return res if len(str(data.stdout)) < 4: return [] resa = str(data.stdout)[2:-1] if cut: res += [ resa[i * cut:(i + 1) * cut] for i in range(int(len(resa) / cut)) ] elif split: res += resa.split(split) else: res += [resa] return res
def rmFiles(files, group=50, add='', dryrun=True): """ remove a set of files in parallel (when the set is huge) Args: ---- files: gs paths group: number to do in parallel add: additional gsutil cp params """ by = len(files) if len(files) < group else group for sfiles in h.grouped(files, by): a = '' for val in sfiles: a += ' ' + val if add: add = ' ' + add if dryrun: print("gsutil -m rm" + add + a) else: code = os.system("gsutil -m rm" + add + a) if code != 0: print('pressed ctrl+c or command failed') break
async def getSpikeInControlScales(refgenome, fastq=None, fastQfolder='', mapper='bwa', pairedEnd=False, cores=1, pathtosam='samtools', pathtotrim_galore='trim_galore', pathtobwa='bwa', totrim=True, tomap=True, tofilter=True, results='res/', toremove=False): """ Will extract the spikeInControls from a fastq file (usefull for, let say ChIPseq data with spike ins) Count based sequencing data is not absolute and will be normalized as each sample will be sequenced at a specific depth. To figure out what was the actual sample concentration, we use Spike In control You should have FastQfolder/[NAME].fastq & BigWigFolder/[NAME].bw with NAME being the same for the same samples Args: ----- refgenome: str the file path to the indexed reference genome FastQfolder: str the folder path where the fastq files are stored (should be named the same as files in BigWigFolder) BigWigFolder: str the folder path where the bigwig files are stored (should be named the same as files in FastQfolder) mapper: str flag to 'bwa', ... pairedEnd: Bool flat to true for paired end sequences. if true, You should have FastQfolder/[NAME]_1|2.fastq Returns: -------- dict(file,float) the scaling factor dict """ if len(fastQfolder) > 0: print('using all files from folder') fastqs = os.listdir(fastQfolder) fastqs = [i for i in fastqs if '.fq.gz' == i[-6:] or '.fastq.gz' == i[-9:]] fastqs.sort() if pairedEnd and (tomap or totrim): print("need to be name_*1, name_*2") fastqs = [i for i in h.grouped(fastqs, 2)] elif fastq is None: raise ValueError('you need input files') else: if type(fastq) is list: print('your files need to be all in the same folder') fastQfolder = '/'.join(fastq[0].split('/')[:-1]) + '/' if not totrim and not tomap: fastqs = [f.split('/')[-1] for f in fastq] else: print("need to be name_*1, name_*2") fastqs = [[f[0].split('/')[-1], f[1].split('/')[-1]] for f in h.grouped(fastq, 2)] else: fastQfolder = '/'.join(fastq.split('/')[:-1]) + '/' fastqs = [fastq.split('/')[-1]] print(fastqs) if not totrim: print("you need to have your files in the " + results + " folder") if totrim and tomap: print("\n\ntrimming\n\n") if pairedEnd: cmds = [] rm = [] for file in fastqs: cmd = pathtotrim_galore + ' --paired --fastqc --gzip ' + fastQfolder + \ file[0] + ' ' + fastQfolder + file[1] + " -o " + results if toremove: rm.append('rm ' + fastQfolder + file[0] + ' ' + fastQfolder + file[1]) cmds.append(cmd) print(cmds) h.parrun(cmds, cores, add=rm) fastqs = [[file[0].split('.')[ 0] + '_val_1.fq.gz', file[1].split('.')[0] + '_val_2.fq.gz'] for file in fastqs] if tomap: print("\n\nmapping\n\n") if pairedEnd: cmds = [] rm = [] for file in fastqs: cmd = pathtobwa + ' mem ' + refgenome + ' ' + results + file[0] + ' ' + results +\ file[1] + ' | ' + pathtosam + ' sort - -o ' + \ results + file[0].split('.')[0] + '.sorted.bam' if toremove: rm.append('rm ' + results + file[0] + ' ' + results + file[1]) cmds.append(cmd) h.parrun(cmds, cores, add=rm) fastqs = [file[0].split('.')[0] + '.sorted.bam' for file in fastqs] if tofilter: print("\n\nfiltering\n\n") cmds = [] rm = [] h.parrun([pathtosam + ' index ' + results + file.split('.') [0] + '.sorted.bam' for file in fastqs], cores) h.parrun([pathtosam + ' flagstat ' + results + file.split('.')[0] + '.sorted.bam > ' + results + file.split('.')[0] + '.sorted.bam.flagstat' for file in fastqs], cores) h.parrun([pathtosam + ' idxstats ' + results + file.split('.')[0] + '.sorted.bam > ' + results + file.split('.')[0] + '.sorted.bam.idxstat' for file in fastqs], cores) fastqs = [file.split('.')[0] + '.sorted.bam' for file in fastqs] else: print("files need to be named: NAME.sorted.bam") fastqs = [file for file in fastqs if '.sorted.bam' == file[-11:]] mapped = {} norm = {} unique_mapped = {} print("\n\ncounting\n\n") for file in fastqs: mapped[file.split('.')[0]] = int(os.popen(pathtosam + ' view -c -F 0x004 -F 0x0008 -f 0x001 -F 0x0400 -q 1 ' + results + file + ' -@ ' + str(cores)).read().split('\n')[0]) # unique_mapped[file.split('.')[0]] = int(re.findall("Mapped reads: (\d+)", os.popen('bamtools stats -in '+results + # file + '.sorted.bam').read())[0]) nbmapped = np.array([i for i in mapped.values()]) nbmapped = np.sort(nbmapped)[0] / nbmapped.astype(float) for i, val in enumerate(mapped.keys()): norm[val] = nbmapped[i] return norm, mapped, # unique_mapped