#!/usr/bin/python # calculate mean normalized log2 value per sample import sys sys.path.append('src') import myUtils as mu import numpy as np inFile1 = 'input/geneExpression.txt' outFile1 = 'intermediate/08.txt' (records1, header1, keys1) = mu.readRecords(inFile1, ['sampleReplicate', 'gene']) sampleGenes = dict() for key in keys1: sample, gene = records1[key]['sample'], records1[key]['gene'] sampleGene = '!'.join([sample, gene]) if sampleGene in sampleGenes: sampleGenes[sampleGene].append(key) else: sampleGenes[sampleGene] = [key] # write yo file sampleGenesOrdered = sorted(sampleGenes.keys()) with open(outFile1, 'w') as out1: # write yo header header = ['sample', 'gene', 'meanTPM'] out1.write('\t'.join(header) + '\n')
import re inFile1 = 'intermediate/fastq-catalog.txt' inFile2 = 'intermediate/sample-replicate-catalog.txt' pipeline = 'pipeline1' end = 'paired' stranded = True numThreads = str(8) sh = '/'.join([pipeline, 'prefix.sh']) gtf = '/nfs/turbo/bankheadTurbo/annotation/refgene/20200130/refGene-sorted-canonical.gtf' alignerIndexDir1 = '/nfs/turbo/bankheadTurbo/annotation/aligner-indexes/star/GRCh38' # star alignerIndexDir2 = '/nfs/turbo/bankheadTurbo/annotation/aligner-indexes/salmon/0.11.3/GRCh38/20200212/transcripts.idx' (records1, header1, keys1) = mu.readRecords(inFile1, ['uniqueName', 'read']) (records2, header2, keys2) = mu.readRecords(inFile2, ['sampleReplicate']) # keys2 = keys2[:2] # testing purposes fastqc = True #fastqc = False # testing pars = { 'pipeline': pipeline, 'threads': numThreads, 'alignerIndexDir': alignerIndexDir1, 'end': end, 'records': records1, 'stranded': stranded, 'gtf': gtf, 'fastqc': fastqc }
#!/usr/bin/python # include mean treatment and control fpkm across replicates import sys sys.path.append('src') import myUtils as mu inFile1 = 'intermediate/07.txt' # de results inFile2 = 'intermediate/08.txt' # mean expression inFile3 = 'input/combos.txt' outFile1 = 'intermediate/09.txt' (records1,header1,keys1) = mu.readRecords(inFile1, ['sample','gene']) (records2,header2,keys2) = mu.readRecords(inFile2, ['sample','gene']) (records3,header3,keys3) = mu.readRecords(inFile3, ['tx']) # write yo file with open(outFile1,'w') as out1: # write yo header header = header1 + ['meanTPM','minMeanTPM','meanTPMTreatment','meanTPMControl'] out1.write('\t'.join(header) + '\n') for key in keys1: record = records1[key] treatment,cellline,gene = record['sample'],record['cellline'],record['gene'] control = records3[treatment]['ctrl'] # add mean fpkm values for treatment and controls trKey = treatment + '!' + gene coKey = control + '!' + gene
import os import re inFile1 = 'intermediate/sample-sheet.txt' pipeline = 'pipeline2' account = 'cdsc_project1' jobName = pipeline # combos 62 X 8 # combos 124 X 4 nodes = 32 # liberal #nodes=20 # conservative numThreads = str(8) shHeader = 'input/slurm-header-' + numThreads + '-array.sh' (records1, header, keys1) = mu.readRecords(inFile1, ['fileID']) keys1a = [key for key in keys1 if records1[key]['disease'] in ['BLCA']] #keys1a = keys1a[:30] # testing #keys1a = keys1a[:5] # testing keys1a = keys1a[:1] # testing subprocess.check_call('mkdir -p ' + pipeline, shell=True) subprocess.check_call('mkdir -p ' + '/'.join([pipeline, 'logs']), shell=True) subprocess.check_call('mkdir -p ' + '/'.join([pipeline, 'scripts']), shell=True) # create sh prefix and execution script sh = '/'.join([pipeline, 'prefix.sh']) execute = pipeline + '/execute.sh' # write sh prefix with open(sh, 'w') as out1, open(shHeader, 'r') as in1:
sys.path.append('src') import myUtils as mu import os, re inDir = '11-txts/' outFile1 = 'intermediate/13.txt' files = os.listdir(inDir) files = sorted([file for file in files if 'transcript' in file]) # write yo file with open(outFile1, 'w') as out1: # get example header and build on it (records1, header1, keys1) = mu.readRecords(inDir + files[0], ['transcript']) header = ['sample', 'sampleReplicate', 'cellline', 'tx', 'replicate' ] + header1 out1.write('\t'.join(header) + '\n') for file in files: inFile = inDir + file (records1, header1, keys1) = mu.readRecords(inFile, ['transcript']) sampleReplicate = file.replace('-transcript.txt', '') print sampleReplicate cellline, tx, rep = sampleReplicate.split('_') sample = '_'.join([cellline, tx])
import sys sys.path.append('src') import myUtils as mu import os inFile1 = 'input/refseq2entrez.txt' inFile2 = 'intermediate/06.txt' # cufflinks transcript list inDir1 = '02-txts/' # cufflinks example file outFile1 = 'intermediate/07.txt' nonRefgeneTranscripts = ['ENST00000460036.1','TCONS_00009125','TCONS_00014146'] (records1,header1,keys1) = mu.readRecords(inFile1,['refgeneRefseq']) (records2,header2,keys2) = mu.readRecords(inFile2,['transcript']) # check that we have sufficient annotation for transcripts isoformTranscripts1 = set(keys2) isoformTranscripts = isoformTranscripts1 isoformTranscripts = sorted(list(isoformTranscripts)) with open(outFile1,'w') as out1: # assemble and write yo header header = ['gene','transcript','hgnc','entrez','refgeneGene','isProteinCoding?'] out1.write('\t'.join(header) + '\n') # look up annotation for each transcript for transcript in isoformTranscripts:
import subprocess import os import re inFile1 = 'intermediate/sample-replicate-catalog.txt' pipeline = 'pipeline1' account = 'neamati1' jobName = pipeline # combos 62 X 8 # combos 124 X 4 nodes = 25 numThreads = str(8) shHeader = 'input/slurm-header-' + numThreads + '-array.sh' (na, na, keys1) = mu.readRecords(inFile1, ['sampleReplicate']) # keys1 = keys1[:2] # testing purposes subprocess.check_call('mkdir -p ' + pipeline, shell=True) subprocess.check_call('mkdir -p ' + '/'.join([pipeline, 'logs']), shell=True) subprocess.check_call('mkdir -p ' + '/'.join([pipeline, 'scripts']), shell=True) # create sh prefix and execution script sh = '/'.join([pipeline, 'prefix.sh']) execute = pipeline + '/execute.sh' # write sh prefix with open(sh, 'w') as out1, open(shHeader, 'r') as in1: for line in in1: # copy prefix header except for array spec
#!/usr/bin/python # update de file import sys sys.path.append('src') import myUtils as mu inFile1 = 'intermediate/06.txt' inFile2 = 'input/geneAnnotation.txt' outFile1 = 'intermediate/07.txt' (records1, header1, keys1) = mu.readRecords(inFile1, ['cellline', 'tx', 'id']) (records2, header2, keys2) = mu.readRecords(inFile2, ['gene', 'transcript']) gene2entrez = dict( list( set([(records2[key]['gene'], records2[key]['entrez']) for key in keys2]))) # create lookup dictionary gene2keys = dict() for key in keys2: gene = records2[key]['gene'] # create entry or add to it if gene not in gene2keys: gene2keys[gene] = [key] else: gene2keys[gene].append(key) genes = gene2keys.keys()
import myUtils as mu import os, re inDir = '11-txts/' outFile1 = 'intermediate/12.txt' files = os.listdir(inDir) files = [file for file in files if 'gene' in file] files = sorted(files) # write yo file with open(outFile1, 'w') as out1: # get example header and build on it (records1, header1, keys1) = mu.readRecords(inDir + files[0], ['gene']) header = ['sample', 'sampleReplicate', 'cellline', 'tx', 'replicate' ] + header1 out1.write('\t'.join(header) + '\n') for file in files: inFile = inDir + file (records1, header1, keys1) = mu.readRecords(inFile, ['gene']) sampleReplicate = file.replace('-gene.txt', '') print sampleReplicate cellline, tx, rep = sampleReplicate.split('_') sample = '_'.join([cellline, tx]) # iterate through records for myKey in keys1:
#!/usr/bin/python # combine counts with gene expression quantification import sys sys.path.append('src') import myUtils as mu inFile1 = 'intermediate/03.txt' inFile2 = 'intermediate/02.txt' inFile3 = 'input/geneExpression.txt' outFile1 = 'intermediate/04.txt' (na, na, genes) = mu.readRecords(inFile1, ['gene']) (records2, header2, keys2) = mu.readRecords(inFile2, ['sampleReplicate', 'gene']) universe = dict([(gene, gene) for gene in genes]) # write yo file with open(inFile3, 'r') as in3, open(outFile1, 'w') as out1: # write yo header header = in3.readline() header = header.strip().split('\t') header += ['count'] out1.write('\t'.join(header) + '\n') for line in in3: parse1 = line.strip().split('\t') assert len(parse1) == len(header) - 1, 'HEADER DOES NOT MATCH'
#!/usr/bin/python # reformats fastq catalog import sys sys.path.append('src') import myUtils as mu import re inFile1 = 'input/fastqCatalog.txt' outFile1 = 'intermediate/fastq-catalog.txt' (records1, header1, keys1) = mu.readRecords(inFile1, ['run', 'sampleReplicate', 'read']) # write yo out with open(outFile1, 'w') as out1: # write yo header header = ['file', 'uniqueName', 'sampleReplicate', 'read'] out1.write('\t'.join(header) + '\n') # for each barcode write 2 lines for myKey in keys1: record = records1[myKey] uniqueName = myKey[:-2] # extract sampe replicate from sampleReplicate parse1 = myKey.split('!') assert len(parse1) == 3, 'CANT PARSE UNIQUENAME!'
#!/usr/bin/python # this script cleans up fold change table by getting rid of the following: # 1) non-protein coding <--- no longer # 2) genes not mapped to entrez # 3) multiple genes mapping to the same entrez import sys sys.path.append('src') import myUtils as mu inFile1 = 'intermediate/09.txt' outFile1 = 'intermediate/10.txt' (records1, header1, keys1) = mu.readRecords(inFile1, ['sample', 'gene', 'entrez']) #keys1a = list(set([key for key in keys1 if records1[key]['isProteinCoding'] == 'Y' and records1[key]['entrez'] != 'NA'])) keys1a = list(set([key for key in keys1 if records1[key]['entrez'] != 'NA'])) # construct dictionary with acceptable gene symbols # forcc entrez2gene = dict() for key in keys1a: record = records1[key] entrez, gene = record['entrez'], record['gene'] # identify select gene per entrez id by arbitrarily choosing gene if entrez not in entrez2gene: entrez2gene[entrez] = gene elif gene < entrez2gene[entrez]: entrez2gene[entrez] = gene
#!/usr/bin/python # read in run files from the core and map unique names to samples and replicates import sys sys.path.append('src') import myUtils as mu import re inFile1 = 'intermediate/02.txt' inFile2 = 'input/experimentalDesign.txt' outFile1 = 'intermediate/03.txt' (records1, header1, keys1) = mu.readRecords(inFile1, ['file']) (records2, header2, keys2) = mu.readRecords(inFile2, ['filePrefix']) # write yo file with open(outFile1, 'w') as out1: # write yo header header = header1 + ['sampleReplicate', 'sample', 'tx', 'rep'] out1.write('\t'.join(header) + '\n') for myKey in keys1: record = records1[myKey] # connect with sampleInfo file using sampleID and read keys2a = [key for key in keys2 if key in myKey] assert len(keys2a) == 1 record2 = records2[keys2a[0]] sampleReplicate, sample, tx, rep = record2['sampleReplicate'], record2[
#!/usr/bin/python # add is de using 3 different criteria import sys sys.path.append('src') import myUtils as mu inFile1 = 'intermediate/12.txt' outFile1 = 'intermediate/13.txt' (records1, header1, keys1) = mu.readRecords(inFile1, ['sample', 'gene', 'criteria']) keys1a = [key for key in keys1 if records1[key]['criteria'] == 'FC2_q'] # write yo file with open(outFile1, 'w') as out1: # write yo header out1.write('\t'.join(header1) + '\n') for key in keys1a: # assemble and write line out lineOut = [] for field in header1: lineOut.append(records1[key][field]) out1.write('\t'.join(lineOut) + '\n')
# wait for gsea to finish before running another import sys sys.path.append('src') import myUtils as mu import subprocess inFile1 = 'input/catalog.txt' numThreads = str(4) perms = 10000 # should be 10000 # perms = 1000 # should be 10000 #perms = 100 # should be 10000 (records1, header1, keys1) = mu.readRecords(inFile1, ['label', 'pipeline']) pipeline2gmt = { 'pipeline1a': 'input/c2.cp.kegg.v7.0.symbols.gmt', 'pipeline1b': 'input/c5.bp.v7.0.symbols.gmt', 'pipeline1c': 'input/h.all.v7.0.symbols.gmt', 'pipeline1d': 'input/c3.tft.v7.0.symbols.gmt' } pipeline2genesets = { 'pipeline1a': 'kegg', 'pipeline1b': 'bp', 'pipeline1c': 'hallmark', 'pipeline1d': 'tft' } #keys1 = [keys1[0]]
# map refgene universe to hgnc gene universe # get rid of NA # take the mean of redundancies import sys sys.path.append('src') import myUtils as mu import os, re import numpy as np inDir = '02-txts' inFile1 = 'intermediate/07.txt' # annotation outDir = '10-txts' (records1, header1, keys1) = mu.readRecords(inFile1, ['transcript']) # refgene2gene refgene2gene = list( set([(records1[key]['refgeneGene'], records1[key]['gene']) for key in keys1])) refgene2gene = dict(refgene2gene) # gene2entrez gene2entrez = list( set([(records1[key]['gene'], records1[key]['entrez']) for key in keys1])) gene2entrez = dict(gene2entrez) # transcript2Gene transcript2Gene = list( set([(records1[key]['transcript'], records1[key]['gene']) for key in keys1])) transcript2Gene = dict(transcript2Gene)
#!/usr/bin/python # convert entrez 2 gene import sys sys.path.append('src') import myUtils as mu inFile1 = 'intermediate/02.txt' inFile2 = 'input/hgnc.txt' outFile1 = 'intermediate/03.txt' (records1, header1, keys1) = mu.readRecords(inFile1, ['label', 'geneset']) (records2, header2, keys2) = mu.readRecords(inFile2, ['hgnc_id']) # hgnc # map uniquely and deterministicaly to hgnc entrez2hgnc = dict() for myKey in keys2: entrez, hgnc = records2[myKey]['entrez_id'], records2[myKey]['hgnc_id'] if entrez not in entrez2hgnc: entrez2hgnc[entrez] = hgnc elif len(hgnc) <= len(entrez2hgnc[entrez]) and hgnc < entrez2hgnc[entrez]: entrez2hgnc[entrez] = hgnc entrez2gene = dict() # then use hgnc symbols for entrez in entrez2hgnc.keys(): entrez2gene[entrez] = records2[entrez2hgnc[entrez]]['symbol'] # write yo file with open(outFile1, 'w') as out1: