Python readRecords示例，myUtils.readRecords Python示例

示例#1

0

显示文件

#!/usr/bin/python
# calculate mean normalized log2 value per sample

import sys
sys.path.append('src')
import myUtils as mu

import numpy as np

inFile1 = 'input/geneExpression.txt'
outFile1 = 'intermediate/08.txt'

(records1, header1, keys1) = mu.readRecords(inFile1,
                                            ['sampleReplicate', 'gene'])

sampleGenes = dict()
for key in keys1:
    sample, gene = records1[key]['sample'], records1[key]['gene']
    sampleGene = '!'.join([sample, gene])
    if sampleGene in sampleGenes:
        sampleGenes[sampleGene].append(key)
    else:
        sampleGenes[sampleGene] = [key]

# write yo file
sampleGenesOrdered = sorted(sampleGenes.keys())
with open(outFile1, 'w') as out1:

    # write yo header
    header = ['sample', 'gene', 'meanTPM']
    out1.write('\t'.join(header) + '\n')

示例#2

0

显示文件

文件： pipeline-execute.py 项目： bankhead3/rnaseq-pipelines-v2

import re

inFile1 = 'intermediate/fastq-catalog.txt'
inFile2 = 'intermediate/sample-replicate-catalog.txt'
pipeline = 'pipeline1'
end = 'paired'
stranded = True

numThreads = str(8)

sh = '/'.join([pipeline, 'prefix.sh'])
gtf = '/nfs/turbo/bankheadTurbo/annotation/refgene/20200130/refGene-sorted-canonical.gtf'
alignerIndexDir1 = '/nfs/turbo/bankheadTurbo/annotation/aligner-indexes/star/GRCh38'  # star
alignerIndexDir2 = '/nfs/turbo/bankheadTurbo/annotation/aligner-indexes/salmon/0.11.3/GRCh38/20200212/transcripts.idx'

(records1, header1, keys1) = mu.readRecords(inFile1, ['uniqueName', 'read'])
(records2, header2, keys2) = mu.readRecords(inFile2, ['sampleReplicate'])
# keys2 = keys2[:2]  # testing purposes
fastqc = True
#fastqc = False # testing
pars = {
    'pipeline': pipeline,
    'threads': numThreads,
    'alignerIndexDir': alignerIndexDir1,
    'end': end,
    'records': records1,
    'stranded': stranded,
    'gtf': gtf,
    'fastqc': fastqc
}

示例#3

0

显示文件

文件： 09-de-mean.py 项目： bankhead3/rnaseq-pipelines-v2

#!/usr/bin/python
# include mean treatment and control fpkm across replicates

import sys
sys.path.append('src')
import myUtils as mu

inFile1 = 'intermediate/07.txt'   # de results
inFile2 = 'intermediate/08.txt'   # mean expression
inFile3 = 'input/combos.txt'
outFile1 = 'intermediate/09.txt'

(records1,header1,keys1) = mu.readRecords(inFile1, ['sample','gene'])
(records2,header2,keys2) = mu.readRecords(inFile2, ['sample','gene'])
(records3,header3,keys3) = mu.readRecords(inFile3, ['tx'])

# write yo file
with open(outFile1,'w') as out1:
    
    # write yo header
    header = header1 + ['meanTPM','minMeanTPM','meanTPMTreatment','meanTPMControl']
    out1.write('\t'.join(header) + '\n')
    
    for key in keys1:
        record = records1[key]
        treatment,cellline,gene = record['sample'],record['cellline'],record['gene']
        control = records3[treatment]['ctrl']

        # add mean fpkm values for treatment and controls
        trKey = treatment + '!' + gene
        coKey = control + '!' + gene

示例#4

0

显示文件

文件： pipeline-setup-gdc.py 项目： bankhead3/rnaseq-pipelines-v2

import os
import re

inFile1 = 'intermediate/sample-sheet.txt'
pipeline = 'pipeline2'
account = 'cdsc_project1'
jobName = pipeline

# combos 62 X 8
# combos 124 X 4
nodes = 32  # liberal
#nodes=20   # conservative
numThreads = str(8)
shHeader = 'input/slurm-header-' + numThreads + '-array.sh'

(records1, header, keys1) = mu.readRecords(inFile1, ['fileID'])
keys1a = [key for key in keys1 if records1[key]['disease'] in ['BLCA']]
#keys1a = keys1a[:30]  # testing
#keys1a = keys1a[:5]  # testing
keys1a = keys1a[:1]  # testing
subprocess.check_call('mkdir -p ' + pipeline, shell=True)
subprocess.check_call('mkdir -p ' + '/'.join([pipeline, 'logs']), shell=True)
subprocess.check_call('mkdir -p ' + '/'.join([pipeline, 'scripts']),
                      shell=True)

# create sh prefix and execution script
sh = '/'.join([pipeline, 'prefix.sh'])
execute = pipeline + '/execute.sh'

# write sh prefix
with open(sh, 'w') as out1, open(shHeader, 'r') as in1:

示例#5

0

显示文件

文件： 13-compile-transcript.py 项目： bankhead3/rnaseq-pipelines-v2

sys.path.append('src')
import myUtils as mu

import os, re

inDir = '11-txts/'
outFile1 = 'intermediate/13.txt'

files = os.listdir(inDir)
files = sorted([file for file in files if 'transcript' in file])

# write yo file
with open(outFile1, 'w') as out1:

    # get example header and build on it
    (records1, header1, keys1) = mu.readRecords(inDir + files[0],
                                                ['transcript'])
    header = ['sample', 'sampleReplicate', 'cellline', 'tx', 'replicate'
              ] + header1
    out1.write('\t'.join(header) + '\n')

    for file in files:
        inFile = inDir + file
        (records1, header1, keys1) = mu.readRecords(inFile, ['transcript'])

        sampleReplicate = file.replace('-transcript.txt', '')

        print sampleReplicate

        cellline, tx, rep = sampleReplicate.split('_')
        sample = '_'.join([cellline, tx])

示例#6

0

显示文件


import sys
sys.path.append('src') 
import myUtils as mu

import os

inFile1 = 'input/refseq2entrez.txt'
inFile2 = 'intermediate/06.txt' # cufflinks transcript list
inDir1 = '02-txts/'  # cufflinks example file
outFile1 = 'intermediate/07.txt'

nonRefgeneTranscripts = ['ENST00000460036.1','TCONS_00009125','TCONS_00014146']

(records1,header1,keys1) = mu.readRecords(inFile1,['refgeneRefseq'])
(records2,header2,keys2) = mu.readRecords(inFile2,['transcript'])

# check that we have sufficient annotation for transcripts
isoformTranscripts1 = set(keys2)
isoformTranscripts = isoformTranscripts1
isoformTranscripts = sorted(list(isoformTranscripts))

with open(outFile1,'w') as out1:

    # assemble and write yo header
    header = ['gene','transcript','hgnc','entrez','refgeneGene','isProteinCoding?']
    out1.write('\t'.join(header) + '\n')

    # look up annotation for each transcript
    for transcript in isoformTranscripts:

示例#7

0

显示文件

文件： pipeline-setup.py 项目： bankhead3/rnaseq-pipelines-v2

import subprocess
import os
import re

inFile1 = 'intermediate/sample-replicate-catalog.txt'
pipeline = 'pipeline1'
account = 'neamati1'
jobName = pipeline

# combos 62 X 8
# combos 124 X 4
nodes = 25
numThreads = str(8)
shHeader = 'input/slurm-header-' + numThreads + '-array.sh'

(na, na, keys1) = mu.readRecords(inFile1, ['sampleReplicate'])
# keys1 = keys1[:2]  # testing purposes

subprocess.check_call('mkdir -p ' + pipeline, shell=True)
subprocess.check_call('mkdir -p ' + '/'.join([pipeline, 'logs']), shell=True)
subprocess.check_call('mkdir -p ' + '/'.join([pipeline, 'scripts']),
                      shell=True)

# create sh prefix and execution script
sh = '/'.join([pipeline, 'prefix.sh'])
execute = pipeline + '/execute.sh'

# write sh prefix
with open(sh, 'w') as out1, open(shHeader, 'r') as in1:
    for line in in1:
        # copy prefix header except for array spec

示例#8

0

显示文件

#!/usr/bin/python
# update de file

import sys
sys.path.append('src')
import myUtils as mu

inFile1 = 'intermediate/06.txt'
inFile2 = 'input/geneAnnotation.txt'
outFile1 = 'intermediate/07.txt'

(records1, header1, keys1) = mu.readRecords(inFile1, ['cellline', 'tx', 'id'])
(records2, header2, keys2) = mu.readRecords(inFile2, ['gene', 'transcript'])

gene2entrez = dict(
    list(
        set([(records2[key]['gene'], records2[key]['entrez'])
             for key in keys2])))

# create lookup dictionary
gene2keys = dict()
for key in keys2:
    gene = records2[key]['gene']

    # create entry or add to it
    if gene not in gene2keys:
        gene2keys[gene] = [key]
    else:
        gene2keys[gene].append(key)

genes = gene2keys.keys()

示例#9

0

显示文件

import myUtils as mu

import os, re

inDir = '11-txts/'
outFile1 = 'intermediate/12.txt'

files = os.listdir(inDir)
files = [file for file in files if 'gene' in file]
files = sorted(files)

# write yo file
with open(outFile1, 'w') as out1:

    # get example header and build on it
    (records1, header1, keys1) = mu.readRecords(inDir + files[0], ['gene'])
    header = ['sample', 'sampleReplicate', 'cellline', 'tx', 'replicate'
              ] + header1
    out1.write('\t'.join(header) + '\n')

    for file in files:
        inFile = inDir + file
        (records1, header1, keys1) = mu.readRecords(inFile, ['gene'])

        sampleReplicate = file.replace('-gene.txt', '')
        print sampleReplicate
        cellline, tx, rep = sampleReplicate.split('_')
        sample = '_'.join([cellline, tx])

        # iterate through records
        for myKey in keys1:

示例#10

0

显示文件

文件： 04-reduce-2-universe.py 项目： bankhead3/rnaseq-pipelines-v2

#!/usr/bin/python
# combine counts with gene expression quantification

import sys
sys.path.append('src')
import myUtils as mu

inFile1 = 'intermediate/03.txt'
inFile2 = 'intermediate/02.txt'
inFile3 = 'input/geneExpression.txt'
outFile1 = 'intermediate/04.txt'

(na, na, genes) = mu.readRecords(inFile1, ['gene'])
(records2, header2, keys2) = mu.readRecords(inFile2,
                                            ['sampleReplicate', 'gene'])

universe = dict([(gene, gene) for gene in genes])

# write yo file
with open(inFile3, 'r') as in3, open(outFile1, 'w') as out1:

    # write yo header
    header = in3.readline()
    header = header.strip().split('\t')
    header += ['count']
    out1.write('\t'.join(header) + '\n')

    for line in in3:
        parse1 = line.strip().split('\t')
        assert len(parse1) == len(header) - 1, 'HEADER DOES NOT MATCH'

示例#11

0

显示文件

文件： 02-build-fastq-catalog.py 项目： bankhead3/rnaseq-pipelines-v2

#!/usr/bin/python
# reformats fastq catalog

import sys
sys.path.append('src')
import myUtils as mu

import re

inFile1 = 'input/fastqCatalog.txt'
outFile1 = 'intermediate/fastq-catalog.txt'

(records1, header1, keys1) = mu.readRecords(inFile1,
                                            ['run', 'sampleReplicate', 'read'])

# write yo out
with open(outFile1, 'w') as out1:

    # write yo header
    header = ['file', 'uniqueName', 'sampleReplicate', 'read']
    out1.write('\t'.join(header) + '\n')

    # for each barcode write 2 lines
    for myKey in keys1:
        record = records1[myKey]

        uniqueName = myKey[:-2]

        # extract sampe replicate from sampleReplicate
        parse1 = myKey.split('!')
        assert len(parse1) == 3, 'CANT PARSE UNIQUENAME!'

示例#12

0

显示文件

文件： 10-polish-gene-universe.py 项目： bankhead3/rnaseq-pipelines-v2

#!/usr/bin/python
# this script cleans up fold change table by getting rid of the following:
# 1) non-protein coding  <--- no longer
# 2) genes not mapped to entrez
# 3) multiple genes mapping to the same entrez

import sys
sys.path.append('src')
import myUtils as mu

inFile1 = 'intermediate/09.txt'
outFile1 = 'intermediate/10.txt'

(records1, header1, keys1) = mu.readRecords(inFile1,
                                            ['sample', 'gene', 'entrez'])

#keys1a = list(set([key for key in keys1 if records1[key]['isProteinCoding'] == 'Y' and records1[key]['entrez'] != 'NA']))
keys1a = list(set([key for key in keys1 if records1[key]['entrez'] != 'NA']))

# construct dictionary with acceptable gene symbols
# forcc
entrez2gene = dict()
for key in keys1a:
    record = records1[key]
    entrez, gene = record['entrez'], record['gene']

    # identify select gene per entrez id by arbitrarily choosing gene
    if entrez not in entrez2gene:
        entrez2gene[entrez] = gene
    elif gene < entrez2gene[entrez]:
        entrez2gene[entrez] = gene

示例#13

0

显示文件

#!/usr/bin/python
# read in run files from the core and map unique names to samples and replicates

import sys
sys.path.append('src')
import myUtils as mu

import re

inFile1 = 'intermediate/02.txt'
inFile2 = 'input/experimentalDesign.txt'
outFile1 = 'intermediate/03.txt'

(records1, header1, keys1) = mu.readRecords(inFile1, ['file'])
(records2, header2, keys2) = mu.readRecords(inFile2, ['filePrefix'])

# write yo file
with open(outFile1, 'w') as out1:

    # write yo header
    header = header1 + ['sampleReplicate', 'sample', 'tx', 'rep']
    out1.write('\t'.join(header) + '\n')

    for myKey in keys1:
        record = records1[myKey]

        # connect with sampleInfo file using sampleID and read
        keys2a = [key for key in keys2 if key in myKey]
        assert len(keys2a) == 1
        record2 = records2[keys2a[0]]
        sampleReplicate, sample, tx, rep = record2['sampleReplicate'], record2[

示例#14

0

显示文件

文件： 13-filter.py 项目： bankhead3/rnaseq-pipelines-v2

#!/usr/bin/python
# add is de using 3 different criteria

import sys
sys.path.append('src')
import myUtils as mu

inFile1 = 'intermediate/12.txt'
outFile1 = 'intermediate/13.txt'

(records1, header1, keys1) = mu.readRecords(inFile1,
                                            ['sample', 'gene', 'criteria'])

keys1a = [key for key in keys1 if records1[key]['criteria'] == 'FC2_q']

# write yo file
with open(outFile1, 'w') as out1:

    # write yo header
    out1.write('\t'.join(header1) + '\n')

    for key in keys1a:

        # assemble and write line out
        lineOut = []
        for field in header1:
            lineOut.append(records1[key][field])
        out1.write('\t'.join(lineOut) + '\n')

示例#15

0

显示文件

# wait for gsea to finish before running another

import sys
sys.path.append('src')
import myUtils as mu

import subprocess

inFile1 = 'input/catalog.txt'

numThreads = str(4)
perms = 10000  # should be 10000
# perms = 1000  # should be 10000
#perms = 100  # should be 10000

(records1, header1, keys1) = mu.readRecords(inFile1, ['label', 'pipeline'])

pipeline2gmt = {
    'pipeline1a': 'input/c2.cp.kegg.v7.0.symbols.gmt',
    'pipeline1b': 'input/c5.bp.v7.0.symbols.gmt',
    'pipeline1c': 'input/h.all.v7.0.symbols.gmt',
    'pipeline1d': 'input/c3.tft.v7.0.symbols.gmt'
}
pipeline2genesets = {
    'pipeline1a': 'kegg',
    'pipeline1b': 'bp',
    'pipeline1c': 'hallmark',
    'pipeline1d': 'tft'
}

#keys1 = [keys1[0]]

示例#16

0

显示文件

# map refgene universe to hgnc gene universe
# get rid of NA
# take the mean of redundancies

import sys
sys.path.append('src')
import myUtils as mu

import os, re
import numpy as np

inDir = '02-txts'
inFile1 = 'intermediate/07.txt'  # annotation
outDir = '10-txts'

(records1, header1, keys1) = mu.readRecords(inFile1, ['transcript'])

# refgene2gene
refgene2gene = list(
    set([(records1[key]['refgeneGene'], records1[key]['gene'])
         for key in keys1]))
refgene2gene = dict(refgene2gene)
# gene2entrez
gene2entrez = list(
    set([(records1[key]['gene'], records1[key]['entrez']) for key in keys1]))
gene2entrez = dict(gene2entrez)
# transcript2Gene
transcript2Gene = list(
    set([(records1[key]['transcript'], records1[key]['gene'])
         for key in keys1]))
transcript2Gene = dict(transcript2Gene)

示例#17

0

显示文件

#!/usr/bin/python
# convert entrez 2 gene

import sys
sys.path.append('src')
import myUtils as mu

inFile1 = 'intermediate/02.txt'
inFile2 = 'input/hgnc.txt'
outFile1 = 'intermediate/03.txt'

(records1, header1, keys1) = mu.readRecords(inFile1, ['label', 'geneset'])
(records2, header2, keys2) = mu.readRecords(inFile2, ['hgnc_id'])  # hgnc

# map uniquely and deterministicaly to hgnc
entrez2hgnc = dict()
for myKey in keys2:
    entrez, hgnc = records2[myKey]['entrez_id'], records2[myKey]['hgnc_id']
    if entrez not in entrez2hgnc:
        entrez2hgnc[entrez] = hgnc
    elif len(hgnc) <= len(entrez2hgnc[entrez]) and hgnc < entrez2hgnc[entrez]:
        entrez2hgnc[entrez] = hgnc
entrez2gene = dict()

# then use hgnc symbols
for entrez in entrez2hgnc.keys():
    entrez2gene[entrez] = records2[entrez2hgnc[entrez]]['symbol']

# write yo file
with open(outFile1, 'w') as out1: