Python Pipeline.run примеры, CGAT.Pipeline.run Python примеры использования

Пример #1

0

Показать файл

Файл: pipeline_hvc.py Проект: BioinformaticsArchive/cgat

def computeOverlapCoding( infile, outfile ):
    '''compute overlap between coding markers and windows.

    This is done by setting the gene_id and transcript_id of markers to the ENSEMBL gene id
    and transcript_id that it overlaps with. Markers not overlapping an ENSEMBL gene id
    are removed.
    '''
    
    to_cluster = True
    tmpfilename = P.getTempFilename( dir = "." )
    
    statement = '''python %(scriptsdir)s/gtf2gtf.py
    --rename=gene \
    --apply=ensembl.diff.genes_ovl \
    < %(infile)s > %(tmpfilename)s
    '''
    
    P.run( **dict( locals().items() + PARAMS.items() ) )

    statement = '''python %(scriptsdir)s/gff2table.py 
    --filename-windows=<(python %(scriptsdir)s/bed2gff.py < windows.bed) 
    --decorator=counts
    --filename-data=%(tmpfilename)s \
    --skip-empty \
    --is-gtf \
    --log=%(outfile)s.log \
    < %(genome)s.fasta > %(outfile)s'''

    P.run( **dict( locals().items() + PARAMS.items() ) )

    os.unlink( tmpfilename )

Пример #2

0

Показать файл

Файл: pipeline_rnaseq_geneset.py Проект: jmadzo/cgat

def buildDownstreamFlankBed(infile, outfile):
    """ build interval downstream of gene start for each entry in bed file"""
    window = PARAMS["geneset_flank"]
    faidx = PARAMS["faidx"]
    statement = """flankBed -i %(infile)s -g %(faidx)s -l 0 -r %(window)s -s 
                   | python %(scriptsdir)s/bed2bed.py --method=filter-genome --genome-file=%(genome_dir)s/%(genome)s --log %(outfile)s.log > %(outfile)s"""
    P.run()

Пример #3

0

Показать файл

Файл: PipelineWindows.py Проект: lesheng/cgat

def countReadsWithinWindows(bedfile,
                            windowfile,
                            outfile,
                            counting_method="midpoint"):
    '''count reads given in *tagfile* within intervals in 
    *windowfile*.

    Both files need to be :term:`bed` formatted.

    Counting is done using bedtools. The counting method
    can be 'midpoint' or 'nucleotide'.
    '''
    job_options = "-l mem_free=4G"

    if counting_method == "midpoint":
        f = '''| awk '{a = $2+($3-$2)/2; printf("%s\\t%i\\t%i\\n", $1, a, a+1)}' '''
    elif counting_method == "nucleotide":
        f = ""
    else:
        raise ValueError("unknown counting method: %s" % counting_method)

    statement = '''
    zcat %(bedfile)s
    %(f)s
    | coverageBed -a stdin -b %(windowfile)s -split
    | sort -k1,1 -k2,2n
    | gzip
    > %(outfile)s
    '''

    P.run()

Пример #4

0

Показать файл

Файл: pipeline_rnaseq_geneset.py Проект: jmadzo/cgat

def ExtendRegion(infile, outfile):
    """convert bed to gtf"""
    statement = """gunzip < %(infile)s 
                   | slopBed -i stdin -g %(faidx)s -b 1000  
                   | gzip
                   > %(outfile)s """
    P.run()

Пример #5

0

Показать файл

Файл: pipeline_rnaseq_geneset.py Проект: jmadzo/cgat

def getNoncodingGeneset(infile, outfile):
    """Assume that all transcripts the do not overlap with ensembl coding geneset are noncoding """
    ensembl_transcripts = PARAMS["ensembl_transcripts"]
    statement = """cat %(infile)s | intersectBed -a stdin -b %(ensembl_transcripts)s -v -s > %(outfile)s;
                   echo "transcripts without ensembl coding overlap: " > %(outfile)s.count; 
                   cat %(outfile)s | wc -l >> %(outfile)s.count;"""
    P.run()

Пример #6

0

Показать файл

Файл: pipeline_rnaseq_geneset.py Проект: jmadzo/cgat

def addMissingNoncodingTranscripts(infile, outfile):
    """ Add ensembl gene id to GTF file"""
    ensembl_noncoding = PARAMS["ensembl_noncoding_gtf"]
    statement = """intersectBed -a %(ensembl_noncoding)s -b %(infile)s  -v -s -f 1 -r > transcripts/missing_ensembl_noncoding_transcripts.gtf;
                   cat %(infile)s transcripts/missing_ensembl_noncoding_transcripts.gtf | sort -k1,1 -k4,4n
                   > %(outfile)s;"""
    P.run()

Пример #7

0

Показать файл

Файл: mapping_titration.py Проект: BioinformaticsArchive/cgat

def buildBAMStats( infile, outfile ):
    '''Count number of reads mapped, duplicates, etc. '''
    to_cluster = USECLUSTER
    scriptsdir = PARAMS["general_scriptsdir"]
    statement = '''python %(scriptsdir)s/bam2stats.py --force 
                   --output-filename-pattern=%(outfile)s.%%s < %(infile)s > %(outfile)s'''
    P.run()

Пример #8

0

Показать файл

Файл: pipeline_variant_annotation.py Проект: jmadzo/cgat

def loadEffects(infile, outfile):
    '''load transcript effects into tables.'''

    root = infile[:-len(".effects.gz")]

    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s \
              --from-zipped \
              --index=transcript_id \
              --table=%(root)s_effects \
    < %(infile)s > %(outfile)s
    '''
    P.run()

    for suffix in ("cds", "intron", "splicing", "translation"):

        statement = '''
        gunzip < %(infile)s.%(suffix)s.gz
        | python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
        --allow-empty
        --index=transcript_id 
        --table=%(root)s_effects_%(suffix)s 
        --ignore-column=seq_na
        --ignore-column=seq_aa
        >> %(outfile)s
        '''
        P.run()

Пример #9

0

Показать файл

Файл: pipeline_fastqToBigWig.py Проект: Charlie-George/cgat

def loadPicardDuplicateStats(infiles, outfile):
    '''Merge Picard duplicate stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = open('dupstats.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.bam")
        statfile = P.snip(f, ".bam") + ".dupstats"
        if not os.path.exists(statfile):
            E.warn("File %s missing" % statfile)
            continue
        lines = [x for x in open(
            statfile, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    # Load into database
    tablename = P.toTable(outfile)
    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s '''
    P.run()

Пример #10

0

Показать файл

Файл: pipeline_fastqToBigWig.py Проект: Charlie-George/cgat

def loadPicardAlignStats(infiles, outfile):
    '''Merge Picard alignment stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = P.getTempFile()
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".alignstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()
    tmpfilename = outf.name

    # Load into database
    tablename = P.toTable(outfile)
    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s'''
    P.run()
    os.unlink(tmpfilename)

Пример #11

0

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: Charlie-George/cgat

def createMAFAlignment(infiles, outfile):
    """
    Takes all .axt files in the input directory, filters them to remove
    files based on supplied regular expressions, converts to a single maf file
    using axtToMaf, filters maf alignments under a specified length.
    """
    outfile = P.snip(outfile, ".gz")
    axt_dir = PARAMS["phyloCSF_location_axt"]
    to_ignore = re.compile(PARAMS["phyloCSF_ignore"])

    axt_files = []
    for axt_file in os.listdir(axt_dir):
        if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file):
            axt_files.append(os.path.join(axt_dir, axt_file))
    axt_files = (" ").join(sorted(axt_files))

    E.info("axt files from which MAF alignment will be created: %s" %
           axt_files)

    target_genome = PARAMS["phyloCSF_target_genome"]
    target_contigs = os.path.join(PARAMS["annotations_annotations_dir"],
                                  PARAMS_ANNOTATIONS["interface_contigs"])
    query_genome = PARAMS["phyloCSF_query_genome"]
    query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"],
                                 PARAMS_ANNOTATIONS["interface_contigs"])

    tmpf1 = P.getTempFilename("./phyloCSF")
    tmpf2 = P.getTempFilename("./phyloCSF")
    to_cluster = False
    # concatenate axt files, then remove headers
    statement = ("zcat %(axt_files)s"
                 " > %(tmpf1)s;"
                 " axtToMaf "
                 "  -tPrefix=%(target_genome)s."
                 "  -qPrefix=%(query_genome)s."
                 "  %(tmpf1)s"
                 "  %(target_contigs)s"
                 "  %(query_contigs)s"
                 "  %(tmpf2)s")
    P.run()

    E.info("Temporary axt file created %s" % os.path.abspath(tmpf1))
    E.info("Temporary maf file created %s" % os.path.abspath(tmpf2))

    removed = P.snip(outfile, ".maf") + "_removed.maf"
    to_cluster = False
    filtered = PipelineLncRNA.filterMAF(tmpf2,
                                        outfile,
                                        removed,
                                        PARAMS["phyloCSF_filter_alignments"])
    E.info("%s blocks were ignored in MAF alignment"
           " because length of target alignment was too short" % filtered[0])
    E.info("%s blocks were output to filtered MAF alignment" % filtered[1])

    os.unlink(tmpf1)
    os.unlink(tmpf2)
    to_cluster = False
    statement = ("gzip %(outfile)s;"
                 " gzip %(removed)s")
    P.run()

Пример #12

0

Показать файл

Файл: PipelineMotifs.py Проект: lesheng/cgat

def runGLAM2SCAN(infiles, outfile):
    '''run glam2scan on all intervals and motifs.
    '''

    to_cluster = True
    # only use new nodes, as /bin/csh is not installed
    # on the old ones.
    # job_options = "-l mem_free=8000M"

    controlfile, dbfile, motiffiles = infiles
    controlfile = dbfile[:-len(".fasta")] + ".controlfasta"
    if not os.path.exists(controlfile):
        raise P.PipelineError(
            "control file %s for %s does not exist" % (controlfile, dbfile))

    if os.path.exists(outfile):
        os.remove(outfile)

    for motiffile in motiffiles:
        of = IOTools.openFile(outfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s ::\n" % motif)
        of.close()

        statement = '''
        cat %(dbfile)s %(controlfile)s | %(execglam2scan)s -2 -n %(glam2scan_results)i n %(motiffile)s - >> %(outfile)s
        '''
        P.run()

Пример #13

0

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: Charlie-George/cgat

def extractLncRNAFastaAlignments(infiles, outfile):
    """
    Recieves a MAF file containing pairwise alignments and a gtf12 file
    containing intervals. Outputs a single fasta file containing aligned
    sequence for each interval.
    """
    bed_file, maf_file = infiles
    maf_tmp = P.getTempFilename("./phyloCSF")
    to_cluster = False
    statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s")
    P.run()

    target_genome = PARAMS["genome"]
    query_genome = PARAMS["phyloCSF_query_genome"]

    genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"])

    gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file,
                                                      maf_tmp,
                                                      genome_file,
                                                      outfile,
                                                      target_genome,
                                                      query_genome,
                                                      keep_gaps=False)
    E.info("%i gene_models extracted" % gene_models)
    os.unlink(maf_tmp)

Пример #14

0

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: Charlie-George/cgat

    def buildFilteredLncRNAGeneSet(infile, outfile):
        '''
        Depending on on filtering_remove_single_exon will:
        i) remove all single exon transcripts from all lncrna models 
        (transcripts)
        ii) remove lncrna loci that only contain single exon transcripts 
        (loci)
        iii) leave all single-exon and multi-exon loci in outfile 
        (None)
        '''

        if not PARAMS["filtering_remove_single_exon"]:
            E.info("Both multi-exon and single-exon lncRNA are retained!")
            statement = ("cp %(infile)s %(outfile)s")
        elif PARAMS["filtering_remove_single_exon"] == "loci":
            E.info("Warning: removing all single-exon"
                   " transcripts from lncRNA set")
            statement = ("zcat %(infile)s |"
                         " grep 'exon_status_locus \"s\"'"
                         " gzip > %(outfile)s")
        elif PARAMS["filtering_remove_single_exon"] == "transcripts":
            E.info("Warning: removing loci with only single-exon transcripts")
            statement = ("zcat %(infile)s |"
                         " grep 'exon_status \"s\"'"
                         " gzip > %(outfile)s")
        else:
            raise ValueError("Unregocnised parameter %s"
                             % PARAMS["filtering_remove_single_exon"])
        P.run()

Пример #15

0

Показать файл

Файл: pipeline_motifs.py Проект: BioinformaticsArchive/cgat

def exportMotifLocations( infiles, outfile ):
    '''export motif locations. There will be a bed-file per motif.

    Overlapping motif matches in different tracks will be merged.
    '''

    dbh = connect()
    cc = dbh.cursor()

    motifs = [ x[0] for x in cc.execute( "SELECT motif FROM motif_info" ).fetchall()]

    
    for motif in motifs:

        tmpf = P.getTempFile(".")
        
        for infile in infiles:
            table = P.toTable(infile) 
            track = P.snip( table, "_mast" )
            for x in cc.execute( """SELECT contig, start, end, '%(track)s', evalue
                                   FROM %(table)s WHERE motif = '%(motif)s' AND start IS NOT NULL""" % locals() ):
                tmpf.write( "\t".join( map(str, x) ) + "\n" )
        tmpf.close()

        outfile = os.path.join( PARAMS["exportdir"], "motifs", "%s.bed.gz" % motif )
        tmpfname = tmpf.name 

        statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s'''
        P.run()

        os.unlink( tmpf.name )

Пример #16

0

Показать файл

Файл: PipelineMotifs.py Проект: lesheng/cgat

def collectMEMEResults(tmpdir, target_path, outfile):
    '''collect output from a MEME run in tmpdir
    and copy all over to target_path

    convert images output by MEME (.eps files) to 
    .png files.'''

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "meme.txt"), outfile)

    # convert images to png
    epsfiles = glob.glob(os.path.join(target_path, "*.eps"))

    for epsfile in epsfiles:
        b, ext = os.path.splitext(epsfile)
        pngfile = b + ".png"
        statement = '''convert %(epsfile)s %(pngfile)s '''
        P.run()

Пример #17

0

Показать файл

Файл: PipelineMotifs.py Проект: lesheng/cgat

def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''

    tmpdir = P.getTempDir(".")

    to_cluster = True
    databases = " ".join(P.asList(PARAMS["tomtom_databases"]))

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile)

    if IOTools.isEmpty(infile):
        E.warn("input is empty - no computation performed")
        P.touch(outfile)
        return

    statement = '''
           tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log
    '''

    P.run()

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)

Пример #18

0

Показать файл

Файл: pipeline_transcriptome.py Проект: BioinformaticsArchive/cgat

def makeSegments( infile, outfile ):
    '''compute intron overrun.'''

    to_cluster = True

    statement = '''gunzip < %(infile)s 
    | %(scriptsdir)s/gff_sort pos 
    | python %(scriptsdir)s/gff2histogram.py 
		--method=values 
		--output-filename-pattern="%(outfile)s.%%s"
		--force 
		--log=%(outfile)s.log 
    > %(outfile)s 
    '''
    P.run()

    statement = '''gunzip 
    < %(infile)s 
    | python %(scriptsdir)s/gtf2gtf.py --sort=position+gene
    | python %(scriptsdir)s/gtf2gtf.py --merge-transcripts 
    | python %(scriptsdir)s/gtf2gtf.py --sort=gene
    | python %(scriptsdir)s/gff2histogram.py 
		--method=values 
		--force 
		--output-filename-pattern="%(outfile)s_genes.%%s" 
		--log=%(outfile)s.log
    >> %(outfile)s'''
    P.run()

Пример #19

0

Показать файл

Файл: pipeline_transcriptome.py Проект: BioinformaticsArchive/cgat

def loadRepeatInformation( infiles, outfile ):
    '''load genome information.'''
    
    to_cluster = True

    table = outfile[:-len(".load")]

    repeatsfile, indexfile = infiles

    tmpfilename = P.getTempFilename( "." )

    statement = '''awk '{printf("%%s\\t0\\t%%i\\n", $1, $4)}' < %(indexfile)s > %(tmpfilename)s'''
    P.run()

    statement = '''
        gunzip < %(repeatsfile)s 
        | python %(scriptsdir)s/gff2bed.py -v 0 
        | coverageBed -a stdin -b %(tmpfilename)s
        | awk 'BEGIN { printf("contig\\tstart\\tend\\tnover_entries\\tnover_bases\\tlength\\tpover\\n" );} {print;}'
        |python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
              --table=%(table)s 
        > %(outfile)s
    '''
    P.run()

    os.unlink( tmpfilename )

Пример #20

0

Показать файл

Файл: pipeline_rnaseqdiffexpression.py Проект: jmadzo/cgat

def buildTranscriptLevelReadCounts(infiles, outfile):
    '''count reads falling into transcripts of protein coding gene models.

    .. note::
       In paired-end data sets each mate will be counted. Thus
       the actual read counts are approximately twice the fragment
       counts.

    '''
    bamfile, geneset = infiles

    if BamTools.isPaired(bamfile):
        counter = 'readpair-counts'
    else:
        counter = 'read-counts'

    statement = '''
    zcat %(geneset)s
    | python %(scriptsdir)s/gtf2table.py
          --reporter=transcripts
          --bam-file=%(bamfile)s
          --counter=length
          --prefix="exons_"
          --counter=%(counter)s
          --prefix=""
          --counter=read-coverage
          --prefix=coverage_
          --min-mapping-quality=%(counting_min_mapping_quality)i
          --multi-mapping=ignore
          --log=%(outfile)s.log
    | gzip
    > %(outfile)s
    '''

    P.run()

Пример #21

0

Показать файл

Файл: pipeline_mapping_benchmark.py Проект: lesheng/cgat

def buildPicardAlignStats(infile, outfile):
    '''Gather BAM file alignment statistics using Picard '''
    to_cluster = USECLUSTER
    track = P.snip(os.path.basename(infile), ".bam")
    statement = '''CollectAlignmentSummaryMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%%(samtools_genome)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT ''' % locals(
    )
    P.run()

Пример #22

0

Показать файл

Файл: PipelineMotifs.py Проект: lesheng/cgat

def runBioProspector(infiles, outfile, dbhandle):
    '''run bioprospector for motif discovery.

    Bioprospector is run on only the top 10% of peaks.
    '''

    # bioprospector currently not working on the nodes
    to_cluster = False

    # only use new nodes, as /bin/csh is not installed
    # on the old ones.
    # job_options = "-l mem_free=8000M"

    tmpfasta = P.getTempFilename(".")
    track = outfile[:-len(".bioprospector")]
    nseq = writeSequencesForIntervals(track,
                                      tmpfasta,
                                      dbhandle,
                                      full=True,
                                      masker="dust",
                                      proportion=PARAMS["bioprospector_proportion"])

    if nseq == 0:
        E.warn("%s: no sequences - bioprospector skipped" % track)
        P.touch(outfile)
    else:
        statement = '''
    BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log
    '''
        P.run()

    os.unlink(tmpfasta)

Пример #23

0

Показать файл

Файл: pipeline_rnaseqdiffexpression.py Проект: jmadzo/cgat

def buildGeneLevelReadCounts(infiles, outfile):
    '''compute read counts and coverage of exons with reads.
    '''

    bamfile, exons = infiles

    if BamTools.isPaired(bamfile):
        counter = 'readpair-counts'
    else:
        counter = 'read-counts'

    # ignore multi-mapping reads
    statement = '''
    zcat %(exons)s
    | python %(scriptsdir)s/gtf2table.py
          --reporter=genes
          --bam-file=%(bamfile)s
          --counter=length
          --prefix="exons_"
          --counter=%(counter)s
          --prefix=""
          --counter=read-coverage
          --prefix=coverage_
          --min-mapping-quality=%(counting_min_mapping_quality)i
          --multi-mapping=ignore
          --log=%(outfile)s.log
    | gzip
    > %(outfile)s
    '''

    P.run()

Пример #24

0

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: Charlie-George/cgat

def renameTranscriptsInPreviousSets(infile, outfile):
    '''
    transcripts need to be renamed because they may use the same
    cufflinks identifiers as we use in the analysis - don't do if they
    have an ensembl id - sort by transcript
    '''
    inf = IOTools.openFile(infile)
    for gtf in GTF.iterator(inf):
        if gtf.gene_id.find("ENSG") != -1:
            statement = '''zcat %(infile)s | grep -v "#"
                        | python %(scriptsdir)s/gtf2gtf.py 
                        --sort=gene
                        --log=%(outfile)s.log
                        | gzip > %(outfile)s'''
        else:
            gene_pattern = "GEN" + P.snip(outfile, ".gtf.gz")
            transcript_pattern = gene_pattern.replace("GEN", "TRAN")
            statement = '''zcat %(infile)s | python %(scriptsdir)s/gtf2gtf.py 
                           --renumber-genes=%(gene_pattern)s%%i 
                           | python %(scriptsdir)s/gtf2gtf.py
                           --renumber-transcripts=%(transcript_pattern)s%%i 
                           | python %(scriptsdir)s/gtf2gtf.py
                           --sort=gene 
                           --log=%(outfile)s.log
                          | gzip > %(outfile)s'''

    P.run()

Пример #25

0

Показать файл

Файл: pipeline_metagenomeassembly.py Проект: Charlie-George/cgat

def runSpades(infile, outfile):
    '''
    run spades on each track
    '''
    job_options = " -l mem_free=30G"
    statement = PipelineMetagenomeAssembly.Spades().build(infile)
    P.run()

Пример #26

0

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: Charlie-George/cgat

def loadLncRNAClass(infile, outfile):
    '''
    load the lncRNA classifications
    '''
    tablename = os.path.basename(
        filenameToTablename(P.snip(infile, ".gtf.gz")))

    to_cluster = False
    # just load each transcript with its classification
    temp = P.getTempFile()
    inf = IOTools.openFile(infile)
    for transcript in GTF.transcript_iterator(GTF.iterator(inf)):
        temp.write("%s\t%s\t%s\n" % (
            transcript[0].transcript_id, 
            transcript[0].gene_id, 
            transcript[0].source))
    temp.close()

    inf_1 = temp.name
    statement = ("python %(scriptsdir)s/csv2db.py"
                 "  -t %(tablename)s"
                 "  --log=%(outfile)s.log"
                 "  --header=transcript_id,gene_id,class"
                 " < %(inf_1)s > %(outfile)s")
    P.run()

Пример #27

0

Показать файл

Файл: pipeline_metagenomeassembly.py Проект: Charlie-George/cgat

def runSoapdenovo(infile, outfile):
    '''
    run soapdenovo
    '''
    job_options = "-l mem_free=30G"
    statement = PipelineMetagenomeAssembly.SoapDenovo2().build(infile)
    P.run()

Пример #28

0

Показать файл

Файл: pipeline_metagenomeassembly.py Проект: Charlie-George/cgat

def runIdba(infile, outfile):
    '''
    run idba on each track
    '''
    job_options = " -l mem_free=30G"
    statement = PipelineMetagenomeAssembly.Idba().build(infile)
    P.run()

Пример #29

0

Показать файл

Файл: pipeline_transcriptome.py Проект: BioinformaticsArchive/cgat

def buildAnnotations( infiles, outfile ):
    '''annotate transcripts by location (intergenic, intronic, ...)'''
    
    infile, annotation = infiles

    statement = '''gunzip 
    < %(infile)s 
    | python %(scriptsdir)s/gtf2gtf.py --sort=gene
    | %(cmd-farm)s --split-at-column=1 --output-header --log=%(outfile)s.log --max-files=60 
	"python %(scriptsdir)s/gtf2table.py 
		--counter=position 
		--counter=classifier 
		--section=exons 
		--section=introns 
		--counter=length 
		--counter=splice 
		--counter=composition-na 
		--counter=splice-comparison 
		--log=%(outfile)s.log 
                --filename-format=gff
		--filename-gff=%(annotation)s 
		--genome-file=%(genome_dir)s/%(genome)s"
    | gzip
    > %(outfile)s
    '''
    P.run()

Пример #30

0

Показать файл

Файл: pipeline_polyphen.py Проект: jmadzo/cgat

def buildBenchmarkInput(infile, outfile):

    tmpfile = P.getTempFile()

    dbhandle = sqlite3.connect(PARAMS["database"])
    cc = dbhandle.cursor()
    statement = '''
    SELECT DISTINCT transcript_id, protein_id FROM peptide_info
    '''
    cc.execute(statement)
    tmpfile.write("transcript_id\tprotein_id\n")
    tmpfile.write("\n".join(["\t".join(x) for x in cc]))
    tmpfile.write("\n")
    tmpfilename = tmpfile.name

    statement = '''
    perl %(scriptsdir)s/extract_fasta.pl %(infile)s
    < cds.fasta 
    python %(scripstdir)s/fasta2variants.py --is-cds  
    | python %(scriptsdir)s/substitute_tokens.py 
             --apply=%(tmpfilename)s
    > %(outfile)s
    '''
    P.run()

    os.unlink(tmpfilename)

Пример #31

0

Показать файл

Файл: pipeline_rnaseqlncrna.py Проект: santayana/cgat

def buildFullGeneSet(infiles, outfile):
    '''
    produces a final gene set that can be used for 
    differential expression analysis and comparisons
    between protein coding and lncRNA transcripts
    '''
    # change the source to be in keeping with classification
    # of transcripts - f coming from cufflinks assembly
    infs = " ".join(infiles)
    statement = ("zcat %(infs)s |"
                 " sed 's/Cufflinks/protein_coding/g' |"
                 " python %(scriptsdir)s/gtf2gtf.py"
                 "  --sort=gene"
                 "  --log=%(outfile)s.log |"
                 " gzip  > %(outfile)s")
    P.run()

Пример #32

0

Показать файл

def lowerStringencyDeNovos(infiles, outfile):
    '''Filter lower stringency de novo variants based on provided jexl expression'''
    to_cluster = USECLUSTER
    infile, pedfile = infiles
    pedigree = csv.DictReader(
        open(pedfile),
        delimiter='\t',
        fieldnames=['family', 'sample', 'father', 'mother', 'sex', 'status'])
    for row in pedigree:
        if row['status'] == '2':
            father = row['father']
            mother = row['mother']
            child = row['sample']
    statement = '''GenomeAnalysisTK -T SelectVariants -R %%(bwa_index_dir)s/%%(genome)s.fa --variant %(infile)s -select 'vc.getGenotype("%(child)s").getPL().1==0&&vc.getGenotype("%(father)s").getPL().0==0&&vc.getGenotype("%(mother)s").getPL().0==0&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")' > %(outfile)s''' % locals(
    )
    P.run()

Пример #33

0

Показать файл

Файл: pipeline_metagenomebenchmark.py Проект: lesheng/cgat

def alignContigsToReference(infile, outfile, param):
    '''
    align the contigs to the reference genomes
    using nucmer
    '''
    print infile, param

    to_cluster = True

    reffile, contigfile = infile, param
    pattern = P.snip(os.path.basename(outfile), ".delta")
    statement = '''nucmer -p %(pattern)s %(reffile)s %(contigfile)s'''
    P.run()
    outf = os.path.basename(outfile)
    statement = '''mv %(outf)s alignment.dir'''
    P.run()

Пример #34

0

Показать файл

Файл: pipeline_exome.py Проект: yangjl/cgat

def filterVariants(infiles, outfile):
    '''Filter variants based on provided jexl expression'''
    to_cluster = USECLUSTER
    infile, pedfile = infiles
    pedigree = csv.DictReader(
        open("%(pedfile)s"),
        delimiter='\t',
        fieldnames=['family', 'sample', 'father', 'mother', 'sex', 'status'])
    for row in pedigree:
        if row['status'] == '2':
            father = row['father']
            mother = row['mother']
            child = row['sample']
    statement = '''GenomeAnalysisTK -T SelectVariants -R %%(bwa_index_dir)s/%%(genome)s.fa --variant %(infile)s -select 'vc.getGenotype("%(father)s").getDP()>=10&&vc.getGenotype("%(mother)s").getDP()>=10&&vc.getGenotype("%(father)s").getAB()<0.05&&vc.getGenotype("%(mother)s").getAB()<0.05&&vc.getGenotype("%(child)s").getAB()>=0.25&&vc.getGenotype("%(child)s").getPL().0>20&&vc.getGenotype("%(child)s").getPL().1==0&&vc.getGenotype("%(child)s").getPL().2>0&&vc.getGenotype("%(father)s").getPL().0==0&&vc.getGenotype("%(father)s").getPL().1>20&&vc.getGenotype("%(father)s").getPL().2>20&&vc.getGenotype("%(mother)s").getPL().0==0&&vc.getGenotype("%(mother)s").getPL().1>20&&vc.getGenotype("%(mother)s").getPL().2>20&&vc.getGenotype("%(child)s").getAD().1>=3' > %(outfile)s''' % locals(
    )
    P.run()

Пример #35

0

Показать файл

Файл: pipeline_transcriptome.py Проект: lesheng/cgat

def loadOverlap(infile, outfile):
    '''load results of overlap computation.'''

    tablename = outfile[:-len("_table.load")]
    statement = '''
	grep -v "\\bna\\b" 
        < %(infile)s 
        |python %(scriptsdir)s/csv2db.py %(csv2db_options)s
             --map set1:str 
             --map set2:str 
             --index=set1 
             --index=set2 
             --table=%(tablename)s
        > %(outfile)s
    '''
    P.run()

Пример #36

0

Показать файл

Файл: pipeline_transcriptome.py Проект: lesheng/cgat

def runFrameFinder(infile, outfile):
    '''run FrameFinder

    search on both strands (-r TRUE). Note that CPC default is: only forward strand.

    '''
    cpc_dir = "/ifs/apps/bio/cpc-0.9-r2"
    statement = '''
    cat %(infile)s |
    %(cpc_dir)s/libs/estate/bin/framefinder
    -r TRUE -w %(cpc_dir)s/data/framefinder.model /dev/stdin
    | gzip
     > %(outfile)s
    '''

    P.run()

Пример #37

0

Показать файл

def buildCodingExons( infile, outfile ):
    '''build a collection of transcripts from the protein-coding portion of the ENSEMBL gene set.

    All exons are kept
    '''

    to_cluster = True

    statement = '''
    gunzip < %(infile)s 
    | awk '$2 == "protein_coding"' 
    | awk '$3 == "exon"' 
    | python %(scriptsdir)s/gtf2gtf.py --remove-duplicates=gene --log=%(outfile)s.log 
    | gzip > %(outfile)s
    '''
    P.run()

Пример #38

0

Показать файл

Файл: pipeline_transcriptome.py Проект: lesheng/cgat

def makeDistances(infiles, outfile):
    '''compute intron overrun.'''

    infile, annotation = infiles

    statement = '''gunzip
    < %(infile)s 
    | python %(scriptsdir)s/gtf2gtf.py --sort=gene
    | %(cmd-farm)s --split-at-column=1 --output-header --log=%(outfile)s.log --max-files=60 
	"python %(scriptsdir)s/gtf2table.py 
		--counter=distance-genes 
		--log=%(outfile)s.log 
		--filename-gff=<( gunzip < %(annotation)s ) " 
    > %(outfile)s 
    '''
    P.run()

Пример #39

0

Показать файл

Файл: pipeline_benchmark_rnaseqmappers.py Проект: lesheng/cgat

def buildCodingGeneSet(infile, outfile):
    '''build a gene set with only protein coding 
    transcripts.

    Genes are selected via their gene biotype in the GTF file.
    Note that this set will contain all transcripts of protein
    coding genes, including processed transcripts.

    This set includes UTR and CDS.
    '''

    to_cluster = True
    statement = '''
    zcat %(infile)s | awk '$2 == "protein_coding"' | gzip > %(outfile)s
    '''
    P.run()

Пример #40

0

Показать файл

def convertBed2Psl(infile, outfile):
    """convert a bed to a psl file."""

    track = outfile[:-len(".bed.gz")]
    genomefile = os.path.join(PARAMS["genome_dir"],
                              PARAMS["%s_genome" % track])
    if not os.path.exists(genomefile + ".fasta"):
        raise IOError("genome %s does not exist" % genomefile)

    statement = """gunzip < %(infile)s 
    | python %(scriptsdir)s/bed2psl.py 
         --genome=%(genomefile)s
         --log=%(outfile)s.log 
    | gzip > %(outfile)s
    """
    P.run()

Пример #41

0

Показать файл

Файл: pipeline_vitaminD_annotator.py Проект: logust79/cgat-apps

def buildAnnotatorSegments(tmpdir, infile, outfile):
    '''convert segments in bed format to annotator format
    from infile to outfile.
    '''

    tmpsegments = os.path.join(tmpdir, "segments")
    to_cluster = True

    statement = '''
        python %(scriptsdir)s/bed2gff.py < %(infile)s |\
	python %(scriptsdir)s/gff2annotator.py --log=%(outfile)s.log --section=segments > %(tmpsegments)s \
    '''

    P.run(**dict(locals().items() + PARAMS.items()))

    return tmpsegments

Пример #42

0

Показать файл

Файл: pipeline_medip.py Проект: lesheng/cgat

def mergeDMRWindows(infile, outfile):
    '''merge overlapping windows.'''

    to_cluster = True

    statement = '''
    zcat %(infile)s
    | python %(scriptsdir)s/medip_merge_intervals.py
          --log=%(outfile)s.log
          --invert
          --output-filename-pattern=%(outfile)s.%%s.bed.gz
    | gzip
    > %(outfile)s
    '''

    P.run()

Пример #43

0

Показать файл

def reportTotalRNAFunctions(infiles, outfiles):
    '''report total RNA functions.'''

    to_cluster = USECLUSTER

    rpkm_filename, annotations_filename = infiles
    expression_filename, diff_filename = outfiles
    statement = '''
    python %(rmaadir)s/report_totalRNA_annotations.py 
           %(rpkm_filename)s 
           %(annotations_filename)s 
           %(expression_filename)s 
           %(diff_filename)s
    '''

    P.run()

Пример #44

0

Показать файл

def copyEnsemblDb(infile, outfile):
    '''copy tables from ensembl database to rnaseq database'''
    table_list = P.asList(PARAMS["ensembl_tables"])
    dbhandle = sqlite3.connect(PARAMS["database"])
    cc = dbhandle.cursor()
    query = """ATTACH "%s" as ensembl;""" % PARAMS["ensembl_db"]
    cc.execute(query)
    for table in table_list:
        cc = dbhandle.cursor()
        query = """CREATE TABLE %s AS SELECT * FROM ensembl.%s;""" % (table,
                                                                      table)
        print query
        cc.execute(query)
    cc.close()
    statement = """touch %(outfile)s;"""
    P.run()

Пример #45

0

Показать файл

def buildGeneTables(infile, outfile):
    '''
    build gene tables
    '''
    if infile.endswith(".gff.gz"):
        outf = gzip.open(outfile, "w")
        outf.write(
            "chr\tsource\tfeature\tstart\tend\tscore\tstrand\tframe\tattributes\n")
        for line in gzip.open(infile).readlines():
            outf.write(line)
        outf.close()
    else:
        statement = '''zcat %(infile)s | python %(scriptsdir)s/fasta2table.py
        -s sequence
        --log=%(outfile)s.log | gzip > %(outfile)s'''
        P.run()

Пример #46

0

Показать файл

Файл: pipeline_vitaminDMHC.py Проект: logust79/cgat-apps

def runMACS( infile, outfile ):

    to_cluster = False

    track = infile[:-len("normbam")]
    try:
        control = pipeline_vitaminD.getControl( track ) + ".bam"
    except AssertionError:
        return

    statement = '''
    macs -t %(infile)s -c %(control)s \
          --name=%(outfile)s \
          --format=bam --tsize=35 --bw=110 --mfold=8 --gsize=6000000 >& %(outfile)s''' 

    P.run( **dict( locals().items() + PARAMS.items() ) )

Пример #47

0

Показать файл

def assignEssentialGenesToContigs(infile, outfile):
    '''
    assign essential genes to contigs
    '''
    dirname = os.path.dirname(infile)
    essential = PARAMS["hmmer_hmm"]
    tempdir = P.getTempDir(".")

    statement = '''zcat %(infile)s > %(tempdir)s/orfs.fa;
    hmmsearch --tblout %(tempdir)s/hmm.out --cut_tc
    --notextw  %(essential)s %(tempdir)s/orfs.fa;
    tail -n+4 %(tempdir)s/hmm.out | sed 's/ * / /g' | cut -f 1,4 -d " "
    | gzip > %(outfile)s'''
    P.run()
    statement = '''rm -rf %(tempdir)s'''
    P.run()

Пример #48

0

Показать файл

Файл: pipeline_ancestral_repeats.py Проект: santayana/cgat

    def buildGenomeAlignment(infile, outfile):
        '''remove non-unique alignments in genomic infile.'''

        statement = '''gunzip < %(infile)s 
        | sort -k10,10 -k12,12n
        | python %(scriptsdir)s/psl2psl.py
        --method=remove-overlapping-query
        --log=%(outfile)s.log
        | sort -k14,14 -k16,16n
        | python %(scriptsdir)s/psl2psl.py
        --method=remove-overlapping-target
        --log=%(outfile)s.log
        | gzip
        >> %(outfile)s
        '''
        P.run()

Пример #49

0

Показать файл

def loadPolyphenMap(infile, outfile):
    '''load polyphen input data.'''

    table = P.toTable(outfile)
    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s
              --index=snp_id 
              --index=track,transcript_id
              --index=contig,pos
              --index=protein_id
              --index=transcript_id
              --table=%(table)s 
    < %(infile)s.map
    > %(outfile)s
    '''
    P.run()

Пример #50

0

Показать файл

Файл: pipeline_cancer_variant_calling.py Проект: TomSmithCGAT/Project38

def createRealignIntervals(infiles, outfile):

    infile, reference = infiles

# need to unload java before runnning GATK as it now runs on java version 7

    statement = '''module unload apps/java/jre1.6.0_26;
    java -Xmx4g -jar
    /ifs/apps/bio/GATK-2.7-2/GenomeAnalysisTK.jar
    -T RealignerTargetCreator
    -R %(reference)s
    -I %(infile)s
    -o %(outfile)s
    ''' % locals()

    P.run()

Пример #51

0

Показать файл

Файл: pipeline_benchmark_rnaseqmappers.py Проект: santayana/cgat

def mapReadsWithBowtieAgainstTranscriptome(infiles, outfile):
    '''map reads from short read archive sequence using bowtie against
    transcriptome data.
    '''

    # Mapping will permit up to one mismatches. This is sufficient
    # as the downstream filter in bams2bam requires the
    # number of mismatches less than the genomic number of mismatches.
    # Change this, if the number of permitted mismatches for the genome
    # increases.

    # Output all valid matches in the best stratum. This will
    # inflate the file sizes due to matches to alternative transcripts
    # but otherwise matches to paralogs will be missed (and such
    # reads would be filtered out).
    job_options = "-l mem_free=16G"
    job_threads = PARAMS["bowtie_threads"]

    tmpfile = P.getTempFilename()

    infile, reffile, contigs = infiles
    track = P.snip(outfile, ".bam")
    prefix = P.snip(reffile, ".fa")

    statement = '''
    gunzip < %(infile)s > %(tmpfile)s;
    checkpoint;
    bowtie -q
           --sam 
           -C
           --un /dev/null
           --threads %(bowtie_threads)s
           %(transcriptome_options)s 
           --best --strata -a
           %(prefix)s_cs
           %(tmpfile)s
    | python %(scriptsdir)s/bam2bam.py --sam --set-nh --log=%(outfile)s.log
    | perl -p -e "if (/^\\@HD/) { s/\\bSO:\S+/\\bSO:coordinate/}"  
    | samtools import %(contigs)s - -
    | samtools sort - %(track)s;
    checkpoint;
    samtools index %(outfile)s
    checkpoint;
    rm -f %(tmpfile)s
    '''

    P.run()

Пример #52

0

Показать файл

def buildCDNAFasta( infile, outfile ):
    '''load ENSEMBL cdna FASTA file
    
    *infile* is an ENSEMBL cdna file.
    '''
    dbname = outfile[:-len(".fasta")]

    statement = '''gunzip 
    < %(infile)s
    | perl -p -e 'if ("^>") { s/ .*//};'
    | python %(scriptsdir)s/index_fasta.py
       --force
    %(dbname)s - 
    > %(dbname)s.log
    '''

    P.run()

Пример #53

0

Показать файл

Файл: pipeline_medip.py Проект: lesheng/cgat

def buildTileStats(infile, outfile):
    '''compute tiling window size statistics from bed file.'''

    use_cluster = True

    statement = '''
    zcat %(infile)s
    | python %(scriptsdir)s/gff2histogram.py 
                   --force
                   --format=bed 
                   --data=size
                   --method=hist
                   --method=stats
                   --output-filename-pattern=%(outfile)s.%%s.tsv
    > %(outfile)s
    '''
    P.run()

Пример #54

0

Показать файл

Файл: pipeline_transcriptome.py Проект: lesheng/cgat

def exportSequences(infile, outfile):
    '''collect sequences from a gtf file.'''

    prefix = outfile[:-len(".fasta")]

    to_cluster = True
    statement = '''gunzip 
        < %(infile)s
        | python %(scriptsdir)s/gtf2gtf.py --sort=gene
	| python %(scriptsdir)s/gff2fasta.py 
		--is-gtf 
		--genome-file=%(genome_dir)s/%(genome)s
		--log=%(outfile)s.log 
	| python %(toolsdir)s/index_fasta.py --force %(prefix)s - 
        > %(outfile)s.log'''

    P.run()

Пример #55

0

Показать файл

def buildPeptideFasta( infile, outfile ):
    '''create ENSEMBL peptide file

    *infile* is an ENSEMBL .pep.all.fa.gz file.
    '''
    dbname = outfile[:-len(".fasta")]

    statement = '''gunzip 
    < %(infile)s
    | perl -p -e 'if ("^>") { s/ .*//};'
    | python %(scriptsdir)s/index_fasta.py
       --force
    %(dbname)s - 
    > %(dbname)s.log
    '''

    P.run()

Пример #56

0

Показать файл

def annotateVariantsSNPsift(infile, outfile):
    '''Add annotations using SNPsift'''
    to_cluster = USECLUSTER
    job_options = "-pe dedicated 4 -R y -l mem_free=6G"
    track = P.snip(os.path.basename(infile), ".vqsr.vcf")
    dbNSFP = PARAMS["annotation_snpsift_dbnsfp"]
    # The following statement is not fully implemented yet
    #    statement = '''SnpSift.sh geneSets -v /ifs/projects/proj016/data/1000Genomes/msigdb.v4.0.symbols.gmt %(infile)s > variants/%(track)s_temp1.vcf; checkpoint;''' % locals()

    statement = '''SnpSift.sh dbnsfp -v %(dbNSFP)s %(infile)s
    > variants/%(track)s_temp1.vcf; checkpoint;''' % locals()

    statement += '''SnpSift.sh annotate /ifs/projects/proj016/data/1000Genomes/00-All.vcf
    variants/%(track)s_temp1.vcf > %(outfile)s ;''' % locals()
    #    statement += '''rm -f variants/*temp*vcf;'''

    P.run()

Пример #57

0

Показать файл

def loadPolyphen(infile, outfile):
    '''load polyphen results.'''

    table = P.toTable(outfile)

    statement = '''
    gunzip 
    < %(infile)s
    | perl -p -e "s/o_acc/protein_id/; s/ +//g; s/^#//;"
    |python %(scriptsdir)s/csv2db.py %(csv2db_options)s
              --index=snp_id 
              --index=protein_id
              --table=%(table)s 
              --map=effect:str
    > %(outfile)s
    '''
    P.run()

Пример #58

0

Показать файл

def mergeGeneLists(infiles, outfile):
    '''Merge gene lists into single table and load into SQLite.'''

    tablename = P.toTable(outfile)
    species_list = P.asList(PARAMS["species"])
    anno_list = P.asList(PARAMS["annotations_db"])
    species_lookup = dict(zip(species_list, anno_list))

    # Connect to database and attach annotation databases
    dbhandle = sqlite3.connect(PARAMS["database"])
    for species in species_lookup.iterkeys():
        species_db = species_lookup[species]
        #species_db = anno_base + species_genome + "/" + db_name
        cc = dbhandle.cursor()
        statement = '''ATTACH DATABASE '%(species_db)s' as %(species)s''' % locals(
        )
        print statement
        cc.execute(statement)
        cc.close()

    # Build union statement
    pre = "CREATE TABLE %s AS " % tablename
    statement = ""
    for f in infiles:
        track = P.snip(os.path.basename(f),
                       ".genelist.load").replace("-", "_").replace(".", "_")
        species = track[:2]
        genelist_id = PARAMS["genelist_id"]
        statement += pre + '''SELECT distinct t.gene_id, t.gene_name, "%(species)s" AS species
                       FROM %(track)s_genelist g, %(species)s.transcript_info t
                       WHERE g.gene_id=t.%(genelist_id)s and t.gene_biotype='protein_coding' ''' % locals(
        )
        pre = " UNION "

    print statement
    cc = dbhandle.cursor()
    cc.execute("DROP TABLE IF EXISTS %(tablename)s" % locals())
    cc.execute(statement)
    cc.execute('''CREATE INDEX "glm_idx1" ON "%s" ("gene_id" ASC) ''' %
               tablename)
    cc.execute('''CREATE INDEX "glm_idx2" ON "%s" ("species" ASC) ''' %
               tablename)
    cc.close()

    statement = "touch %s" % outfile
    P.run()

Пример #59

0

Показать файл

def loadRepeatsRates(infile, outfile):
    '''load repeat overlap'''

    table = outfile[:-len(".load")]

    statement = '''gunzip 
    < %(infile)s 
    | awk '$4 > 0'
    | python %(toolsdir)s/csv_cut.py --remove exons_lengths exons_values
    |python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
              --index=gene_id 
              --map=gene_id:str 
              --table=%(table)s 
              --allow-empty
    > %(outfile)s'''

    P.run()

Пример #60

0

Показать файл

def loadSegments(infile, outfile):
    '''load segments'''

    table = outfile[:-len(".load")]

    for x in (".distances", ".sizes", ".overlaps", "_genes.distances",
              "_genes.sizes", "_genes.overlaps"):
        y = re.sub("\.", "_", x)
        statement = '''
        python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
        --index=gene_id 
        --map=gene_id:str 
        --table=%(table)s%(y)s 
        < %(infile)s%(x)s
        >> %(outfile)s'''

        P.run()

Python Pipeline.run примеры использования