Exemplo n.º 1
0
def main():
    if len(sys.argv) < 4:
        print >> sys.stderr, \
                'Usage: assemgraph2.py <query file> <target file>' + \
                ' <blast9 file> [min score=200]'
        raise SystemExit
    try:
        MIN_SCORE = float(sys.argv[5])
    except IndexError:
        pass

    print >> sys.stderr, 'Reading sequence databases...'
    queries = seqdb.SequenceFileDB(sys.argv[1])
    targets = seqdb.SequenceFileDB(sys.argv[2])
    print >> sys.stderr, len(queries), len(targets)
    try:
        align_file = open(sys.argv[3])
    except IOError as e:
        print >> sys.stderr, 'Error: check alignment file.'
        raise e

    print >> sys.stderr, 'Constructing alignment graphs...'
    graph = nx.Graph()
    for c, (query, target) in enumerate(parse_alignments(align_file)):
        graph.add_edge(query, target)

        if c % 100 == 0:
            print >> sys.stderr, '...', c

    # nx.draw(graph)
    # plt.show()
    # print graph.nodes()
    logfile = open('assemgraph.log', 'w')
    visited_nodes = set()
    cluster_no = 0
    for node in graph.nodes():
        if node not in visited_nodes:
            filename1 = 'cluster_%d_targets' % cluster_no
            filename2 = 'cluster_%d_queries' % cluster_no
            ofile1 = open(filename1, 'w')
            ofile2 = open(filename2, 'w')
            print >> sys.stderr, \
                    'Writing cluster %d to a file...' % cluster_no
            vnodes, max_length = (write_sequence(node, graph, targets, queries,
                                                 ofile1, ofile2))
            visited_nodes.update(vnodes)
            for n in vnodes:
                size = len(targets[n]) if n in targets else len(queries[n])
                print >> logfile, 'cluster_%d\t%s\t%d' % (cluster_no, n, size)
            ofile1.close()
            ofile2.close()
            print >> sys.stderr, 'total nodes = %d' % len(vnodes)

            cluster_no += 1

    print >> logfile, '***finished***'
    logfile.close()
Exemplo n.º 2
0
def main():
    try:
        input_file = sys.argv[1]
        fasta_file = sys.argv[2]
    except IndexError:
        print >> sys.stderr, \
            'unmapped_seq2.py <text file> <fasta file> [min length=50]'
    try:
        min_length = int(sys.argv[3])
    except IndexError:
        min_length = 50

    db = seqdb.SequenceFileDB(fasta_file)

    input_sequences = set()
    print >> sys.stderr, 'Reading sequences...',
    for line in open(input_file):
        input_sequences.add(line.strip())
    print >> sys.stderr, 'total %d' % len(input_sequences)

    print >> sys.stderr, 'Writing unmapped sequences...'
    for seq in db:
        sequence = db[seq]
        if (seq not in input_sequences and len(sequence) >= min_length):
            print >> sys.stderr, 'Writing %s, %d bp' % (seq, len(sequence))
            sequtil.write_fasta(sys.stdout, str(sequence), id=seq)
Exemplo n.º 3
0
def pygr_slice(fname):
    fasta = seqdb.SequenceFileDB(fname)
    for rec in fasta:
        seq = fasta[rec]
        for i in range(NSLICE):
            sub = str(seq[:100])
        break
def parse_motif():
	peak_dict = count_peak()
	genome = seqdb.SequenceFileDB('/u/home/f/frankwoe/nobackup/hg19/hg19.noRand.fa')
	
	# motifs include: GGACT, RRACT, DRACH
	motif_list = ["(GGACT)", "([GA][GA]ACT)", "([AGT][AG]AC[ACT])"]
	
	with gzip.GzipFile('peak_sum.tsv.gz', 'wb') as f:
		# header line
		f.write("\t".join( [ 'peak', 'exp_num', 'exp', 'occurence_num', 'occurence', 'fc', 'GGACT,RRACT,DRACH', 'seq' ] ) + '\n')
		for peak in peak_dict:
			chrom, start, end, strand = peak.split(':')
			peak_seq = _fetch_seq(genome, chrom, start, end, strand)
			occurence = peak_dict[peak].keys()
			occured_exp = list(set([x.split('.')[1] for x in occurence ]))
			fc = [ str(peak_dict[peak][x]) for x in occurence ]
			motif_count = _match_motif(peak_seq, motif_list)
			
			f.write('\t'.join( [
				peak, 
				str(len(occured_exp)),
				','.join(occured_exp), 
				str(len(occurence)),
				','.join(occurence), 
				','.join(fc), 
				','.join(motif_count), 
				peak_seq ]) + '\n' )
Exemplo n.º 5
0
def pygr_reverse_comp(fname):
    fasta = seqdb.SequenceFileDB(fname)
    keys = fasta.keys()
    keys.sort()
    for rec in keys:
        # force full reverse complement
        seq = str(-fasta[rec])
        sub = seq[:10]
Exemplo n.º 6
0
    def setUp(self):

        genomeFile = '/Users/Likit/projects/mdv/data/chick.fa'
        self.genome = seqdb.SequenceFileDB(genomeFile, verbose=False)

        exons = [('chr1', 51035309, 51035430), ('chr1', 51062489, 51062516)]

        self.isoform = genebuilder.Isoform('chr1', '1', '0', exons, self.genome)
Exemplo n.º 7
0
def main():
    '''Main function'''

    genome = seqdb.SequenceFileDB(sys.argv[1])  # genome sequence
    try:
        infile = open(sys.argv[2])  # splice sites file from gimme/compare_junction.py
    except IndexError, IOError:
        # no input file given or cannot open the file,
        # read data from stdin instead
        infile = sys.stdin
Exemplo n.º 8
0
def bed2pygr(dbprefix, referencefile, bedfile, indir):

    collision_counter = defaultdict(int)
    chrdb = seqdb.SequenceFileDB(referencefile)
    annodb = annotation.AnnotationDB({}, chrdb)

    al = cnestedlist.NLMSA(dbprefix, 'w', pairwiseMode=True)

    load_bed(al, annodb, bedfile, collision_counter)

    al.build(saveSeqDict=True)

    genomeprefix = os.path.basename(referencefile).rsplit('.', 1)[0]
    print >> open(os.path.join(dbprefix) + '.genome', 'w'), genomeprefix
Exemplo n.º 9
0
def read_genbank_annots(gbfile, fastafile=None, featureType='CDS',
                        geneQualifier='gene'):
    '''construct annotation DB for gene CDS intervals.
    NB: this assumes each gene consists of ONE interval.
    This cannot be used for multi-exon genes!'''
    try:
        gbparse = SeqIO.parse(gbfile, 'genbank')
    except TypeError: # SeqIO changed its interface?
        ifile = open(gbfile)
        try:
            gbparse = SeqIO.parse(ifile, 'genbank')
            gbseqs = list(gbparse)
        finally:
            ifile.close()
    else:
        gbseqs = list(gbparse)
    if fastafile is None:
        fastafile = gbfile.split('.')[0] + '.fna'
    genome = seqdb.SequenceFileDB(fastafile)
    genomeIndex = blast.BlastIDIndex(genome) # handle NCBI ID blobs properly
    annodb = annotation.AnnotationDB({}, genome,
                                     sliceAttrDict=dict(id=0, start=1, stop=2,
                                                        orientation=3))
    i = 0
    for s in gbseqs:
        seqID = genomeIndex[s.id].id # find the right seq and get its actual ID
        for f in s.features:
            if f.type == featureType:
                try:
                    name = f.qualifiers[geneQualifier][0]
                except KeyError: # keep the annotation even if label missing
                    warnings.warn('Missing gene qualifier "%s" on %s annotation'
                                  % (geneQualifier, featureType))
                    name = 'unlabeled_%s_%d' % (featureType, i)
                    i += 1
                annodb.new_annotation(name,
                        (seqID, f.location.start.position,
                         f.location.end.position, f.strand))
    al = cnestedlist.NLMSA('tmp', 'memory', pairwiseMode=True)
    for a in annodb.itervalues():
        al.addAnnotation(a)
    al.build()
    return annodb, al, genome
Exemplo n.º 10
0
'''

import sys
from pygr import seqdb, sequtil

if len(sys.argv) < 4:
    print >> sys.stderr, \
        'Usage: split_sequence.py fasta_file chunk_size overlap_size'
    raise SystemExit

input_file = sys.argv[1]
chunk_size = int(sys.argv[2])
overlap_size = int(sys.argv[3])

db = seqdb.SequenceFileDB(input_file)

for seq in db:
    window = 0
    print >> sys.stderr, 'Splitting %s...' % (seq)
    if len(db[seq]) <= chunk_size:
        sequtil.write_fasta(sys.stdout, str(db[seq]), id=seq)
    else:
        seq = db[seq]
        _id = 1
        chunk_id = "%s_%d" % (seq.id, _id)
        while window < len(seq):
            chunk = seq[window:window + chunk_size]
            sequtil.write_fasta(sys.stdout, str(chunk), id=chunk_id)
            _id += 1
            window += (chunk_size - overlap_size)
Exemplo n.º 11
0
def main():
    if len(sys.argv) < 4: raise SystemExit
    try:
        MIN_SCORE = float(sys.argv[5])
    except IndexError:
        pass

    print >> sys.stderr, 'Reading sequence databases...'
    queries = seqdb.SequenceFileDB(sys.argv[1])
    targets = seqdb.SequenceFileDB(sys.argv[2])
    print >> sys.stderr, len(queries), len(targets)
    try:
        align_file = open(sys.argv[3])
    except IOError as e:
        print >> sys.stderr, 'Error: check alignment file.'
        raise e

    aligndb = cnestedlist.NLMSA('alignment', mode='memory', pairwiseMode=True)

    print >> sys.stderr, 'Adding sequences to an alignment database...'
    # for n, target in enumerate(targets):
    #     aligndb += targets[target]
    #     if n % 1000 == 0: print >> sys.stderr, '...', n

    target_list = set()

    for c, al in enumerate(parse_alignments(align_file)):
        aligndb += targets[al.target]
        target_list.add(al.target)
        add_alignment(aligndb, al, targets, queries)
        if c % 100 == 0: print >> sys.stderr, '...', c

    print >> sys.stderr, 'Building the alignment database...'
    aligndb.build()

    print >> sys.stderr, 'Constructing alignment graphs...'
    graph = nx.Graph()
    for c, target in enumerate(target_list):
        try:
            sub_ival = targets[target]
            for src, dest, edge in aligndb[sub_ival].edges():
                source = repr(src).split('[')[0].lstrip('-')
                destination = repr(dest).split('[')[0].lstrip('-')
                graph.add_edge(source, destination)
        except KeyError:
            pass
        if c % 100 == 0: print >> sys.stderr, '...', c

    # nx.draw(graph)
    # plt.show()
    # print graph.nodes()
    logfile = open('assemgraph.log', 'w')
    visited_nodes = set()
    cluster_no = 0
    for node in graph.nodes():
        if node not in visited_nodes:
            filename1 = 'cluster_%d_targets' % cluster_no
            filename2 = 'cluster_%d_queries' % cluster_no
            ofile1 = open(filename1, 'w')
            ofile2 = open(filename2, 'w')
            print >> sys.stderr, \
                    'Writing cluster %d to a file...' % cluster_no,
            vnodes, max_length = (write_sequence(node, graph, targets, queries,
                                                 ofile1, ofile2))
            visited_nodes.update(vnodes)
            for n in vnodes:
                size = len(targets[n]) if n in targets else len(queries[n])
                print >> logfile, 'cluster_%d\t%s\t%d' % (cluster_no, n, size)
            ofile1.close()
            ofile2.close()
            print >> sys.stderr, '\ttotal nodes = %d' % len(vnodes)

            cluster_no += 1

    print >> logfile, '***finished***'
    logfile.close()
Exemplo n.º 12
0
def read_genome(fn):
    '''the input of this function should be a FASTA file'''
    genome = seqdb.SequenceFileDB(fn)
    return genome
Exemplo n.º 13
0
'''Get a part of a target sequence that aligned to a given query sequence.'''

import sys
import csv
from pygr import seqdb, sequtil

psl_file = sys.argv[1]
genome_file = sys.argv[2]
genome = seqdb.SequenceFileDB(genome_file)

reader = csv.reader(open(psl_file), dialect='excel-tab')
for cols in reader:
    target = cols[13]
    start = int(cols[15])
    end = int(cols[16])
    seq = genome[target][start:end]
    seqid = target + '_' + cols[9]
    sequtil.write_fasta(sys.stdout, str(seq), id=seqid)
Exemplo n.º 14
0
	
	(o, args) = opts.parse_args()
	
	GC_width = int(o.gc_width)

	if(o.pygr_seq != None):
		if(o.pygr_seq.upper() == "HG18"):
			seqs = pygr.Data.Bio.Seq.Genome.Human.hg18()
		elif(o.pygr_seq.upper() == "HG19"):
			seqs = pygr.Data.Bio.Seq.Genome.Human.hg19()
		elif(o.pygr_seq.upper() == "CHIMPY"):
			seqs = pygr.Data.Bio.Seq.Genome.chimp.chrY()
		elif(o.pygr_seq.upper() == "BACS" or o.pygr_seq.upper() == "CONTROL_BACS"):
			seqs = pygr.Data.Bio.Seq.Genome.Human.control_bacs()
	elif(o.fn_fastq_seq!=None):
		seqs = seqdb.SequenceFileDB(o.fn_fastq_seq)
	else:
		print("no sequence file input... exiting")
		sys.exit(1)

	GC_content = {}

	grp = "GC_content"
	GC_DT = create_gc_DenseTrackSet(o.fnoutTable,o.fnContigLengths,grp,o.overwrite)


	for contig in seqs:
		if(contig in GC_DT[grp]):
			print("loading %s..."%(contig))
			#if(chr=="chr21"):
			#GC_content["chr21"] = get_chr_correction(hg18,"chr21",GC_width)
Exemplo n.º 15
0
def pygr_parse_fasta(fname):
    fasta = seqdb.SequenceFileDB(fname)
Exemplo n.º 16
0
def main():
    infile = sys.argv[1]
    genome = seqdb.SequenceFileDB(sys.argv[2])

    get_sequence(infile, genome)
Exemplo n.º 17
0
def pygr_iter(fname):
    fasta = seqdb.SequenceFileDB(fname)
    for rec in fasta:
        seq = fasta[rec]
Exemplo n.º 18
0
def main(options, args):
    exons = {}
    clusters = {}
    newClusterID = 0
    clusterConnections = {}
    linkedExons = {}
    exonPositions = {}
    endExons = {}
    singleton = 0

    print >> sys.stderr, 'Minimum UTR length = ', options.minimumUTRLength
    print >> sys.stderr, 'Parsing and clustering exons..'
    for n, alnObj in enumerate(psl_parser.read(open(options.infile), 'track')):
        tStarts = alnObj.attrib['tStarts']
        blockSizes = alnObj.attrib['blockSizes']

        if len(blockSizes) == 1:
            singleton += 1

        tName = alnObj.attrib['tName']
        newClusterID = construct(tName, tStarts, blockSizes, exons, clusters,
                                 newClusterID, clusterConnections, linkedExons,
                                 exonPositions, endExons)
        if n % 1000 == 0:
            print >> sys.stderr, '...', n

    print >> sys.stderr, 'Total singletons = ', singleton

    sumExons = {}
    for ref, end in exons:
        try:
            sumExons[ref] += 1
        except KeyError:
            sumExons[ref] = 1
    for ref in sorted(sumExons):
        print >> sys.stderr, '\t%s has %d exon(s).' % (ref, sumExons[ref])

    print >> sys.stderr, '\nTotal %d cluster(s) found.' % len(clusters)

    print >> sys.stderr, '\nMerging clusters..'
    mergedClusters = mergeClusters(clusters, clusterConnections)
    print >> sys.stderr, '\nCleaning up..'
    ignored = set([])
    for cl in mergedClusters:
        allExons = mergedClusters[cl]
        cleanUpLinkedExons(allExons, linkedExons, exonPositions, ignored,
                           options.minimumUTRLength)

    print >> sys.stderr, 'Modifying the right end of each transcript..'
    for cl in mergedClusters:
        findLongestEnd(mergedClusters[cl], linkedExons, endExons,
                       exonPositions, ignored)

    print >> sys.stderr, '\nConstructing transcripts..'
    allPaths = {}
    visited = set([])
    for n, cl in enumerate(mergedClusters):
        txExons = sorted(mergedClusters[cl])
        paths = buildPaths(linkedExons, txExons, allPaths, ignored, visited)
        allPaths[cl] = paths
        if n % 1000 == 0:
            if n > 0:
                print >> sys.stderr, '... %d built..' % n

    genome = seqdb.SequenceFileDB(options.genome, verbose=False)
    '''Create isoform objects from allPaths and
    search for ORF.

    '''
    print >> sys.stderr, '\nBuilding gene models..'
    allGenes = {}
    n = 0
    for chrom, geneID in allPaths:
        n += 1
        isoformID = 0
        for isoExons in allPaths[(chrom, geneID)]:
            isoform = Isoform(chrom, geneID, isoformID, isoExons, genome)
            if chrom not in allGenes:
                allGenes[chrom] = {}
                allGenes[chrom][geneID] = [isoform]
            else:
                try:
                    allGenes[chrom][geneID].append(isoform)
                except KeyError:
                    allGenes[chrom][geneID] = [isoform]
            isoformID += 1

            if n % 1000 == 0:
                print >> sys.stderr, '...', n

    print >> sys.stderr, '\nRemoving redundant sequences..'
    findRedundantSequence(allGenes)
    '''Creating sequence records for each DNA, RNA and protein sequences.'''
    isoformDNASeqs = []
    isoformProteinSeqs = []
    isoformRNASeqs = []
    totalGenes = 0
    for chrom in allGenes:
        for geneID in allGenes[chrom]:
            totalGenes += 1
            isoformID = 0
            for isoform in allGenes[chrom][geneID]:
                if not isoform.redundant:
                    isoform.isoformID = isoformID
                    isoformName = '%s:%d.%d' % (chrom, geneID,
                                                isoform.isoformID)
                    DNARecord = SeqRecord(isoform.dnaSeq, id=isoformName)
                    isoformDNASeqs.append(DNARecord)
                    '''Search for ORF for non-redundant sequences'''

                    print >> sys.stderr, 'searching ORF: %s:%d.%d' \
                                            % (chrom, geneID,isoformID)
                    findORF(isoform)

                    if isoform.frame:
                        proteinRecord = SeqRecord(isoform.proteinSeq,
                                                  id=isoformName)
                        RNARecord = SeqRecord(isoform.mrnaSeq, id=isoformName)
                        isoformProteinSeqs.append(proteinRecord)
                        isoformRNASeqs.append(RNARecord)
                    isoformID += 1

                if n > 0 and n % 1000 == 0:
                    print >> sys.stderr, '...', n, 'transcripts done.'

    print >> sys.stderr, 'Total genes = %d\n\n', totalGenes
    print >> sys.stderr, 'Writing gene models to file...'
    writeBEDFile(allGenes, options.basename)
    print >> sys.stderr, 'Writing DNA sequences to file...'
    SeqIO.write(isoformDNASeqs, options.basename + '.dnas.fa', 'fasta')
    print >> sys.stderr, 'Writing RNA sequences to file...'
    SeqIO.write(isoformRNASeqs, options.basename + '.mrnas.fa', 'fasta')
    print >> sys.stderr, 'Writing protein sequences to file...'
    SeqIO.write(isoformProteinSeqs, options.basename + '.proteins.fa', 'fasta')
Exemplo n.º 19
0
 def setUp(self):
     hbb1_mouse = testutil.datafile('hbb1_mouse.fa')
     self.dna = seqdb.SequenceFileDB(hbb1_mouse)
     self.tdb = translationDB.get_translation_db(self.dna)