Пример #1
0
def aa_seq(options):
    """ Gets the ancestral sequence from a Fasta file

    """
    f = Fasta(options.ancestralfasta)
    keyz = (f.keys())
    match = ''
    if (options.single_chromosome):
        # Single chromosome fasta should only have one sequence.
        # that sequence should be the sequence of interest.
        keyz = list(keyz)
        key = keyz[0]
    else:
        get_chromosome_from_header = options.header
        get_chromosome_from_header = \
            get_chromosome_from_header.replace('?', options.chromosome)
        for key in keyz:
            if(re.match(get_chromosome_from_header, key) is not None):
                match = key
        if(match is ''):
            raise Exception("No match possible is something wrong with the"
                            " regex specified to the program as"
                            "--header-regex")
    aaSeq = f[key]
    return(aaSeq)
Пример #2
0
def calc_nuc_counts(fasta_filename, region_size_min,
                    region_size_max, verbose):
    ''' calculate nuc frequencies for normalization.

        Returns: dict of nucleotide frequencies.
    '''

    nuc_counts = defaultdict(Counter)

    fasta = Fasta(fasta_filename)

    for chrom, seq in fasta.items():

        for idx, pos in enumerate(seq):

            for region_size in range(region_size_min,
                                     region_size_max + 1):

                nucs = seq[idx:idx+region_size]

                if len(nucs) < region_size: continue

                nuc_counts[region_size][nucs] += 1

    return nuc_counts
Пример #3
0
    def _no_empty(self, lista, listb):
        ''' removes empty entries '''
        
        # check for empty fasta.
        tmpa = list()
        tmpb = list()
        for i in range(len(listb)):
            
            # open it.
            try:
                z = Fasta(listb[i], record_class=MemoryRecord)
            
                # check for empty.
                if len(z.keys()) == 0:
                    continue

                # add to temp.
                tmpa.append(lista[i])
                tmpb.append(listb[i])

            except:
                logging.warning("bad fasta file")
            
        # sort back.
        return tmpa, tmpb
def parse_sequences(sites, size, fasta_file):
    """Adds the binding site sequences extende to 'size' per row (decoded as A=0, C=1, G=2, T=3) to each input region."""
    from pyfasta import Fasta  # Fasta package is needed to fetch sequences from genome fasta file
            
    print "INFO: Begin to fetch sequences...."
    
    f = Fasta(fasta_file, key_fn=lambda key: key.split()[0])

    for i, reg in enumerate(sites):
        
        start = reg["ext_start"]
        end = reg["ext_end"]
        
        # if motif on negativ strand, shift region by +1 to account for zero based half-open intervals
        if reg["strand"] == '-':
            start += 1
            end += 1
        
        seq = f.sequence({"chr":reg["chr"], "start":start, "stop":end}, one_based=False)

        # Note, the 'strand':reg["strand"] argument for f.sequence does not work, there seems to be a bug in the pyfasta/fasta.py code.
        seq = seq.upper()
 
        # if motif on negative strand, convert seq to reverse complement
        if reg["strand"] == '-': 
            seq = reverse_complement(seq)
        
        # add sequence to region dict
        reg["ext_seq"] = seq
        
    print "INFO: Finished sequences."
    return regions 
Пример #5
0
def read_fa(fa='/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/mm10/transcriptome/mm10_transcriptome.fa'):
	gj.printFuncRun('read_fa')
	gj.printFuncArgs()
	fa_dict = Fasta(fa, key_fn=lambda key:key.split("\t")[0])
	print fa_dict.keys()[0:3]
	gj.printFuncRun('read_fa')
	return fa_dict
Пример #6
0
def removehost(fasta, bed):
    removeregion = dict()

    with open(bed) as bedin:
        for i in bedin:

            removeregion[i.rstrip()] = 1

    fa = Fasta(fasta)

    outfile = 'removehost_' + fasta

    outio = open(outfile, 'w')

    for seqname in fa.keys():

        if seqname in removeregion:

            continue

        else:

            outst = '>' + seqname + '\n' + str(fa[seqname]) + '\n'

            outio.write(outst)

    outio.close()
def read_score(score_tab, ref):
    fa = Fasta(ref)
    fa_dict = {}
    for i, j in fa.items():
        fa_dict[i.split('\t')[0]] = j

    score_dict = nested_dict()
    with open(score_tab, 'r') as TXT:
        for line in TXT:
            line = line.strip()
            if not line or line.startswith('@'): continue
            arr = line.split('\t')
            if arr[1] == '-': continue
            score_dict[arr[0]][int(arr[2])] = arr[7]
    score_dict = score_dict.to_dict()
    #     print score_dict

    reactivity_dict = nested_dict(2, list)
    for i, j in score_dict.items():
        for p in xrange(1, len(fa_dict[i]) + 1):
            if p not in score_dict[i]:
                r = 'NULL'
            elif score_dict[i][p] == '-1':
                r = 'NULL'
            else:
                r = score_dict[i][p]
            reactivity_dict[i]['reactivity_ls'].append(r)

    return reactivity_dict.to_dict()
Пример #8
0
 def create_pyfasta_iterator(self, **kwargs):
     from pyfasta import Fasta
     print "Generating PyFasta sequence index.  This may take a moment...."
     self.fasta = Fasta(kwargs['input'])
     self.readcount = len(self.fasta)
     self.db_values = zip(range(len(self.fasta)), sorted(self.fasta.keys()))
     self.read = iter(self.db_values)
Пример #9
0
def read_fa(
    fa='/Share/home/zhangqf7/gongjing/zebrafish/data/reference/transcriptome/danRer10.refSeq.transcriptome.fa'
):
    fa_dict1 = Fasta(fa, key_fn=lambda key: key.split("\t")[0])
    fa_dict = {i.split()[0]: j[0:] for i, j in fa_dict1.items()}
    print fa_dict.keys()[0:3]
    return fa_dict
Пример #10
0
def aa_seq(options):
    """ Gets the ancestral sequence from a Fasta file

    """
    f = Fasta(options.ancestralfasta)
    keyz = (f.keys())
    match = ''
    if (options.single_chromosome):
        # Single chromosome fasta should only have one sequence.
        # that sequence should be the sequence of interest.
        keyz = list(keyz)
        key = keyz[0]
    else:
        get_chromosome_from_header = options.header
        get_chromosome_from_header = \
            get_chromosome_from_header.replace('?', options.chromosome)
        for key in keyz:
            if (re.match(get_chromosome_from_header, key) is not None):
                match = key
        if (match is ''):
            raise Exception("No match possible is something wrong with the"
                            " regex specified to the program as"
                            "--header-regex")
    aaSeq = f[key]
    return (aaSeq)
Пример #11
0
    def run(self, filename):
        self.openOutFiles(filename)
        f = Fasta(filename)

        count = len(f)
        self.not_found_in_kabat, self.fr4_not_found, current = (0, 0, 0)

        for name in f.keys():
            current += 1
            if current % 1000 == 0:
                print "All %d. Current: %d" % (count, current)
                # format: vName_jName{frameNumber} or vName_dName{frameNumber}_jName{frameNumber}

            vGeneName = name.split("_")[0]

            vGeneRegions = self.getVGeneRegions(vGeneName)
            if vGeneRegions is None:
                continue

            withoutMarkup = f[name][vGeneRegions[self.kabat.regions_count * 2 - 1]:]
            group = self.findFR4(name, withoutMarkup)
            if group is None:
                continue

            self.result_kabat_file.write(name)
            self.result_kabat_file.write(("\t%d" * 10) % tuple(vGeneRegions))
            self.result_kabat_file.write(("\t%d" * 4 + "\n") % tuple(
                [vGeneRegions[9] + i for i in [1, group.start(), group.start() + 1, len(withoutMarkup)]]))

        self.closeOutFiles()
        print "all: {}; not in kabat: {}; without fr4: {}".format(current, self.not_found_in_kabat, self.fr4_not_found)
Пример #12
0
def genome_contenct_stats(fasta_path):
    f = Fasta(fasta_path)
    g_box_total = []
    for seqid in f.keys():
        seq = f[seqid][:]
        g_boxs = len(re.findall('CACGTG',seq,flags=re.IGNORECASE))
        g_box_total.append(g_boxs)
    print >>sys.stderr, "total gboxes:{0}".format(sum(g_box_total))
Пример #13
0
def read_fa(fa='/Share/home/zhangqf7/gongjing/zebrafish/data/reference/transcriptome/danRer10.refSeq.transcriptome.fa'):
	gj.printFuncRun('read_fa')
	gj.printFuncArgs()
	fa_dict1 = Fasta(fa, key_fn=lambda key:key.split("\t")[0])
	fa_dict = {i.split()[0]:j[0:] for i,j in fa_dict1.items()}
	print fa_dict.keys()[0:3]
	gj.printFuncRun('read_fa')
	return fa_dict
Пример #14
0
def create_fasta_flat_file(file):
    """Reads a fasta file for fast sequence retrival"""

    fasta_file = Fasta(file, key_fn=lambda key: key.split()[0])

    fasta_headers = set(fasta_file.keys());

    return fasta_file, fasta_headers
Пример #15
0
def genome_contenct_stats(fasta_path):
    f = Fasta(fasta_path)
    g_box_total = []
    for seqid in f.keys():
        seq = f[seqid][:]
        g_boxs = len(re.findall("CACGTG", seq, flags=re.IGNORECASE))
        g_box_total.append(g_boxs)
    print >> sys.stderr, "total gboxes:{0}".format(sum(g_box_total))
def getSequence(genome):
    genome=Fasta(genome)
    RAD_seq = pd.read_csv('../data/input_data/peak.csv')
    result = map(lambda i:[genome.sequence({'chr':RAD_seq['chrom'][i],'start':RAD_seq['start'][i],'stop':RAD_seq['end'][i]})],range(len(RAD_seq)))
    RAD_seq['seq'] = result
    RAD_seq['seq'] = RAD_seq.apply(fuc,axis=1)
    RAD_seq.to_csv('../data/input_data/RAD_seq.csv',index=False)
    print 'getSequence is over,RAD_seq.csv is bulit!'
Пример #17
0
def check_keyfn2(path, klass, inplace):
    f = Fasta(path, record_class=klass, flatten_inplace=inplace, key_fn=lambda
            key: "-".join(key.split()))

    assert sorted(f.keys()) == ['a-extra', 'b-extra', 'c-extra'], f.keys()

    assert f['a-extra']
    fix(path)
Пример #18
0
class Alg:
    def __init__(self, fastafn, freqfn, colorfn):
        self.pos = []
        self.init = False
        self.size = 0
        self.fasta = Fasta(fastafn)
        self.colorfn = colorfn
        self.conta = {'n':0, '-':0, 'a':1, 'c':2, 'g':3, 't':4, '\n':'\n'}

        self.read_fasta(fastafn)
        self.write_freqs(freqfn)

    def do_plot(self, plot, names = False):
        msa = self.seqtocol(self.colorfn, names= names)
        if plot:
            return(msa)
        

    def read_fasta(self, fastafn):
        for entry in self.fasta.keys():
            seq = self.fasta[entry][:]
            if not self.init:
                # this assumes that all the entries in the fasta record are the same size. 
                # this is the default setting for clustalo 
                # TODO add an assertion ro verify so
                self.size = len(seq) 
                for i in range(0, self.size):
                    self.pos.append(Pos(i))
                self.init = True

            for nt in range(0, self.size):
                self.pos[nt].freq[seq[nt].lower()]+=1 
    
    def seqtocol(self, outfn, names=False):
        outf = open(outfn, 'w')
        colors = []
        for i,entry in enumerate(self.fasta.keys()):
            outf.write(entry+','+','.join([str(self.conta[i.lower()]) for i in self.fasta[entry][:]])+'\n')
            if names:
                colors.append(entry)
            [colors.append(self.conta[i.lower()]) for i in self.fasta[entry][:]]
        outf.close()

        # TODO thisis very weird, check why one option returns the transpose
        if names:
            #colors = np.array(colors).reshape( 1+i, 1+len(self.fasta[entry][:])) 
            colors = np.array(colors).reshape( 1+len(self.fasta[entry][:]), 1+i) 
        else:
            colors = np.array(colors).reshape(1+i, len(self.fasta[entry][:])) 
        return(colors)

    def write_freqs(self, outfn):
        outf = open(outfn, 'w')
        outf.write('\t'.join(['a','c','t','g'])+'\n')
        for j in self.pos:
            outf.write('\t'.join([str(j.freq['a']),str(j.freq['c']),str(j.freq['t']),str(j.freq['g'])])+'\n')
        outf.close()
Пример #19
0
def check_keyfn2(path, klass, inplace):
    f = Fasta(path,
              record_class=klass,
              flatten_inplace=inplace,
              key_fn=lambda key: "-".join(key.split()))

    assert sorted(f.keys()) == ['a-extra', 'b-extra', 'c-extra'], f.keys()

    assert f['a-extra']
    fix(path)
Пример #20
0
    def __init__(self, fastafn, freqfn, colorfn):
        self.pos = []
        self.init = False
        self.size = 0
        self.fasta = Fasta(fastafn)
        self.colorfn = colorfn
        self.conta = {'n':0, '-':0, 'a':1, 'c':2, 'g':3, 't':4, '\n':'\n'}

        self.read_fasta(fastafn)
        self.write_freqs(freqfn)
Пример #21
0
def split(args):
    parser = optparse.OptionParser("""\
   split a fasta file into separated files.
        pyfasta split -n 6 [-k 5000 ] some.fasta
    the output will be some.0.fasta, some.1.fasta ... some.6.fasta
    the sizes will be as even as reasonable.
   """)
    parser.add_option("--header", dest="header", metavar="FILENAME_FMT",
       help="""this overrides all other options. if specified, it will
               split the file into a separate file for each header. it
               will be a template specifying the file name for each new file.
               e.g.:    "%(fasta)s.%(seqid)s.fasta"
               where 'fasta' is the basename of the input fasta file and seqid
               is the header of each entry in the fasta file.""" ,default=None)

    parser.add_option("-n", "--n", type="int", dest="nsplits", 
                            help="number of new files to create")
    parser.add_option("-o", "--overlap", type="int", dest="overlap", 
                            help="overlap in basepairs", default=0)
    parser.add_option("-k", "--kmers", type="int", dest="kmers", default=-1,
                     help="""\
    split big files into pieces of this size in basepairs. default
    default of -1 means do not split the sequence up into k-mers, just
    split based on the headers. a reasonable value would be 10Kbp""")
    options, fasta = parser.parse_args(args)
    if not (fasta and (options.nsplits or options.header)):
        sys.exit(parser.print_help())

    if isinstance(fasta, (tuple, list)):
        assert len(fasta) == 1, fasta
        fasta = fasta[0]

    kmer = options.kmers if options.kmers != -1 else None
    overlap = options.overlap if options.overlap != 0 else None
    f = Fasta(fasta)
    if options.header:
        names = dict([(seqid, options.header % \
                      dict(fasta=f.fasta_name, seqid=seqid)) \
                                       for seqid in f.iterkeys()])
        """
        if len(names) > 0:
            assert names[0][1] != names[1][1], ("problem with header format", options.header)
        fhs = dict([(seqid, open(fn, 'wb')) for seqid, fn in names[:200]])
        fhs.extend([(seqid, StringIO(), fn) for seqid, fn in names[200:]])
        """
        return with_header_names(f, names)
    else:
        names = newnames(fasta, options.nsplits, kmers=kmer, overlap=overlap, 
                     header=options.header)

        #fhs = [open(n, 'wb') for n in names]
    if options.kmers == -1:
        return without_kmers(f, names)
    else: 
        return with_kmers(f, names, options.kmers, options.overlap)
Пример #22
0
def mask_to_bed(fasta_file, mask_bed_name):
    "creates a bed file of the start and stops of masked seqs"
    mask_bed = open(mask_bed_name,"wb")
    f= Fasta(fasta_file)
    mask_id = 1
    for seqid in f.keys():
        seq = f[seqid][:]
        for m in re.finditer("X+",seq):
            mask_id = mask_id + 1
            w = '{0}\t{1}\t{2}\t{3}\t{4}\t+\t.\t.\t.\t1\t{5}\t0\n'.format(seqid,m.start(),m.end(),"mask_id {0}".format(mask_id),(m.end()-m.start()),(m.end()-m.start()+1))
            mask_bed.write(w)
Пример #23
0
def write_c2t(fasta_name, unconverted, colorspace=False):
    """
    given a fasta file, write a new file:
        `some.fr.c2t.fasta` which contains:
          + the same headers prefixed with 'f' with all C's converted to T
          + headers prefixed with 'r' reverse complemented with
                                 all C's converted to T.

    if unconverted is false, then also save a file with the forward and reverse
    without conversion.
    """
    d = op.join(op.dirname(fasta_name), "bowtie_index")
    if colorspace: d += "_colorspace"
    if not op.exists(d): os.mkdir(d)

    p, ext = op.splitext(op.basename(fasta_name))  # some.fasta -> some, fasta
    fname = "%s/%s.fr.c2t%s" % (d, p, ext)
    # no conversion, just copy the file into the index dir.
    unconverted_fname = "%s/%s.fr%s" % (d, p, ext)
    if op.exists(fname):
        if not unconverted: return fname, unconverted_fname
        elif op.exists(unconverted_fname): return fname, unconverted_fname

    fasta = Fasta(fasta_name)

    c2t_fh = open(fname, 'w')
    unc_fh = open(unconverted_fname, 'w') if unconverted else None

    print >> sys.stderr, "writing forward and reverse c2t to: %s" % (fname, )

    try:
        for header in fasta.iterkeys():
            seq = str(fasta[header]).upper()
            assert not ">" in seq
            # c2t, prefix header with f and write
            print >> c2t_fh, ">f%s" % header
            print >> c2t_fh, seq.replace('C', 'T')
            # then r-c, c2t, prefix header with r and write
            print >> c2t_fh, ">r%s" % header
            rseq = revcomp(seq)
            print >> c2t_fh, rseq.replace('C', 'T')
            if unc_fh is not None:
                print >> unc_fh, ">f%s\n%s" % (header, seq)
                print >> unc_fh, ">r%s\n%s" % (header, rseq)

        c2t_fh.close()
    except:
        os.unlink(fname)
        os.unlink(unconverted_fname)
        raise

    return fname, unconverted_fname
Пример #24
0
def write_c2t(fasta_name, unconverted, colorspace=False):
    """
    given a fasta file, write a new file:
        `some.fr.c2t.fasta` which contains:
          + the same headers prefixed with 'f' with all C's converted to T
          + headers prefixed with 'r' reverse complemented with
                                 all C's converted to T.

    if unconverted is false, then also save a file with the forward and reverse
    without conversion.
    """
    d = op.join(op.dirname(fasta_name), "bowtie_index")
    if colorspace: d += "_colorspace"
    if not op.exists(d): os.mkdir(d)

    p, ext = op.splitext(op.basename(fasta_name)) # some.fasta -> some, fasta
    fname = "%s/%s.fr.c2t%s" % (d, p, ext)
        # no conversion, just copy the file into the index dir.
    unconverted_fname = "%s/%s.fr%s" % (d, p, ext)
    if op.exists(fname):
        if not unconverted: return fname, unconverted_fname
        elif op.exists(unconverted_fname): return fname, unconverted_fname

    fasta = Fasta(fasta_name)

    c2t_fh = open(fname, 'w')
    unc_fh = open(unconverted_fname, 'w') if unconverted else None

    print >>sys.stderr, "writing forward and reverse c2t to: %s" % (fname,)

    try:
        for header in fasta.iterkeys():
            seq = str(fasta[header]).upper()
            assert not ">" in seq
            # c2t, prefix header with f and write
            print >>c2t_fh, ">f%s" % header
            print >>c2t_fh, seq.replace('C', 'T')
            # then r-c, c2t, prefix header with r and write
            print >>c2t_fh, ">r%s" % header
            rseq = revcomp(seq)
            print >>c2t_fh, rseq.replace('C', 'T')
            if unc_fh is not None:
                print >>unc_fh, ">f%s\n%s" % (header, seq)
                print >>unc_fh, ">r%s\n%s" % (header, rseq)

        c2t_fh.close()
    except:
        os.unlink(fname)
        os.unlink(unconverted_fname)
        raise

    return fname, unconverted_fname
def cut_up_genome(input_files_list, output_folder, region_length):
    for file in input_files_list:
        f = Fasta(file)
        chr = sorted(f.keys())
        for chromosome in chr:
            sequence = f[chromosome]
            regions = [
                sequence[i:i + region_length]
                for i in range(0, len(sequence), region_length)
            ]
            path = os.path.join(output_folder, f'chr={chromosome}')
            write_to_json(path, regions, region_length)
            print(f'{chromosome} is complete!')
Пример #26
0
def mask_to_bed(fasta_file, mask_bed_name):
    "creates a bed file of the start and stops of masked seqs"
    mask_bed = open(mask_bed_name, "wb")
    f = Fasta(fasta_file)
    mask_id = 1
    for seqid in f.keys():
        seq = f[seqid][:]
        for m in re.finditer("X+", seq):
            mask_id = mask_id + 1
            w = '{0}\t{1}\t{2}\t{3}\t{4}\t+\t.\t.\t.\t1\t{5}\t0\n'.format(
                seqid, m.start(), m.end(), "mask_id {0}".format(mask_id),
                (m.end() - m.start()), (m.end() - m.start() + 1))
            mask_bed.write(w)
Пример #27
0
def mask(fasta_file, org, cutoff, mask_value='X'):
    h5, node = get_node(org, 'r')

    outfile = fasta_file[:fasta_file.rfind(".")] + (".masked.%i" % cutoff) \
                         + fasta_file[fasta_file.rfind("."):]

    print "> masking sequence to file:", outfile
    out = open(outfile ,'w')

    fasta = Fasta(fasta_file)

    soft_mask = mask_value.lower() == 'soft'
    for seqid in sorted(fasta.iterkeys()): 
        masked = 0
        if soft_mask:
            seq = str(fasta[seqid])
            # mask is the lowercase sequence.
            mask_value = np.array(seq.lower(), dtype='c')
            seq = np.array(seq.upper(), dtype='c')
        else:
            fasta[seqid].tostring = False
            seq = fasta[seqid][:] # a


        if not 'c' + seqid in node:
            print >>sys.stderr, seqid,\
                '! not found in masked, writing unchanged\n' \
                '  this means that no section of this sequence appeared\n' \
                '  more than %i times' % cutoff
            out.write('>' + seqid + '\n')
            out.write(seq.tostring() + '\n')
            continue
        
        hit_counts = getattr(node, 'c' + seqid)[:]
        masked_seq = np.where(numexpr.evaluate("hit_counts > %i" % cutoff)
                              , mask_value, seq).tostring() 

        l = len(masked_seq)
        print >>sys.stderr, "! seq:%s len:%i %%masked:%.3f" % (seqid, l, 
                                   100.0 * masked_seq.count(mask_value) / l)
        assert len(seq) == l
        out.write('>' + seqid + '\n')
        out.write(masked_seq + '\n')

    out.close()
    # write out a file .fasta.version containing
    # the svnversion (if available of this script
    # that was used to create the file.
    path = os.path.dirname(__file__)
    os.system('svnversion %s > %s.version' % (path, outfile))
    h5.close()
Пример #28
0
def main(gff_file, outdir):
    """empty docstring"""
    name = re.compile("parent=([^.;]+)", re.I)

    feats = {}
    non_cds_feats = collections.defaultdict(list)
    for line in open(gff_file):
        line = line.split("\t")
        match = re.search(name, line[-1])
        if not match:
            continue
        fname = match.groups(0)[0]
        non_cds_feats[fname].append(line)
        if line[2].upper() == "CDS":
            feats[fname] = True
            continue
        if fname in feats:
            continue
        feats[fname] = None
    i = 0
    for k, v in sorted(feats.items()):
        if not v is None:
            del non_cds_feats[k]

    seen = {}
    RNA = open(outdir + "/at_non_cds.gff", "w")
    for k, feat_list in sorted(non_cds_feats.items()):
        for feat in feat_list:
            if feat[0] in ("ChrC", "ChrM"):
                continue
            if feat[2] == "exon":
                continue
            key = (feat[0], feat[3], feat[4])
            if key in seen:
                continue
            feat[0] = feat[0].upper().replace("CHR", "")
            seen[key] = True
            feat[-1] = k
            print >> RNA, "\t".join(feat)
    RNA.close()

    gff = read_gff(outdir + "/at_non_cds.gff")
    fasta = Fasta("/home/gturco/src/find_cns_gturco/pipeline/data/arabidopsis.fasta")
    ftypes = {}
    FA = open(outdir + "/at_rnas.fasta", "w")
    for chr, feature_list in gff.iteritems():
        for fname, feature in feature_list.iteritems():
            seq = fasta.sequence(feature)
            print >> FA, ">", feature["name"]
            print >> FA, seq
    FA.close()
Пример #29
0
def mask(fasta_file, org, cutoff, mask_value='X'):
    h5, node = get_node(org, 'r')

    outfile = fasta_file[:fasta_file.rfind(".")] + (".masked.%i" % cutoff) \
                         + fasta_file[fasta_file.rfind("."):]

    print "> masking sequence to file:", outfile
    out = open(outfile, 'w')

    fasta = Fasta(fasta_file)

    soft_mask = mask_value.lower() == 'soft'
    for seqid in sorted(fasta.iterkeys()):
        masked = 0
        if soft_mask:
            seq = str(fasta[seqid])
            # mask is the lowercase sequence.
            mask_value = np.array(seq.lower(), dtype='c')
            seq = np.array(seq.upper(), dtype='c')
        else:
            fasta[seqid].tostring = False
            seq = fasta[seqid][:]  # a

        if not 'c' + seqid in node:
            print >>sys.stderr, seqid,\
                '! not found in masked, writing unchanged\n' \
                '  this means that no section of this sequence appeared\n' \
                '  more than %i times' % cutoff
            out.write('>' + seqid + '\n')
            out.write(seq.tostring() + '\n')
            continue

        hit_counts = getattr(node, 'c' + seqid)[:]
        masked_seq = np.where(numexpr.evaluate("hit_counts > %i" % cutoff),
                              mask_value, seq).tostring()

        l = len(masked_seq)
        print >> sys.stderr, "! seq:%s len:%i %%masked:%.3f" % (
            seqid, l, 100.0 * masked_seq.count(mask_value) / l)
        assert len(seq) == l
        out.write('>' + seqid + '\n')
        out.write(masked_seq + '\n')

    out.close()
    # write out a file .fasta.version containing
    # the svnversion (if available of this script
    # that was used to create the file.
    path = os.path.dirname(__file__)
    os.system('svnversion %s > %s.version' % (path, outfile))
    h5.close()
Пример #30
0
def check_keyfn(path, klass, inplace):
    f = Fasta(path, record_class=klass, flatten_inplace=inplace, key_fn=lambda key: key.split()[0])
    assert sorted(f.keys()) == ['a', 'b', 'c'], f.keys()
    fix(path)
    ff = Fasta(path, record_class=klass, flatten_inplace=inplace)
    assert sorted(ff.keys()) == ['a extra', 'b extra', 'c extra'], (ff.keys(), klass)
    fix(path)
Пример #31
0
def check_kmer_overlap(f):
    chr2 = f['chr2']

    kmers = Fasta.as_kmers(chr2, 10, overlap=2)
    for i, k in enumerate(list(kmers)[:-1]):
        assert (len(k[1]) == 10)
        assert (k[0] == (i * (10 - 2)))

    kmers = Fasta.as_kmers(chr2, 10, overlap=4)
    seqs = [k[1] for k in kmers]
    paired_seqs = zip(seqs[0:-1], seqs[1:])
    for a, b in paired_seqs:
        if len(a) < 4 or len(b) < 4: continue
        assert (a[-4:] == b[:4])
Пример #32
0
def spgenome(fafile, outdir, maxsize=1000000000):


    spfiles = list()
    if path.exists(fafile):

        outfiles = dict()

        subfiles = dict()

        infa = Fasta(fafile)

        # nowsub = 0

        nowlen = 0

        for chrom in infa.keys():

            chrlen = len(infa[chrom])

            nowlen = nowlen+chrlen

            nowsub = int(nowlen/maxsize)

            if nowsub not in subfiles:

                subfilename = 'tmpfile' + str(nowsub) + '.fa'

                subfile = path.join(outdir,subfilename)

                spfiles.append(subfile)

                subfiles[nowsub] = open(subfile,'w')

            # outfiles[chrom] = nowsub

            print('>', chrom, sep='', file=subfiles[nowsub])

            print(infa[chrom], file=subfiles[nowsub])

        for nowsub in subfiles:

            subfiles[nowsub].close()


    else:
        print("Can't find ", fafile)

    return spfiles
Пример #33
0
 def search(self, ref_base, pos, alt_base="X"):
     var_name = "".join([ref_base, str(pos), alt_base])
     fasta_string = self.create_variant_probe_set(var_name=var_name)
     with tempfile.NamedTemporaryFile() as fp:
         fp.write(fasta_string)
         fp.seek(0)
         fasta = Fasta(fp.name)
     refs = []
     alts = []
     for k, v in fasta.items():
         if "ref" in k:
             refs.append(str(v))
         else:
             alts.append(str(v))
     return {"query": var_name, "results": self.genotype_alleles(refs, alts)}
Пример #34
0
def get_sequence_dict(file_path, upper=True):
    """
    Returns a dictionary of fasta records. If upper is true, all bases will be uppercased.
    """
    assert os.path.exists(file_path), ('Error: FASTA file {} does not exist'.format(file_path))
    gdx_path = file_path + ".gdx"
    assert os.path.exists(gdx_path), ("Error: gdx does not exist for this fasta. We need the fasta files to be "
                                      "flattened in place prior to running the pipeline because of concurrency issues.")
    flat_path = file_path + '.flat'
    assert os.path.exists(flat_path), ("Error: flat file does not exist for this fasta. We need the fasta files to be "
                                       "flattened in place prior to running the pipeline because of concurrency issues.")
    if upper is True:
        return Fasta(file_path, record_class=UpperNpyFastaRecord)
    else:
        return Fasta(file_path)
def align_consensus(tmp_dir, gp, target_genome_fasta, ref_tx_fasta):
    """
    Main consensus alignment function.
    """
    ref_tx_fasta = Fasta(ref_tx_fasta)
    target_genome_fasta = Fasta(target_genome_fasta)
    tmp_tgt, tmp_ref, tmp_psl = prepare_tmp_files(tmp_dir, gp,
                                                  target_genome_fasta)
    tx_seq = str(ref_tx_fasta[gp.name])
    fastaWrite(tmp_ref, gp.name, tx_seq)
    system("blat {} {} -out=psl -noHead {}".format(tmp_tgt, tmp_ref, tmp_psl))
    r = popenCatch("simpleChain -outPsl {} /dev/stdout".format(tmp_psl))
    r = r.split("\n")[:-1]
    best_cov, best_ident = evaluate_blat_results(r)
    return map(str, [gp.id, gp.name, best_cov, best_ident])
    def test_find_closest_splice_acceptor_plus(self):
        """ Find the closest splice acceptor, which is 17 bp upstream.
            Plus strand."""

        # Process reference junctions
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjDict = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)

        # Intron bound info
        transcriptID = "test_read"
        jnNumber = 0
        chrom = "chr1"
        start = 23071360
        end = 23072140
        strand = "+"
        genome = Fasta("input_files/hg38_chr1.fa")

        junction = sj.SpliceJunction(transcriptID, jnNumber, chrom, start, end,
                                     strand, genome, sjDict)

        acceptor = junction.get_splice_acceptor()
        closest_acceptor = TC.find_closest_bound(acceptor, acceptors)
        assert closest_acceptor.start == 23072122
        assert closest_acceptor.end == 23072123
        assert closest_acceptor.dist == -17
Пример #37
0
    def test_primary_monoexon_read(self):
        """ The supplied read is a primary alignment. This means that a
            transcript object is created, and the logInfo struct notes the
            primary status."""

        sam_file = "input_files/sams/perfectReferenceMatch_noIntrons.sam"
        with open(sam_file, 'r') as f:
            sam_line = f.readline().strip()

        genome = Fasta("input_files/hg38_chr1.fa")
        sjAnnot = set()

        transcript, logInfo = TC.transcript_init(sam_line, genome, sjAnnot)
        assert transcript.QNAME == "c21031/f2p3/3400"
        assert transcript.FLAG == 0
        assert transcript.CHROM == "chr1"
        assert transcript.POS == 192575775
        assert transcript.CIGAR == "155M"
        assert transcript.MD == "MD:Z:155"
        assert logInfo.Mapping == "primary"      
        assert logInfo.corrected_deletions == \
               logInfo.uncorrected_deletions == \
               logInfo.variant_deletions == \
               logInfo.corrected_insertions == \
               logInfo.uncorrected_insertions == \
               logInfo.variant_insertions == \
               logInfo.corrected_mismatches == \
               logInfo.uncorrected_mismatches == \
               logInfo.corrected_NC_SJs == logInfo.uncorrected_NC_SJs == "NA" 
    def test_find_closest_splice_acceptor_minus(self):
        """ Find the closest splice acceptor, which is 1 bp downstream.
            Minus strand. Note that dist is relative to the genome, not to
            the direction of the transcript."""

        # Process reference junctions
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjDict = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)

        # Intron bound info
        transcriptID = "test_read"
        jnNumber = 0
        chrom = "chr1"
        start = 22071331
        end = 22073331
        strand = "-"
        genome = Fasta("input_files/hg38_chr1.fa")

        junction = sj.SpliceJunction(transcriptID, jnNumber, chrom, start, end,
                                     strand, genome, sjDict)

        acceptor = junction.get_splice_acceptor()
        closest_acceptor = TC.find_closest_bound(acceptor, acceptors)
        assert closest_acceptor.start == 22071329
        assert closest_acceptor.end == 22071330
        assert closest_acceptor.dist == -1
Пример #39
0
    def test_get_depth_info(self):
        ref_fasta = Fasta(fasta_dir + 'test/chr0.fa')
        chr0 = ref_fasta['chr0']
        confident_regions = Regions([(0,10000000)])

        reads = list(self.bam_in)
        r = get_depth_info(reads, "chr0", 0, len(chr0), None, confident_regions)
        (depth_df, summary_depth_info, confident_depth_info, target_info, target_cov) = r

        reads_dd = filter(lambda x: not x.is_duplicate, reads)
        r_dd = get_depth_info(reads_dd, "chr0", 0, len(chr0), None, confident_regions)
        (dd_depth_df, summary_depth_info_deduped, confident_depth_info, target_info, target_cov) = r_dd

        self.assertEqual(summary_depth_info, {0: 10, 1: 10, 2: 10, 3: 10})
        self.assertEqual(summary_depth_info_deduped, {0: 10, 1: 20, 2: 10})
        self.assertEqual(target_info, {})


        r = get_depth_info(reads, "chr0", 0, len(chr0), Regions([(5, 15)]), confident_regions)
        (target_depth_df, summary_depth_info, confident_depth_info, target_info, target_cov) = r

        self.assertEqual(summary_depth_info, {2: 5, 3: 5})

        self.assertEqual(len(target_depth_df), 10)
        self.assertEqual(len(target_cov), 1)
        self.assertEqual(target_cov['mean'][0], 2.5)
        self.assertEqual(sum(target_depth_df.coverage), target_info['on_target_bases'])


        r_dd = get_depth_info(reads_dd, "chr0", 0, len(chr0), Regions([(5, 15)]), confident_regions)
        (target_depth_df, summary_depth_info_deduped, confident_depth_info, target_info, target_cov) = r_dd

        self.assertEqual(summary_depth_info_deduped, {1: 5, 2: 5})
Пример #40
0
 def segments(self):
     '''
     Generator for Segments
     '''
     startchr = self.start_chromosome
     start = self.start_location
     chrs = [x[0] for x in sorted(self.fasta.index.items(), key=lambda a: a[1][0])]
     for chr in chrs:
         segcount = 0
         if self.verbose:
             print "Reading chr %s" % chr
         # Skip forward if a starting chr was defined
         if startchr is not None and startchr != chr:
             continue
         else:
             startchr = None
             
         for kmer in Fasta.as_kmers(self.fasta[chr],self.segment_size):
             end = start + self.segment_size                
             seg = Segment(start, end, kmer[1] ,chr)
             segcount += 1
             if self.verbose and segcount % 1000 == 0:
                 print "Read %d segments" % segcount
             yield seg
             start = end
Пример #41
0
    def segments(self):
        '''
        Generator for Segments
        '''
        startchr = self.start_chromosome
        start = self.start_location
        chrs = [
            x[0]
            for x in sorted(self.fasta.index.items(), key=lambda a: a[1][0])
        ]
        for chr in chrs:
            segcount = 0
            if self.verbose:
                print "Reading chr %s" % chr
            # Skip forward if a starting chr was defined
            if startchr is not None and startchr != chr:
                continue
            else:
                startchr = None

            for kmer in Fasta.as_kmers(self.fasta[chr], self.segment_size):
                end = start + self.segment_size
                seg = Segment(start, end, kmer[1], chr)
                segcount += 1
                if self.verbose and segcount % 1000 == 0:
                    print "Read %d segments" % segcount
                yield seg
                start = end
    def test_find_closest_splice_donor_minus(self):
        """ For a toy case with multiple donors and acceptors in close
            proximity, test whether TC can find the closest reference donor
            to the supplied intron bound.

            Similar to before, there is an exact match for the donor, located
            at 23071360 in 1-based coordinates and 23071359 in 0-based."""

        # Process reference junctions
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjDict = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)

        # Intron bound info
        transcriptID = "test_read"
        jnNumber = 0
        chrom = "chr1"
        start = 23070360
        end = 23071360
        strand = "-"
        genome = Fasta("input_files/hg38_chr1.fa")

        junction = sj.SpliceJunction(transcriptID, jnNumber, chrom, start, end,
                                     strand, genome, sjDict)

        donor = junction.get_splice_donor()
        closest_donor = TC.find_closest_bound(donor, donors)
        assert closest_donor.start == 23071359
        assert closest_donor.end == 23071360
        assert closest_donor.dist == 0
Пример #43
0
    def test_fix_donor_case3(self):
        """ Toy transcript with sequence AAGGT|GAA, where the splice motif
            is noncanonical but located 2 bp from a canonical splice donor.
            chr1: 23,071,357 - 23,072,126

            So-called case #3
        """

        # Process references
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjDict = TC.processSpliceAnnotation(sjFile, tmp_dir,
                                                               chroms)
        genome = Fasta("input_files/hg38_chr1.fa")


        # Init transcript object
        sam_fields = ["test_read", "0", "chr1", "23071357", "255", "5M762N3M", "*",
                      "0", "0", "AAGGTGAA", "*",  "NM:i:0", "MD:Z:8"]
        transcript = t2.Transcript(sam_fields, genome, sjDict)
        jnNumber = 0
        maxDist = 5
        donor = (transcript.spliceJunctions[jnNumber]).bounds[0]

        # Attempt to correct the splice donor side of the junction (left)
        new_seq, new_cigar = TC.fix_one_side_of_junction(transcript.CHROM,
                                                         transcript.POS, jnNumber,
                                                         donor, -2, genome,
                                                         transcript.SEQ,
                                                         transcript.CIGAR)

        assert new_seq == "AAGGAA"
        assert new_cigar == "3M764N3M"
Пример #44
0
def test_classes():

    for inplace in (True, False):
        for klass in record_classes:
            f = Fasta('tests/data/three_chrs.fasta', record_class=klass, flatten_inplace=inplace)
            yield check_keys, f
            yield check_misc, f, klass
            yield check_contains, f
            yield check_shape, f
            yield check_bounds, f
            yield check_tostring, f
            yield check_kmers, f
            yield check_kmer_overlap, f
            yield check_slice_size, f
            yield check_slice, f
            yield check_full_slice, f
            yield check_array_copy, f
            yield check_array, f
            yield check_one_based, f

            fasta_name = f.fasta_name

            del f

            yield check_keyfn, 'tests/data/key.fasta', klass, inplace

            yield check_reload, klass, fasta_name

            yield check_duplicates, klass, inplace

            _cleanup()
Пример #45
0
def count_freq(blast_file, fasta, org, count_subject=True):
    """one large blast file """
    h5, node = get_node(org, 'w')

    # use existing counts.
    if (h5, node) == (None, None): return
    f = Fasta(fasta)

    print "counting..."
    cache = {}
    for sline in open(blast_file):
        line = sline.split("\t")
        qchr, schr = line[:2]

        qstart, qstop, sstart, sstop = map(int, line[6:10])

        if not qchr in cache:
            update_cache(qchr, node, len(f[qchr]), h5, cache)
            cache_clear(cache, node, qchr, schr)
        # convert to 0-based indexes:
        # 1 8 => 0 8, but range doesnt include upper boud.
        cache[qchr][qstart - 1:qstop] += 1

        if count_subject:
            if sstart > sstop: sstart, sstop = sstop, sstart
            if not schr in cache:
                update_cache(schr, node, len(f[schr]), h5, cache)
                cache_clear(cache, node, qchr, schr)
                cache[schr][sstart - 1:sstop] += 1

    for achr in cache:
        getattr(node, 'c' + achr)[:] = cache[achr]

    h5.close()
Пример #46
0
def check_kmers(f):
    seq = str(f['chr2'])

    kmers = list(Fasta.as_kmers(f['chr2'], 10))
    assert (len(kmers) == len(seq) / 10)
    assert (kmers[0] == (0, seq[:10]))

    seqs = [k[1] for k in kmers]
    assert ("".join(seqs) == seq)
    last_pair = kmers[-1]
    assert (seqs[-1][-1] == 'T')

    seq = str(f['chr3'])
    kmers = list(Fasta.as_kmers(f['chr3'], 1))
    assert (kmers[2][0] == 2)
    seqs = [k[1] for k in kmers]
    assert ("".join(seqs) == seq)
Пример #47
0
class Reference(object):
    def __init__(self, genome_fasta):
        # @see: https://pypi.python.org/pypi/pyfasta
        key_fn = lambda key : key.split()[0] # Use first value before whitespace as keys
        self.fasta =  Fasta(genome_fasta, key_fn=key_fn)

    def get_sequence_from_iv(self, iv):
        feature_hash = {'chr' : iv.chrom, 'start' : iv.start, 'stop' : iv.end, 'strand' : iv.strand}
        return self.fasta.sequence(feature_hash, one_based=False)
Пример #48
0
def read_fasta(ref_files, fasta_header):
    """Read fasta file

    New line character can only exist between header and sequence,
    not inside sequence

    Args:
        file_path (str): Path to fasta file.

    Returns:
        fasta_dict (dict): Dictionary with fasta headers as keys and the
            sequences as values.
    """
    # Open fasta file and store headers and sequences
    for fasta_path in ref_files:
        # print(fasta_path)
        fasta = Fasta(fasta_path)
        if fasta_header in fasta.keys():
            return fasta
Пример #49
0
    def split_seqs(self, num_jobs, max_ref=5, max_qry=20):
        ''' splits reference and query into appropriate number of splits '''
        
        # load data into memory.
        r = Fasta(self.ref_fasta, record_class=MemoryRecord)
        q = Fasta(self.qry_fasta, record_class=MemoryRecord)
        
        ## reference ##
        # split according to criteria.
        if len(r) < max_ref:
            max_ref = len(r)
            
        if max_ref > num_jobs:
            max_ref = 1
        
        if len(q) < max_qry:
            max_qry = len(q)

        if num_jobs < max_qry:
            max_qry = num_jobs

        if (max_ref * max_qry) > num_jobs:
            max_qry = int(float(num_jobs) / float(max_ref))
        
        # count number of seqs.
        sc = len(r.keys())
        
        # create split info.
        self.ref_names = ["ref_%i" % x for x in range(max_ref)]
        self.ref_files = ["%s/%s.fasta" % (self.out_dir, x) for x in self.ref_names]
        
        # split according to rules.
        pyfasta.split_fasta.without_kmers(r, self.ref_files)
        self.ref_names, self.ref_files = self._no_empty(self.ref_names, self.ref_files)
        
        ## query ##
        # create split info.
        self.qry_names = ["qry_%i" % x for x in range(max_qry)]
        self.qry_files = ["%s/%s.fasta" % (self.out_dir, x) for x in self.qry_names]
        
        # split according to rules.
        pyfasta.split_fasta.without_kmers(q, self.qry_files)
        self.qry_names, self.qry_files = self._no_empty(self.qry_names, self.qry_files)
Пример #50
0
def generate_corpusfile(fasta_fname, n, corpus_fname):
    '''
    Args:
        fasta_fname: corpus file name
        n: the number of chunks to split. In other words, "n" for "n-gram"
        corpus_fname: corpus_fnameput corpus file path
    Description:
        Protvec uses word2vec inside, and it requires to load corpus file
        to generate corpus.
    '''
    f = open(corpus_fname, "w")
    fasta = Fasta(fasta_fname)
    for record_id in tqdm(fasta.keys(), desc='corpus generation progress'):
        r = fasta[record_id]
        seq = str(r)
        ngram_patterns = split_ngrams(seq, n)
        for ngram_pattern in ngram_patterns:
            f.write(" ".join(ngram_pattern) + "\n")
    f.close()
Пример #51
0
def process_query():
    print('Reading sequence library and query sequence')
    library = Fasta(library_path)
    queries = Fasta(query_path)
    query_sequence = str(queries["Rattus"])

    print('Processing')
    progress = progressbar.ProgressBar(max_value=len(library.keys()))
    cpu_count = multiprocessing.cpu_count()
    executor = ThreadPoolExecutor(max_workers=cpu_count)

    tasks = []
    for record in list(library.keys())[:library_process_limit]:
        library_sequence = str(library[record])
        future = executor.submit(align, library_sequence, query_sequence)
        tasks.append(AlignmentTask(record, future))

    results = []
    for i in range(len(tasks)):
        _, _, score = tasks[i].future.result()
        results.append(AlignmentResult(title=tasks[i].record, score=score))
        progress.update(i)

    etalone_score = sum([ smatrix[(x, x)] for x in query_sequence ])

    print("Done")
    print("Etalone score is %d" % etalone_score)
    print("Got %d results, here are top-30 among them:" % len(results))
    print("Score  | Match   | Record")

    for sequence in sorted(results, key=lambda x: x.score, reverse=True)[:30]:
        match = (sequence.score / etalone_score) * 100.0
        print("%6d | %5.3f%% | %s" % (sequence.score, match, sequence.title))

    timer = get_performance_timer()
    for time in [timer.dotplot, timer.regions, timer.align]:
        print(time / cpu_count)
Пример #52
0
def main(gff_file, fasta_file, parents, children):

    db_file = gff_file + ".db"

    if not op.exists(db_file):
        GFFutils.create_gffdb(gff_file, db_file)

    f = Fasta(fasta_file)
    g = GFFutils.GFFDB(db_file)

    parents = set(parents.split(','))
    parents_iter = [g.features_of_type(x) for x in parents]
    parents_list = itertools.chain(*parents_iter)
    children_list = set(children.split(','))

    for feat in parents_list:

        children = []
        for c in g.children(feat.id, 1):

            if c.featuretype not in children_list: continue
            child = f.sequence(dict(chr=c.chrom, start=c.start, stop=c.stop,
                strand=c.strand))
            children.append((child, c))

        if not children: 
            print >>sys.stderr, "[warning] %s has no children with type %s" \
                                    % (feat.id, ','.join(children_list))
            continue
        # sort children in incremental position
        children.sort(key=lambda x: x[1].start)
        # reverse children if negative strand
        if feat.strand=='-': children.reverse()
        feat_seq = ''.join(x[0] for x in children)

        print ">%s" % feat.id
        print feat_seq
class Sequence():
    """docstring for Sequence"""
    def __init__(self, engine='mysql', function = 'iterator', **kwargs):
        self.engine = engine
        if self.engine == 'mysql' and function == 'iterator':
            self.create_mysql_iterator(**kwargs)
        elif self.engine == 'biopython' and kwargs['data_type'] == 'fasta':
            self.create_biopython_iterator(**kwargs)
        elif self.engine == 'pyfasta' and kwargs['data_type'] == 'fasta':
            self.create_pyfasta_iterator(**kwargs)
        elif self.engine == 'twobit' and kwargs['data_type'] == 'twobit':
            self.create_twobit_iterator(**kwargs)

    def create_mysql_iterator(self, **kwargs):
        cur = kwargs['cursor']
        query = '''SELECT id, record FROM sequence WHERE n_count <= 2 AND 
                    trimmed_len > 40'''
        cur.execute(query)
        self.readcount = cur.rowcount
        self.read = iter(cur.fetchall())

    def create_biopython_iterator(self, **kwargs):
        from Bio import SeqIO
        print "Generating BioPython sequence index.  This may take a moment...."
        self.fasta = SeqIO.index(kwargs['input'], kwargs['data_type'])
        self.readcount = len(self.fasta)
        self.db_values = zip(range(len(self.fasta)), sorted(self.fasta.keys()))
        self.read = iter(self.db_values)

    def create_twobit_iterator(self, **kwargs):
        import bx.seq.twobit
        self.fasta = bx.seq.twobit.TwoBitFile(file(kwargs['input']))
        self.readcount = self.fasta.seq_count
        self.db_values = zip(range(self.fasta.seq_count), sorted(self.fasta.keys()))
        self.read = iter(self.db_values)

    def create_pyfasta_iterator(self, **kwargs):
        from pyfasta import Fasta
        print "Generating PyFasta sequence index.  This may take a moment...."
        self.fasta = Fasta(kwargs['input'])
        self.readcount = len(self.fasta)
        self.db_values = zip(range(len(self.fasta)), sorted(self.fasta.keys()))
        self.read = iter(self.db_values)

    def get_pyfasta_reads(self, **kwargs):
        from pyfasta import Fasta
        self.fasta = Fasta(kwargs['input'])
        self.readcount = len(self.fasta)
Пример #54
0
def with_kmers(f, names, k, overlap):
    """
    split the sequences in Fasta object `f` into pieces of length `k` 
    with the given `overlap` the results are written to the array of files
    `fhs`
    """
    fhs = [open(name, 'wb') for name in names]
    i = 0
    for seqid in f.keys():
        seq = f[seqid]
        for (start0, subseq) in Fasta.as_kmers(seq, k, overlap=overlap):

            fh = fhs[i % len(fhs)]
            print >>fh, ">%s" % format_kmer(seqid, start0)
            print >>fh, subseq
            i += 1
Пример #55
0
def main():
    args = make_parser()
    if args.inplace:
        f = Fasta(args.fasta_file, flatten_inplace=True)
    else:
        f = Fasta(args.fasta_file)

    if args.output_file is not None:
        output = open(args.output_file, 'w')
    else:
        output_file_name = args.fasta_file.split('.')[0]
        output_file = '{0}.phylip'.format(output_file_name)
        output = open(output_file, 'w')

    sequence_count = len(f.keys())
    sequence_length = len(f[next(iter(f.keys()))])
    # print('', sequence_count, sequence_length, sep=' ')
    output.write(' {0} {1}\n'.format(sequence_count, sequence_length))

    for key in f.keys():
        subseq = []
        for chunk in grouper(f[key][:LINE_LENGTH], CHUNK_LENGTH):
            subseq.append(''.join(item[0] for item in chunk))
        subseq = ' '.join(subseq)
        if len(key) < CHUNK_LENGTH:
            key = key.ljust(CHUNK_LENGTH)
        else:
            key = key[:CHUNK_LENGTH]
        # print(key, ' ', subseq)
        output.write('{0} {1}\n'.format(key, subseq))

    sequence_length -= LINE_LENGTH
    start = LINE_LENGTH
    stop = LINE_LENGTH * 2
    # print()
    output.write('\n')

    while sequence_length > 0:
        for key in f.keys():
            subseq = []
            for chunk in grouper(f[key][start:stop], CHUNK_LENGTH, ' '):
                subseq.append(''.join(item[0] for item in chunk))
            subseq = ' '.join(subseq)
            # print(PAD_STRING, ' ', subseq)
            output.write('{0} {1}\n'.format(PAD_STRING, subseq))
        sequence_length -= LINE_LENGTH
        start += LINE_LENGTH
        stop += LINE_LENGTH
        # print()
        output.write('\n')

    output.close()
Пример #56
0
def align():
    hg19 = Fasta('hg19.fa')
    print hg19.keys()

    hg19Chr = sorted(hg19.keys(), reverse=True)

    YRI = Fasta('YRIref.fasta')
    print YRI.keys()
    YRIChr = sorted(YRI.keys())
    print hg19[hg19Chr[0]][:20]
    print YRI[YRIChr[0]][:20]

    print hg19[hg19Chr[0]][:20]
    print YRI[YRIChr[0]][:20]

    fhout = open('hg19_YRI_diff.bed', 'w')

    header = 'chrom, chromStart, chromEnd, hg19, YRI \n'
    fhout.write(header)
    for each in hg19Chr:
        seq1 = hg19[each][:10000]
        seq2 = YRI[each][:10000]
        print 'reached 1'
        print 'doing alignment for ', each
        alignment = nw.global_align(seq1, seq2, gap=-2, matrix=None, match=1, mismatch=-1)
        print 'reached 2'
        len1 = len(alignment[0]) #hg19
        len2 = len(alignment[1]) #YRI

        if len2>len1:
            x = len2
        else:
            x = len1

        for i in range(x):
            if alignment[0][i] != alignment[1][i]:
                #write to fhout
                outline = each + ',' + str(i) + ',' + str(i+1) + ',' + alignment[0][i] + ',' + alignment[1][i] + '\n'
                fhout.write(outline)


    fhout.close()
'''
检测比对过的fasta文件中所有序列之间是否两两均具有重叠区域
'''

__version__ = "1.0"

from pyfasta import Fasta
import argparse

#命令行选项处理
parser = argparse.ArgumentParser()
parser.add_argument("-i", "-in", "--input", metavar="filename", dest="input", type=str , help="fasta file to check")
parser.add_argument("-v", "--version", action='version', help="The version of this program.", version = "Version: " + __version__)
args = parser.parse_args()

f = Fasta(args.input)
loci = sorted(f.keys())
for locus1 in loci:
    for locus2 in loci:
        flag = 0
        sequence1 = f[locus1]
        sequence2 = f[locus2]
        i = 0
        while i < len(sequence1) and i < len(sequence2):
            base1 = sequence1[i]
            base2 = sequence2[i]
            if base1 != "-" and base2 != "-":
                flag = 1
                break
            i += 1
        if flag == 0: