Exemplos de Fasta em Python, exemplos de pyfasta.Fasta em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: aa_annotate.py Projeto: MMesbahU/selectionTools

def aa_seq(options):
    """ Gets the ancestral sequence from a Fasta file

    """
    f = Fasta(options.ancestralfasta)
    keyz = (f.keys())
    match = ''
    if (options.single_chromosome):
        # Single chromosome fasta should only have one sequence.
        # that sequence should be the sequence of interest.
        keyz = list(keyz)
        key = keyz[0]
    else:
        get_chromosome_from_header = options.header
        get_chromosome_from_header = \
            get_chromosome_from_header.replace('?', options.chromosome)
        for key in keyz:
            if(re.match(get_chromosome_from_header, key) is not None):
                match = key
        if(match is ''):
            raise Exception("No match possible is something wrong with the"
                            " regex specified to the program as"
                            "--header-regex")
    aaSeq = f[key]
    return(aaSeq)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: genome_nuc_freqs.py Projeto: speach/modmap

def calc_nuc_counts(fasta_filename, region_size_min,
                    region_size_max, verbose):
    ''' calculate nuc frequencies for normalization.

        Returns: dict of nucleotide frequencies.
    '''

    nuc_counts = defaultdict(Counter)

    fasta = Fasta(fasta_filename)

    for chrom, seq in fasta.items():

        for idx, pos in enumerate(seq):

            for region_size in range(region_size_min,
                                     region_size_max + 1):

                nucs = seq[idx:idx+region_size]

                if len(nucs) < region_size: continue

                nuc_counts[region_size][nucs] += 1

    return nuc_counts

Exemplo n.º 3

0

Exibir arquivo

Arquivo: nucmer.py Projeto: jim-bo/parabio

    def _no_empty(self, lista, listb):
        ''' removes empty entries '''
        
        # check for empty fasta.
        tmpa = list()
        tmpb = list()
        for i in range(len(listb)):
            
            # open it.
            try:
                z = Fasta(listb[i], record_class=MemoryRecord)
            
                # check for empty.
                if len(z.keys()) == 0:
                    continue

                # add to temp.
                tmpa.append(lista[i])
                tmpb.append(listb[i])

            except:
                logging.warning("bad fasta file")
            
        # sort back.
        return tmpa, tmpb

Exemplo n.º 4

0

Exibir arquivo

Arquivo: 5primeCounter.py Projeto: ComputationalSystemsBiology/ExoProfiler

def parse_sequences(sites, size, fasta_file):
    """Adds the binding site sequences extende to 'size' per row (decoded as A=0, C=1, G=2, T=3) to each input region."""
    from pyfasta import Fasta  # Fasta package is needed to fetch sequences from genome fasta file
            
    print "INFO: Begin to fetch sequences...."
    
    f = Fasta(fasta_file, key_fn=lambda key: key.split()[0])

    for i, reg in enumerate(sites):
        
        start = reg["ext_start"]
        end = reg["ext_end"]
        
        # if motif on negativ strand, shift region by +1 to account for zero based half-open intervals
        if reg["strand"] == '-':
            start += 1
            end += 1
        
        seq = f.sequence({"chr":reg["chr"], "start":start, "stop":end}, one_based=False)

        # Note, the 'strand':reg["strand"] argument for f.sequence does not work, there seems to be a bug in the pyfasta/fasta.py code.
        seq = seq.upper()
 
        # if motif on negative strand, convert seq to reverse complement
        if reg["strand"] == '-': 
            seq = reverse_complement(seq)
        
        # add sequence to region dict
        reg["ext_seq"] = seq
        
    print "INFO: Finished sequences."
    return regions

Exemplo n.º 5

0

Exibir arquivo

def read_fa(fa='/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/mm10/transcriptome/mm10_transcriptome.fa'):
	gj.printFuncRun('read_fa')
	gj.printFuncArgs()
	fa_dict = Fasta(fa, key_fn=lambda key:key.split("\t")[0])
	print fa_dict.keys()[0:3]
	gj.printFuncRun('read_fa')
	return fa_dict

Exemplo n.º 6

0

Exibir arquivo

def removehost(fasta, bed):
    removeregion = dict()

    with open(bed) as bedin:
        for i in bedin:

            removeregion[i.rstrip()] = 1

    fa = Fasta(fasta)

    outfile = 'removehost_' + fasta

    outio = open(outfile, 'w')

    for seqname in fa.keys():

        if seqname in removeregion:

            continue

        else:

            outst = '>' + seqname + '\n' + str(fa[seqname]) + '\n'

            outio.write(outst)

    outio.close()

Exemplo n.º 7

0

Exibir arquivo

Arquivo: known_structure_set_comparison.py Projeto: Tsinghua-gongjing/Keth-seq

def read_score(score_tab, ref):
    fa = Fasta(ref)
    fa_dict = {}
    for i, j in fa.items():
        fa_dict[i.split('\t')[0]] = j

    score_dict = nested_dict()
    with open(score_tab, 'r') as TXT:
        for line in TXT:
            line = line.strip()
            if not line or line.startswith('@'): continue
            arr = line.split('\t')
            if arr[1] == '-': continue
            score_dict[arr[0]][int(arr[2])] = arr[7]
    score_dict = score_dict.to_dict()
    #     print score_dict

    reactivity_dict = nested_dict(2, list)
    for i, j in score_dict.items():
        for p in xrange(1, len(fa_dict[i]) + 1):
            if p not in score_dict[i]:
                r = 'NULL'
            elif score_dict[i][p] == '-1':
                r = 'NULL'
            else:
                r = score_dict[i][p]
            reactivity_dict[i]['reactivity_ls'].append(r)

    return reactivity_dict.to_dict()

Exemplo n.º 8

0

Exibir arquivo

Arquivo: design_primers.py Projeto: bopopescu/msatcommander-gs

 def create_pyfasta_iterator(self, **kwargs):
     from pyfasta import Fasta
     print "Generating PyFasta sequence index.  This may take a moment...."
     self.fasta = Fasta(kwargs['input'])
     self.readcount = len(self.fasta)
     self.db_values = zip(range(len(self.fasta)), sorted(self.fasta.keys()))
     self.read = iter(self.db_values)

Exemplo n.º 9

0

Exibir arquivo

def read_fa(
    fa='/Share/home/zhangqf7/gongjing/zebrafish/data/reference/transcriptome/danRer10.refSeq.transcriptome.fa'
):
    fa_dict1 = Fasta(fa, key_fn=lambda key: key.split("\t")[0])
    fa_dict = {i.split()[0]: j[0:] for i, j in fa_dict1.items()}
    print fa_dict.keys()[0:3]
    return fa_dict

Exemplo n.º 10

0

Exibir arquivo

def aa_seq(options):
    """ Gets the ancestral sequence from a Fasta file

    """
    f = Fasta(options.ancestralfasta)
    keyz = (f.keys())
    match = ''
    if (options.single_chromosome):
        # Single chromosome fasta should only have one sequence.
        # that sequence should be the sequence of interest.
        keyz = list(keyz)
        key = keyz[0]
    else:
        get_chromosome_from_header = options.header
        get_chromosome_from_header = \
            get_chromosome_from_header.replace('?', options.chromosome)
        for key in keyz:
            if (re.match(get_chromosome_from_header, key) is not None):
                match = key
        if (match is ''):
            raise Exception("No match possible is something wrong with the"
                            " regex specified to the program as"
                            "--header-regex")
    aaSeq = f[key]
    return (aaSeq)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: markup.py Projeto: biocad/au-summer-2013

    def run(self, filename):
        self.openOutFiles(filename)
        f = Fasta(filename)

        count = len(f)
        self.not_found_in_kabat, self.fr4_not_found, current = (0, 0, 0)

        for name in f.keys():
            current += 1
            if current % 1000 == 0:
                print "All %d. Current: %d" % (count, current)
                # format: vName_jName{frameNumber} or vName_dName{frameNumber}_jName{frameNumber}

            vGeneName = name.split("_")[0]

            vGeneRegions = self.getVGeneRegions(vGeneName)
            if vGeneRegions is None:
                continue

            withoutMarkup = f[name][vGeneRegions[self.kabat.regions_count * 2 - 1]:]
            group = self.findFR4(name, withoutMarkup)
            if group is None:
                continue

            self.result_kabat_file.write(name)
            self.result_kabat_file.write(("\t%d" * 10) % tuple(vGeneRegions))
            self.result_kabat_file.write(("\t%d" * 4 + "\n") % tuple(
                [vGeneRegions[9] + i for i in [1, group.start(), group.start() + 1, len(withoutMarkup)]]))

        self.closeOutFiles()
        print "all: {}; not in kabat: {}; without fr4: {}".format(current, self.not_found_in_kabat, self.fr4_not_found)

Exemplo n.º 12

0

Exibir arquivo

def genome_contenct_stats(fasta_path):
    f = Fasta(fasta_path)
    g_box_total = []
    for seqid in f.keys():
        seq = f[seqid][:]
        g_boxs = len(re.findall('CACGTG',seq,flags=re.IGNORECASE))
        g_box_total.append(g_boxs)
    print >>sys.stderr, "total gboxes:{0}".format(sum(g_box_total))

Exemplo n.º 13

0

Exibir arquivo

Arquivo: atcg_stats.py Projeto: Tsinghua-gongjing/test

def read_fa(fa='/Share/home/zhangqf7/gongjing/zebrafish/data/reference/transcriptome/danRer10.refSeq.transcriptome.fa'):
	gj.printFuncRun('read_fa')
	gj.printFuncArgs()
	fa_dict1 = Fasta(fa, key_fn=lambda key:key.split("\t")[0])
	fa_dict = {i.split()[0]:j[0:] for i,j in fa_dict1.items()}
	print fa_dict.keys()[0:3]
	gj.printFuncRun('read_fa')
	return fa_dict

Exemplo n.º 14

0

Exibir arquivo

Arquivo: splicer.py Projeto: henrikstranneheim/VariantUtilities

def create_fasta_flat_file(file):
    """Reads a fasta file for fast sequence retrival"""

    fasta_file = Fasta(file, key_fn=lambda key: key.split()[0])

    fasta_headers = set(fasta_file.keys());

    return fasta_file, fasta_headers

Exemplo n.º 15

0

Exibir arquivo

Arquivo: genome_stats.py Projeto: gturco/random_bio_tools

def genome_contenct_stats(fasta_path):
    f = Fasta(fasta_path)
    g_box_total = []
    for seqid in f.keys():
        seq = f[seqid][:]
        g_boxs = len(re.findall("CACGTG", seq, flags=re.IGNORECASE))
        g_box_total.append(g_boxs)
    print >> sys.stderr, "total gboxes:{0}".format(sum(g_box_total))

Exemplo n.º 16

0

Exibir arquivo

Arquivo: getSeqFromFasta.py Projeto: king21guns/Chiapet-chipseq-project

def getSequence(genome):
    genome=Fasta(genome)
    RAD_seq = pd.read_csv('../data/input_data/peak.csv')
    result = map(lambda i:[genome.sequence({'chr':RAD_seq['chrom'][i],'start':RAD_seq['start'][i],'stop':RAD_seq['end'][i]})],range(len(RAD_seq)))
    RAD_seq['seq'] = result
    RAD_seq['seq'] = RAD_seq.apply(fuc,axis=1)
    RAD_seq.to_csv('../data/input_data/RAD_seq.csv',index=False)
    print 'getSequence is over,RAD_seq.csv is bulit!'

Exemplo n.º 17

0

Exibir arquivo

Arquivo: test_all.py Projeto: brentp/pyfasta

def check_keyfn2(path, klass, inplace):
    f = Fasta(path, record_class=klass, flatten_inplace=inplace, key_fn=lambda
            key: "-".join(key.split()))

    assert sorted(f.keys()) == ['a-extra', 'b-extra', 'c-extra'], f.keys()

    assert f['a-extra']
    fix(path)

Exemplo n.º 18

0

Exibir arquivo

Arquivo: aln_to_mat.py Projeto: tzeitim/ogtk

class Alg:
    def __init__(self, fastafn, freqfn, colorfn):
        self.pos = []
        self.init = False
        self.size = 0
        self.fasta = Fasta(fastafn)
        self.colorfn = colorfn
        self.conta = {'n':0, '-':0, 'a':1, 'c':2, 'g':3, 't':4, '\n':'\n'}

        self.read_fasta(fastafn)
        self.write_freqs(freqfn)

    def do_plot(self, plot, names = False):
        msa = self.seqtocol(self.colorfn, names= names)
        if plot:
            return(msa)
        

    def read_fasta(self, fastafn):
        for entry in self.fasta.keys():
            seq = self.fasta[entry][:]
            if not self.init:
                # this assumes that all the entries in the fasta record are the same size. 
                # this is the default setting for clustalo 
                # TODO add an assertion ro verify so
                self.size = len(seq) 
                for i in range(0, self.size):
                    self.pos.append(Pos(i))
                self.init = True

            for nt in range(0, self.size):
                self.pos[nt].freq[seq[nt].lower()]+=1 
    
    def seqtocol(self, outfn, names=False):
        outf = open(outfn, 'w')
        colors = []
        for i,entry in enumerate(self.fasta.keys()):
            outf.write(entry+','+','.join([str(self.conta[i.lower()]) for i in self.fasta[entry][:]])+'\n')
            if names:
                colors.append(entry)
            [colors.append(self.conta[i.lower()]) for i in self.fasta[entry][:]]
        outf.close()

        # TODO thisis very weird, check why one option returns the transpose
        if names:
            #colors = np.array(colors).reshape( 1+i, 1+len(self.fasta[entry][:])) 
            colors = np.array(colors).reshape( 1+len(self.fasta[entry][:]), 1+i) 
        else:
            colors = np.array(colors).reshape(1+i, len(self.fasta[entry][:])) 
        return(colors)

    def write_freqs(self, outfn):
        outf = open(outfn, 'w')
        outf.write('\t'.join(['a','c','t','g'])+'\n')
        for j in self.pos:
            outf.write('\t'.join([str(j.freq['a']),str(j.freq['c']),str(j.freq['t']),str(j.freq['g'])])+'\n')
        outf.close()

Exemplo n.º 19

0

Exibir arquivo

Arquivo: test_all.py Projeto: Z19900505/Python.learn

def check_keyfn2(path, klass, inplace):
    f = Fasta(path,
              record_class=klass,
              flatten_inplace=inplace,
              key_fn=lambda key: "-".join(key.split()))

    assert sorted(f.keys()) == ['a-extra', 'b-extra', 'c-extra'], f.keys()

    assert f['a-extra']
    fix(path)

Exemplo n.º 20

0

Exibir arquivo

Arquivo: aln_to_mat.py Projeto: tzeitim/ogtk

    def __init__(self, fastafn, freqfn, colorfn):
        self.pos = []
        self.init = False
        self.size = 0
        self.fasta = Fasta(fastafn)
        self.colorfn = colorfn
        self.conta = {'n':0, '-':0, 'a':1, 'c':2, 'g':3, 't':4, '\n':'\n'}

        self.read_fasta(fastafn)
        self.write_freqs(freqfn)

Exemplo n.º 21

0

Exibir arquivo

Arquivo: split_fasta.py Projeto: brentp/pyfasta

def split(args):
    parser = optparse.OptionParser("""\
   split a fasta file into separated files.
        pyfasta split -n 6 [-k 5000 ] some.fasta
    the output will be some.0.fasta, some.1.fasta ... some.6.fasta
    the sizes will be as even as reasonable.
   """)
    parser.add_option("--header", dest="header", metavar="FILENAME_FMT",
       help="""this overrides all other options. if specified, it will
               split the file into a separate file for each header. it
               will be a template specifying the file name for each new file.
               e.g.:    "%(fasta)s.%(seqid)s.fasta"
               where 'fasta' is the basename of the input fasta file and seqid
               is the header of each entry in the fasta file.""" ,default=None)

    parser.add_option("-n", "--n", type="int", dest="nsplits", 
                            help="number of new files to create")
    parser.add_option("-o", "--overlap", type="int", dest="overlap", 
                            help="overlap in basepairs", default=0)
    parser.add_option("-k", "--kmers", type="int", dest="kmers", default=-1,
                     help="""\
    split big files into pieces of this size in basepairs. default
    default of -1 means do not split the sequence up into k-mers, just
    split based on the headers. a reasonable value would be 10Kbp""")
    options, fasta = parser.parse_args(args)
    if not (fasta and (options.nsplits or options.header)):
        sys.exit(parser.print_help())

    if isinstance(fasta, (tuple, list)):
        assert len(fasta) == 1, fasta
        fasta = fasta[0]

    kmer = options.kmers if options.kmers != -1 else None
    overlap = options.overlap if options.overlap != 0 else None
    f = Fasta(fasta)
    if options.header:
        names = dict([(seqid, options.header % \
                      dict(fasta=f.fasta_name, seqid=seqid)) \
                                       for seqid in f.iterkeys()])
        """
        if len(names) > 0:
            assert names[0][1] != names[1][1], ("problem with header format", options.header)
        fhs = dict([(seqid, open(fn, 'wb')) for seqid, fn in names[:200]])
        fhs.extend([(seqid, StringIO(), fn) for seqid, fn in names[200:]])
        """
        return with_header_names(f, names)
    else:
        names = newnames(fasta, options.nsplits, kmers=kmer, overlap=overlap, 
                     header=options.header)

        #fhs = [open(n, 'wb') for n in names]
    if options.kmers == -1:
        return without_kmers(f, names)
    else: 
        return with_kmers(f, names, options.kmers, options.overlap)

Exemplo n.º 22

0

Exibir arquivo

Arquivo: cns_to_bed.py Projeto: gturco/random_bio_tools

def mask_to_bed(fasta_file, mask_bed_name):
    "creates a bed file of the start and stops of masked seqs"
    mask_bed = open(mask_bed_name,"wb")
    f= Fasta(fasta_file)
    mask_id = 1
    for seqid in f.keys():
        seq = f[seqid][:]
        for m in re.finditer("X+",seq):
            mask_id = mask_id + 1
            w = '{0}\t{1}\t{2}\t{3}\t{4}\t+\t.\t.\t.\t1\t{5}\t0\n'.format(seqid,m.start(),m.end(),"mask_id {0}".format(mask_id),(m.end()-m.start()),(m.end()-m.start()+1))
            mask_bed.write(w)

Exemplo n.º 23

0

Exibir arquivo

def write_c2t(fasta_name, unconverted, colorspace=False):
    """
    given a fasta file, write a new file:
        `some.fr.c2t.fasta` which contains:
          + the same headers prefixed with 'f' with all C's converted to T
          + headers prefixed with 'r' reverse complemented with
                                 all C's converted to T.

    if unconverted is false, then also save a file with the forward and reverse
    without conversion.
    """
    d = op.join(op.dirname(fasta_name), "bowtie_index")
    if colorspace: d += "_colorspace"
    if not op.exists(d): os.mkdir(d)

    p, ext = op.splitext(op.basename(fasta_name))  # some.fasta -> some, fasta
    fname = "%s/%s.fr.c2t%s" % (d, p, ext)
    # no conversion, just copy the file into the index dir.
    unconverted_fname = "%s/%s.fr%s" % (d, p, ext)
    if op.exists(fname):
        if not unconverted: return fname, unconverted_fname
        elif op.exists(unconverted_fname): return fname, unconverted_fname

    fasta = Fasta(fasta_name)

    c2t_fh = open(fname, 'w')
    unc_fh = open(unconverted_fname, 'w') if unconverted else None

    print >> sys.stderr, "writing forward and reverse c2t to: %s" % (fname, )

    try:
        for header in fasta.iterkeys():
            seq = str(fasta[header]).upper()
            assert not ">" in seq
            # c2t, prefix header with f and write
            print >> c2t_fh, ">f%s" % header
            print >> c2t_fh, seq.replace('C', 'T')
            # then r-c, c2t, prefix header with r and write
            print >> c2t_fh, ">r%s" % header
            rseq = revcomp(seq)
            print >> c2t_fh, rseq.replace('C', 'T')
            if unc_fh is not None:
                print >> unc_fh, ">f%s\n%s" % (header, seq)
                print >> unc_fh, ">r%s\n%s" % (header, rseq)

        c2t_fh.close()
    except:
        os.unlink(fname)
        os.unlink(unconverted_fname)
        raise

    return fname, unconverted_fname

Exemplo n.º 24

0

Exibir arquivo

Arquivo: __init__.py Projeto: BioinformaticsArchive/methylcode

def write_c2t(fasta_name, unconverted, colorspace=False):
    """
    given a fasta file, write a new file:
        `some.fr.c2t.fasta` which contains:
          + the same headers prefixed with 'f' with all C's converted to T
          + headers prefixed with 'r' reverse complemented with
                                 all C's converted to T.

    if unconverted is false, then also save a file with the forward and reverse
    without conversion.
    """
    d = op.join(op.dirname(fasta_name), "bowtie_index")
    if colorspace: d += "_colorspace"
    if not op.exists(d): os.mkdir(d)

    p, ext = op.splitext(op.basename(fasta_name)) # some.fasta -> some, fasta
    fname = "%s/%s.fr.c2t%s" % (d, p, ext)
        # no conversion, just copy the file into the index dir.
    unconverted_fname = "%s/%s.fr%s" % (d, p, ext)
    if op.exists(fname):
        if not unconverted: return fname, unconverted_fname
        elif op.exists(unconverted_fname): return fname, unconverted_fname

    fasta = Fasta(fasta_name)

    c2t_fh = open(fname, 'w')
    unc_fh = open(unconverted_fname, 'w') if unconverted else None

    print >>sys.stderr, "writing forward and reverse c2t to: %s" % (fname,)

    try:
        for header in fasta.iterkeys():
            seq = str(fasta[header]).upper()
            assert not ">" in seq
            # c2t, prefix header with f and write
            print >>c2t_fh, ">f%s" % header
            print >>c2t_fh, seq.replace('C', 'T')
            # then r-c, c2t, prefix header with r and write
            print >>c2t_fh, ">r%s" % header
            rseq = revcomp(seq)
            print >>c2t_fh, rseq.replace('C', 'T')
            if unc_fh is not None:
                print >>unc_fh, ">f%s\n%s" % (header, seq)
                print >>unc_fh, ">r%s\n%s" % (header, rseq)

        c2t_fh.close()
    except:
        os.unlink(fname)
        os.unlink(unconverted_fname)
        raise

    return fname, unconverted_fname

Exemplo n.º 25

0

Exibir arquivo

Arquivo: hg38_hunam_genome.py Projeto: earlham-sherlock/earlham-sherlock.github.io

def cut_up_genome(input_files_list, output_folder, region_length):
    for file in input_files_list:
        f = Fasta(file)
        chr = sorted(f.keys())
        for chromosome in chr:
            sequence = f[chromosome]
            regions = [
                sequence[i:i + region_length]
                for i in range(0, len(sequence), region_length)
            ]
            path = os.path.join(output_folder, f'chr={chromosome}')
            write_to_json(path, regions, region_length)
            print(f'{chromosome} is complete!')

Exemplo n.º 26

0

Exibir arquivo

def mask_to_bed(fasta_file, mask_bed_name):
    "creates a bed file of the start and stops of masked seqs"
    mask_bed = open(mask_bed_name, "wb")
    f = Fasta(fasta_file)
    mask_id = 1
    for seqid in f.keys():
        seq = f[seqid][:]
        for m in re.finditer("X+", seq):
            mask_id = mask_id + 1
            w = '{0}\t{1}\t{2}\t{3}\t{4}\t+\t.\t.\t.\t1\t{5}\t0\n'.format(
                seqid, m.start(), m.end(), "mask_id {0}".format(mask_id),
                (m.end() - m.start()), (m.end() - m.start() + 1))
            mask_bed.write(w)

Exemplo n.º 27

0

Exibir arquivo

Arquivo: mask_genome.py Projeto: gturco/find_cns

def mask(fasta_file, org, cutoff, mask_value='X'):
    h5, node = get_node(org, 'r')

    outfile = fasta_file[:fasta_file.rfind(".")] + (".masked.%i" % cutoff) \
                         + fasta_file[fasta_file.rfind("."):]

    print "> masking sequence to file:", outfile
    out = open(outfile ,'w')

    fasta = Fasta(fasta_file)

    soft_mask = mask_value.lower() == 'soft'
    for seqid in sorted(fasta.iterkeys()): 
        masked = 0
        if soft_mask:
            seq = str(fasta[seqid])
            # mask is the lowercase sequence.
            mask_value = np.array(seq.lower(), dtype='c')
            seq = np.array(seq.upper(), dtype='c')
        else:
            fasta[seqid].tostring = False
            seq = fasta[seqid][:] # a


        if not 'c' + seqid in node:
            print >>sys.stderr, seqid,\
                '! not found in masked, writing unchanged\n' \
                '  this means that no section of this sequence appeared\n' \
                '  more than %i times' % cutoff
            out.write('>' + seqid + '\n')
            out.write(seq.tostring() + '\n')
            continue
        
        hit_counts = getattr(node, 'c' + seqid)[:]
        masked_seq = np.where(numexpr.evaluate("hit_counts > %i" % cutoff)
                              , mask_value, seq).tostring() 

        l = len(masked_seq)
        print >>sys.stderr, "! seq:%s len:%i %%masked:%.3f" % (seqid, l, 
                                   100.0 * masked_seq.count(mask_value) / l)
        assert len(seq) == l
        out.write('>' + seqid + '\n')
        out.write(masked_seq + '\n')

    out.close()
    # write out a file .fasta.version containing
    # the svnversion (if available of this script
    # that was used to create the file.
    path = os.path.dirname(__file__)
    os.system('svnversion %s > %s.version' % (path, outfile))
    h5.close()

Exemplo n.º 28

0

Exibir arquivo

Arquivo: arabidopsis_rna.py Projeto: gturco/find_cns

def main(gff_file, outdir):
    """empty docstring"""
    name = re.compile("parent=([^.;]+)", re.I)

    feats = {}
    non_cds_feats = collections.defaultdict(list)
    for line in open(gff_file):
        line = line.split("\t")
        match = re.search(name, line[-1])
        if not match:
            continue
        fname = match.groups(0)[0]
        non_cds_feats[fname].append(line)
        if line[2].upper() == "CDS":
            feats[fname] = True
            continue
        if fname in feats:
            continue
        feats[fname] = None
    i = 0
    for k, v in sorted(feats.items()):
        if not v is None:
            del non_cds_feats[k]

    seen = {}
    RNA = open(outdir + "/at_non_cds.gff", "w")
    for k, feat_list in sorted(non_cds_feats.items()):
        for feat in feat_list:
            if feat[0] in ("ChrC", "ChrM"):
                continue
            if feat[2] == "exon":
                continue
            key = (feat[0], feat[3], feat[4])
            if key in seen:
                continue
            feat[0] = feat[0].upper().replace("CHR", "")
            seen[key] = True
            feat[-1] = k
            print >> RNA, "\t".join(feat)
    RNA.close()

    gff = read_gff(outdir + "/at_non_cds.gff")
    fasta = Fasta("/home/gturco/src/find_cns_gturco/pipeline/data/arabidopsis.fasta")
    ftypes = {}
    FA = open(outdir + "/at_rnas.fasta", "w")
    for chr, feature_list in gff.iteritems():
        for fname, feature in feature_list.iteritems():
            seq = fasta.sequence(feature)
            print >> FA, ">", feature["name"]
            print >> FA, seq
    FA.close()

Exemplo n.º 29

0

Exibir arquivo

Arquivo: mask_genome.py Projeto: yuzhenpeng/find_cns

def mask(fasta_file, org, cutoff, mask_value='X'):
    h5, node = get_node(org, 'r')

    outfile = fasta_file[:fasta_file.rfind(".")] + (".masked.%i" % cutoff) \
                         + fasta_file[fasta_file.rfind("."):]

    print "> masking sequence to file:", outfile
    out = open(outfile, 'w')

    fasta = Fasta(fasta_file)

    soft_mask = mask_value.lower() == 'soft'
    for seqid in sorted(fasta.iterkeys()):
        masked = 0
        if soft_mask:
            seq = str(fasta[seqid])
            # mask is the lowercase sequence.
            mask_value = np.array(seq.lower(), dtype='c')
            seq = np.array(seq.upper(), dtype='c')
        else:
            fasta[seqid].tostring = False
            seq = fasta[seqid][:]  # a

        if not 'c' + seqid in node:
            print >>sys.stderr, seqid,\
                '! not found in masked, writing unchanged\n' \
                '  this means that no section of this sequence appeared\n' \
                '  more than %i times' % cutoff
            out.write('>' + seqid + '\n')
            out.write(seq.tostring() + '\n')
            continue

        hit_counts = getattr(node, 'c' + seqid)[:]
        masked_seq = np.where(numexpr.evaluate("hit_counts > %i" % cutoff),
                              mask_value, seq).tostring()

        l = len(masked_seq)
        print >> sys.stderr, "! seq:%s len:%i %%masked:%.3f" % (
            seqid, l, 100.0 * masked_seq.count(mask_value) / l)
        assert len(seq) == l
        out.write('>' + seqid + '\n')
        out.write(masked_seq + '\n')

    out.close()
    # write out a file .fasta.version containing
    # the svnversion (if available of this script
    # that was used to create the file.
    path = os.path.dirname(__file__)
    os.system('svnversion %s > %s.version' % (path, outfile))
    h5.close()

Exemplo n.º 30

0

Exibir arquivo

Arquivo: test_all.py Projeto: jliptrap/pyfasta

def check_keyfn(path, klass, inplace):
    f = Fasta(path, record_class=klass, flatten_inplace=inplace, key_fn=lambda key: key.split()[0])
    assert sorted(f.keys()) == ['a', 'b', 'c'], f.keys()
    fix(path)
    ff = Fasta(path, record_class=klass, flatten_inplace=inplace)
    assert sorted(ff.keys()) == ['a extra', 'b extra', 'c extra'], (ff.keys(), klass)
    fix(path)

Exemplo n.º 31

0

Exibir arquivo

Arquivo: test_all.py Projeto: jliptrap/pyfasta

def check_kmer_overlap(f):
    chr2 = f['chr2']

    kmers = Fasta.as_kmers(chr2, 10, overlap=2)
    for i, k in enumerate(list(kmers)[:-1]):
        assert (len(k[1]) == 10)
        assert (k[0] == (i * (10 - 2)))

    kmers = Fasta.as_kmers(chr2, 10, overlap=4)
    seqs = [k[1] for k in kmers]
    paired_seqs = zip(seqs[0:-1], seqs[1:])
    for a, b in paired_seqs:
        if len(a) < 4 or len(b) < 4: continue
        assert (a[-4:] == b[:4])

Exemplo n.º 32

0

Exibir arquivo

def spgenome(fafile, outdir, maxsize=1000000000):


    spfiles = list()
    if path.exists(fafile):

        outfiles = dict()

        subfiles = dict()

        infa = Fasta(fafile)

        # nowsub = 0

        nowlen = 0

        for chrom in infa.keys():

            chrlen = len(infa[chrom])

            nowlen = nowlen+chrlen

            nowsub = int(nowlen/maxsize)

            if nowsub not in subfiles:

                subfilename = 'tmpfile' + str(nowsub) + '.fa'

                subfile = path.join(outdir,subfilename)

                spfiles.append(subfile)

                subfiles[nowsub] = open(subfile,'w')

            # outfiles[chrom] = nowsub

            print('>', chrom, sep='', file=subfiles[nowsub])

            print(infa[chrom], file=subfiles[nowsub])

        for nowsub in subfiles:

            subfiles[nowsub].close()


    else:
        print("Can't find ", fafile)

    return spfiles

Exemplo n.º 33

0

Exibir arquivo

 def search(self, ref_base, pos, alt_base="X"):
     var_name = "".join([ref_base, str(pos), alt_base])
     fasta_string = self.create_variant_probe_set(var_name=var_name)
     with tempfile.NamedTemporaryFile() as fp:
         fp.write(fasta_string)
         fp.seek(0)
         fasta = Fasta(fp.name)
     refs = []
     alts = []
     for k, v in fasta.items():
         if "ref" in k:
             refs.append(str(v))
         else:
             alts.append(str(v))
     return {"query": var_name, "results": self.genotype_alleles(refs, alts)}

Exemplo n.º 34

0

Exibir arquivo

def get_sequence_dict(file_path, upper=True):
    """
    Returns a dictionary of fasta records. If upper is true, all bases will be uppercased.
    """
    assert os.path.exists(file_path), ('Error: FASTA file {} does not exist'.format(file_path))
    gdx_path = file_path + ".gdx"
    assert os.path.exists(gdx_path), ("Error: gdx does not exist for this fasta. We need the fasta files to be "
                                      "flattened in place prior to running the pipeline because of concurrency issues.")
    flat_path = file_path + '.flat'
    assert os.path.exists(flat_path), ("Error: flat file does not exist for this fasta. We need the fasta files to be "
                                       "flattened in place prior to running the pipeline because of concurrency issues.")
    if upper is True:
        return Fasta(file_path, record_class=UpperNpyFastaRecord)
    else:
        return Fasta(file_path)

Exemplo n.º 35

0

Exibir arquivo

Arquivo: align_cgp_cds.py Projeto: yuzhenpeng/comparativeAnnotator

def align_consensus(tmp_dir, gp, target_genome_fasta, ref_tx_fasta):
    """
    Main consensus alignment function.
    """
    ref_tx_fasta = Fasta(ref_tx_fasta)
    target_genome_fasta = Fasta(target_genome_fasta)
    tmp_tgt, tmp_ref, tmp_psl = prepare_tmp_files(tmp_dir, gp,
                                                  target_genome_fasta)
    tx_seq = str(ref_tx_fasta[gp.name])
    fastaWrite(tmp_ref, gp.name, tx_seq)
    system("blat {} {} -out=psl -noHead {}".format(tmp_tgt, tmp_ref, tmp_psl))
    r = popenCatch("simpleChain -outPsl {} /dev/stdout".format(tmp_psl))
    r = r.split("\n")[:-1]
    best_cov, best_ident = evaluate_blat_results(r)
    return map(str, [gp.id, gp.name, best_cov, best_ident])

Exemplo n.º 36

0

Exibir arquivo

Arquivo: test_find_closest_annotated_bound.py Projeto: gongjingtang/TranscriptClean

    def test_find_closest_splice_acceptor_plus(self):
        """ Find the closest splice acceptor, which is 17 bp upstream.
            Plus strand."""

        # Process reference junctions
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjDict = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)

        # Intron bound info
        transcriptID = "test_read"
        jnNumber = 0
        chrom = "chr1"
        start = 23071360
        end = 23072140
        strand = "+"
        genome = Fasta("input_files/hg38_chr1.fa")

        junction = sj.SpliceJunction(transcriptID, jnNumber, chrom, start, end,
                                     strand, genome, sjDict)

        acceptor = junction.get_splice_acceptor()
        closest_acceptor = TC.find_closest_bound(acceptor, acceptors)
        assert closest_acceptor.start == 23072122
        assert closest_acceptor.end == 23072123
        assert closest_acceptor.dist == -17

Exemplo n.º 37

0

Exibir arquivo

    def test_primary_monoexon_read(self):
        """ The supplied read is a primary alignment. This means that a
            transcript object is created, and the logInfo struct notes the
            primary status."""

        sam_file = "input_files/sams/perfectReferenceMatch_noIntrons.sam"
        with open(sam_file, 'r') as f:
            sam_line = f.readline().strip()

        genome = Fasta("input_files/hg38_chr1.fa")
        sjAnnot = set()

        transcript, logInfo = TC.transcript_init(sam_line, genome, sjAnnot)
        assert transcript.QNAME == "c21031/f2p3/3400"
        assert transcript.FLAG == 0
        assert transcript.CHROM == "chr1"
        assert transcript.POS == 192575775
        assert transcript.CIGAR == "155M"
        assert transcript.MD == "MD:Z:155"
        assert logInfo.Mapping == "primary"      
        assert logInfo.corrected_deletions == \
               logInfo.uncorrected_deletions == \
               logInfo.variant_deletions == \
               logInfo.corrected_insertions == \
               logInfo.uncorrected_insertions == \
               logInfo.variant_insertions == \
               logInfo.corrected_mismatches == \
               logInfo.uncorrected_mismatches == \
               logInfo.corrected_NC_SJs == logInfo.uncorrected_NC_SJs == "NA"

Exemplo n.º 38

0

Exibir arquivo

Arquivo: test_find_closest_annotated_bound.py Projeto: gongjingtang/TranscriptClean

    def test_find_closest_splice_acceptor_minus(self):
        """ Find the closest splice acceptor, which is 1 bp downstream.
            Minus strand. Note that dist is relative to the genome, not to
            the direction of the transcript."""

        # Process reference junctions
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjDict = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)

        # Intron bound info
        transcriptID = "test_read"
        jnNumber = 0
        chrom = "chr1"
        start = 22071331
        end = 22073331
        strand = "-"
        genome = Fasta("input_files/hg38_chr1.fa")

        junction = sj.SpliceJunction(transcriptID, jnNumber, chrom, start, end,
                                     strand, genome, sjDict)

        acceptor = junction.get_splice_acceptor()
        closest_acceptor = TC.find_closest_bound(acceptor, acceptors)
        assert closest_acceptor.start == 22071329
        assert closest_acceptor.end == 22071330
        assert closest_acceptor.dist == -1

Exemplo n.º 39

0

Exibir arquivo

    def test_get_depth_info(self):
        ref_fasta = Fasta(fasta_dir + 'test/chr0.fa')
        chr0 = ref_fasta['chr0']
        confident_regions = Regions([(0,10000000)])

        reads = list(self.bam_in)
        r = get_depth_info(reads, "chr0", 0, len(chr0), None, confident_regions)
        (depth_df, summary_depth_info, confident_depth_info, target_info, target_cov) = r

        reads_dd = filter(lambda x: not x.is_duplicate, reads)
        r_dd = get_depth_info(reads_dd, "chr0", 0, len(chr0), None, confident_regions)
        (dd_depth_df, summary_depth_info_deduped, confident_depth_info, target_info, target_cov) = r_dd

        self.assertEqual(summary_depth_info, {0: 10, 1: 10, 2: 10, 3: 10})
        self.assertEqual(summary_depth_info_deduped, {0: 10, 1: 20, 2: 10})
        self.assertEqual(target_info, {})


        r = get_depth_info(reads, "chr0", 0, len(chr0), Regions([(5, 15)]), confident_regions)
        (target_depth_df, summary_depth_info, confident_depth_info, target_info, target_cov) = r

        self.assertEqual(summary_depth_info, {2: 5, 3: 5})

        self.assertEqual(len(target_depth_df), 10)
        self.assertEqual(len(target_cov), 1)
        self.assertEqual(target_cov['mean'][0], 2.5)
        self.assertEqual(sum(target_depth_df.coverage), target_info['on_target_bases'])


        r_dd = get_depth_info(reads_dd, "chr0", 0, len(chr0), Regions([(5, 15)]), confident_regions)
        (target_depth_df, summary_depth_info_deduped, confident_depth_info, target_info, target_cov) = r_dd

        self.assertEqual(summary_depth_info_deduped, {1: 5, 2: 5})

Exemplo n.º 40

0

Exibir arquivo

Arquivo: PyfastaReader.py Projeto: harvardinformatics/gx

 def segments(self):
     '''
     Generator for Segments
     '''
     startchr = self.start_chromosome
     start = self.start_location
     chrs = [x[0] for x in sorted(self.fasta.index.items(), key=lambda a: a[1][0])]
     for chr in chrs:
         segcount = 0
         if self.verbose:
             print "Reading chr %s" % chr
         # Skip forward if a starting chr was defined
         if startchr is not None and startchr != chr:
             continue
         else:
             startchr = None
             
         for kmer in Fasta.as_kmers(self.fasta[chr],self.segment_size):
             end = start + self.segment_size                
             seg = Segment(start, end, kmer[1] ,chr)
             segcount += 1
             if self.verbose and segcount % 1000 == 0:
                 print "Read %d segments" % segcount
             yield seg
             start = end

Exemplo n.º 41

0

Exibir arquivo

    def segments(self):
        '''
        Generator for Segments
        '''
        startchr = self.start_chromosome
        start = self.start_location
        chrs = [
            x[0]
            for x in sorted(self.fasta.index.items(), key=lambda a: a[1][0])
        ]
        for chr in chrs:
            segcount = 0
            if self.verbose:
                print "Reading chr %s" % chr
            # Skip forward if a starting chr was defined
            if startchr is not None and startchr != chr:
                continue
            else:
                startchr = None

            for kmer in Fasta.as_kmers(self.fasta[chr], self.segment_size):
                end = start + self.segment_size
                seg = Segment(start, end, kmer[1], chr)
                segcount += 1
                if self.verbose and segcount % 1000 == 0:
                    print "Read %d segments" % segcount
                yield seg
                start = end

Exemplo n.º 42

0

Exibir arquivo

Arquivo: test_find_closest_annotated_bound.py Projeto: gongjingtang/TranscriptClean

    def test_find_closest_splice_donor_minus(self):
        """ For a toy case with multiple donors and acceptors in close
            proximity, test whether TC can find the closest reference donor
            to the supplied intron bound.

            Similar to before, there is an exact match for the donor, located
            at 23071360 in 1-based coordinates and 23071359 in 0-based."""

        # Process reference junctions
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjDict = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)

        # Intron bound info
        transcriptID = "test_read"
        jnNumber = 0
        chrom = "chr1"
        start = 23070360
        end = 23071360
        strand = "-"
        genome = Fasta("input_files/hg38_chr1.fa")

        junction = sj.SpliceJunction(transcriptID, jnNumber, chrom, start, end,
                                     strand, genome, sjDict)

        donor = junction.get_splice_donor()
        closest_donor = TC.find_closest_bound(donor, donors)
        assert closest_donor.start == 23071359
        assert closest_donor.end == 23071360
        assert closest_donor.dist == 0

Exemplo n.º 43

0

Exibir arquivo

    def test_fix_donor_case3(self):
        """ Toy transcript with sequence AAGGT|GAA, where the splice motif
            is noncanonical but located 2 bp from a canonical splice donor.
            chr1: 23,071,357 - 23,072,126

            So-called case #3
        """

        # Process references
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjDict = TC.processSpliceAnnotation(sjFile, tmp_dir,
                                                               chroms)
        genome = Fasta("input_files/hg38_chr1.fa")


        # Init transcript object
        sam_fields = ["test_read", "0", "chr1", "23071357", "255", "5M762N3M", "*",
                      "0", "0", "AAGGTGAA", "*",  "NM:i:0", "MD:Z:8"]
        transcript = t2.Transcript(sam_fields, genome, sjDict)
        jnNumber = 0
        maxDist = 5
        donor = (transcript.spliceJunctions[jnNumber]).bounds[0]

        # Attempt to correct the splice donor side of the junction (left)
        new_seq, new_cigar = TC.fix_one_side_of_junction(transcript.CHROM,
                                                         transcript.POS, jnNumber,
                                                         donor, -2, genome,
                                                         transcript.SEQ,
                                                         transcript.CIGAR)

        assert new_seq == "AAGGAA"
        assert new_cigar == "3M764N3M"

Exemplo n.º 44

0

Exibir arquivo

Arquivo: test_all.py Projeto: jliptrap/pyfasta

def test_classes():

    for inplace in (True, False):
        for klass in record_classes:
            f = Fasta('tests/data/three_chrs.fasta', record_class=klass, flatten_inplace=inplace)
            yield check_keys, f
            yield check_misc, f, klass
            yield check_contains, f
            yield check_shape, f
            yield check_bounds, f
            yield check_tostring, f
            yield check_kmers, f
            yield check_kmer_overlap, f
            yield check_slice_size, f
            yield check_slice, f
            yield check_full_slice, f
            yield check_array_copy, f
            yield check_array, f
            yield check_one_based, f

            fasta_name = f.fasta_name

            del f

            yield check_keyfn, 'tests/data/key.fasta', klass, inplace

            yield check_reload, klass, fasta_name

            yield check_duplicates, klass, inplace

            _cleanup()

Exemplo n.º 45

0

Exibir arquivo

Arquivo: mask_genome.py Projeto: yuzhenpeng/find_cns

def count_freq(blast_file, fasta, org, count_subject=True):
    """one large blast file """
    h5, node = get_node(org, 'w')

    # use existing counts.
    if (h5, node) == (None, None): return
    f = Fasta(fasta)

    print "counting..."
    cache = {}
    for sline in open(blast_file):
        line = sline.split("\t")
        qchr, schr = line[:2]

        qstart, qstop, sstart, sstop = map(int, line[6:10])

        if not qchr in cache:
            update_cache(qchr, node, len(f[qchr]), h5, cache)
            cache_clear(cache, node, qchr, schr)
        # convert to 0-based indexes:
        # 1 8 => 0 8, but range doesnt include upper boud.
        cache[qchr][qstart - 1:qstop] += 1

        if count_subject:
            if sstart > sstop: sstart, sstop = sstop, sstart
            if not schr in cache:
                update_cache(schr, node, len(f[schr]), h5, cache)
                cache_clear(cache, node, qchr, schr)
                cache[schr][sstart - 1:sstop] += 1

    for achr in cache:
        getattr(node, 'c' + achr)[:] = cache[achr]

    h5.close()

Exemplo n.º 46

0

Exibir arquivo

Arquivo: test_all.py Projeto: jamescasbon/pyfasta

def check_kmers(f):
    seq = str(f['chr2'])

    kmers = list(Fasta.as_kmers(f['chr2'], 10))
    assert (len(kmers) == len(seq) / 10)
    assert (kmers[0] == (0, seq[:10]))

    seqs = [k[1] for k in kmers]
    assert ("".join(seqs) == seq)
    last_pair = kmers[-1]
    assert (seqs[-1][-1] == 'T')

    seq = str(f['chr3'])
    kmers = list(Fasta.as_kmers(f['chr3'], 1))
    assert (kmers[2][0] == 2)
    seqs = [k[1] for k in kmers]
    assert ("".join(seqs) == seq)

Exemplo n.º 47

0

Exibir arquivo

Arquivo: reference.py Projeto: henmt/2015

class Reference(object):
    def __init__(self, genome_fasta):
        # @see: https://pypi.python.org/pypi/pyfasta
        key_fn = lambda key : key.split()[0] # Use first value before whitespace as keys
        self.fasta =  Fasta(genome_fasta, key_fn=key_fn)

    def get_sequence_from_iv(self, iv):
        feature_hash = {'chr' : iv.chrom, 'start' : iv.start, 'stop' : iv.end, 'strand' : iv.strand}
        return self.fasta.sequence(feature_hash, one_based=False)

Exemplo n.º 48

0

Exibir arquivo

Arquivo: simulate.py Projeto: emilhaegglund/master-thesis

def read_fasta(ref_files, fasta_header):
    """Read fasta file

    New line character can only exist between header and sequence,
    not inside sequence

    Args:
        file_path (str): Path to fasta file.

    Returns:
        fasta_dict (dict): Dictionary with fasta headers as keys and the
            sequences as values.
    """
    # Open fasta file and store headers and sequences
    for fasta_path in ref_files:
        # print(fasta_path)
        fasta = Fasta(fasta_path)
        if fasta_header in fasta.keys():
            return fasta

Exemplo n.º 49

0

Exibir arquivo

Arquivo: nucmer.py Projeto: jim-bo/parabio

    def split_seqs(self, num_jobs, max_ref=5, max_qry=20):
        ''' splits reference and query into appropriate number of splits '''
        
        # load data into memory.
        r = Fasta(self.ref_fasta, record_class=MemoryRecord)
        q = Fasta(self.qry_fasta, record_class=MemoryRecord)
        
        ## reference ##
        # split according to criteria.
        if len(r) < max_ref:
            max_ref = len(r)
            
        if max_ref > num_jobs:
            max_ref = 1
        
        if len(q) < max_qry:
            max_qry = len(q)

        if num_jobs < max_qry:
            max_qry = num_jobs

        if (max_ref * max_qry) > num_jobs:
            max_qry = int(float(num_jobs) / float(max_ref))
        
        # count number of seqs.
        sc = len(r.keys())
        
        # create split info.
        self.ref_names = ["ref_%i" % x for x in range(max_ref)]
        self.ref_files = ["%s/%s.fasta" % (self.out_dir, x) for x in self.ref_names]
        
        # split according to rules.
        pyfasta.split_fasta.without_kmers(r, self.ref_files)
        self.ref_names, self.ref_files = self._no_empty(self.ref_names, self.ref_files)
        
        ## query ##
        # create split info.
        self.qry_names = ["qry_%i" % x for x in range(max_qry)]
        self.qry_files = ["%s/%s.fasta" % (self.out_dir, x) for x in self.qry_names]
        
        # split according to rules.
        pyfasta.split_fasta.without_kmers(q, self.qry_files)
        self.qry_names, self.qry_files = self._no_empty(self.qry_names, self.qry_files)

Exemplo n.º 50

0

Exibir arquivo

Arquivo: models.py Projeto: kyu999/biovec

def generate_corpusfile(fasta_fname, n, corpus_fname):
    '''
    Args:
        fasta_fname: corpus file name
        n: the number of chunks to split. In other words, "n" for "n-gram"
        corpus_fname: corpus_fnameput corpus file path
    Description:
        Protvec uses word2vec inside, and it requires to load corpus file
        to generate corpus.
    '''
    f = open(corpus_fname, "w")
    fasta = Fasta(fasta_fname)
    for record_id in tqdm(fasta.keys(), desc='corpus generation progress'):
        r = fasta[record_id]
        seq = str(r)
        ngram_patterns = split_ngrams(seq, n)
        for ngram_pattern in ngram_patterns:
            f.write(" ".join(ngram_pattern) + "\n")
    f.close()

Exemplo n.º 51

0

Exibir arquivo

Arquivo: main.py Projeto: maxmalysh/fasta

def process_query():
    print('Reading sequence library and query sequence')
    library = Fasta(library_path)
    queries = Fasta(query_path)
    query_sequence = str(queries["Rattus"])

    print('Processing')
    progress = progressbar.ProgressBar(max_value=len(library.keys()))
    cpu_count = multiprocessing.cpu_count()
    executor = ThreadPoolExecutor(max_workers=cpu_count)

    tasks = []
    for record in list(library.keys())[:library_process_limit]:
        library_sequence = str(library[record])
        future = executor.submit(align, library_sequence, query_sequence)
        tasks.append(AlignmentTask(record, future))

    results = []
    for i in range(len(tasks)):
        _, _, score = tasks[i].future.result()
        results.append(AlignmentResult(title=tasks[i].record, score=score))
        progress.update(i)

    etalone_score = sum([ smatrix[(x, x)] for x in query_sequence ])

    print("Done")
    print("Etalone score is %d" % etalone_score)
    print("Got %d results, here are top-30 among them:" % len(results))
    print("Score  | Match   | Record")

    for sequence in sorted(results, key=lambda x: x.score, reverse=True)[:30]:
        match = (sequence.score / etalone_score) * 100.0
        print("%6d | %5.3f%% | %s" % (sequence.score, match, sequence.title))

    timer = get_performance_timer()
    for time in [timer.dotplot, timer.regions, timer.align]:
        print(time / cpu_count)

Exemplo n.º 52

0

Exibir arquivo

Arquivo: gff_loader.py Projeto: Nicholas-NVS/bio-pipeline

def main(gff_file, fasta_file, parents, children):

    db_file = gff_file + ".db"

    if not op.exists(db_file):
        GFFutils.create_gffdb(gff_file, db_file)

    f = Fasta(fasta_file)
    g = GFFutils.GFFDB(db_file)

    parents = set(parents.split(','))
    parents_iter = [g.features_of_type(x) for x in parents]
    parents_list = itertools.chain(*parents_iter)
    children_list = set(children.split(','))

    for feat in parents_list:

        children = []
        for c in g.children(feat.id, 1):

            if c.featuretype not in children_list: continue
            child = f.sequence(dict(chr=c.chrom, start=c.start, stop=c.stop,
                strand=c.strand))
            children.append((child, c))

        if not children: 
            print >>sys.stderr, "[warning] %s has no children with type %s" \
                                    % (feat.id, ','.join(children_list))
            continue
        # sort children in incremental position
        children.sort(key=lambda x: x[1].start)
        # reverse children if negative strand
        if feat.strand=='-': children.reverse()
        feat_seq = ''.join(x[0] for x in children)

        print ">%s" % feat.id
        print feat_seq

Exemplo n.º 53

0

Exibir arquivo

Arquivo: design_primers.py Projeto: faircloth-lab/msatcommander-gs

class Sequence():
    """docstring for Sequence"""
    def __init__(self, engine='mysql', function = 'iterator', **kwargs):
        self.engine = engine
        if self.engine == 'mysql' and function == 'iterator':
            self.create_mysql_iterator(**kwargs)
        elif self.engine == 'biopython' and kwargs['data_type'] == 'fasta':
            self.create_biopython_iterator(**kwargs)
        elif self.engine == 'pyfasta' and kwargs['data_type'] == 'fasta':
            self.create_pyfasta_iterator(**kwargs)
        elif self.engine == 'twobit' and kwargs['data_type'] == 'twobit':
            self.create_twobit_iterator(**kwargs)

    def create_mysql_iterator(self, **kwargs):
        cur = kwargs['cursor']
        query = '''SELECT id, record FROM sequence WHERE n_count <= 2 AND 
                    trimmed_len > 40'''
        cur.execute(query)
        self.readcount = cur.rowcount
        self.read = iter(cur.fetchall())

    def create_biopython_iterator(self, **kwargs):
        from Bio import SeqIO
        print "Generating BioPython sequence index.  This may take a moment...."
        self.fasta = SeqIO.index(kwargs['input'], kwargs['data_type'])
        self.readcount = len(self.fasta)
        self.db_values = zip(range(len(self.fasta)), sorted(self.fasta.keys()))
        self.read = iter(self.db_values)

    def create_twobit_iterator(self, **kwargs):
        import bx.seq.twobit
        self.fasta = bx.seq.twobit.TwoBitFile(file(kwargs['input']))
        self.readcount = self.fasta.seq_count
        self.db_values = zip(range(self.fasta.seq_count), sorted(self.fasta.keys()))
        self.read = iter(self.db_values)

    def create_pyfasta_iterator(self, **kwargs):
        from pyfasta import Fasta
        print "Generating PyFasta sequence index.  This may take a moment...."
        self.fasta = Fasta(kwargs['input'])
        self.readcount = len(self.fasta)
        self.db_values = zip(range(len(self.fasta)), sorted(self.fasta.keys()))
        self.read = iter(self.db_values)

    def get_pyfasta_reads(self, **kwargs):
        from pyfasta import Fasta
        self.fasta = Fasta(kwargs['input'])
        self.readcount = len(self.fasta)

Exemplo n.º 54

0

Exibir arquivo

Arquivo: split_fasta.py Projeto: brettjurgens/466-project

def with_kmers(f, names, k, overlap):
    """
    split the sequences in Fasta object `f` into pieces of length `k` 
    with the given `overlap` the results are written to the array of files
    `fhs`
    """
    fhs = [open(name, 'wb') for name in names]
    i = 0
    for seqid in f.keys():
        seq = f[seqid]
        for (start0, subseq) in Fasta.as_kmers(seq, k, overlap=overlap):

            fh = fhs[i % len(fhs)]
            print >>fh, ">%s" % format_kmer(seqid, start0)
            print >>fh, subseq
            i += 1

Exemplo n.º 55

0

Exibir arquivo

Arquivo: __init__.py Projeto: davidmnoriega/fast2phy

def main():
    args = make_parser()
    if args.inplace:
        f = Fasta(args.fasta_file, flatten_inplace=True)
    else:
        f = Fasta(args.fasta_file)

    if args.output_file is not None:
        output = open(args.output_file, 'w')
    else:
        output_file_name = args.fasta_file.split('.')[0]
        output_file = '{0}.phylip'.format(output_file_name)
        output = open(output_file, 'w')

    sequence_count = len(f.keys())
    sequence_length = len(f[next(iter(f.keys()))])
    # print('', sequence_count, sequence_length, sep=' ')
    output.write(' {0} {1}\n'.format(sequence_count, sequence_length))

    for key in f.keys():
        subseq = []
        for chunk in grouper(f[key][:LINE_LENGTH], CHUNK_LENGTH):
            subseq.append(''.join(item[0] for item in chunk))
        subseq = ' '.join(subseq)
        if len(key) < CHUNK_LENGTH:
            key = key.ljust(CHUNK_LENGTH)
        else:
            key = key[:CHUNK_LENGTH]
        # print(key, ' ', subseq)
        output.write('{0} {1}\n'.format(key, subseq))

    sequence_length -= LINE_LENGTH
    start = LINE_LENGTH
    stop = LINE_LENGTH * 2
    # print()
    output.write('\n')

    while sequence_length > 0:
        for key in f.keys():
            subseq = []
            for chunk in grouper(f[key][start:stop], CHUNK_LENGTH, ' '):
                subseq.append(''.join(item[0] for item in chunk))
            subseq = ' '.join(subseq)
            # print(PAD_STRING, ' ', subseq)
            output.write('{0} {1}\n'.format(PAD_STRING, subseq))
        sequence_length -= LINE_LENGTH
        start += LINE_LENGTH
        stop += LINE_LENGTH
        # print()
        output.write('\n')

    output.close()

Exemplo n.º 56

0

Exibir arquivo

Arquivo: nwAlignment.py Projeto: B-Rich/gsinghal_python_src

def align():
    hg19 = Fasta('hg19.fa')
    print hg19.keys()

    hg19Chr = sorted(hg19.keys(), reverse=True)

    YRI = Fasta('YRIref.fasta')
    print YRI.keys()
    YRIChr = sorted(YRI.keys())
    print hg19[hg19Chr[0]][:20]
    print YRI[YRIChr[0]][:20]

    print hg19[hg19Chr[0]][:20]
    print YRI[YRIChr[0]][:20]

    fhout = open('hg19_YRI_diff.bed', 'w')

    header = 'chrom, chromStart, chromEnd, hg19, YRI \n'
    fhout.write(header)
    for each in hg19Chr:
        seq1 = hg19[each][:10000]
        seq2 = YRI[each][:10000]
        print 'reached 1'
        print 'doing alignment for ', each
        alignment = nw.global_align(seq1, seq2, gap=-2, matrix=None, match=1, mismatch=-1)
        print 'reached 2'
        len1 = len(alignment[0]) #hg19
        len2 = len(alignment[1]) #YRI

        if len2>len1:
            x = len2
        else:
            x = len1

        for i in range(x):
            if alignment[0][i] != alignment[1][i]:
                #write to fhout
                outline = each + ',' + str(i) + ',' + str(i+1) + ',' + alignment[0][i] + ',' + alignment[1][i] + '\n'
                fhout.write(outline)


    fhout.close()

Exemplo n.º 57

0

Exibir arquivo

Arquivo: check.aln.fasta.overlap.py Projeto: guochangjiang/Python.learn

'''
检测比对过的fasta文件中所有序列之间是否两两均具有重叠区域
'''

__version__ = "1.0"

from pyfasta import Fasta
import argparse

#命令行选项处理
parser = argparse.ArgumentParser()
parser.add_argument("-i", "-in", "--input", metavar="filename", dest="input", type=str , help="fasta file to check")
parser.add_argument("-v", "--version", action='version', help="The version of this program.", version = "Version: " + __version__)
args = parser.parse_args()

f = Fasta(args.input)
loci = sorted(f.keys())
for locus1 in loci:
    for locus2 in loci:
        flag = 0
        sequence1 = f[locus1]
        sequence2 = f[locus2]
        i = 0
        while i < len(sequence1) and i < len(sequence2):
            base1 = sequence1[i]
            base2 = sequence2[i]
            if base1 != "-" and base2 != "-":
                flag = 1
                break
            i += 1
        if flag == 0: