Exemplo n.º 1
0
def check_keyfn(path, klass, inplace):
    f = Fasta(path, record_class=klass, flatten_inplace=inplace, key_fn=lambda key: key.split()[0])
    assert sorted(f.keys()) == ['a', 'b', 'c'], f.keys()
    fix(path)
    ff = Fasta(path, record_class=klass, flatten_inplace=inplace)
    assert sorted(ff.keys()) == ['a extra', 'b extra', 'c extra'], (ff.keys(), klass)
    fix(path)
Exemplo n.º 2
0
def read_fa(fa='/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/mm10/transcriptome/mm10_transcriptome.fa'):
	gj.printFuncRun('read_fa')
	gj.printFuncArgs()
	fa_dict = Fasta(fa, key_fn=lambda key:key.split("\t")[0])
	print fa_dict.keys()[0:3]
	gj.printFuncRun('read_fa')
	return fa_dict
Exemplo n.º 3
0
def parse_align(train_fa, validation_fa, blastn_output, savefn):
    train_fa_dict = Fasta(train_fa)
    validation_fa_dict = Fasta(validation_fa)
    seq_similarity_dict = nested_dict(2, list)
    for i in list(validation_fa_dict.keys()):
        for j in list(train_fa_dict.keys()):
            seq_similarity_dict[i][j] = np.nan
    with open(blastn_output, 'r') as OUT:
        for line in OUT:
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            arr = line.split('\t')
            seq_similarity_dict[arr[0]][arr[1]] = -np.log10(float(arr[10]))
    seq_similarity_df = pd.DataFrame.from_dict(seq_similarity_dict,
                                               orient='index')

    fig, ax = plt.subplots(figsize=(12, 30))
    sns.heatmap(seq_similarity_df.T.head(1000),
                xticklabels=False,
                yticklabels=False,
                cmap="YlGnBu")
    plt.tight_layout()
    plt.savefig(savefn)
    plt.close()
    return seq_similarity_df
Exemplo n.º 4
0
def check_keyfn2(path, klass, inplace):
    f = Fasta(path, record_class=klass, flatten_inplace=inplace, key_fn=lambda
            key: "-".join(key.split()))

    assert sorted(f.keys()) == ['a-extra', 'b-extra', 'c-extra'], f.keys()

    assert f['a-extra']
    fix(path)
Exemplo n.º 5
0
class Alg:
    def __init__(self, fastafn, freqfn, colorfn):
        self.pos = []
        self.init = False
        self.size = 0
        self.fasta = Fasta(fastafn)
        self.colorfn = colorfn
        self.conta = {'n':0, '-':0, 'a':1, 'c':2, 'g':3, 't':4, '\n':'\n'}

        self.read_fasta(fastafn)
        self.write_freqs(freqfn)

    def do_plot(self, plot, names = False):
        msa = self.seqtocol(self.colorfn, names= names)
        if plot:
            return(msa)
        

    def read_fasta(self, fastafn):
        for entry in self.fasta.keys():
            seq = self.fasta[entry][:]
            if not self.init:
                # this assumes that all the entries in the fasta record are the same size. 
                # this is the default setting for clustalo 
                # TODO add an assertion ro verify so
                self.size = len(seq) 
                for i in range(0, self.size):
                    self.pos.append(Pos(i))
                self.init = True

            for nt in range(0, self.size):
                self.pos[nt].freq[seq[nt].lower()]+=1 
    
    def seqtocol(self, outfn, names=False):
        outf = open(outfn, 'w')
        colors = []
        for i,entry in enumerate(self.fasta.keys()):
            outf.write(entry+','+','.join([str(self.conta[i.lower()]) for i in self.fasta[entry][:]])+'\n')
            if names:
                colors.append(entry)
            [colors.append(self.conta[i.lower()]) for i in self.fasta[entry][:]]
        outf.close()

        # TODO thisis very weird, check why one option returns the transpose
        if names:
            #colors = np.array(colors).reshape( 1+i, 1+len(self.fasta[entry][:])) 
            colors = np.array(colors).reshape( 1+len(self.fasta[entry][:]), 1+i) 
        else:
            colors = np.array(colors).reshape(1+i, len(self.fasta[entry][:])) 
        return(colors)

    def write_freqs(self, outfn):
        outf = open(outfn, 'w')
        outf.write('\t'.join(['a','c','t','g'])+'\n')
        for j in self.pos:
            outf.write('\t'.join([str(j.freq['a']),str(j.freq['c']),str(j.freq['t']),str(j.freq['g'])])+'\n')
        outf.close()
Exemplo n.º 6
0
def check_keyfn2(path, klass, inplace):
    f = Fasta(path,
              record_class=klass,
              flatten_inplace=inplace,
              key_fn=lambda key: "-".join(key.split()))

    assert sorted(f.keys()) == ['a-extra', 'b-extra', 'c-extra'], f.keys()

    assert f['a-extra']
    fix(path)
def extract_only_ref_variant_fasta():
    f = Fasta(args.reference)
    if len(f.keys()) == 1:
        ref_id = str(f.keys())
    ffp = open("%s/Only_ref_variant_positions_for_closely" %
               args.filter2_only_snp_vcf_dir).readlines()
    core_vcf_file = args.filter2_only_snp_vcf_filename.replace(
        '_filter2_final.vcf_no_proximate_snp.vcf',
        '_filter2_final.vcf_core.vcf.gz')
    fasta_string = ""
    count = 0
    for lines in ffp:
        lines = lines.strip()
        grep_position = "zcat %s | grep -v \'#\' | awk -F\'\\t\' \'{ if ($2 == %s) print $0 }\' | awk -F\'\\t\' \'{print $5}\'" % (
            core_vcf_file, lines)
        proc = subprocess.Popen([grep_position],
                                stdout=subprocess.PIPE,
                                shell=True)
        (out, err) = proc.communicate()
        out = out.strip()
        if out:
            if "," in out:

                split = out.split(',')
                fasta_string = fasta_string + split[0]
                print "HET SNP found: Position:%s; Taking the First SNP:%s" % (
                    lines, split[0])
                count += 1
            else:
                fasta_string = fasta_string + out
                count += 1
        else:
            fasta_string = fasta_string + str(
                f.sequence({
                    'chr': str(f.keys()[0]),
                    'start': int(lines),
                    'stop': int(lines)
                }))
            count += 1
    pattern = re.compile(r'\s+')
    fasta_string = re.sub(pattern, '', fasta_string)
    final_fasta_string = ">%s\n" % os.path.basename(
        core_vcf_file.replace('_filter2_final.vcf_core.vcf.gz',
                              '')) + fasta_string
    fp = open(
        "%s/%s_variants.fa" %
        (args.filter2_only_snp_vcf_dir,
         os.path.basename(
             core_vcf_file.replace('_filter2_final.vcf_core.vcf.gz', ''))),
        'w+')
    fp.write(final_fasta_string + '\n')
    fp.close()
Exemplo n.º 8
0
def main():
    args = make_parser()
    if args.inplace:
        f = Fasta(args.fasta_file, flatten_inplace=True)
    else:
        f = Fasta(args.fasta_file)

    if args.output_file is not None:
        output = open(args.output_file, 'w')
    else:
        output_file_name = args.fasta_file.split('.')[0]
        output_file = '{0}.phylip'.format(output_file_name)
        output = open(output_file, 'w')

    sequence_count = len(f.keys())
    sequence_length = len(f[next(iter(f.keys()))])
    # print('', sequence_count, sequence_length, sep=' ')
    output.write(' {0} {1}\n'.format(sequence_count, sequence_length))

    for key in f.keys():
        subseq = []
        for chunk in grouper(f[key][:LINE_LENGTH], CHUNK_LENGTH):
            subseq.append(''.join(item[0] for item in chunk))
        subseq = ' '.join(subseq)
        if len(key) < CHUNK_LENGTH:
            key = key.ljust(CHUNK_LENGTH)
        else:
            key = key[:CHUNK_LENGTH]
        # print(key, ' ', subseq)
        output.write('{0} {1}\n'.format(key, subseq))

    sequence_length -= LINE_LENGTH
    start = LINE_LENGTH
    stop = LINE_LENGTH * 2
    # print()
    output.write('\n')

    while sequence_length > 0:
        for key in f.keys():
            subseq = []
            for chunk in grouper(f[key][start:stop], CHUNK_LENGTH, ' '):
                subseq.append(''.join(item[0] for item in chunk))
            subseq = ' '.join(subseq)
            # print(PAD_STRING, ' ', subseq)
            output.write('{0} {1}\n'.format(PAD_STRING, subseq))
        sequence_length -= LINE_LENGTH
        start += LINE_LENGTH
        stop += LINE_LENGTH
        # print()
        output.write('\n')

    output.close()
Exemplo n.º 9
0
def removehost(fasta, bed):
    removeregion = dict()

    with open(bed) as bedin:
        for i in bedin:

            removeregion[i.rstrip()] = 1

    fa = Fasta(fasta)

    outfile = 'removehost_' + fasta

    outio = open(outfile, 'w')

    for seqname in fa.keys():

        if seqname in removeregion:

            continue

        else:

            outst = '>' + seqname + '\n' + str(fa[seqname]) + '\n'

            outio.write(outst)

    outio.close()
Exemplo n.º 10
0
    def run(self, filename):
        self.openOutFiles(filename)
        f = Fasta(filename)

        count = len(f)
        self.not_found_in_kabat, self.fr4_not_found, current = (0, 0, 0)

        for name in f.keys():
            current += 1
            if current % 1000 == 0:
                print "All %d. Current: %d" % (count, current)
                # format: vName_jName{frameNumber} or vName_dName{frameNumber}_jName{frameNumber}

            vGeneName = name.split("_")[0]

            vGeneRegions = self.getVGeneRegions(vGeneName)
            if vGeneRegions is None:
                continue

            withoutMarkup = f[name][vGeneRegions[self.kabat.regions_count * 2 - 1]:]
            group = self.findFR4(name, withoutMarkup)
            if group is None:
                continue

            self.result_kabat_file.write(name)
            self.result_kabat_file.write(("\t%d" * 10) % tuple(vGeneRegions))
            self.result_kabat_file.write(("\t%d" * 4 + "\n") % tuple(
                [vGeneRegions[9] + i for i in [1, group.start(), group.start() + 1, len(withoutMarkup)]]))

        self.closeOutFiles()
        print "all: {}; not in kabat: {}; without fr4: {}".format(current, self.not_found_in_kabat, self.fr4_not_found)
Exemplo n.º 11
0
def aa_seq(options):
    """ Gets the ancestral sequence from a Fasta file

    """
    f = Fasta(options.ancestralfasta)
    keyz = (f.keys())
    match = ''
    if (options.single_chromosome):
        # Single chromosome fasta should only have one sequence.
        # that sequence should be the sequence of interest.
        keyz = list(keyz)
        key = keyz[0]
    else:
        get_chromosome_from_header = options.header
        get_chromosome_from_header = \
            get_chromosome_from_header.replace('?', options.chromosome)
        for key in keyz:
            if(re.match(get_chromosome_from_header, key) is not None):
                match = key
        if(match is ''):
            raise Exception("No match possible is something wrong with the"
                            " regex specified to the program as"
                            "--header-regex")
    aaSeq = f[key]
    return(aaSeq)
Exemplo n.º 12
0
    def _no_empty(self, lista, listb):
        ''' removes empty entries '''
        
        # check for empty fasta.
        tmpa = list()
        tmpb = list()
        for i in range(len(listb)):
            
            # open it.
            try:
                z = Fasta(listb[i], record_class=MemoryRecord)
            
                # check for empty.
                if len(z.keys()) == 0:
                    continue

                # add to temp.
                tmpa.append(lista[i])
                tmpb.append(listb[i])

            except:
                logging.warning("bad fasta file")
            
        # sort back.
        return tmpa, tmpb
Exemplo n.º 13
0
def aa_seq(options):
    """ Gets the ancestral sequence from a Fasta file

    """
    f = Fasta(options.ancestralfasta)
    keyz = (f.keys())
    match = ''
    if (options.single_chromosome):
        # Single chromosome fasta should only have one sequence.
        # that sequence should be the sequence of interest.
        keyz = list(keyz)
        key = keyz[0]
    else:
        get_chromosome_from_header = options.header
        get_chromosome_from_header = \
            get_chromosome_from_header.replace('?', options.chromosome)
        for key in keyz:
            if (re.match(get_chromosome_from_header, key) is not None):
                match = key
        if (match is ''):
            raise Exception("No match possible is something wrong with the"
                            " regex specified to the program as"
                            "--header-regex")
    aaSeq = f[key]
    return (aaSeq)
Exemplo n.º 14
0
def genome_contenct_stats(fasta_path):
    f = Fasta(fasta_path)
    g_box_total = []
    for seqid in f.keys():
        seq = f[seqid][:]
        g_boxs = len(re.findall('CACGTG',seq,flags=re.IGNORECASE))
        g_box_total.append(g_boxs)
    print >>sys.stderr, "total gboxes:{0}".format(sum(g_box_total))
Exemplo n.º 15
0
def create_fasta_flat_file(file):
    """Reads a fasta file for fast sequence retrival"""

    fasta_file = Fasta(file, key_fn=lambda key: key.split()[0])

    fasta_headers = set(fasta_file.keys());

    return fasta_file, fasta_headers
Exemplo n.º 16
0
def genome_contenct_stats(fasta_path):
    f = Fasta(fasta_path)
    g_box_total = []
    for seqid in f.keys():
        seq = f[seqid][:]
        g_boxs = len(re.findall("CACGTG", seq, flags=re.IGNORECASE))
        g_box_total.append(g_boxs)
    print >> sys.stderr, "total gboxes:{0}".format(sum(g_box_total))
def extract_reference_allele():
    print "Extracting Reference Allele from Reference Fasta file - %s to REF\n" % args.reference
    # Get reference genome ID from reference fasta file
    get_reference = Fasta(args.reference)
    if len(get_reference.keys()) == 1:
        ref_id = get_reference.keys()
    print "The reference genome ID from reference genome - %s" % ref_id

    fileObj = open("REF", 'w+')
    fileObj.write('Ref' + '\n')
    for item in pos:
        ref_allele = str(
            get_reference.sequence({
                'chr': str(get_reference.keys()[0]),
                'start': int(item),
                'stop': int(item)
            }))
        fileObj.write(ref_allele + '\n')
    fileObj.close()
Exemplo n.º 18
0
def split(args):
    parser = optparse.OptionParser("""\
   split a fasta file into separated files.
        pyfasta split -n 6 [-k 5000 ] some.fasta
    the output will be some.1.fasta, some.2.fasta ... some.6.fasta
    the sizes will be as even as reasonable.
   """)
    parser.add_option("--header", dest="header", metavar="FILENAME_FMT",
       help="""this overrides all other options. if specified, it will
               split the file into a separate file for each header. it
               will be a template specifying the file name for each new file.
               e.g.:    "%(fasta)s.%(seqid)s.fasta"
               where 'fasta' is the basename of the input fasta file and seqid
               is the header of each entry in the fasta file.""" ,default=None)

    parser.add_option("-n", "--n", type="int", dest="nsplits", 
                            help="number of new files to create")
    parser.add_option("-o", "--overlap", type="int", dest="overlap", 
                            help="overlap in basepairs", default=0)
    parser.add_option("-k", "--kmers", type="int", dest="kmers", default=-1,
                     help="""\
    split big files into pieces of this size in basepairs. default
    default of -1 means do not split the sequence up into k-mers, just
    split based on the headers. a reasonable value would be 10Kbp""")
    options, fasta = parser.parse_args(args)
    if not (fasta and (options.nsplits or options.header)):
        sys.exit(parser.print_help())

    if isinstance(fasta, (tuple, list)):
        assert len(fasta) == 1, fasta
        fasta = fasta[0]

    kmer = options.kmers if options.kmers != -1 else None
    overlap = options.overlap if options.overlap != 0 else None
    f = Fasta(fasta)
    if options.header:
        names = dict([(seqid, options.header % \
                      dict(fasta=f.fasta_name, seqid=seqid)) \
                                       for seqid in f.keys()])
        """
        if len(names) > 0:
            assert names[0][1] != names[1][1], ("problem with header format", options.header)
        fhs = dict([(seqid, open(fn, 'wb')) for seqid, fn in names[:200]])
        fhs.extend([(seqid, StringIO(), fn) for seqid, fn in names[200:]])
        """
        return with_header_names(f, names)
    else:
        names = newnames(fasta, options.nsplits, kmers=kmer, overlap=overlap, 
                     header=options.header)

        #fhs = [open(n, 'wb') for n in names]
    if options.kmers == -1:
        return without_kmers(f, names)
    else: 
        return with_kmers(f, names, options.kmers, options.overlap)
Exemplo n.º 19
0
def mask_to_bed(fasta_file, mask_bed_name):
    "creates a bed file of the start and stops of masked seqs"
    mask_bed = open(mask_bed_name,"wb")
    f= Fasta(fasta_file)
    mask_id = 1
    for seqid in f.keys():
        seq = f[seqid][:]
        for m in re.finditer("X+",seq):
            mask_id = mask_id + 1
            w = '{0}\t{1}\t{2}\t{3}\t{4}\t+\t.\t.\t.\t1\t{5}\t0\n'.format(seqid,m.start(),m.end(),"mask_id {0}".format(mask_id),(m.end()-m.start()),(m.end()-m.start()+1))
            mask_bed.write(w)
Exemplo n.º 20
0
def mask_to_bed(fasta_file, mask_bed_name):
    "creates a bed file of the start and stops of masked seqs"
    mask_bed = open(mask_bed_name, "wb")
    f = Fasta(fasta_file)
    mask_id = 1
    for seqid in f.keys():
        seq = f[seqid][:]
        for m in re.finditer("X+", seq):
            mask_id = mask_id + 1
            w = '{0}\t{1}\t{2}\t{3}\t{4}\t+\t.\t.\t.\t1\t{5}\t0\n'.format(
                seqid, m.start(), m.end(), "mask_id {0}".format(mask_id),
                (m.end() - m.start()), (m.end() - m.start() + 1))
            mask_bed.write(w)
def cut_up_genome(input_files_list, output_folder, region_length):
    for file in input_files_list:
        f = Fasta(file)
        chr = sorted(f.keys())
        for chromosome in chr:
            sequence = f[chromosome]
            regions = [
                sequence[i:i + region_length]
                for i in range(0, len(sequence), region_length)
            ]
            path = os.path.join(output_folder, f'chr={chromosome}')
            write_to_json(path, regions, region_length)
            print(f'{chromosome} is complete!')
Exemplo n.º 22
0
def process_query():
    print('Reading sequence library and query sequence')
    library = Fasta(library_path)
    queries = Fasta(query_path)
    query_sequence = str(queries["Rattus"])

    print('Processing')
    progress = progressbar.ProgressBar(max_value=len(library.keys()))
    cpu_count = multiprocessing.cpu_count()
    executor = ThreadPoolExecutor(max_workers=cpu_count)

    tasks = []
    for record in list(library.keys())[:library_process_limit]:
        library_sequence = str(library[record])
        future = executor.submit(align, library_sequence, query_sequence)
        tasks.append(AlignmentTask(record, future))

    results = []
    for i in range(len(tasks)):
        _, _, score = tasks[i].future.result()
        results.append(AlignmentResult(title=tasks[i].record, score=score))
        progress.update(i)

    etalone_score = sum([ smatrix[(x, x)] for x in query_sequence ])

    print("Done")
    print("Etalone score is %d" % etalone_score)
    print("Got %d results, here are top-30 among them:" % len(results))
    print("Score  | Match   | Record")

    for sequence in sorted(results, key=lambda x: x.score, reverse=True)[:30]:
        match = (sequence.score / etalone_score) * 100.0
        print("%6d | %5.3f%% | %s" % (sequence.score, match, sequence.title))

    timer = get_performance_timer()
    for time in [timer.dotplot, timer.regions, timer.align]:
        print(time / cpu_count)
Exemplo n.º 23
0
def spgenome(fafile, outdir, maxsize=1000000000):


    spfiles = list()
    if path.exists(fafile):

        outfiles = dict()

        subfiles = dict()

        infa = Fasta(fafile)

        # nowsub = 0

        nowlen = 0

        for chrom in infa.keys():

            chrlen = len(infa[chrom])

            nowlen = nowlen+chrlen

            nowsub = int(nowlen/maxsize)

            if nowsub not in subfiles:

                subfilename = 'tmpfile' + str(nowsub) + '.fa'

                subfile = path.join(outdir,subfilename)

                spfiles.append(subfile)

                subfiles[nowsub] = open(subfile,'w')

            # outfiles[chrom] = nowsub

            print('>', chrom, sep='', file=subfiles[nowsub])

            print(infa[chrom], file=subfiles[nowsub])

        for nowsub in subfiles:

            subfiles[nowsub].close()


    else:
        print("Can't find ", fafile)

    return spfiles
Exemplo n.º 24
0
class Sequence():
    """docstring for Sequence"""
    def __init__(self, engine='mysql', function = 'iterator', **kwargs):
        self.engine = engine
        if self.engine == 'mysql' and function == 'iterator':
            self.create_mysql_iterator(**kwargs)
        elif self.engine == 'biopython' and kwargs['data_type'] == 'fasta':
            self.create_biopython_iterator(**kwargs)
        elif self.engine == 'pyfasta' and kwargs['data_type'] == 'fasta':
            self.create_pyfasta_iterator(**kwargs)
        elif self.engine == 'twobit' and kwargs['data_type'] == 'twobit':
            self.create_twobit_iterator(**kwargs)

    def create_mysql_iterator(self, **kwargs):
        cur = kwargs['cursor']
        query = '''SELECT id, record FROM sequence WHERE n_count <= 2 AND 
                    trimmed_len > 40'''
        cur.execute(query)
        self.readcount = cur.rowcount
        self.read = iter(cur.fetchall())

    def create_biopython_iterator(self, **kwargs):
        from Bio import SeqIO
        print "Generating BioPython sequence index.  This may take a moment...."
        self.fasta = SeqIO.index(kwargs['input'], kwargs['data_type'])
        self.readcount = len(self.fasta)
        self.db_values = zip(range(len(self.fasta)), sorted(self.fasta.keys()))
        self.read = iter(self.db_values)

    def create_twobit_iterator(self, **kwargs):
        import bx.seq.twobit
        self.fasta = bx.seq.twobit.TwoBitFile(file(kwargs['input']))
        self.readcount = self.fasta.seq_count
        self.db_values = zip(range(self.fasta.seq_count), sorted(self.fasta.keys()))
        self.read = iter(self.db_values)

    def create_pyfasta_iterator(self, **kwargs):
        from pyfasta import Fasta
        print "Generating PyFasta sequence index.  This may take a moment...."
        self.fasta = Fasta(kwargs['input'])
        self.readcount = len(self.fasta)
        self.db_values = zip(range(len(self.fasta)), sorted(self.fasta.keys()))
        self.read = iter(self.db_values)

    def get_pyfasta_reads(self, **kwargs):
        from pyfasta import Fasta
        self.fasta = Fasta(kwargs['input'])
        self.readcount = len(self.fasta)
Exemplo n.º 25
0
def main():
    """
    select specific contigs from FASTA file
    """
    if len(sys.argv) == 2:
        prefix = sys.argv[1]
    else:
        print "Usage: python select.py <prefix>; assume that <prefix>_BspQI_key.txt <prefix>.fasta and <prefix>_list.txt exist; output will be <prefix>_selected.fasta"
        return 0
     
    ren = ReadTable(prefix+'_BspQI_key.txt', 4, '\t') # 4 lines of header 
    print 'renaming table',ren
    select = ReadTable(prefix+'_list.txt', 0) # no header, text file of contigs numbers, one per line
    print 'select list',select

    # create a dictionary between contig id x[0] and (FASTA id x[1])
    renaming = {}
    for x in ren:
        renaming[int(x[0])]=x[1] # contigs names are converted into integers, as well as length
    print 'renaming dictionary', renaming
  
    # collect the names of the contigs to be cut 
    selected_list = []
    for x in select:
        index = int(x[0]) # name of the contig to select, convert contig name into integer so we can match it
        #print 'index',index
        if index in renaming:
           selected_list.append(renaming[index]) # add the name of the contig
        else: 
           print 'Error: contig',index,'does not exist'
           sys.exit(-1)
    print 'selected_list', selected_list
 
    # open the fasta file for reading
    fas = Fasta(prefix+'.fasta')
    # open the new fasta file for writing
    ofa = open(prefix+'_new.fasta','w')
    print 'writing new fasta'
    for x in sorted(fas.keys()): # process all the contigs one by one
        if x in selected_list: # if it needs to be split
            print 'Selecting',x
            ofa.write('>'+x+'\n')
            ofa.write(fas[x][:]+'\n') # entire contig
        else: 
            print 'Not selecting',x
    ofa.close()
Exemplo n.º 26
0
def generate_corpusfile(fasta_fname, n, corpus_fname):
    '''
    Args:
        fasta_fname: corpus file name
        n: the number of chunks to split. In other words, "n" for "n-gram"
        corpus_fname: corpus_fnameput corpus file path
    Description:
        Protvec uses word2vec inside, and it requires to load corpus file
        to generate corpus.
    '''
    f = open(corpus_fname, "w")
    fasta = Fasta(fasta_fname)
    for record_id in tqdm(fasta.keys(), desc='corpus generation progress'):
        r = fasta[record_id]
        seq = str(r)
        ngram_patterns = split_ngrams(seq, n)
        for ngram_pattern in ngram_patterns:
            f.write(" ".join(ngram_pattern) + "\n")
    f.close()
Exemplo n.º 27
0
def get_sketch(fasta, n_kmers=100, k=15):
    # use a sample of kmers from a fastq
    hash_count = Counter()
    f = Fasta(fasta)
    for chrom in f.keys():
        seq = f[chrom]
        for i in range(len(seq) - k):
            kmer = seq[i:i + k]
            hash_count[kmer] += 1

    hashes_used = 0
    hashed_sketch = []
    for kmer in sorted(hash_count.keys()):
        if hashes_used <= n_kmers:
            #print(hash_count[i])
            hashed_sketch.append(kmer)
            hashes_used += 1

    return hashed_sketch
Exemplo n.º 28
0
def read_fasta(ref_files, fasta_header):
    """Read fasta file

    New line character can only exist between header and sequence,
    not inside sequence

    Args:
        file_path (str): Path to fasta file.

    Returns:
        fasta_dict (dict): Dictionary with fasta headers as keys and the
            sequences as values.
    """
    # Open fasta file and store headers and sequences
    for fasta_path in ref_files:
        # print(fasta_path)
        fasta = Fasta(fasta_path)
        if fasta_header in fasta.keys():
            return fasta
Exemplo n.º 29
0
    def split_seqs(self, num_jobs, max_ref=5, max_qry=20):
        ''' splits reference and query into appropriate number of splits '''
        
        # load data into memory.
        r = Fasta(self.ref_fasta, record_class=MemoryRecord)
        q = Fasta(self.qry_fasta, record_class=MemoryRecord)
        
        ## reference ##
        # split according to criteria.
        if len(r) < max_ref:
            max_ref = len(r)
            
        if max_ref > num_jobs:
            max_ref = 1
        
        if len(q) < max_qry:
            max_qry = len(q)

        if num_jobs < max_qry:
            max_qry = num_jobs

        if (max_ref * max_qry) > num_jobs:
            max_qry = int(float(num_jobs) / float(max_ref))
        
        # count number of seqs.
        sc = len(r.keys())
        
        # create split info.
        self.ref_names = ["ref_%i" % x for x in range(max_ref)]
        self.ref_files = ["%s/%s.fasta" % (self.out_dir, x) for x in self.ref_names]
        
        # split according to rules.
        pyfasta.split_fasta.without_kmers(r, self.ref_files)
        self.ref_names, self.ref_files = self._no_empty(self.ref_names, self.ref_files)
        
        ## query ##
        # create split info.
        self.qry_names = ["qry_%i" % x for x in range(max_qry)]
        self.qry_files = ["%s/%s.fasta" % (self.out_dir, x) for x in self.qry_names]
        
        # split according to rules.
        pyfasta.split_fasta.without_kmers(q, self.qry_files)
        self.qry_names, self.qry_files = self._no_empty(self.qry_names, self.qry_files)
Exemplo n.º 30
0
def main():

    args = check_options(get_options())

    fain = Fasta(args.input)

    faout = open(args.output, 'w')

    minlen = int(1e6)

    print(minlen)

    shortseq = 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'

    breacker = 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'

    shortlist = list()

    for chrome in fain.keys():

        if len(fain[chrome]) < minlen:
            # print(chrome, len(fain[chrome]))
            # shortseq = shortseq + str(fain[chrome]) + breacker
            shortlist.append(chrome)

        else:
            print(chrome, len(fain[chrome]))
            print('>%s' % chrome, file=faout)
            print(fain[chrome], file=faout)

    print('>shortsequences', file=faout)

    for chrome in shortlist:

        print(str(fain[chrome]),shortseq,sep='',end='', file=faout)

#    print(shortseq, file=faout)


    faout.close()
Exemplo n.º 31
0
    def run(self, filename):
        self.openOutFiles(filename)
        f = Fasta(filename)

        count = len(f)
        self.not_found_in_kabat, self.fr4_not_found, current = (0, 0, 0)

        for name in f.keys():
            current += 1
            if current % 1000 == 0:
                print "All %d. Current: %d" % (count, current)
                # format: vName_jName{frameNumber} or vName_dName{frameNumber}_jName{frameNumber}

            vGeneName = name.split("_")[0]

            vGeneRegions = self.getVGeneRegions(vGeneName)
            if vGeneRegions is None:
                continue

            withoutMarkup = f[name][vGeneRegions[self.kabat.regions_count * 2 -
                                                 1]:]
            group = self.findFR4(name, withoutMarkup)
            if group is None:
                continue

            self.result_kabat_file.write(name)
            self.result_kabat_file.write(("\t%d" * 10) % tuple(vGeneRegions))
            self.result_kabat_file.write(("\t%d" * 4 + "\n") % tuple([
                vGeneRegions[9] + i for i in
                [1, group.start(),
                 group.start() +
                 1, len(withoutMarkup)]
            ]))

        self.closeOutFiles()
        print "all: {}; not in kabat: {}; without fr4: {}".format(
            current, self.not_found_in_kabat, self.fr4_not_found)
Exemplo n.º 32
0
def main():
    """
    select contigs from FASTA file that do not have "reads=1" on their header
    """
    if len(sys.argv) == 2:
        prefix = sys.argv[1]
    else:
        print "Usage: python get_nonsingleton_unitigs.py <canu_unassembled.fasta>; select contigs from FASTA file that do not have reads=1 on their header; creates <canu_unassembled_unitigs.fasta> file and runs n50 script"
        return 0

    count = 0
    fas = Fasta(prefix)
    ofa = open(prefix[:-6] + '_unitigs.fasta', 'w')
    for x in sorted(fas.keys()):  # process all the contigs one by one
        if "reads=1" in x:
            continue
        #print 'Selecting',x
        ofa.write('>' + x + '\n')
        ofa.write(fas[x][:] + '\n')  # entire contig
        count += 1
    print 'Selected', count, 'contigs with at least 2 reads'
    ofa.close()
    os.system("/home/stelo/bin/n50 -f " + prefix[:-6] + "_unitigs.fasta")
    os.system("rm -f *.flat *.gdx")
Exemplo n.º 33
0
def align():
    hg19 = Fasta('hg19.fa')
    print hg19.keys()

    hg19Chr = sorted(hg19.keys(), reverse=True)

    YRI = Fasta('YRIref.fasta')
    print YRI.keys()
    YRIChr = sorted(YRI.keys())
    print hg19[hg19Chr[0]][:20]
    print YRI[YRIChr[0]][:20]

    print hg19[hg19Chr[0]][:20]
    print YRI[YRIChr[0]][:20]

    fhout = open('hg19_YRI_diff.bed', 'w')

    header = 'chrom, chromStart, chromEnd, hg19, YRI \n'
    fhout.write(header)
    for each in hg19Chr:
        seq1 = hg19[each][:10000]
        seq2 = YRI[each][:10000]
        print 'reached 1'
        print 'doing alignment for ', each
        alignment = nw.global_align(seq1, seq2, gap=-2, matrix=None, match=1, mismatch=-1)
        print 'reached 2'
        len1 = len(alignment[0]) #hg19
        len2 = len(alignment[1]) #YRI

        if len2>len1:
            x = len2
        else:
            x = len1

        for i in range(x):
            if alignment[0][i] != alignment[1][i]:
                #write to fhout
                outline = each + ',' + str(i) + ',' + str(i+1) + ',' + alignment[0][i] + ',' + alignment[1][i] + '\n'
                fhout.write(outline)


    fhout.close()
Exemplo n.º 34
0
#Usage python GC_from_fasta file [window_size]
from collections import Counter

from pyfasta import Fasta
import sys


f = Fasta(sys.argv[1], key_fn=lambda key: key.split()[0])
window_size = 301 if len(sys.argv) < 3 else int(sys.argv[2])
if not (window_size % 2):
    window_size += 1
out = open(sys.argv[1]+'.GC', 'w')
for chrom in f.keys():
    print chrom
    length = len(f[chrom])
    start = 0
    while start < length:
        c = Counter(f[chrom][start:start+window_size])
        try:
            out.write('\t'.join(map(str, [chrom, start + (window_size-1)/2, float(c['G'] + c['C'] + c['g'] + c['c']) / float(c['G'] + c['C'] + c['g'] + c['c']+ c['t'] + c['T'] + c['a'] + c['A'])])) + '\n')
        except ZeroDivisionError:
            pass
        start += window_size

Exemplo n.º 35
0
#version 1.1 此版本使用pyfasta实现。
import sys, os
from pyfasta import Fasta

if len(sys.argv) != 3:
    print 'Usage: *.py inputFile outputFile'
    sys.exit(0)
inputFile = sys.argv[1]
outputFile = sys.argv[2]


def writeFile(text, files):
    with open(files, 'a') as f:
        f.write(text)


if os.path.isfile(inputFile):
    f = Fasta(inputFile)
    for key in f.keys():
        writeFile(">" + key + os.linesep, outputFile)
        content = f.sequence(
            {
                'chr': key,
                'start': 0,
                'stop': len(f[key]) - 1,
                'strand': '-'
            },
            one_based=False)
        writeFile(content + os.linesep, outputFile)
else:
    print '您输入的不是一个文件'
Exemplo n.º 36
0
#pfam(key) - uniprot(value list) dictionary
with open("Metaproteome_pfam_forDMI.tab", "r") as legionella_domains_table:
    legionella_domains_table.readline()
    pfam_uniprot = {}
    for line in legionella_domains_table:
        line = line.strip().split("\t")
        if "," in line[1]:
            line[1] = line[1].split(",")
            for pfam in line[1]:
                if pfam not in pfam_uniprot:
                    pfam_uniprot[pfam] = []
                pfam_uniprot[pfam].append(line[0])

#uniprot(key) - motif(value list) dictionary
uniprot_motif = {}
for key in human.keys():
    for motif in elm_regex:
        match = re.search(str(elm_regex[motif]), str(human[key]))
        if match:
            if key not in uniprot_motif:
                uniprot_motif[key] = []
            #print("%s;%s;%s"%(motif,match.start(),match.end()))
            uniprot_motif[key].append(
                (motif, str(match.start()), str(match.end())))

with open("MPDMIresult.tsv", "w") as output:
    for pfam, uniprot_list in pfam_uniprot.items():
        for uniprot in uniprot_list:
            for motif in motif_domain:
                if pfam in motif_domain[motif]:
                    for uni, motif_list in uniprot_motif.items():
Exemplo n.º 37
0
def main():

    args = check_options(get_options())

    genomesize = int(os.path.getsize(args.genome)/1e6)

    kmer = int(log(genomesize, 4)+1)

    if kmer < 17:

        kmer = 17

    #jellyfish par
    lowercount = 2

    #jellyfish par
    jfsize = '100M'

    # splite sequence longer than 10M
    spsize = 10000000

    step = args.step

    maxkmerscore = int(((args.length * args.homology / 100) - kmer) * args.ploidy/2 + 0.5 )

    jfpool = Pool(args.threads)

    # ?build kmerindex
    jfkmerfile = os.path.join(args.saved,(os.path.basename(args.genome)+'_'+str(kmer)+'mer.jf'))

    kmerbuild = True

    if os.path.isfile(jfkmerfile):

        if not args.docker:

            print("find:", jfkmerfile)

            kmmess = "Found kmerfile "+jfkmerfile+". Do you want rebuild it?  Press Y or N to continue:"

            print(kmmess)

            while True:

                char = getch()

                if char.lower() in ("y", "n"):

                    print(char)

                    if char == 'y':

                        kmerbuild = True

                    elif char == 'n':

                        kmerbuild = False

                    break


    # ?build bwa index
    bwaindexfile = os.path.basename(args.genome)

    bwatestindex = os.path.join(args.saved, bwaindexfile+'.sa')

    bwaindex = os.path.join(args.saved, bwaindexfile)

    bwabuild = True

    if os.path.isfile(bwatestindex):

        if not args.docker:

            print('find:', bwatestindex)

            bwamess = "Found bwa index file " + bwatestindex + ". Do you want rebuild it? Press Y or N to continue:"

            print(bwamess)

            while True:

                char = getch()

                if char.lower() in ("y", "n"):

                    print(char)

                    if char == 'y':

                        bwabuild = True

                    elif char == 'n':

                        bwabuild = False

                    break

    print("genomesize:",genomesize, "kmer:",kmer, "jfkmerfile:",
          jfkmerfile, "kmerbuild:", kmerbuild, "bwabuild:", bwabuild, "threads:", args.threads)

    # Build Jellyfish index
    if kmerbuild:

        jfcount = jellyfish.jfcount(jfpath=args.jellyfish, mer=kmer, infile=args.genome, output=jfkmerfile,
                                    threads=args.threads, lowercount=lowercount, size=jfsize)

        if jfcount:

            print("JellyFish Count finished ...")

        else:

            print("JellyFish Count Error!!!")

            sys.exit(1)

    else:

        print("Use ", jfkmerfile)
    # End build Jellyfish index

    if bwabuild:

        bwa.bwaindex(args.bwa, args.genome, args.saved)

        print("bwa index build finished ...")

    else:

        print("Use", bwatestindex)


    jffilteredprobe = list()

#####

    if genomesize < 1000:

        fastain = Fasta(args.input)

        jffpbrunerlist = list()

        for seqname in fastain.keys():

            chrlen = len(fastain[seqname])

            if chrlen < spsize:

                start = 0

                end = chrlen - 1

                jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer,
                                                  pyfasta=fastain, seqname=seqname, pblength=args.length,
                                                  maxkmerscore=maxkmerscore, start=start,
                                                  end=end, step=step)

                jffpbrunerlist.append(jffpbruner)

            else:

                chrblock = int(chrlen/spsize) + 1

                for i in range(chrblock):

                    start = i * spsize

                    end = start + spsize - 1

                    if end >= chrlen:

                        end = chrlen - 1

                    jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer,
                                                  pyfasta=fastain, seqname=seqname, pblength=args.length,
                                                  maxkmerscore=maxkmerscore, start=start,
                                                  end=end, step=step)

                    jffpbrunerlist.append(jffpbruner)

        jffinished = 0

        print(len(jffpbrunerlist))

        for curpblist in jfpool.imap_unordered(jellyfish.kmerfilterprobe, jffpbrunerlist):

            jffilteredprobe.extend(curpblist)

            jffinished += 1

            print("Jellyfish filter: ",jffinished,'/',len(jffpbrunerlist), sep='')

        jfpool.close()

        print('Jellyfish filter finished!!')

    else:

        ### split fa file when geome size greater than 1 Gb

        print("genome size > 1G")

        subFas = spgenome.spgenome(args.input, args.saved)



        for subFafile in subFas:
            print(subFafile)
            fastain = Fasta(subFafile)

            jffpbrunerlist = list()

            for seqname in fastain.keys():

                chrlen = len(fastain[seqname])

                if chrlen < spsize:

                    start = 0

                    end = chrlen - 1

                    jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer,
                                                      pyfasta=fastain, seqname=seqname, pblength=args.length,
                                                      maxkmerscore=maxkmerscore, start=start,
                                                      end=end, step=step)

                    jffpbrunerlist.append(jffpbruner)

                else:

                    chrblock = int(chrlen / spsize) + 1

                    for i in range(chrblock):

                        start = i * spsize

                        end = start + spsize - 1

                        if end >= chrlen:
                            end = chrlen - 1

                        jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer,
                                                          pyfasta=fastain, seqname=seqname, pblength=args.length,
                                                          maxkmerscore=maxkmerscore, start=start,
                                                          end=end, step=step)

                        jffpbrunerlist.append(jffpbruner)

            jffinished = 0

            print(len(jffpbrunerlist))

            for curpblist in jfpool.imap_unordered(jellyfish.kmerfilterprobe, jffpbrunerlist):
                jffilteredprobe.extend(curpblist)

                jffinished += 1

                print(subFafile + " Jellyfish filter: ", jffinished, '/', len(jffpbrunerlist), sep='')


        jfpool.close()

        print('Jellyfish filter finished!!')


    tmppbfa = os.path.join(args.saved, os.path.basename(args.input)+'_tmp_probe.fa')

    tmppbfaio = open(tmppbfa, 'w')

    seqnum = 0

    for tmppb in jffilteredprobe:

        print('>','seq',seqnum, sep='',file=tmppbfaio)

        print(tmppb,file=tmppbfaio)

        seqnum += 1

    tmppbfaio.close()

    del jffilteredprobe

    bwafiltedpb = bwa.bwafilter(bwabin=args.bwa, reffile=bwaindex, inputfile=tmppbfa, minas=args.length,
                                maxxs=int(args.length*args.homology/100), threadnumber=args.threads)

    # print(bwafiltedpb)

    tmpbwaftlist = os.path.join(args.saved, os.path.basename(args.input)+'.bed')

    alltmpbwaftlist = os.path.join(args.saved, os.path.basename(args.input)+'_all.bed')

    tmpbwaftlistio = open(tmpbwaftlist,'w')

    allbwaftlistio = open(alltmpbwaftlist,'w')

    seqlenfile = os.path.join(args.saved, os.path.basename(args.input)+'.len')

    seqlenio = open(seqlenfile,'w')

    seqlength = bwa.bwareflength(bwabin=args.bwa, reffile=bwaindex)

    for seqname in seqlength:

            print(seqname, seqlength[seqname], sep='\t', file=seqlenio)

    seqlenio.close()


    oligobefortmf = list()

    for pbtmp in bwafiltedpb:

        # print(pbtmp, file=tmpbwaftlistio)
        nowpbcounter = dict()

        nowpbcounter['seq'] = pbtmp

        nowpbcounter['dTm'] = args.dtm

        nowpbcounter['rprimer'] = args.primer

        oligobefortmf.append(nowpbcounter)

    keepedprobe = list()

    ctedpb = 0

    oligobefortmflen = len(oligobefortmf)

    print("oligobefortmflen:",oligobefortmflen)

    pbftpool = Pool()

    for (pb, keep) in pbftpool.imap_unordered(probefilter, oligobefortmf):

        if keep:

            keepedprobe.append(pb)
                # print(pb, file=tmpbwaftlistio)
        ctedpb += 1

        if ctedpb % 10000 == 0:

            print(ctedpb,'/',oligobefortmflen)

    pbdictbychr = dict()

    pbftpool.close()

    for pb in keepedprobe:

        seq, chro, start = pb.split('\t')

        start = int(start)

        if chro in pbdictbychr:

            pbdictbychr[chro][start] = seq

        else:

            pbdictbychr[chro] = dict()

            pbdictbychr[chro][start] = seq

    lenrprimer = len(args.primer)

    if lenrprimer == 0:

            lenrprimer = 5

    slidwindow = lenrprimer+args.length

    for chro in pbdictbychr:

        startn = 0

        for startnow in sorted(pbdictbychr[chro]):

            endnow = startnow + args.length - 1

            print(chro, startnow, endnow, pbdictbychr[chro][startnow],file=allbwaftlistio,sep='\t')

            if startnow > startn+slidwindow:
                    #startn = startnow+slidwindow
                startn = startnow

                print(chro, startnow, endnow, pbdictbychr[chro][startnow], file=tmpbwaftlistio, sep='\t')


    tmpbwaftlistio.close()

    allbwaftlistio.close()

    print("Job finshed!!")
Exemplo n.º 38
0
def get_aln_size(consensus_ref):
    f = Fasta(consensus_ref)
    assert len(f) == 1
    return len(f[f.keys()[0]])
Exemplo n.º 39
0

# Get the fasta name file from command line

inputfilename = sys.argv[1]
print 'Input fasta file: ', inputfilename


# Lendo arquivo de entrada

print 'Loading fasta file...'
f = Fasta(inputfilename)


# Getting all keys
KEYS = sorted( f.keys() )


# Now we will discard everything that is larger than a certain
# threshold defined by the following variable
Size_threshold = 9000

Maiores = {}
Menores = {}


# We now split them into two dictionaries depedinding on their 
# size.
for j in KEYS:
    
    if len( f[j] ) > Size_threshold:
Exemplo n.º 40
0
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

try:
    inpFasta = sys.argv[1]
except IndexError:
    print "Arguments: fasta_file"
    sys.exit(1)

# 100 bp window.
window = 100
fa = Fasta(inpFasta)

for seqid in fa.keys():
# get sequence as a numpy array with dtype='c'--char
    seq = np.array(fa[seqid], dtype='c')
    seq

    gcs = (seq == 'C') | (seq == 'G')
    gcs

# cast the booleans to ints.
    gcs = gcs.astype(np.uint8)
    gcs

    kern = np.ones(window)/ window
    kern

# same has boundary effects but output array is same length as seq
Exemplo n.º 41
0
# dumb map
m = {"Notch2NL-C_Notch2NL-D": ("Notch2", "Notch2NL-A", "Notch2NL-B"),
    "Notch2NL-D": ("Notch2", "Notch2NL-A", "Notch2NL-B", "Notch2NL-C"),
    "all": ("Notch2", "Notch2NL-A", "Notch2NL-B", "Notch2NL-C", "Notch2NL-D")}

regions = {frozenset(["Notch2NL-D"]): [[0, 15866], [74917, 81068], [162369, 165396]],
           frozenset(["Notch2NL-D", "Notch2NL-C"]): [[15867, 74916]],
           frozenset(): [[81069, 162368], [165397, 2000000]]}



f = Fasta("stitched_alignment.fa")
results = {}
for exclude in [frozenset(), frozenset(["Notch2NL-D"]), frozenset(["Notch2NL-D", "Notch2NL-C"])]:
    t = open("tmp.fasta", "w")
    for para in sorted(set(f.keys()) - exclude):
        t.write(">{}\n{}\n".format(para, f[para]))
    t.close()
    n = '_'.join(sorted(exclude)) if len(exclude) > 0 else 'all'
    cmd = ['java', '-jar', '/cluster/home/ifiddes/jvarkit/dist-1.133/biostar94573.jar', '-R', n,
           'tmp.fasta']
    r = callProcLines(cmd)
    recs = [x.split() for x in r if not x.startswith("#")]
    results[exclude] = recs


raw_recs = []
for exclude, region in regions.iteritems():
   for start, stop in region:
    raw_recs.extend([x for x in results[exclude] if start < int(x[1]) <= stop])
Exemplo n.º 42
0
from d2 import d2
from phylum_data import PHYLUM_DATA
from pyfasta import Fasta

K = 25

seq_data = {}
scores = {}
metadata = {}

i = 0

for filename in glob(getenv("DATA_DIR", "data") + "/*.fna"):
    fasta = Fasta(filename)
    key = sorted(fasta.keys())[0]

    genbank_id = key.split(" ")[0]
    short_name = " ".join(key.split(" ")[1:3])
    org_phylum_data = PHYLUM_DATA.get(short_name, {})
    name = " ".join(key.split(" ")[1:-2])[:-1]

    metadata[genbank_id] = {
        "name": name,
        "phylum": org_phylum_data.get("phylum", ""),
        "domain": org_phylum_data.get("domain", ""),
        "ncbiLevel3": org_phylum_data.get("ncbiLevel3", "")
    }

    seq_data[genbank_id] = fasta[key][:]
def main():
    try:
        ### steps ###
        ## load genome
        ## load mod blat
        ## iterate over blat hits and extract genomic sequences regarding the given fragment sizes (transcript and LTR) at the reference position
        ### 2 cases: strand + or -
        ## export bed with sequence coordinates to extract
        ## export fasta output using pybedtools
        ### seq_id: Qname ; Tname ; LTR size; transcript size ; total size; RC if strand "-"
        ### seq

        ## load genome
        logger.info("Loading fasta genome ...")
        if stat(args.genome).st_size == 0:
            logger.error("genome file is empty: " + args.genome )
            sys.exit(1)
        else:
            fasta = Fasta(args.genome)
            logger.info("genome file: " + args.genome)
            logger.info("number of reference sequences: " + str(len(sorted(fasta.keys()))))
        
        ## load mod blat
        logger.info("Loading modblat ...")
        if stat(args.modblat).st_size == 0:
            logger.error("modblat file is empty: " + args.modblat )
            sys.exit(1)
        else:
            logger.info("mod blat file: " + args.modblat)
            mb = ModBlat(args.modblat)

        logger.info("number of blat hits: " + str(len(mb.hits)))
        for hit in mb.hits:
            logger.log(0, "qname/tname pair: " + str(hit.qname) + "/" + str(hit.tname))

        ## compute genomic coordinates
        logger.info("Compute genomic bed items coordinates ...")
        bedItems= []
        for hit in mb.hits:
            bi = hit.computeGenomicSequenceBedItem(args.upstream_frag_sz, args.downstream_frag_sz)
            bedItems.append(bi.totuple())
        logger.info("number of bed items: " + str(len(bedItems)))

        ## export bed items to bed file
        logger.info("Export to bed file ...")
        bed = pybedtools.BedTool(bedItems)
        outfile = path.basename(path.splitext(args.modblat)[0]) + '_seqFlankBlatHit.bed'
        bed.saveas(outfile, trackline="track name='genomic sequence extraction flanking blat hit' color=128,0,0")
        num_lines = sum(1 for line in open(outfile))
        logger.info("number of lines in bed file: " + str(num_lines))

        ## get fasta sequence from bed
        logger.info("Get fasta sequences from bed ...")
        fasta_out = path.basename(path.splitext(args.modblat)[0]) + '_seqFlankBlatHit.fasta'
        bed = bed.sequence(fi=args.genome, s=True, name=True)
        bedout = bed.save_seqs(fasta_out)
        assert open(bedout.seqfn).read() == open(bed.seqfn).read()
        fout = Fasta(fasta_out)
        logger.info("flanking blat hits sequences file: " + fasta_out)
        logger.info("number of flanking sequences: " + str(len(sorted(fout.keys()))))

    except KeyboardInterrupt:
        print "Shutdown requested...exiting"
    except Exception:
        traceback.print_exc(file=sys.stdout)
Exemplo n.º 44
0
    def run(self):

        if self.kmerbuild:

            jfcounter = jellyfish.jfcount(jfpath=self.jellyfishpath, mer=self.kmer,
                                          infile=self.genomefile, output=self.jfkmerfile, threads=self.threadsnumber,
                                          lowercount=self.lowercount, size=self.size)

            """
                check jelly fish count run correctly
            """
            if jfcounter:

                self.progressnumber = self.progressnumber + 5

                self.notifyProgress.emit(self.progressnumber)

                self.notifyMessage.emit("JellyFish Count finished...")

            else:

                self.notifyMessage.emit("JellyFish Count Error!!!")

        else:
            jfcountmess = "Use " + self.jfkmerfile
            self.progressnumber = self.progressnumber + 5
            self.notifyProgress.emit(self.progressnumber)
            self.notifyMessage.emit(jfcountmess)

        if self.indexbuild:

            if self.aligner == 'BWA':

                bwa.bwaindex(self.alnpath, self.genomefile, self.samplefolder)

                self.notifyMessage.emit("BWA Index build finished...")

                self.progressnumber = self.progressnumber + 5
                self.notifyProgress.emit(self.progressnumber)

            elif self.aligner == 'BLAT':

                """
                    add code for BLAT
                """

                pass
        else:

            self.progressnumber = self.progressnumber + 5
            self.notifyProgress.emit(self.progressnumber)

        """
            load and splite input file
        """

        # splite sequence longer than 10M
        spsize = 10000000

        maxkmerscore = int(self.pblength * self.homology / 100) - self.kmer

        jffilteredprobe = list()

        fastain = Fasta(self.inputfile)

        jffpbrunerlist = list()


        for seqname in fastain.keys():

            chrlen = len(fastain[seqname])

            if chrlen < spsize:

                start = 0

                end = chrlen - 1

                jffpbruner = jellyfish.JFfpbruner(jfpath=self.jellyfishpath, jfkmerfile=self.jfkmerfile, mer=self.kmer,
                                                  pyfasta=fastain, seqname=seqname, pblength=self.pblength,
                                                  maxkmerscore=maxkmerscore, start=start,
                                                  end=end, step=self.step)
                jffpbrunerlist.append(jffpbruner)

            else:

                chrblock = int(chrlen / spsize) + 1

                for i in range(chrblock):

                    start = i * spsize

                    end = start + spsize - 1

                    if end >= chrlen:

                        end = chrlen - 1

                    jffpbruner = jellyfish.JFfpbruner(jfpath=self.jellyfishpath, jfkmerfile=self.jfkmerfile, mer=self.kmer,
                                                  pyfasta=fastain, seqname=seqname, pblength=self.pblength,
                                                  maxkmerscore=maxkmerscore, start=start,
                                                  end=end, step=self.step)

                    jffpbrunerlist.append(jffpbruner)



        jffinished = 0

        for curpblist in self.pool.imap_unordered(jellyfish.kmerfilterprobe, jffpbrunerlist):

            jffilteredprobe.extend(curpblist)

            tmpprogress = float(format(self.progressnumber + (jffinished/len(jffpbrunerlist) * 40),".2f"))

            self.notifyProgress.emit(tmpprogress)

            if self.isRunning():

                print("running")

            else:

                print("not running")

            jffinished += 1


        self.notifyMessage.emit('jelly fish finished!!')

        self.progressnumber = 50.0

        self.notifyProgress.emit(self.progressnumber)

        tmppbfa = os.path.join(self.samplefolder, os.path.basename(self.inputfile)+'_tmp_probes.fa')

        tmppbfaio = open(tmppbfa, 'w')

        seqnum = 0

        for tmppb in jffilteredprobe:

            print('>','seq',seqnum, sep='',file=tmppbfaio)


            print(tmppb,file=tmppbfaio)


            seqnum += 1

        tmppbfaio.close()

        #delete jffilteredprobe and release memory
        del jffilteredprobe

        bwaindexfile = os.path.join(self.samplefolder, os.path.basename(self.genomefile))

        bwafiltedpb = bwa.bwafilter(bwabin=self.alnpath, reffile=bwaindexfile, inputfile=tmppbfa, minas=self.pblength,
                                    maxxs=int(self.pblength * self.homology / 100), threadnumber=self.threadsnumber)


        tmpbwaftlist = os.path.join(self.samplefolder, os.path.basename(self.inputfile)+'.bed')

        alltmpbwaftlist = os.path.join(self.samplefolder, os.path.basename(self.inputfile)+'_all.bed')

        tmpbwaftlistio = open(tmpbwaftlist,'w')

        allbwaftlistio = open(alltmpbwaftlist,'w')

        seqlenfile = os.path.join(self.samplefolder, os.path.basename(self.inputfile))+'.len'

        seqlenio = open(seqlenfile, 'w')

        seqlength = bwa.bwareflength(bwabin=self.alnpath, reffile=bwaindexfile)

        for seqname in seqlength:

            print(seqname, seqlength[seqname], sep='\t', file=seqlenio)

        seqlenio.close()


        oligobefortmf = list()

        for pbtmp in bwafiltedpb:

            # print(pbtmp, file=tmpbwaftlistio)
            nowpbcounter = dict()

            nowpbcounter['seq'] = pbtmp

            nowpbcounter['dTm'] = self.dTm

            nowpbcounter['rprimer'] = self.rprimer


            oligobefortmf.append(nowpbcounter)

        keepedprobe = list()

        self.progressnumber = 55

        self.notifyProgress.emit(self.progressnumber)

        ctedpb = 0



        oligobefortmflen = len(oligobefortmf)

        for (pb, keep) in self.pool.imap_unordered(probefilter, oligobefortmf):

            if keep:

                keepedprobe.append(pb)
                # print(pb, file=tmpbwaftlistio)

            ctedpb += 1

            if ctedpb % 10000 == 0:

                tmpprogress = float(format(self.progressnumber + (ctedpb/oligobefortmflen * 30),".2f"))

                self.notifyProgress.emit(tmpprogress)

        self.notifyProgress.emit(90)

        pbdictbychr = dict()

        #load pb to dict
        for pb in keepedprobe:

            # print(pb, file=tmpbwaftlistio)
            seq, chro, start = pb.split('\t')

            start = int(start)

            if chro in pbdictbychr:

                pbdictbychr[chro][start] = seq

            else:

                pbdictbychr[chro] = dict()



                pbdictbychr[chro][start] = seq


        #get lenth of primer
        lenrprimer = len(self.rprimer)

        if lenrprimer == 0:

            lenrprimer = 5

        slidwindow = lenrprimer+self.pblength


        for chro in pbdictbychr:

            startn = 0

            for startnow in sorted(pbdictbychr[chro]):

                endnow = startnow + self.pblength - 1

                print(chro, startnow, endnow, pbdictbychr[chro][startnow],file=allbwaftlistio,sep='\t')

                if startnow > startn+slidwindow:

                    #startn = startnow+slidwindow
                    startn = startnow



                    print(chro, startnow, endnow, pbdictbychr[chro][startnow], file=tmpbwaftlistio, sep='\t')


        tmpbwaftlistio.close()

        allbwaftlistio.close()

        #remove temp fasta file
        # os.remove(tmppbfa)

        self.notifyProgress.emit(100)

        self.notifyMessage.emit('all finished!!')
Exemplo n.º 45
0
def dmi(bacterial_input, bacterial_id_col, bacterial_pf_col,
        human_receptors_DMI, output_file_path):
    bacterial_id_col = bacterial_id_col - 1
    bacterial_pf_col = bacterial_pf_col - 1

    # def rename(fasta_key):
    #     fasta_key = fasta_key.split("|")
    #     fasta_key = fasta_key[0]
    #     return fasta_key

    # fasta processing -> human.keys() print the keys, human[key_name] print the sequence
    human = Fasta(human_receptors_DMI)  #'human_receptors.fasta')

    #elm identifier(key) - regex(value) dictionary
    with open("elm_motif.tsv", "r") as motif_table:
        motif_table.readline()
        elm_regex = {}
        for line in motif_table:
            line = line.strip().split("\t")
            elm_regex[line[1]] = line[4]

    #motif(key) - domain(value list) dictionary
    with open("elm_interaction_domains.tsv", "r") as motif_domain_table:
        motif_domain_table.readline()
        motif_domain = {}
        for line in motif_domain_table:
            line = line.strip("\n").split("\t")
            if line[0] not in motif_domain:
                motif_domain[line[0]] = []
            motif_domain[line[0]].append(line[1])

    #pfam(key) - uniprot(value list) dictionary
    with open(bacterial_input, "r") as bacterial_proteins:
        bacterial_proteins.readline()
        bacterial_proteins = [
            a.strip().split("\t") for a in bacterial_proteins
        ]
        pfam_uniprot = dict([(a[bacterial_pf_col], [])
                             for a in bacterial_proteins])
        for line in bacterial_proteins:
            pfam_uniprot[line[bacterial_pf_col]].append(line[bacterial_id_col])

    #uniprot(key) - motif(value list) dictionary
    uniprot_motif = {}
    for key in human.keys():
        for motif in elm_regex:
            match = re.search(str(elm_regex[motif]), str(human[key]))
            if match:
                if key not in uniprot_motif:
                    uniprot_motif[key] = []
                #print("%s;%s;%s"%(motif,match.start(),match.end()))
                uniprot_motif[key].append(
                    (motif, str(match.start()), str(match.end())))

    with open(output_file_path, "w") as output:
        predictions = 0
        for pfam, uniprot_list in pfam_uniprot.items():
            for uniprot in uniprot_list:
                for motif in motif_domain:
                    if pfam in motif_domain[motif]:
                        for uni, motif_list in uniprot_motif.items():
                            for motif_2 in motif_list:
                                if motif_2[0] == motif:
                                    predictions += 1
                                    output.write(uni + ";" +
                                                 ";".join(motif_2) + ";" +
                                                 ";" + pfam + ";" + uniprot +
                                                 "\n")
    return predictions
Exemplo n.º 46
0
                sys.exit(0)
        #process arguments

        genome1 = arg[0]
        genome2 = arg[1]
        genomeOut = arg[2]

        #open genomeOut file to write new genome
        gOut = open(genomeOut,'w')
        
        #open both genomes as pyFasta arrays
        Fgenome1 = Fasta(genome1)
        Fgenome2 = Fasta(genome2)
        
        #get chromosome names
        chroms = Fgenome1.keys()
        
        #for each chromosome
        
        for chrom in chroms:
        
                #convert pyFasta arrays to numpy arrays
                
                np_genome1 = np.array(Fgenome1[chrom])
                np_genome2 = np.array(Fgenome2[chrom])
                
                #get Boolean array from elementwise comparison of chromosomes
                chrom_matches = np.core.defchararray.equal(np_genome1,np_genome2)
                
                #make new array of size of chrom, fill with N's
                
Exemplo n.º 47
0
# Import MAC fasta file
logComment('Importing MAC fasta file...')
mac_fasta = None
try:
	mac_fasta = Fasta(MACfile)
except Exception as e:
	print("Error while importing fasta file\n" + str(e))
	logComment("Can't import fasta file\n" + str(e))
	exit()

#if DEBUGGING:
#	print(list(mac_fasta.keys()))

# Record number of imported MAC contigs
macCount = len(mac_fasta.keys())
logComment(str(macCount) + ' sequences imported')

# Rough Blast parameters
dust = "yes" if Options['RoughBlastDust'] else "no"
ungapped = " -ungapped " if Options['RoughBlastUngapped'] else ""
maskLowercase = " -lcase_masking " if Options['BlastMaskLowercase'] else ""
	
logComment("BLAST rough pass parameters:\nblastn -task " + Options['RoughBlastTask'] + " -word_size " + str(Options['RoughBlastWordSize']) + " -max_hsps 0 " +
"-max_target_seqs 10000 -dust " + dust + ungapped + maskLowercase + "-num_threads " + str(Options['ThreadCount']) + 
" -outfmt \"10 qseqid sseqid pident length mismatch qstart qend sstart send evalue bitscore qcovs\"\n")

#Fine Blast parameters
dust = "yes" if Options['FineBlastDust'] else "no"
ungapped = " -ungapped " if Options['FineBlastUngapped'] else ""
	
Exemplo n.º 48
0
def main():
	
	# Functions for jtools:
	# Get sequence
	# Detect tandem acceptors NAGNAG
	# Annotate with genes
	# Jiggle
	# Bed to juncid
	# Guess frame
	# Find stops in intron + in frame
	# SVM recomputes
	# Splice site strength? ppt? 

	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# Load a fasta file
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	global f
	# Opening fasta filehandle
	print >> sys.stderr, "[%s] Opening fasta file" % (spanki_utils.timestamp())
	f = Fasta(fastafile)
	fastachr = set(sorted(f.keys()))
	#print fastachr

	########################################################
	### Parsing a juncbed file
	########################################################
	if (juncbedfile):

		print "juncid\toriginal_id\tdastring"
		print >> sys.stderr, "Loading", juncbedfile
		lines = csv.reader(open(juncbedfile, 'rb'), delimiter='\t')
		z = []
		for line in lines:
			pattern = re.compile('track')
			track = pattern.search(line[0])
			if not track:
				values = line
				blocksizes = values[10].split(",")
				blockstarts = values[11].split(",")
				chr = values[0]
				rangestart = int(values[1]) - 1
				rangeend = int(values[2])
				strand = values[5]
				id = values[3]
				intronstart = rangestart + int(blocksizes[0]) + 2
				intronend = rangeend - int(blocksizes[1]) 
				# Or..
				#intronend = rangestart + int(blocksizes[0]) + int(blockstarts[1])
			
				#chrXHet	800	1767	JUNC00000001	2	+	800	1767	255,0,0	2	20,63	0,904
			
				intronsize = intronend - intronstart;
			
				juncid = chr + ":" + str(intronstart) + "_" + str(intronend) + ":" + strand
				dastring = intron_sequence_single(juncid,f)
				z.append(str(dastring))
				print juncid, values[3], dastring
		
		print >> sys.stderr, "Distribution of detected motifs:\n",Counter(z)
		quit("Done")
	########################################################
	### Parsing a intronbed file
	########################################################
	#scaffold_12916	13833982	13834044	10
	#scaffold_12916	13838614	13838676	67
	#scaffold_12916	13839119	13839204	75

	if (intronbedfile):
		print "juncid\tid\tdastring"
		lines = csv.reader(open(intronbedfile, 'rb'), delimiter='\t')
		for line in lines:
			pattern = re.compile('track')
			track = pattern.search(line[0])
			values = line
			if not track:
				chr = values[0]
				intronstart = int(values[1]) + 1
				intronend = int(values[2]) - 1
				strand = "+"
				id = values[0]
			
				intronsize = intronend - intronstart;
			
				juncid = chr + ":" + str(intronstart) + "_" + str(intronend) + ":" + strand
				dastring = intron_sequence_single(juncid,f)
				
				print juncid, values[3], dastring
		
		
		quit("Done")
	########################################################

	########################################################
	### Converting from another format
	########################################################

	if gfffile:
	
		#reflist = tab_to_dict(gff)
		results = collections.defaultdict(lambda : collections.defaultdict(dict))
		gffdict = gff_to_dict(gfffile)
		for x in gffdict:
			#print x
			#print gffdict[x]
			if (gffdict[x]['feature_type'] == "exon_junction"):
				juncid = gffdict[x]['chr'] + ":" + str(int(gffdict[x]['start']) + 1) + "_" + str(int(gffdict[x]['end']) - 1) + ":" + gffdict[x]['strand']
			elif (gffdict[x]['feature_type'] == "intron"):
				juncid = gffdict[x]['chr'] + ":" + gffdict[x]['start'] + "_" + gffdict[x]['end'] + ":" + gffdict[x]['strand']
			dastring = intron_sequence_single(juncid,f)
			#print dastring
			results[x]['juncid'] = juncid
			results[x]['dastring'] = dastring
	
		print "ID\tjuncid\tdastring"
		for x in sorted(results.iterkeys()):
			print x, "\t", results[x]['juncid'], "\t", results[x]['dastring']
	
		quit()

	########################################################
	### Converting from another format
	########################################################

	if gtffile:
	
		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# Intializing the reference
		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# You need the gtf file, and the fasta file
		lookup = spanki_utils.prep_ref(gtffile,fastafile,output_dir)
		## Note that you now have a reference called ref.bam, and a lookup dict
		#tmp_dir = output_dir + "/tmp/"
		#reffile = tmp_dir + "/ref.bam"
		reffile = "tmp/ref.bam"
		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# Load an annotation, flattened as bam
		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		print >> sys.stderr, "[%s] Trying to load annotation as bam" % (spanki_utils.timestamp())
		reffh = pysam.Samfile( reffile, "rb" )
		edgedict, refjuncs = spanki_parse_utils.parseRefAsBam(reffh)
		reffh.close()
		print >> sys.stderr, "[%s] Done loading annotation as bam" % (spanki_utils.timestamp())
	
		for junc in refjuncs:
			print junc

	
		quit()
	### Below are functions that operate on a junction list
	########################################################


	if jlist:
	
		#~~~~~~~~~~~~~~~~~~~
		# Load reference junction list
		#~~~~~~~~~~~~~~~~~~~
		reflist = tab_to_dict(jlist)
	
		# Find the junctions in jlist that are not in jtab
	
		myjuncs = reflist.keys()
	
	
	
		
		print >> sys.stderr, len(myjuncs), "in junction list"
	
		updonor = 20
		downdonor = 2
		upacceptor = 2
		downacceptor = 20
		
	
		for x in myjuncs:
			print x
			j1 = Junctionid(x)
			j1.display()
			if j1.strand == "+":
				#print Seq(f[j1.chr][j1.donor-updonor:j1.donor], IUPAC.unambiguous_dna)
				tempseq = Seq(f[j1.chr][j1.donor-updonor:j1.donor], IUPAC.unambiguous_dna)
				#print "***", tempseq.translate()
				
				#print Seq(f[j1.chr][j1.donor:j1.donor + downdonor], IUPAC.unambiguous_dna)
				#print Seq(f[j1.chr][j1.acceptor-upacceptor:j1.acceptor], IUPAC.unambiguous_dna)
				#print Seq(f[j1.chr][j1.acceptor:j1.acceptor + downacceptor], IUPAC.unambiguous_dna)
				nagstring = find_nag(Seq(f[j1.chr][j1.acceptor:j1.acceptor + downacceptor], IUPAC.unambiguous_dna))
				print nagstring
			elif j1.strand == "-":
				pass
				#print Seq(f[j1.chr][j1.donor:j1.donor + updonor], IUPAC.unambiguous_dna).reverse_complement()
				#print Seq(f[j1.chr][j1.donor - downdonor:j1.donor], IUPAC.unambiguous_dna).reverse_complement()
				#print Seq(f[j1.chr][j1.acceptor:j1.acceptor + upacceptor], IUPAC.unambiguous_dna).reverse_complement()
				#print Seq(f[j1.chr][j1.acceptor-downacceptor:j1.acceptor], IUPAC.unambiguous_dna).reverse_complement()
			else:
				quit("Don't recognize strand")
	
				#fiveprimeflank = fiveprimeflank.reverse_complement()
		quit("Done")
	
	
	quit()
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# Older code that's not used yet
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	
	
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# IRT
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	bamfh = pysam.Samfile( bamfile, "rb" )
	#for alignedread in samfile:
	# Need some kind of iterator to getread length from first alignment in sam
	print >> sys.stderr, "[%s] Getting intron read-though (IRT), may take awhile" % (spanki_utils.timestamp())
	IRT = intron_readthrough(myjuncs,bamfh)
	bamfh.close()
	print >> sys.stderr, "[%s] Done getting IRT" % (spanki_utils.timestamp())



	#for edgeid in covbyedge.keys():
	#	print edgeid, covbyedge[edgeid]
		

	# These are the fields you end up with after merging:
	#juncid	geneassign	cov	lirt	rirt	irt	dncov	ancov	numsamps
	#chr2L:22427471_22427525:- 	none 	2 	57 	28 	85 	0 	0 	1
	#chr2R:5702257_5702656:+ 	FBgn0040092 	13 	0 	0 	0 	0 	0 	2
	#chr2L:11436293_11436415:- 	FBgn0261648 	23 	0 	0 	0 	0 	0 	2
	#chr2R:9334834_9336812:- 	FBgn0013765 	6 	0 	0 	0 	0 	0 	2

	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# Now compile the results
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# First show how you can get in hte myjuncs list
	print >> sys.stderr, "Printing results table"
	print >> juncs_out, "juncid\tgeneassign\tannostatus\tintron_size\tgmcode\tregcode\tcov\tlirt\trirt\tirt\tdncov\tancov"
	
	for juncid in sorted(keys2):
		try:
			results = [juncid, jdict[juncid]['geneassign'], jdict[juncid]['annostatus'], jdict[juncid]['intron_size'], jdict[juncid]['gmcode'], jdict[juncid]['regcode'], jdict[juncid]['cov'], jdict[juncid]['lirt'], jdict[juncid]['rirt'], jdict[juncid]['irt'], jdict[juncid]['dncov'], jdict[juncid]['ancov']]
			print >> juncs_out, ('\t'.join(map(str,results)))
		except KeyError:
			#myjuncs.append(juncid)	
			j1 = Junctionid(juncid)
			donid = j1.donid
			accid = j1.accid
			if covbyedge[donid]: dncov = covbyedge[donid]
			else: dncov = 0
			if covbyedge[accid]: ancov = covbyedge[accid]
			else: ancov = 0
			results = [juncid, reflist[juncid]['geneassign'],  reflist[juncid]['annostatus'],  reflist[juncid]['intron_size'],  reflist[juncid]['gmcode'],  reflist[juncid]['regcode'], 0, IRT[juncid]['lirt'], IRT[juncid]['rirt'], IRT[juncid]['irt'], dncov, ancov]
			#print(results, sep='\t')
			print >> juncs_out, ('\t'.join(map(str,results)))

	quit("done")

 	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 	# Parse the read alignments
 	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# Parse the bam file
	## Get a table of junctions, table of donors etc.
	bamfh = pysam.Samfile( bamfile, "rb" )
	#JTAB,UNFILT_JTAB,STAB,NEWDTAB,MMES = parse_aligns_detailed(bamfh)
	JTAB,UNFILT_JTAB = quickcov(bamfh,anchorsize)
	bamfh.close()
	myjuncs = JTAB.keys()
	myjuncs.sort()
	
 	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 	# Print junction list to the output directory
 	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 	print "juncid\tunfilt_cov\tcov"
	for juncid in myjuncs:
		print juncid, UNFILT_JTAB[juncid], JTAB[juncid]
Exemplo n.º 49
0
def main():

    args = check_options(get_options())

    genomesize = int(os.path.getsize(args.genome)/1e6)

    kmer = int(log(genomesize, 4)+1)

    if kmer < 17:

        kmer = 17

    #jellyfish par
    lowercount = 2

    #jellyfish par
    jfsize = '100M'

    # splite sequence longer than 10M
    spsize = 10000000

    step = args.step

    maxkmerscore = int(args.length * args.homology / 100) - kmer

    jfpool = Pool(args.threads)

    # ?build kmerindex
    jfkmerfile = os.path.join(args.saved,(os.path.basename(args.genome)+'_'+str(kmer)+'mer.jf'))

    kmerbuild = True

    if os.path.isfile(jfkmerfile):

        if not args.docker:

            print("find:", jfkmerfile)

            kmmess = "Found kmerfile "+jfkmerfile+". Do you want rebuild it?  Press Y or N to continue:"

            print(kmmess)

            while True:

                char = getch()

                if char.lower() in ("y", "n"):

                    print(char)

                    if char == 'y':

                        kmerbuild = True

                    elif char == 'n':

                        kmerbuild = False

                    break


    # ?build bwa index
    bwaindexfile = os.path.basename(args.genome)

    bwatestindex = os.path.join(args.saved, bwaindexfile+'.sa')

    bwaindex = os.path.join(args.saved, bwaindexfile)

    bwabuild = True

    if os.path.isfile(bwatestindex):

        if not args.docker:

            print('find:', bwatestindex)

            bwamess = "Found bwa index file " + bwatestindex + ". Do you want rebuild it? Press Y or N to continue:"

            print(bwamess)

            while True:

                char = getch()

                if char.lower() in ("y", "n"):

                    print(char)

                    if char == 'y':

                        bwabuild = True

                    elif char == 'n':

                        bwabuild = False

                    break

    print("genomesize:",genomesize, "kmer:",kmer, "jfkmerfile:",
          jfkmerfile, "kmerbuild:", kmerbuild, "bwabuild:", bwabuild, "threads:", args.threads)

    # Build Jellyfish index
    if kmerbuild:

        jfcount = jellyfish.jfcount(jfpath=args.jellyfish, mer=kmer, infile=args.genome, output=jfkmerfile,
                                    threads=args.threads, lowercount=lowercount, size=jfsize)

        if jfcount:

            print("JellyFish Count finished ...")

        else:

            print("JellyFish Count Error!!!")

            sys.exit(1)

    else:

        print("Use ", jfkmerfile)
    # End build Jellyfish index

    if bwabuild:

        bwa.bwaindex(args.bwa, args.genome, args.saved)

        print("bwa index build finished ...")

    else:

        print("Use", bwatestindex)


    jffilteredprobe = list()

    fastain = Fasta(args.input)

    jffpbrunerlist = list()

    for seqname in fastain.keys():

        chrlen = len(fastain[seqname])

        if chrlen < spsize:

            start = 0

            end = chrlen - 1

            jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer,
                                              pyfasta=fastain, seqname=seqname, pblength=args.length,
                                              maxkmerscore=maxkmerscore, start=start,
                                              end=end, step=step)

            jffpbrunerlist.append(jffpbruner)

        else:

            chrblock = int(chrlen/spsize) + 1

            for i in range(chrblock):

                start = i * spsize

                end = start + spsize - 1

                if end >= chrlen:

                    end = chrlen - 1

                jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer,
                                              pyfasta=fastain, seqname=seqname, pblength=args.length,
                                              maxkmerscore=maxkmerscore, start=start,
                                              end=end, step=step)

                jffpbrunerlist.append(jffpbruner)

    jffinished = 0

    for curpblist in jfpool.imap_unordered(jellyfish.kmerfilterprobe, jffpbrunerlist):

        jffilteredprobe.extend(curpblist)

        jffinished += 1

        print("Jellyfish filter: ",jffinished,'/',len(jffpbrunerlist), sep='')

    jfpool.close()

    print('Jellyfish filter finished!!')

    tmppbfa = os.path.join(args.saved, os.path.basename(args.input)+'_tmp_probe.fa')

    tmppbfaio = open(tmppbfa, 'w')

    seqnum = 0

    for tmppb in jffilteredprobe:

        print('>','seq',seqnum, sep='',file=tmppbfaio)

        print(tmppb,file=tmppbfaio)

        seqnum += 1

    tmppbfaio.close()

    del jffilteredprobe

    bwafiltedpb = bwa.bwafilter(bwabin=args.bwa, reffile=bwaindex, inputfile=tmppbfa, minas=args.length,
                                maxxs=int(args.length*args.homology/100), threadnumber=args.threads)

    # print(bwafiltedpb)

    tmpbwaftlist = os.path.join(args.saved, os.path.basename(args.input)+'.bed')

    alltmpbwaftlist = os.path.join(args.saved, os.path.basename(args.input)+'_all.bed')

    tmpbwaftlistio = open(tmpbwaftlist,'w')

    allbwaftlistio = open(alltmpbwaftlist,'w')

    seqlenfile = os.path.join(args.saved, os.path.basename(args.input)+'.len')

    seqlenio = open(seqlenfile,'w')

    seqlength = bwa.bwareflength(bwabin=args.bwa, reffile=bwaindex)

    for seqname in seqlength:

            print(seqname, seqlength[seqname], sep='\t', file=seqlenio)

    seqlenio.close()


    oligobefortmf = list()

    for pbtmp in bwafiltedpb:

        # print(pbtmp, file=tmpbwaftlistio)
        nowpbcounter = dict()

        nowpbcounter['seq'] = pbtmp

        nowpbcounter['dTm'] = args.dtm

        nowpbcounter['rprimer'] = args.primer

        oligobefortmf.append(nowpbcounter)

    keepedprobe = list()

    ctedpb = 0

    oligobefortmflen = len(oligobefortmf)

    print("oligobefortmflen:",oligobefortmflen)

    pbftpool = Pool()

    for (pb, keep) in pbftpool.imap_unordered(probefilter, oligobefortmf):

        if keep:

            keepedprobe.append(pb)
                # print(pb, file=tmpbwaftlistio)
        ctedpb += 1

        if ctedpb % 10000 == 0:

            print(ctedpb,'/',oligobefortmflen)

    pbdictbychr = dict()

    pbftpool.close()

    for pb in keepedprobe:

        seq, chro, start = pb.split('\t')

        start = int(start)

        if chro in pbdictbychr:

            pbdictbychr[chro][start] = seq

        else:

            pbdictbychr[chro] = dict()

            pbdictbychr[chro][start] = seq

    lenrprimer = len(args.primer)

    if lenrprimer == 0:

            lenrprimer = 5

    slidwindow = lenrprimer+args.length

    for chro in pbdictbychr:

        startn = 0

        for startnow in sorted(pbdictbychr[chro]):

            endnow = startnow + args.length - 1

            print(chro, startnow, endnow, pbdictbychr[chro][startnow],file=allbwaftlistio,sep='\t')

            if startnow > startn+slidwindow:
                    #startn = startnow+slidwindow
                startn = startnow

                print(chro, startnow, endnow, pbdictbychr[chro][startnow], file=tmpbwaftlistio, sep='\t')


    tmpbwaftlistio.close()

    allbwaftlistio.close()

    print("Job finshed!!")
Exemplo n.º 50
0
def acquire_chr(ref_genome):
    file = Fasta(ref_genome)
    return sorted(file.keys())
Exemplo n.º 51
0
def random_sequence(file):
    fasta = Fasta(file)
    key = choice(fasta.keys())
    return (key, fasta[key])
Exemplo n.º 52
0
    def run(self):

        if self.kmerbuild:

            jfcounter = jellyfish.jfcount(jfpath=self.jellyfishpath,
                                          mer=self.kmer,
                                          infile=self.genomefile,
                                          output=self.jfkmerfile,
                                          threads=self.threadsnumber,
                                          lowercount=self.lowercount,
                                          size=self.size)
            """
                check jelly fish count run correctly
            """
            if jfcounter:

                self.progressnumber = self.progressnumber + 5

                self.notifyProgress.emit(self.progressnumber)

                self.notifyMessage.emit("JellyFish Count finished...")

            else:

                self.notifyMessage.emit("JellyFish Count Error!!!")

        else:
            jfcountmess = "Use " + self.jfkmerfile
            self.progressnumber = self.progressnumber + 5
            self.notifyProgress.emit(self.progressnumber)
            self.notifyMessage.emit(jfcountmess)

        if self.indexbuild:

            if self.aligner == 'BWA':

                bwa.bwaindex(self.alnpath, self.genomefile, self.samplefolder)

                self.notifyMessage.emit("BWA Index build finished...")

                self.progressnumber = self.progressnumber + 5
                self.notifyProgress.emit(self.progressnumber)

            elif self.aligner == 'BLAT':
                """
                    add code for BLAT
                """

                pass
        else:

            self.progressnumber = self.progressnumber + 5
            self.notifyProgress.emit(self.progressnumber)
        """
            load and splite input file
        """

        # splite sequence longer than 10M
        spsize = 10000000

        maxkmerscore = int(self.pblength * self.homology / 100) - self.kmer

        jffilteredprobe = list()

        fastain = Fasta(self.inputfile)

        jffpbrunerlist = list()

        for seqname in fastain.keys():

            chrlen = len(fastain[seqname])

            if chrlen < spsize:

                start = 0

                end = chrlen - 1

                jffpbruner = jellyfish.JFfpbruner(jfpath=self.jellyfishpath,
                                                  jfkmerfile=self.jfkmerfile,
                                                  mer=self.kmer,
                                                  pyfasta=fastain,
                                                  seqname=seqname,
                                                  pblength=self.pblength,
                                                  maxkmerscore=maxkmerscore,
                                                  start=start,
                                                  end=end,
                                                  step=self.step)
                jffpbrunerlist.append(jffpbruner)

            else:

                chrblock = int(chrlen / spsize) + 1

                for i in range(chrblock):

                    start = i * spsize

                    end = start + spsize - 1

                    if end >= chrlen:

                        end = chrlen - 1

                    jffpbruner = jellyfish.JFfpbruner(
                        jfpath=self.jellyfishpath,
                        jfkmerfile=self.jfkmerfile,
                        mer=self.kmer,
                        pyfasta=fastain,
                        seqname=seqname,
                        pblength=self.pblength,
                        maxkmerscore=maxkmerscore,
                        start=start,
                        end=end,
                        step=self.step)

                    jffpbrunerlist.append(jffpbruner)

        jffinished = 0

        for curpblist in self.pool.imap_unordered(jellyfish.kmerfilterprobe,
                                                  jffpbrunerlist):

            jffilteredprobe.extend(curpblist)

            tmpprogress = float(
                format(
                    self.progressnumber +
                    (jffinished / len(jffpbrunerlist) * 40), ".2f"))

            self.notifyProgress.emit(tmpprogress)

            if self.isRunning():

                print("running")

            else:

                print("not running")

            jffinished += 1

        self.notifyMessage.emit('kmer filter finished!!')

        self.progressnumber = 50.0

        self.notifyProgress.emit(self.progressnumber)

        tmppbfa = os.path.join(
            self.samplefolder,
            os.path.basename(self.inputfile) + '_tmp_probes.fa')

        tmppbfaio = open(tmppbfa, 'w')

        seqnum = 0

        for tmppb in jffilteredprobe:

            print('>', 'seq', seqnum, sep='', file=tmppbfaio)

            print(tmppb, file=tmppbfaio)

            seqnum += 1

        tmppbfaio.close()

        #delete jffilteredprobe and release memory
        del jffilteredprobe

        bwaindexfile = os.path.join(self.samplefolder,
                                    os.path.basename(self.genomefile))

        bwafiltedpb = bwa.bwafilter(bwabin=self.alnpath,
                                    reffile=bwaindexfile,
                                    inputfile=tmppbfa,
                                    minas=self.pblength,
                                    maxxs=int(self.pblength * self.homology /
                                              100),
                                    threadnumber=self.threadsnumber)

        tmpbwaftlist = os.path.join(self.samplefolder,
                                    os.path.basename(self.inputfile) + '.bed')

        alltmpbwaftlist = os.path.join(
            self.samplefolder,
            os.path.basename(self.inputfile) + '_all.bed')

        tmpbwaftlistio = open(tmpbwaftlist, 'w')

        allbwaftlistio = open(alltmpbwaftlist, 'w')

        seqlenfile = os.path.join(self.samplefolder,
                                  os.path.basename(self.inputfile)) + '.len'

        seqlenio = open(seqlenfile, 'w')

        seqlength = bwa.bwareflength(bwabin=self.alnpath, reffile=bwaindexfile)

        for seqname in seqlength:

            print(seqname, seqlength[seqname], sep='\t', file=seqlenio)

        seqlenio.close()

        oligobefortmf = list()

        for pbtmp in bwafiltedpb:

            # print(pbtmp, file=tmpbwaftlistio)
            nowpbcounter = dict()

            nowpbcounter['seq'] = pbtmp

            nowpbcounter['dTm'] = self.dTm

            nowpbcounter['rprimer'] = self.rprimer

            oligobefortmf.append(nowpbcounter)

        keepedprobe = list()

        self.progressnumber = 55

        self.notifyProgress.emit(self.progressnumber)

        ctedpb = 0

        oligobefortmflen = len(oligobefortmf)

        for (pb, keep) in self.pool.imap_unordered(probefilter, oligobefortmf):

            if keep:

                keepedprobe.append(pb)
                # print(pb, file=tmpbwaftlistio)

            ctedpb += 1

            if ctedpb % 10000 == 0:

                tmpprogress = float(
                    format(
                        self.progressnumber + (ctedpb / oligobefortmflen * 30),
                        ".2f"))

                self.notifyProgress.emit(tmpprogress)

        self.notifyProgress.emit(90)

        pbdictbychr = dict()

        #load pb to dict
        for pb in keepedprobe:

            # print(pb, file=tmpbwaftlistio)
            seq, chro, start = pb.split('\t')

            start = int(start)

            if chro in pbdictbychr:

                pbdictbychr[chro][start] = seq

            else:

                pbdictbychr[chro] = dict()

                pbdictbychr[chro][start] = seq

        #get lenth of primer
        lenrprimer = len(self.rprimer)

        if lenrprimer == 0:

            lenrprimer = 5

        slidwindow = lenrprimer + self.pblength

        for chro in pbdictbychr:

            startn = 0

            for startnow in sorted(pbdictbychr[chro]):

                endnow = startnow + self.pblength - 1

                print(chro,
                      startnow,
                      endnow,
                      pbdictbychr[chro][startnow],
                      file=allbwaftlistio,
                      sep='\t')

                if startnow > startn + slidwindow:

                    #startn = startnow+slidwindow
                    startn = startnow

                    print(chro,
                          startnow,
                          endnow,
                          pbdictbychr[chro][startnow],
                          file=tmpbwaftlistio,
                          sep='\t')

        tmpbwaftlistio.close()

        allbwaftlistio.close()

        #remove temp fasta file
        # os.remove(tmppbfa)

        self.notifyProgress.emit(100)

        self.notifyMessage.emit('all finished!!')
Exemplo n.º 53
0
def vcf_to_fasta(input_vcf, output_fasta, ref_seq,
                 species, use_indels, min_depth, free_bayes,
                 ploidy, to_fasta, main_sequence, coverage_files,
                 min_probs=0.8, impute=False, unique_only=False):
    # First part is to get the fasta sequence then atke each position
    # and then alter the reference as necessary for each sample.
    # Because everyone will have different SNPs.
    f = Fasta(ref_seq)
    # For now this is only going to work with mtDNA sequences,
    # but plan to extend this in the future to full genome
    # gets the full genomes sequences and currently assumes
    # that the fasta only contains one sequence.
    min_depth = int(min_depth)
    ploidy = int(ploidy)
    if(impute):
        is_beagle = True
    index = [n for n, l in enumerate(f.keys()) if l.startswith(main_sequence)]
    index = index[0]
    full_sequence = list(str(f[f.keys()[index]]))
    min_max_coord = []
    first_coordinate = True 
    sample_fasta = {}
    unique_snps = {}
    if free_bayes or ploidy == 1:
        free_bayes = True
        ploidy = 1
    if(impute):
        is_beagle = True
        free_bayes = False
    sample_lines = {}
    vcf_reader = vcf.Reader(open(input_vcf, 'r'), strict_whitespace=True)
    samples = vcf_reader.samples
    sample_offset = {}
    sample_offset_end= {}
    for sample in samples:
        sample_lines[sample] = []
        sample_fasta[sample] = full_sequence[:]
        sample_offset[sample] = 0
        sample_offset_end[sample] = {}
    for record in vcf_reader:
        position = record.POS
        if first_coordinate:
            min_max_coord.append(str(position))
            first_coordinate = False
        for sample in record.samples:
            genotype = sample['GT']
            is_beagle = False
            temp_position = position - 1  + sample_offset[sample.sample]
            try:
                pl = sample['PL']
                pheno_l = [int(o) for o in pl]
                dp = sample['DP']
                pl = pheno_l.index(min(pheno_l))
                if genotype == None or float(dp) <= min_depth:
                    sample_fasta[sample.sample][temp_position] = 'N'
                    # Just to ensure, the bad thing doesn't occur
                    # Overwriting the N call.
                    continue
            except AttributeError:
                if not free_bayes:
                    is_beagle = True
                    gp = sample['GP']
                    g_l = [float(o) for o in gp]
                    if max(g_l) < min_probs:
                        #print sample
                        sample_fasta[sample.sample][temp_position] = 'N'
                        continue
                    pl = g_l.index(max(g_l))
                else:
                    if genotype == '.' or genotype == None:
                        sample_fasta[sample.sample][temp_position] = 'N'
                        continue
            except TypeError:
                sample_fasta[sample.sample][temp_position] = 'N'
                continue
            sample = sample.sample
            if free_bayes or ploidy == 1:
                genotype = genotype[0]
                if genotype == '0':
                    continue
            elif not is_beagle:
                genotype = genotype.split('/')
            else:
                genotype = genotype.split("|")
            # If pl is greater than zero
            ref = record.REF
            alt = record.ALT
            # Gl is substituted
            if free_bayes or int(pl) > 0:
                if is_ga_or_ct(ref, alt):
                    if not free_bayes:
                        if is_beagle:
                            if g_l[0] > g_l[2]:
                                continue
                        elif pheno_l[0] < pheno_l[2]:
                            continue
                no_alleles = 1 + len(alt)
                if not free_bayes:
                    genotype = genotype[0]
                real_gt = str(alt[int(genotype)-1])
                if real_gt == "*":
                    sample_fasta[sample][temp_position] = "N"
                    continue
                if to_fasta:
                    if species == 'human':
                        if position == 8270 and ref == "CACCCCCTCT":
                            sample_fasta[sample][8280:8289] = '-'*9
                            continue
                    for i in range(0, max(len(real_gt), len(ref))):
                        if  i == (len(real_gt) - 1) and i == (len(ref)- 1):
                            gt = real_gt[i]
                            if free_bayes and len(str(alt)) > 1:
                                real_gt = str(alt[0])
                            #print(temp_position)
                            sample_fasta[sample][temp_position] = gt
                        elif len(real_gt) > len(ref) and i != 0:
                            if use_indels:
                                if temp_position == 2677:
                                    print real_gt
                                    print ref 
                                    print real_gt[i]
                                gt = list(real_gt[i])
                                sample_offset_end[sample][temp_position] = len(gt)

                                temp_position = temp_position + 1
                                sample_fasta[sample] = \
                                    sample_fasta[sample][:temp_position] + \
                                    gt + sample_fasta[sample][temp_position:]
                                sample_offset[sample] += 1
                        elif len(real_gt) < len(ref) and i != 0:
                            sample_fasta[sample][temp_position + i] = '-'
                else:
                    if species == 'human':
                        if position == 955 and  "ACCCC" in str(alt[0]):
                            sample_lines[sample].extend(["960.1CCCCC"])
                            try:
                                unique_snps["960.1CCCCC"] += 1
                            except KeyError:
                                unique_snps["960.1CCCCC"] = 1
                            continue
                        if position == 8270 and ref == "CACCCCCTCT":
                            sample_lines[sample].extend([str(i)+"d" for i in range(8281, 8290)])
                            for item in [str(i) +"d" for i in range(8281, 8290)]:
                                try:
                                    unique_snps[item] += 1
                                except KeyError:
                                    unique_snps[item] = 1
                            continue
                        if position == 285 and ref == "CAA":
                            sample_lines[sample].extend([str(i) + "d" for i in range(290, 293)])
                            for item in [str(i) +"d" for i in range(290, 293)]:
                                try:
                                    unique_snps[item] += 1
                                except KeyError:
                                    unique_snps[item] = 1
                            continue
                        if position == 247 and ref == "GA":
                            sample_lines[sample].extend([str(249) + "d"])
                            item = str(249) + "d"
                            try:
                                unique_snps[item] += 1
                            except KeyError:
                                unique_snps[item] = 1
                            continue
                    for i in range(0, max(len(real_gt), len(ref))):
                        if i == (len(real_gt) - 1) and i == (len(ref)- 1):
                            gt = real_gt[i]
                            if free_bayes and len(str(alt)) > 1:
                                real_gt = str(alt[0])
                            sample_lines[sample].append(str(position+i) + gt)
                            if unique_only:
                                try:
                                    unique_snps[str(position+i) + gt] += 1
                                except KeyError:
                                    unique_snps[str(position+i) + gt] = 1
                        elif len(real_gt) > len(ref) and i != 0:
                            gt = real_gt[i]
                            sample_lines[sample].append(str(temp_position+i) + "."  + str(i) + gt)
                            if unique_only:
                                try:
                                    unique_snps[str(temp_position+i) + "."  + str(i) + gt] += 1
                                except KeyError:
                                    unique_snps[str(temp_position+i) + "."  + str(i) + gt] = 1
                            temp_position = temp_position - 1

                        elif len(real_gt) < len(ref) and i != 0:
                            sample_lines[sample].append(str(position+i) + "d")
                            if unique_only:
                                try:
                                    unique_snps[str(position+i) + "d"] += 1
                                except KeyError:
                                    unique_snps[str(position+i) + "d"] = 1
    if to_fasta:
        sample_fasta_count_changes = {}
        for sample in samples:
            if not impute:
                for cov in coverage_files:
                    if sample in cov:
                        with open(cov) as coverage_f:
                            start = 0
                            for line in coverage_f:
                                s_line = line.split('\t')
                                start_temp = int(s_line[1]) -1
                                while start_temp != start:
                                    sample_fasta[sample][start] = 'N'
                                    start += 1
                                coverage = int(s_line[3])
                                if coverage <= min_depth:
                                    sample_fasta[sample][start] = 'N'
                                start += 1
            else:
                for cov in coverage_files:
                    if sample in cov:
                        with open(cov) as coverage_f:
                            start = 0
                            for line in coverage_f:
                                s_line = line.split('\t')
                                start_temp = int(s_line[1]) -1
                                while start_temp != start:
                                    try:
                                        sample_fasta_count_changes[start] += 1
                                    except KeyError:
                                        sample_fasta_count_changes[start] = 1
                                    #sample_fasta[sample][start] = 'N'
                                    start += 1
                                coverage = int(s_line[3])
                                if coverage <= min_depth:
                                    try:
                                        sample_fasta_count_changes[start] += 1
                                    except KeyError:
                                        sample_fasta_count_changes[start] = 1
                                    #sample_fasta[sample][start] = 'N'
                                start += 1
            # TODO make sure that this cannot get called when using the indels option
        if impute:
            for sample in samples:
                offset = 0
                for i in range(0,len(sample_fasta[sample])):
                    if i in sample_offset_end[sample]:
                        offset += sample_offset_end[sample][i]    
                    try: 
                        temp_number = sample_fasta_count_changes[i]
                        if temp_number == len(coverage_files):
                            sample_fasta[sample][i+offset] = 'N'
                    except KeyError:
                        pass
        with open(output_fasta, 'w') as out:
            for sample in samples:
                out.write('>'+ sample + '\n')
                out.write("".join(sample_fasta[sample]) + '\n')
    else:
        if unique_only:
            unique_truth = {}
            for snp, count in unique_snps.items():
                if count == len(sample_lines):
                    unique_truth[snp] = False
                else:
                    unique_truth[snp] = True
        min_max_coord.append(str(position))
        with open(output_fasta, 'w') as hgrep_o:
            hgrep_o.write('SampleId\tRange\tHaploGroup\tPolymorphisms (delimited by tab)\n')
            for sample, substitions in sample_lines.items():
                output_line = []
                output_line.append(sample)
                output_line.append('-'.join(min_max_coord))
                output_line.append("?")
                for sub in substitions:
                    if unique_only:
                        if unique_truth[sub] == True:
                            output_line.append(sub)
                    else:
                        output_line.append(sub)
                output_line = "\t".join(output_line) + "\n"
                if len(output_line.split('\t')) == 3:
                    continue
                hgrep_o.write(output_line)
Exemplo n.º 54
0
def vcf_to_fasta(input_vcf,
                 output_fasta,
                 ref_seq,
                 species,
                 use_indels,
                 min_depth,
                 free_bayes,
                 ploidy,
                 to_fasta,
                 main_sequence,
                 coverage_files,
                 min_probs=0.8,
                 impute=False,
                 unique_only=False):
    # First part is to get the fasta sequence then atke each position
    # and then alter the reference as necessary for each sample.
    # Because everyone will have different SNPs.
    f = Fasta(ref_seq)
    # For now this is only going to work with mtDNA sequences,
    # but plan to extend this in the future to full genome
    # gets the full genomes sequences and currently assumes
    # that the fasta only contains one sequence.
    min_depth = int(min_depth)
    ploidy = int(ploidy)
    if (impute):
        is_beagle = True
    index = [n for n, l in enumerate(f.keys()) if l.startswith(main_sequence)]
    index = index[0]
    full_sequence = list(str(f[f.keys()[index]]))
    min_max_coord = []
    first_coordinate = True
    sample_fasta = {}
    unique_snps = {}
    if free_bayes or ploidy == 1:
        free_bayes = True
        ploidy = 1
    if (impute):
        is_beagle = True
        free_bayes = False
    sample_lines = {}
    vcf_reader = vcf.Reader(open(input_vcf, 'r'), strict_whitespace=True)
    samples = vcf_reader.samples
    sample_offset = {}
    sample_offset_end = {}
    for sample in samples:
        sample_lines[sample] = []
        sample_fasta[sample] = full_sequence[:]
        sample_offset[sample] = 0
        sample_offset_end[sample] = {}
    for record in vcf_reader:
        position = record.POS
        if first_coordinate:
            min_max_coord.append(str(position))
            first_coordinate = False
        for sample in record.samples:
            genotype = sample['GT']
            is_beagle = False
            temp_position = position - 1 + sample_offset[sample.sample]
            try:
                pl = sample['PL']
                pheno_l = [int(o) for o in pl]
                dp = sample['DP']
                pl = pheno_l.index(min(pheno_l))
                if genotype == None or float(dp) <= min_depth:
                    sample_fasta[sample.sample][temp_position] = 'N'
                    # Just to ensure, the bad thing doesn't occur
                    # Overwriting the N call.
                    continue
            except AttributeError:
                if not free_bayes:
                    is_beagle = True
                    gp = sample['GP']
                    g_l = [float(o) for o in gp]
                    if max(g_l) < min_probs:
                        print sample
                        sample_fasta[sample.sample][temp_position] = 'N'
                        continue
                    pl = g_l.index(max(g_l))
                else:
                    if genotype == '.' or genotype == None:
                        sample_fasta[sample.sample][temp_position] = 'N'
                        continue
            except TypeError:
                sample_fasta[sample.sample][temp_position] = 'N'
                continue
            sample = sample.sample
            if free_bayes or ploidy == 1:
                genotype = genotype[0]
                if genotype == '0':
                    continue
            elif not is_beagle:
                genotype = genotype.split('/')
            else:
                genotype = genotype.split("|")
            # If pl is greater than zero
            ref = record.REF
            alt = record.ALT
            # Gl is substituted
            if free_bayes or int(pl) > 0:
                if is_ga_or_ct(ref, alt):
                    if not free_bayes:
                        if is_beagle:
                            if g_l[0] > g_l[2]:
                                continue
                        elif pheno_l[0] < pheno_l[2]:
                            continue
                no_alleles = 1 + len(alt)
                if not free_bayes:
                    genotype = genotype[0]
                real_gt = str(alt[int(genotype) - 1])
                if to_fasta:
                    if species == 'human':
                        if position == 8270 and ref == "CACCCCCTCT":
                            sample_fasta[sample][8280:8289] = '-' * 9
                            continue
                    for i in range(0, max(len(real_gt), len(ref))):
                        if i == (len(real_gt) - 1) and i == (len(ref) - 1):
                            gt = real_gt[i]
                            if free_bayes and len(str(alt)) > 1:
                                real_gt = str(alt[0])
                            print(temp_position)
                            sample_fasta[sample][temp_position] = gt
                        elif len(real_gt) > len(ref) and i != 0:
                            if use_indels:
                                gt = list(real_gt[i])
                                sample_offset_end[sample][temp_position] = len(
                                    gt)

                                temp_position = temp_position + 1
                                sample_fasta[sample] = \
                                    sample_fasta[sample][:temp_position] + \
                                    gt + sample_fasta[sample][temp_position:]
                                sample_offset[sample] += 1
                            else:
                                gt = real_gt[i]
                                sample_fasta[sample][temp_position] = gt[0]
                        elif len(real_gt) < len(ref) and i != 0:
                            sample_fasta[sample][temp_position + i] = '-'
                else:
                    if species == 'human':
                        if position == 955 and "ACCCC" in str(alt[0]):
                            sample_lines[sample].extend(["960.1CCCCC"])
                            try:
                                unique_snps["960.1CCCCC"] += 1
                            except KeyError:
                                unique_snps["960.1CCCCC"] = 1
                            continue
                        if position == 8270 and ref == "CACCCCCTCT":
                            sample_lines[sample].extend(
                                [str(i) + "d" for i in range(8281, 8290)])
                            for item in [
                                    str(i) + "d" for i in range(8281, 8290)
                            ]:
                                try:
                                    unique_snps[item] += 1
                                except KeyError:
                                    unique_snps[item] = 1
                            continue
                        if position == 285 and ref == "CAA":
                            sample_lines[sample].extend(
                                [str(i) + "d" for i in range(290, 293)])
                            for item in [
                                    str(i) + "d" for i in range(290, 293)
                            ]:
                                try:
                                    unique_snps[item] += 1
                                except KeyError:
                                    unique_snps[item] = 1
                            continue
                        if position == 247 and ref == "GA":
                            sample_lines[sample].extend([str(249) + "d"])
                            item = str(249) + "d"
                            try:
                                unique_snps[item] += 1
                            except KeyError:
                                unique_snps[item] = 1
                            continue
                    for i in range(0, max(len(real_gt), len(ref))):
                        if i == (len(real_gt) - 1) and i == (len(ref) - 1):
                            gt = real_gt[i]
                            if free_bayes and len(str(alt)) > 1:
                                real_gt = str(alt[0])
                            sample_lines[sample].append(str(position + i) + gt)
                            if unique_only:
                                try:
                                    unique_snps[str(position + i) + gt] += 1
                                except KeyError:
                                    unique_snps[str(position + i) + gt] = 1
                        elif len(real_gt) > len(ref) and i != 0:
                            gt = real_gt[i]
                            sample_lines[sample].append(
                                str(temp_position + i) + "." + str(i) + gt)
                            if unique_only:
                                try:
                                    unique_snps[str(temp_position + i) + "." +
                                                str(i) + gt] += 1
                                except KeyError:
                                    unique_snps[str(temp_position + i) + "." +
                                                str(i) + gt] = 1
                            temp_position = temp_position - 1

                        elif len(real_gt) < len(ref) and i != 0:
                            sample_lines[sample].append(
                                str(position + i) + "d")
                            if unique_only:
                                try:
                                    unique_snps[str(position + i) + "d"] += 1
                                except KeyError:
                                    unique_snps[str(position + i) + "d"] = 1
    if to_fasta:
        sample_fasta_count_changes = {}
        for sample in samples:
            if not impute:
                for cov in coverage_files:
                    if sample in cov:
                        with open(cov) as coverage_f:
                            start = 0
                            for line in coverage_f:
                                s_line = line.split('\t')
                                start_temp = int(s_line[1]) - 1
                                while start_temp != start:
                                    sample_fasta[sample][start] = 'N'
                                    start += 1
                                coverage = int(s_line[3])
                                if coverage <= min_depth:
                                    sample_fasta[sample][start] = 'N'
                                start += 1
            else:
                for cov in coverage_files:
                    if sample in cov:
                        with open(cov) as coverage_f:
                            start = 0
                            for line in coverage_f:
                                s_line = line.split('\t')
                                start_temp = int(s_line[1]) - 1
                                while start_temp != start:
                                    try:
                                        sample_fasta_count_changes[start] += 1
                                    except KeyError:
                                        sample_fasta_count_changes[start] = 1
                                    #sample_fasta[sample][start] = 'N'
                                    start += 1
                                coverage = int(s_line[3])
                                if coverage <= min_depth:
                                    try:
                                        sample_fasta_count_changes[start] += 1
                                    except KeyError:
                                        sample_fasta_count_changes[start] = 1
                                    #sample_fasta[sample][start] = 'N'
                                start += 1
            # TODO make sure that this cannot get called when using the indels option
        if impute:
            for sample in samples:
                offset = 0
                for i in range(0, len(sample_fasta[sample])):
                    if i in sample_offset_end[sample]:
                        offset += sample_offset_end[sample][i]
                    try:
                        temp_number = sample_fasta_count_changes[i]
                        if temp_number == len(coverage_files):
                            sample_fasta[sample][i + offset] = 'N'
                    except KeyError:
                        pass
        with open(output_fasta, 'w') as out:
            for sample in samples:
                out.write('>' + sample + '\n')
                out.write("".join(sample_fasta[sample]) + '\n')
    else:
        if unique_only:
            unique_truth = {}
            for snp, count in unique_snps.items():
                if count == len(sample_lines):
                    unique_truth[snp] = False
                else:
                    unique_truth[snp] = True
        min_max_coord.append(str(position))
        with open(output_fasta, 'w') as hgrep_o:
            hgrep_o.write(
                'SampleId\tRange\tHaploGroup\tPolymorphisms (delimited by tab)\n'
            )
            for sample, substitions in sample_lines.items():
                output_line = []
                output_line.append(sample)
                output_line.append('-'.join(min_max_coord))
                output_line.append("?")
                for sub in substitions:
                    if unique_only:
                        if unique_truth[sub] == True:
                            output_line.append(sub)
                    else:
                        output_line.append(sub)
                output_line = "\t".join(output_line) + "\n"
                if len(output_line.split('\t')) == 3:
                    continue
                hgrep_o.write(output_line)
Exemplo n.º 55
0
def main():
    """
    cuts fasta file at specific location
    """
    if len(sys.argv) == 2:
        prefix = sys.argv[1]
    else:
        print "Usage: python split.py <prefix>; assume that <prefix>_BspQI_key.txt <prefix>.fasta and <prefix>_cut_list.csv exist; output will be <prefix>_new.fasta; cut_list is <contigID>,<loc1>,[<loc2>] -- first line is scaling constant"
        return 0
     
    ren = ReadTable(prefix+'_BspQI_key.txt', 4, '\t') # 4 lines of header 
    #print ren
    cut = ReadTable(prefix+'_cut_list.csv', 0, ',') # no header (saved as MS-DOS csv via Excel)
    #print cut 

    # create a dictionary between contig id and FASTA id x[1] and FAST len x[2]
    renaming = {}
    for x in ren:
        renaming[int(x[0])]=(x[1],int(x[2])) # contigs names are converted into integers, as well as length
    #print renaming
  
    # collect the names of the contigs to be cut 
    location = {}
    scaling = float(cut[0][0])
    print 'scaling constant',scaling
    for x in cut[1:]:
        index = int(x[0]) # name of the contig to cut, convert contig name into into integer so we can match it
        if index in renaming:
	    if (len(x) == 2):
           	l = int(round(float(x[1])/scaling)) # position to cut
            	if (l > renaming[index][1]): # check the length
                    print 'Error: cannot split contig',index,'at position',l,'because it is only',renaming[index][1],'bp long'
                    sys.exit(-1)
                else:
                    location[renaming[index][0]]=[l] # location[contig_name]->position
	    elif (len(x) == 3):
  		l1 = int(round(float(x[1])/scaling)) # position to cut
		l2 = int(round(float(x[2])/scaling)) # position to cut
                if (l1 > renaming[index][1]) or (l2 > renaming[index][1]): # check the length
                    print 'Error: cannot split contig',index,'at position',l1,l2,'because it is only',renaming[index][1],'bp long'
                    sys.exit(-1)
                else:
                    location[renaming[index][0]]=[l1,l2] # location[contig_name]->position
        else: 
           print 'Error: contig',index,'does not exist'
           sys.exit(-1)
    print location
 
    # open the fasta file for reading
    fas = Fasta(prefix+'.fasta')
    # open the new fasta file for writing
    ofa = open(prefix+'_new.fasta','w')
    for x in sorted(fas.keys()): # process all the contigs one by one
        if x in location: # if it needs to be split
	    if len(location[x]) == 1:
		l = location[x][0]
            	print 'Splitting',x,'at location',l
            	ofa.write('>'+x+'|chimeric1\n')
            	ofa.write(fas[x][:l]+'\n') # prefix
            	ofa.write('>'+x+'|chimeric2\n')
            	ofa.write(fas[x][l:]+'\n') # suffix
	    elif len(location[x]) == 2:
                l1 = location[x][0]
                l2 = location[x][1]
		if (l1 > l2):
		    temp = l2
		    l2 = l1
		    l1 = temp
                print 'Splitting',x,'at location',l1,'and',l2
                ofa.write('>'+x+'|chimeric1\n')
                ofa.write(fas[x][:l1]+'\n') # prefix
                ofa.write('>'+x+'|chimeric2\n')
                ofa.write(fas[x][l1:l2]+'\n') # middle
                ofa.write('>'+x+'|chimeric3\n')
                ofa.write(fas[x][l2:]+'\n') # suffix
        else: 
            #print 'Not splitting',x
            ofa.write('>'+x+'\n')
            ofa.write(fas[x][:]+'\n')
    ofa.close()
检测比对过的fasta文件中所有序列之间是否两两均具有重叠区域
'''

__version__ = "1.0"

from pyfasta import Fasta
import argparse

#命令行选项处理
parser = argparse.ArgumentParser()
parser.add_argument("-i", "-in", "--input", metavar="filename", dest="input", type=str , help="fasta file to check")
parser.add_argument("-v", "--version", action='version', help="The version of this program.", version = "Version: " + __version__)
args = parser.parse_args()

f = Fasta(args.input)
loci = sorted(f.keys())
for locus1 in loci:
    for locus2 in loci:
        flag = 0
        sequence1 = f[locus1]
        sequence2 = f[locus2]
        i = 0
        while i < len(sequence1) and i < len(sequence2):
            base1 = sequence1[i]
            base2 = sequence2[i]
            if base1 != "-" and base2 != "-":
                flag = 1
                break
            i += 1
        if flag == 0:
            print(locus1, "与", locus2, "不存在重叠序列!")
# FastA Parser
from pyfasta import Fasta

# Serializers
from .serializers import RefQuerySerializer, RefGenomeSerializer

# From JSON to Python Data Format
from django.utils.six import BytesIO
from rest_framework.parsers import JSONParser

# From Serializer to JSON
from rest_framework.renderers import JSONRenderer

f = Fasta('geneticapi/templates/data/genbank.GRCh37.fa')
FASTA_INDEX = sorted(f.keys(), reverse=True)

class Error(Exception):
	"""Base class for exceptions in this module."""
	pass

class ChromeParseException(Error):
	"""Exception raised for errors in the input.

	Attributes:
		msg  -- explanation of the error
	"""

	def __init__(self, msg, status):
		self.ERROR_RESPONSE = json.dumps({
										'message': msg,