Пример #1
0
def fetch(args):
    fasta = Fasta(args.fasta)
    regions = args.regions
    if args.list:
        with args.list as listfile:
            for region in listfile:
                regions.append(region.rstrip())
    for region in regions:
        region = region.split()[0]
        try:
            rname, interval = region.split(':')
        except ValueError:
            rname = region
            interval = None
        try:
            start, end = interval.split('-')
            sequence = fasta[rname][int(start) - 1:int(end)]
        except (AttributeError, ValueError):
            sequence = fasta[rname][:]
        if args.complement:
            sequence = sequence.complement
        if args.reverse:
            sequence = sequence.reverse
        line_len = fasta[rname]._fa.faidx.index[rname]['lenc']
        if args.name:
            sys.stdout.write('>' + sequence.name + '\n')
            for line in wrap_sequence(line_len, sequence.seq):
                sys.stdout.write(line)
        else:
            for line in wrap_sequence(line_len, sequence.seq):
                sys.stdout.write(line)
    fasta.close()
Пример #2
0
def fetch_sequence(args, fasta, name, start=None, end=None):
    try:
        line_len = fasta.faidx.index[name].lenc
        if args.auto_strand and start > end and start is not None and end is not None:
            # flip (0, 1] coordinates
            sequence = fasta[name][end - 1:start + 1]
            sequence = sequence.reverse.complement
        else:
            sequence = fasta[name][start:end]
    except KeyError:
        sys.stderr.write(
            "warning: {name} not found in file\n".format(**locals()))
        return
    if args.complement:
        sequence = sequence.complement
    if args.reverse:
        sequence = sequence.reverse
    if args.no_output:
        return
    if args.no_names:
        pass
    else:
        if (start or end) and not args.no_coords:
            yield ''.join(['>', sequence.fancy_name, '\n'])
        else:
            yield ''.join(['>', sequence.name, '\n'])
    for line in wrap_sequence(line_len, sequence.seq):
        yield line
Пример #3
0
 def splitGenome(self):
     fa = Fasta(self.fastaFileName)
     for seq in fa:
         with open('{}{}.fa'.format(STANDARD_GENOME_PATH, seq.name),
                   'w') as out:
             out.write('>{}\n'.format(seq.name))
             for line in wrap_sequence(70, str(seq)):
                 out.write(line)
     print("<<<<<<<Splitted>>>>>>")
Пример #4
0
def make_long_fasta(filename, nrecs=250, seqlen=SEQLEN):
    headers = []
    with open(filename, 'w') as f:
        s = "ACTGACTGAC"
        for i in range(nrecs):
            h = "header%i" % i
            headers.append(h)
            f.write('>' + h + '\n')
            for line in pyfaidx.wrap_sequence(80, s * (seqlen//10)):
                f.write(line)
    return headers
def Split_chromosome():
    print("Enter the FASTA file :")
    fasta = input()
    fasta = '{}.{}'.format(fasta, 'fa')
    fa = Fasta(fasta)
    print("Splitting chromosomes..........")

    for seq in fa:
        with open('{}.fa'.format(seq.name), 'w') as out:
            out.write('>{}\n'.format(seq.name))
            for line in wrap_sequence(70, str(seq)):
                out.write(line)
    print("<<<<<<<Splitted>>>>>>")
Пример #6
0
def fetch_sequence(args, fasta, name, start=None, end=None):
    line_len = fasta.faidx.index[name].lenc
    sequence = fasta[name][start:end]
    if args.complement:
        sequence = sequence.complement
    if args.reverse:
        sequence = sequence.reverse
    if not args.no_names:
        if start or end:
            yield ''.join(['>', sequence.longname, '\n'])
        else:
            yield ''.join(['>', sequence.name, '\n'])
    for line in wrap_sequence(line_len, sequence.seq):
        yield line
Пример #7
0
def bed_to_introns(bed_in, fasta_in, fasta_out):
    logging.info("Opening FASTA: {0}".format(fasta_in))
    logging.info("Note: will take a while the first time it is opened.")
    fasta = Fasta(fasta_in,
                  key_function=lambda key: key.split()[0],
                  strict_bounds=True)

    bed_h = open(bed_in, 'r')

    all_keys = {}
    output_seq = []
    count = 0
    for line in bed_h:
        if count % 10000 == 0 and count > 0:
            logging.info("On intron: {0}".format(count))
        ref, start, stop, genename = line.split()
        seq = fasta[ref][int(start):int(stop)]
        key = ref + ':' + start + '-' + stop
        if key in all_keys:
            logging.warning("ERROR: {0} appears once already".format(key))
            continue
        all_keys[key] = True
        if int(start) > int(stop):
            logging.warning(
                "Intron coords greater than reference: {0}:{1}-{2}".format(
                    ref, start, stop))
            logging.warning("Reference length: {0}".format(len(fasta[ref])))
            continue
        if len(seq) == 0:
            logging.warning("Intron length is 0? {0}:{1}-{2}".format(
                ref, start, stop))
            continue
        output_seq.append(seq)
        count += 1

    bed_h.close()

    with open(fasta_out, 'w') as outf:
        logging.info("Writing intron sequences out to {0}".format(fasta_out))
        for rec in output_seq:
            print('>{rname}:{start}-{end}'.format(rname=rec.name,
                                                  start=str(rec.start - 1),
                                                  end=str(rec.end)),
                  file=outf)
            for line in wrap_sequence(60, rec.seq):
                outf.write(line)
Пример #8
0
def fetch_sequence(args, fasta, name, start=None, end=None):
    try:
        line_len = fasta.faidx.index[name].lenc
        sequence = fasta[name][start:end]
    except KeyError:
        sys.stderr.write("warning: {name} not found in file\n".format(**locals()))
        return
    if args.complement:
        sequence = sequence.complement
    if args.reverse:
        sequence = sequence.reverse
    if args.no_names:
        pass
    elif args.full_names:
        yield ''.join(['>', fasta[name].long_name, '\n'])
    else:
        if start or end:
            yield ''.join(['>', sequence.longname, '\n'])
        else:
            yield ''.join(['>', sequence.name, '\n'])
    for line in wrap_sequence(line_len, sequence.seq):
        yield line
Пример #9
0
def fetch_sequence(args, fasta, name, start=None, end=None):
    try:
        line_len = fasta.faidx.index[name].lenc
        sequence = fasta[name][start:end]
    except KeyError:
        sys.stderr.write("warning: {name} not found in file\n".format(**locals()))
        return
    if args.complement:
        sequence = sequence.complement
    if args.reverse:
        sequence = sequence.reverse
    if args.no_names:
        pass
    elif args.full_names:
        yield ''.join(['>', fasta[name].long_name, '\n'])
    else:
        if start or end:
            yield ''.join(['>', sequence.longname, '\n'])
        else:
            yield ''.join(['>', sequence.name, '\n'])
    for line in wrap_sequence(line_len, sequence.seq):
        yield line
Пример #10
0
with open(pos) as P, open(outfa, 'w') as OUT, open(outdisstat, 'w') as DIS:
    tmp = {}
    basedis = {}
    DIS.write('\t'.join(['location', 'A', 'C', 'G', 'T']) + '\n')
    for i in P.readlines():
        if i.startswith('PA'): continue
        #this is use the python spice method, so must substract one
        id, st, ed, strand = makecord(i.strip())
        seqname = id + ':' + str(st) + '-' + str(ed) + ':' + strand
        sequence = fasta[id][st:ed]
        line_len = fasta.faidx.index[id].lenc
        if strand == '-':
            sequence = sequence.complement
            sequence = sequence.reverse
            OUT.write('>' + seqname + '\n')
            for line in wrap_sequence(line_len, sequence.seq):
                OUT.write(line)
        else:
            OUT.write('>' + seqname + '\n')
            for line in wrap_sequence(line_len, sequence.seq):
                OUT.write(line)
        for index, base in enumerate(list(sequence.seq)):
            if index not in basedis:
                basedis[index] = [base]
            else:
                basedis[index].append(base)

    for index, base in basedis.items():
        c = Counter(base)
        total = sum(c.values())
        percent = {key: value / total for key, value in c.items()}
Пример #11
0
def main(progname=None):
    parse = argparse.ArgumentParser(
        description=
        'Build normal genome by integrating germline SNPs from a VCF file.',
        prog=progname if progname else sys.argv[0])
    parse.add_argument('-v',
                       '--vcf',
                       type=check_vcf,
                       required=True,
                       metavar='FILE',
                       help='a VCF file containing germline SNPs')
    parse.add_argument('-r',
                       '--reference',
                       type=str,
                       required=True,
                       metavar='FILE',
                       help='a fasta file of reference genome')
    default = 'normal_fa'
    parse.add_argument('-o',
                       '--output',
                       type=check_output_folder,
                       default=default,
                       metavar='DIR',
                       help='output directory [{}]'.format(default))
    parse.add_argument(
        '-a',
        '--autosomes',
        type=check_autosomes,
        required=True,
        metavar='STR',
        help='autosomes of the genome (e.g. 1,2,3,4,5 or 1..4,5)')
    default = None
    parse.add_argument(
        '-s',
        '--sex_chr',
        type=check_sex,
        default=default,
        metavar='STR',
        help='sex chromosomes of the genome (separated by comma) [{}]'.format(
            default))
    args = parse.parse_args()
    if args.sex_chr == None:
        args.sex_chr = []
    else:
        args.sex_chr = args.sex_chr.split(',')
    autosomes = parse_autosomes(args.autosomes)

    #build the data structure: genome_profile
    reference = pyfaidx.Fasta(args.reference)
    genome_profile = fai_info(fai=args.reference + '.fai',
                              autosomes=autosomes,
                              sex_chr=args.sex_chr)
    #fill in the list hap_vars in genome_profile
    add_vcf_vars(profile=genome_profile, vcf=args.vcf)

    os.mkdir(args.output, mode=0o755)

    for i in range(2):
        with open('{}/normal.parental_{}.fa'.format(args.output, i),
                  'w') as output:
            for chroms in genome_profile['order']:
                if i < len(genome_profile[chroms]['hap_vars']):
                    start = 0
                    segments = []
                    for snp in genome_profile[chroms]['hap_vars'][i]:
                        try:
                            segments.append(reference[chroms][start:(snp[0] -
                                                                     1)].seq)
                        except ValueError:
                            if snp[0] - start == 1:
                                #This snp and the previous one is adjacent snps. e.g. chr1 45 (previous) and chr1 46 (current)
                                #If you retrive by reference['chr1'][45:(46-1)].seq, the return is not '', an error will pop actually.
                                pass
                            else:
                                raise
                        segments.append(snp[1])
                        start = snp[0]
                    if start < genome_profile[chroms]['length']:
                        segments.append(reference[chroms][start:].seq)
                    output.write('>{}\n'.format(chroms))
                    for outputline in pyfaidx.wrap_sequence(
                            genome_profile[chroms]['linebases'],
                            ''.join(segments)):
                        output.write(outputline)
Пример #12
0
def build_fasta(output=None, chain=None, normal_fa=None, width=None):
    refs = []
    for fa in normal_fa.split(','):
        refs.append(pyfaidx.Fasta(fa))
    parentalre = re.compile('^parental:[01]$')
    node = os.path.basename(chain)
    node = node.split('.')[0]
    outputf = []
    for parental in 0, 1:
        outputf.append(
            open('{}/{}.parental_{}.fa'.format(output, node, parental), 'w'))
    reference = None
    with open(chain) as inputf:
        seq_name = None
        parental = None
        seq = []
        for line in inputf:
            line = line.rstrip()
            if line.startswith('>'):
                if seq:
                    outputf[parental].write('>{}\n'.format(seq_name))
                    for outputline in pyfaidx.wrap_sequence(
                            width, ''.join(seq)):
                        outputf[parental].write(outputline)
                seq_name, parental = line[1:].split()
                if parentalre.match(parental):
                    parental = int(parental.split(':')[1])
                    try:
                        reference = refs[parental]
                    except IndexError:
                        raise FastaMissingError(
                            'There is no parental {} avalible,\n'.format(
                                parental) +
                            'which is required in the record ({}):\n{}\n'.
                            format(chain, line))
                else:
                    raise ChainFileError(
                        'The format of this line below from the chain file ' +
                        '({}) is not correct:\n{}\n'.format(chain, line))
                seq = []
            else:
                column = line.split()
                chroms = column[0]
                start = int(column[1])
                end = int(column[2])
                seq_type = column[3]
                segment = ''
                if seq_type == 'REF':
                    segment = reference[chroms][start:end].seq
                elif seq_type == 'SNV':
                    ref = reference[chroms][start:end].seq
                    m = Mutation(ref=ref, form=column[4])
                    segment = m.alternative
                elif seq_type == 'DEL':
                    pass
                else:
                    raise ChainFileError(
                        'Can not recognize the sequence type ({}) '.format(
                            seq_type) +
                        'of the record below from the chain file ({}):\n{}\n'.
                        format(chain, line))
                seq.append(segment)
        if seq:
            outputf[parental].write('>{}\n'.format(seq_name))
            for outputline in pyfaidx.wrap_sequence(width, ''.join(seq)):
                outputf[parental].write(outputline)
    for parental in 0, 1:
        outputf[parental].close()
Пример #13
0
# Import packages
import os
import sys
# Import packages
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import FeatureLocation, CompoundLocation
from Bio import pairwise2
from Bio.Seq import Seq
from pyfaidx import Fasta, wrap_sequence

file = sys.argv[1]
output_file = file

with open(output_file + ".nosynthetic", 'a') as out:
    fasta_sequences = SeqIO.parse(open(file), 'fasta')
    for fasta in fasta_sequences:
        name = fasta.name
        desc = fasta.description
        if '>' in desc:
            desc_s = desc.split('>')[0]
        else:
            desc_s = desc
        if 'synthetic' not in desc_s \
            and 'artificial' not in desc_s \
                and 'fragment' not in desc_s \
                    and 'low quality' not in desc_s \
                      and 'partial' not in desc_s:
            out.write('>{}\n'.format(desc_s))
            for line in wrap_sequence(70, str(fasta.seq)):
                out.write(line)
from os import listdir
from pyfaidx import Fasta, wrap_sequence



fileList = listdir('.')
fileList.remove('BhybridumS_002_v1.0.softmasked.fa')
fileList.remove('q.PAC4GC.hybridum.S.chr.list')
fileList = [fasta for fasta in fileList if fasta.endswith('.fasta')]
with open('q.PAC4GC.hybridum.S.chr.list','r') as f:
    chrList = f.read().split('\n')

print chrList

chrList.remove('')

for file in fileList:
    newfilename = file[:file.find('.')]+'S.fasta'
    print newfilename
    fasta = Fasta(file)
    open(newfilename,'w').close()
    with open(newfilename,'w') as fOut:
        print chrList
        print file,newfilename
        for chromName in chrList:
            print '>'+chromName
            fOut.write('>' + chromName +'\n%s\n'%(''.join(line for line in wrap_sequence(60, str(fasta[chromName])))))