示例#1
0
def writeClassifiedFastas(classType,Dirr,resultsDir, df):
    fasta_files_dict = Get_Dirr_All_Fasta (classType,Dirr)
    classDict = {}
    writerDict = {}
    for key, value in fasta_files_dict.items():
        files = {key:value}
        for filename, classname in files.items():
            with open(filename) as fasta:
                for record in FastaIterator(fasta): #SeqIO.SimpleFastaParser(fasta):
                    title = record[0]
                    seq_id = title.split(None, 1)[0]
                    if (record.id in df.index):
                        classname = df[record.id]
                        if (classname not in writerDict):
                            classname = "".join([c for c in classname if c.isalpha() or c.isdigit() or c==' ']).rstrip()
                            file = resultsDir + '\\' + classname + '.fasta'
                            classHandle = open(file, "w")
                            classDict[classname] = classHandle
                            myWriter = FastaWriter(classDict[classname])
                            myWriter.write_header()
                            writerDict[classname] = myWriter
                        writerDict[classname].write_record(record)
    for classname, classHandle in classDict.items():
        writerDict[classname].write_footer()
        classDict[classname].close()
def create_proteins_for_each_peptide(input_path,
                                     fasta_input,
                                     output_path,
                                     final_peptides,
                                     allow_change_in_cleavage_sites=False):
    """
    for each sequence create the native protein
    and create a version of thath protein for each peptide
    """

    final_edited_peptides = final_peptides[final_peptides['edited']]

    #create a seq-id:sequence dictionary from input fasta file
    sequences_dict = {}
    for record in SeqIO.parse(open(input_path + fasta_input, "r"), "fasta"):
        sequences_dict.update({record.id: record.seq})

    writer = FastaWriter(open(
        output_path + 'proteins_per_peptide_from_' + fasta_input, 'w'),
                         wrap=None)
    writer.write_header()

    for key, mrna_sequence, in sequences_dict.items():

        #first print the native protein
        comb_id = key + '|original'
        protein = mrna_sequence.translate()
        writer.write_record(SeqRecord(protein, id=comb_id, description=''))

        edited_peptides = final_edited_peptides[final_edited_peptides['seq_id']
                                                == key]

        n = 1
        for index, row in edited_peptides.iterrows():

            #flag editing combination for print\dont print in proteins file
            edit_prot = True
            if not allow_change_in_cleavage_sites and edit_prot:
                if final_peps_df.loc[
                        index,
                        'N_terminus'] != 'no_change' or final_peps_df.loc[
                            index,
                            'C_terminus'] != 'no_change' or final_peps_df.loc[
                                index, 'cancelled_cs_in_pep']:
                    edit_prot = False

            if edit_prot:
                permutation_coor = tuple(
                    int(x) for x in row['permutation_coor_base0'].split('_')
                    if x != '')
                protein = mrna_sequence[:permutation_coor[0]].translate(
                ) + row['biological_extended_peptide'] + mrna_sequence[
                    permutation_coor[1] + 1:]
                comb_id = key + '|edited_' + str(n) + '\t' + str(
                    row['editing_combinations_relative_to_coding_seq_base0'])
                writer.write_record(
                    SeqRecord(protein, id=comb_id, description=''))
                n += 1

    writer.write_footer()
示例#3
0
文件: muscle.py 项目: bsmithers/hpf
    def __init__(
        self, input, output=None, diags=None, maxhours=None, maxmb=None, clwstrict=False, quiet=True, clw=False
    ):
        self._itemp = None
        self._otemp = None
        self.maxhours = maxhours
        self.clwstrict = clwstrict
        self.clw = clw
        self.quiet = quiet
        self.maxmb = maxmb
        if isinstance(input, str):
            assert os.path.exists(input), "Can't find file %s" % input
            self.input = input
        elif isinstance(input, list):
            from tempfile import NamedTemporaryFile
            from Bio.SeqIO.FastaIO import FastaWriter

            self._itemp = NamedTemporaryFile()
            self.input = self._itemp.name
            writer = FastaWriter(self._itemp, wrap=0)
            writer.write_records(input)
            self._itemp.flush()
        else:
            raise Exception("Unknown input type", input)

        if isinstance(output, str):
            self.output = output
        elif output == None:
            self._otemp = NamedTemporaryFile()
            self.output = self._otemp.name
def create_fully_edited_proteins_fasta(input_path, fasta_input, output_path):
    """
    for each sequence create a native protein version and a fully edited version
    """

    mm_headers = {}
    [
        mm_headers.update({mm: re.compile(r'(?<=' + mm + '_base0:\s).*?]')})
        for mm in all_mm
    ]

    writer = FastaWriter(open(
        output_path + 'fully_edited_and_native_proteins_from_' + fasta_input,
        'w'),
                         wrap=None)
    writer.write_header()

    for record in SeqIO.parse(open(input_path + fasta_input, "r"), "fasta"):

        sites_dict = {}
        [
            sites_dict.update({
                mm:
                sorted(
                    eval(
                        find_by_regex_in_header(record.description,
                                                mm_headers[mm])))
            }) for mm in all_mm
        ]
        sites_number = sum([len(sites_dict[mm]) for mm in all_mm])
        length = len(record.seq)
        comb = tuple([sites_dict[mm] for mm in all_mm])

        protein_basic_description = ''
        #translate native protein
        seq_id = record.id + '_original'
        protein = record.seq.translate()
        writer.write_record(
            SeqRecord(protein,
                      id=seq_id,
                      description=protein_basic_description))

        if sites_number:
            seq_id = record.id + '_fully_edited'
            protein_description = protein_basic_description + '| editing_combinations_base0_wrt_to_coding_sequence: ' + str(
                comb)
            edited_seq = Seq(
                edit_rna_as_peptide(str(record.seq), (0, length - 1), comb),
                generic_dna)
            protein = edited_seq.translate()
            writer.write_record(
                SeqRecord(protein, id=seq_id, description=protein_description))
            if len(edited_seq) % 3:
                print(record.id)
                print(len(record.seq))
                print(len(edited_seq))

    writer.write_footer()
示例#5
0
文件: __init__.py 项目: bsmithers/hpf
 def write(f):
     i = 0
     for seqrecord in sequences:
         if seqrecord.id == "<unknown id>":
             seqrecord.id = str(i)
         i+=1
     writer = FastaWriter(f)
     writer.write_file(sequences)
     f.flush() #IMPORTANT
示例#6
0
def write_fasta(sequence, file_handle, wrap=60):
    """
    :param sequence: sequence to write in the file
    :type sequence: :class:`Bio.SeqRecord.SeqRecord` object
    :param file_handle: output file handler
    :type file_handle: 
    """
    _LOGGER.info("Writing output to " + file_handle.name + "...")
    writer = FastaWriter(file_handle, wrap=wrap)
    writer.write_file(sequence)
 def handle_noargs(self, **options):
     outfilename = options['outfile']
     outfileh = open(outfilename, 'w')
     print "Fetching records."
     records = Protein.objects.all()
     seqs = self._records_to_seqs(records)
     print "Writing records to %s" % outfilename
     writer = FastaWriter(outfileh, record2title=lambda x: x.id)
     writer.write_file(seqs)
     outfileh.close()
     print "Done."
示例#8
0
 def _write(self, file, value):
     """
     Write output to fasta file
     :param folder: file and location of outputfile
     :param value: 
     :return: 
     """
     handle = open(file, "w")
     writer = FastaWriter(handle, wrap=None)
     writer.write_file(value)
     handle.close()
示例#9
0
def split_files(fasta_file):
    """This next section removes line wraps, so I can
    split the file without interrupting a gene"""
    from Bio.SeqIO.FastaIO import FastaWriter
    output_handle = open("nowrap.fasta", "w")
    seqrecords=[ ]
    writer = FastaWriter(output_handle, wrap=0)
    for record in SeqIO.parse(open(fasta_file), "fasta"):
        seqrecords.append(record)
    writer.write_file(seqrecords)
    output_handle.close()
    """I can always make the number of lines an alterable field"""
    subprocess.check_call("split -l 200000 nowrap.fasta", shell=True)
示例#10
0
def split_files(fasta_file):
    """This next section removes line wraps, so I can
    split the file without interrupting a gene"""
    from Bio.SeqIO.FastaIO import FastaWriter
    output_handle = open("nowrap.fasta", "w")
    seqrecords=[ ]
    writer = FastaWriter(output_handle, wrap=0)
    for record in SeqIO.parse(open(fasta_file), "fasta"):
        seqrecords.append(record)
    writer.write_file(seqrecords)
    output_handle.close()
    """I can always make the number of lines an alterable field"""
    subprocess.check_call("split -l 200000 nowrap.fasta", shell=True)
示例#11
0
def cut_fasta_by_len(fa_file, len_cutoff, outdir, prefix, suffix):
    # https://stackoverflow.com/questions/273192/how-can-i-create-a-directory-if-it-does-not-exist
    # Defeats race condition when another thread created the path
    #if not os.path.exists(outdir):
    #    os.mkdir(outdir)
    try:
        os.makedirs(outdir)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    cut_fa_file = os.path.join(outdir,
                               prefix + ".ge" + str(len_cutoff) + suffix)
    if os.path.exists(cut_fa_file) and (os.path.getsize(cut_fa_file) > 0):
        return cut_fa_file

    if fa_file.endswith(".gz"):
        in_h = gzip.open(fa_file, 'rt')
    else:
        in_h = open(fa_file, 'r')
    with open(cut_fa_file, 'w') as out_h:
        #for rec in SeqIO.parse(in_h, 'fasta'):
        #    if len(rec.seq) >= len_cutoff:
        #        SeqIO.write(rec, out_h, 'fasta')
        # yes, the SeqIO.parse() API is more simple to use, easy to understand
        # but, try different method, you will find something
        writer = FastaWriter(out_h)
        writer.write_header()
        for rec in FastaIterator(in_h):
            if len(rec) >= len_cutoff:
                writer.write_record(rec)
        writer.write_footer()
    in_h.close()
    return cut_fa_file
def create_peptides_fasta(input_path, fasta_input, peps_df, extention=15):

    writer = FastaWriter(open(
        input_path + 'peptides_extanded_by' + str(extention) + '_from' +
        fasta_input, 'w'),
                         wrap=None)
    writer.write_header()

    for record in SeqIO.parse(open(input_path + fasta_input, "r"), "fasta"):

        prot = record.seq.translate()
        for i, row in peps_df[peps_df['seq_id'] == record.id].iterrows():
            rna_pep_coor = row['in_frame_coordinates_base0'].split('_')
            pep_start = int(rna_pep_coor[1]) / 3
            pep_end = int(rna_pep_coor[2]) / 3
            seq_start = max(0, pep_start - extention)
            seq_end = min(pep_end + extention, len(prot))
            extented_pep = prot[seq_start:pep_start] + row[
                'biological_peptide'] + prot[min(pep_end +
                                                 1, len(prot)):seq_end]
            if not row['edited']:
                seq_id = record.id + '_original_' + str(
                    seq_start * 3) + '_' + str(
                        seq_end * 3) + '_pep_id_' + str(i)
            else:
                seq_id = record.id + '_' + str(seq_start * 3) + '_' + str(
                    seq_end * 3) + '_editing_range' + row[
                        'permutation_coor_base0'] + '_pep_id_' + str(i)
            writer.write_record(
                SeqRecord(extented_pep, id=seq_id, description=''))

    writer.write_footer()
示例#13
0
 def write_by_og(self, output_folder):
     '''
     Write for each og all the mapped sequences into separate fasta files to a specified folder
     :param output_folder: folder where files should be stored
     '''
     if not os.path.exists(output_folder):
         os.makedirs(output_folder)
     for key, value in tqdm(self.og_records.items(),
                            desc="Writing DNA seq sorted by OG",
                            unit=" OG"):
         handle = open(os.path.join(output_folder, 'mapped_' + key + '.fa'),
                       "w")
         writer = FastaWriter(handle, wrap=None)
         writer.write_file(value)
         handle.close()
示例#14
0
def trim(barcode, length, in_handle, out_handle):
    """
    Trim input sequences, write to out_handle

    Trims barcodes from FASTA-formatted sequences in in_handle, truncates
    sequences at provided length, writes to out_handle.
    """
    def inner(sequences, pattern):
        "Does the trimming"
        for sequence in sequences:
            s = str(sequence.seq)
            m = pattern.match(s)
            if m:
                yield sequence[m.end():length]
            else:
                print >> sys.stderr, "No match:", sequence.id
                yield sequence[:length]

    # Records are read TCAG, we trim the first 4 *flows*, which won't
    # detect duplicate G's.
    barcode = barcode.lstrip('G')

    pattern = re.compile('^{0}'.format(barcode))

    sequences = SeqIO.parse(in_handle, 'fasta')

    trimmed = inner(sequences, pattern)
    FastaWriter(out_handle, wrap=None).write_file(trimmed)
示例#15
0
 def write_select_og_dna(self):
     '''
     Write for each species all the DNA sequences into separate fasta files
     :param output_folder: folder where files should be stored
     '''
     output_folder = os.path.join(self.args.output_path, "reference_ogs_dna")
     if not os.path.exists(output_folder):
         os.makedirs(output_folder)
         for key, value in tqdm(self.ogs.items(), desc="Writing OGs sorted by species",
                                unit=" species"):
             handle = open(os.path.join(output_folder, key + '.fa'), "w")
             writer = FastaWriter(handle, wrap=None)
             writer.write_file(value.dna)
             handle.close()
     elif len(self.ogs_dna_by_species) == len(glob.glob(os.path.join(output_folder, '*.fa'))):
         print('Folder with files already exists and will not be overwritten.')
示例#16
0
def writeFasta(fb,seqList):
    if len(seqList) <= 0:
        raise ValueError("No data to Persist.")
    writer = FastaWriter(fb)
    writer.write_header()
    for record in seqList:
        writer.write_record(record)
    writer.write_footer()
示例#17
0
 def convert_ill_fasta(self, event):
     filters = 'Text files (*.txt)|*.txt'
     dialog = wx.FileDialog(None, style=wx.OPEN, wildcard=filters)
     if dialog.ShowModal() == wx.ID_OK:
         self.illumina = dialog.GetPath()
     self.filename = self.illumina.split('/')[-1]
     self.filename = self.filename + '.fasta'
     records = SeqIO.parse(open(self.illumina), "fastq-illumina")
     handle = open(self.filename, "w")
     count = FastaWriter(handle, wrap=80).write_file(records)
     handle.close()
     print "Converted %i records" % count
示例#18
0
def reheader_fasta(fa_in, fa_out, header_function, in_gz, gz):
    if in_gz:
        in_h = gzip.open(fa_in, 'rt')
    else:
        in_h = open(fa_in, 'r')
    if gz:
        out_h = bgzf.BgzfWriter(fa_out, 'wb')
    else:
        out_h = open(fa_out, 'w')
    writer = FastaWriter(out_h)
    writer.write_header()
    for rec in FastaIterator(in_h, title2ids=header_function):
        writer.write_record(rec)
    writer.write_footer()
    out_h.close()
    in_h.close()
示例#19
0
文件: cdhit.py 项目: bsmithers/hpf
 def __init__(self, input, output=None, identity=0.8, length=0.8):
     self._itemp=None
     self._otemp=None
     self.identity=identity
     self.length=length
     if isinstance(input, str):
         assert(os.path.exists(input))
         self.input = input
     elif isinstance(input, list):
         from tempfile import NamedTemporaryFile
         from Bio.SeqIO.FastaIO import FastaWriter
         self._itemp = NamedTemporaryFile()
         self.input = self._itemp.name
         writer = FastaWriter(self._itemp,wrap=0)
         writer.write_records(input)
         self._itemp.flush()
     else:
         raise Exception("Unknown input type",input)
     
     if isinstance(output, str):
         self.output = output
     elif output==None:
         self._otemp = NamedTemporaryFile()
         self.output = self
示例#20
0
def split(fasta_file, parts):
    cmd = "grep -c '>' %s"%fasta_file
    out,err = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE).communicate()
    count = int(out.strip())
    part_size = int(math.ceil(float(count)/float(parts)))
    runtime().debug("Part size",part_size)
    writer = None
    handle = None
    with open(fasta_file) as fasta:
        for i,record in enumerate(SeqIO.parse(fasta,"fasta")):
            if i%part_size==0:
                part = i/part_size
                newfile=fasta_file+".%i"%part
                runtime().debug(i,i%part_size,newfile)
                if handle:
                    handle.close()
                handle = open(newfile,"w")
                #print handle
                if writer:
                    writer.write_footer()
                writer =  FastaWriter(handle)
                writer.write_header()
            #print record
            writer.write_record(record)
示例#21
0
def main(args):
	
	for fasta in SeqIO.parse(args.fasta, "fasta"):

		out = open(fasta.id+".fasta", "w")
		
		fasta_out = FastaWriter(out, wrap=70)
		fasta_out.write_header()
		fasta_out.write_record(fasta)
		
		out.close()
示例#22
0
def trierFastaByDomain(tgtDomain,fastaDict,step2List,writeFileName,formatFunc):
    fb     = open(writeFileName,'w')
    writer = FastaWriter(fb)
    writer.write_header()
    for record in step2List:
        score,gName,domain,gID,ARC,RF,reverse,begin,end,desc = formatFunc(record)
        if domain == tgtDomain:
            if fastaDict.get(gID) <> None:
                writer.write_record(fastaDict.get(gID))
            '''
            else:
                print "[%s] n'existe pas dans le fiche"%(gID)
            '''
    writer.write_footer()
    fb.close()
示例#23
0
 def make_qiime_output(self):
     # Prepare fasta writer #
     handle = open(self.qiime_fasta.path, 'w')
     writer = FastaWriter(handle, wrap=0)
     writer.write_header()
     # Counter #
     counter = defaultdict(int)
     # Do it #
     for r in self.only_used.parse_barcodes():
         sample_name = r.first.sample.short_name
         counter[sample_name] += 1
         r.read.id = '%s_%i %s' % (sample_name, counter[sample_name],
                                   r.read.id)
         bar_seq = r.read.seq[0:self.pool.bar_len]
         r.read.description = "orig_bc=%s new_bc=%s bc_diffs=0" % (bar_seq,
                                                                   bar_seq)
         writer.write_record(r.read[self.trim_fwd:-self.trim_rev])
     # Close #
     writer.write_footer()
     handle.close()
示例#24
0
 def make_qiime_output(self):
     # Prepare fasta writer #
     handle = open(self.qiime_fasta.path, 'w')
     writer = FastaWriter(handle, wrap=0)
     writer.write_header()
     # Counter #
     counter = defaultdict(int)
     # Do it #
     for r in self.only_used.parse_barcodes():
         sample_name = r.first.sample.short_name
         counter[sample_name] += 1
         r.read.id = '%s_%i %s' % (sample_name, counter[sample_name], r.read.id)
         bar_seq = r.read.seq[0:self.pool.bar_len]
         r.read.description = "orig_bc=%s new_bc=%s bc_diffs=0" % (bar_seq, bar_seq)
         writer.write_record(r.read[self.trim_fwd:-self.trim_rev])
     # Close #
     writer.write_footer()
     handle.close()
示例#25
0
def cleanUpFasta(fname,fastaDict,step2List,Step2FormatSepFunc,seuil=1e-3):
    with open(fname,'w') as fb:
        writer = FastaWriter(fb)
        writer.write_header()
        for line in step2List:
            try:
                score,gName,domain,gId,ARC,RF,reverse,begin,end,desc = Step2FormatSepFunc(line)
                if score > seuil:
                    # print "[%s] score [%f] > seuil [%f].\n"%(gName,score,seuil)
                    continue
                code = fastaDict[gName].seq.tostring()
                if reverse:
                    code = code[::-1]
                record = SeqRecord(Seq(code[begin:end],generic_dna),name=gName,id=gId,description=desc)
                writer.write_record(record)
            except KeyError:
                print "[%s] not exists in fasta dictionary.\n"%gName
                continue
        writer.write_footer()
示例#26
0
 def write_dna(self, species, output_folder):
     handle = open(os.path.join(output_folder, species + '_OGs.fa'), "w")
     writer = FastaWriter(handle, wrap=None)
     writer.write_file(self.dna)
     handle.close()
def select_from_small_file(args):
    inp_file, db_inp_file, db_out_file, out_file, num = args
    inp = list(SeqIO.parse(open(inp_file), 'fasta'))
    shuffle(inp)
    writer = FastaWriter(open(out_file, 'w'), wrap=0)
    writer.write_file(inp[:num])
示例#28
0
# Python Script to Trim Based on designated start and end
###############################################################################
# Written by Mario Muscarella
# Last Update 10 May 2013

# Directions:

from Bio import SeqIO
import sys
import glob
from Bio.SeqIO.FastaIO import FastaWriter

# change these numbers
start = 1130 
end = 42988

def trim_positions(records, start, end):
	for record in records:
		yield record[start:end]

#files = glob.glob("*.align")
file = "HMWF.align"

original_seqs = SeqIO.parse(file, "fasta")
trimmed_seqs = trim_positions(original_seqs, start, end)
output_handle = open(file+".trim.fasta", "w")
count = FastaWriter(output_handle, wrap=0).write_file(trimmed_seqs)
output_handle.close()
print "Trimmed %i reads" % count

def create_edited_proteins_all_represented_combinations(
        input_path,
        fasta_input,
        output_path,
        final_peps_df,
        max_edits_per_pep=None,
        allow_change_in_cleavage_sites=False):
    """
    for each sequence create the native protein
    and create a version of that protein for each editing combination represented by that each edited peptide
    """

    #create a seq-id:sequence dictionary from input fasta file
    sequences_dict = {}
    for record in SeqIO.parse(open(input_path + fasta_input, "r"), "fasta"):
        sequences_dict.update({record.id: record.seq})

    writer = FastaWriter(open(
        output_path + 'proteins_per_combination_from_' + fasta_input, 'w'),
                         wrap=None)
    writer.write_header()

    #creating a dataframe of all editing cominations per protein
    #    comps_editing_combs = final_peps_df.groupby('seq_id').agg({'editing_combinations_relative_to_sense_orf_base0':lambda x: sorted([comb for sublist in list(x) for comb in sublist])})
    comps_editing_combs = final_peps_df.groupby('seq_id')[
        'editing_combinations_relative_to_coding_seq_base0'].aggregate(
            lambda x: list(x))
    #for each seq_id, iterate over all editing combinations and creat edited peptides
    final_peps_df = final_peps_df.drop_duplicates(
        subset='seq_id', keep='first'
    )  #removing duplicates as only data in seq_id level is now needed
    final_peps_df.set_index('seq_id', inplace=True)

    for index, combs_nested_list in comps_editing_combs.iteritems():

        written_combs = []
        n = 1
        protein_basic_description = ''
        length = len(sequences_dict[index])
        flattened_comb_list = [c for l in combs_nested_list for c in l]

        for comb in flattened_comb_list:

            #flag editing combination for print\dont print in proteins file
            edit_prot = True
            if max_edits_per_pep != None:
                if len([site for edit_type in comb
                        for site in edit_type]) > max_edits_per_pep:
                    edit_prot = False
            if not allow_change_in_cleavage_sites and edit_prot:
                if final_peps_df.loc[
                        index,
                        'N_terminus'] != 'no_change' or final_peps_df.loc[
                            index,
                            'C_terminus'] != 'no_change' or final_peps_df.loc[
                                index, 'cancelled_cs_in_pep']:
                    edit_prot = False

            #editing proteins and writing to file if combination not already writen and combination do not exceed editing events
            if comb not in written_combs and edit_prot:
                if comb == ([], [], [], [], [], [], [], [], [], [], [],
                            []):  #the original sequence
                    comb_id = index + '_original'
                    protein = sequences_dict[index].translate()
                    protein_description = protein_basic_description
                else:
                    comb_id = index + '_edited_' + str(n)
                    protein_description = protein_basic_description + '| editing_combinations_base0_wrt_to_coding_sequence: ' + str(
                        comb)
                    protein = Seq(
                        edit_rna_as_peptide(str(sequences_dict[index]),
                                            (0, length - 1), comb),
                        generic_dna).translate()
                    n += 1
                written_combs.append(comb)
                writer.write_record(
                    SeqRecord(protein,
                              id=comb_id,
                              description=protein_description))

    writer.write_footer()
示例#30
0
def create_in_frame_rna_file_from_anovar_results_and_coding_mrna_seqs_final_sites_dfs(fasta_file,output_name,out_path,mm_df_dict,stop_as_bad_records,met_as_good_records,last_is_stop,variants_to_use = []):
    """
    input - coding sequences as fasta file
            
            sites (wrt to coding sequence) dataframe - result of read_editing_sites_wrt_coding_seqs
            after ucsc_id column is set to index 
            different dataframes for different mm types
    
    output - fasta file in the format of proteomics simulator 
             some of the values in the header will be useless because the input includes that coding sequences
             so this function does not trim the sequences.
    """
    
    n_bad = 0
    n_good = 0
    sites_good = 0
    sites_bad = 0
    
    writer =  FastaWriter(open(out_path + output_name + '.fasta' , 'w'), wrap=None)
    writer_bad = FastaWriter(open(out_path + 'bad_seqs_' + output_name + '.fasta' , 'w'), wrap=None)
    writer.write_header()
    writer_bad.write_header()
    
    for record in SeqIO.parse(open(fasta_file, "r"), "fasta"):
        
        mm_loc_dict = {}
        
        split_header = record.id.split(';')
        rec_id = split_header[0] + ';' + split_header[1]
        use_variant = True
        
        if len(variants_to_use): #if a not-empty list is passed for variants_to_use, flag variants that are not in list so they will not be included in uotput
            if rec_id not in variants_to_use:
                use_variant = False
        
        if use_variant:
                        
            for mm in all_mm:
                if mm_df_dict[mm] is None:
                    mm_list = []
                else:
                    sites = mm_df_dict[mm]
                    try:
                        mm_list = [int(k)-1 for k in sites.loc[[rec_id]]['position_base1']]
                    except KeyError:
                        mm_list = []
                mm_loc_dict.update({mm:mm_list})

#            prot_start_nuc = 1
#            prot_end_nuc = len(final_sequence)
#            if last_is_stop:
#                prot_end_nuc = prot_end_nuc-3    
#            prot_start = 'first_met_in_original_orf'
#            prot_end = 'original_sense_strand_orf_end'
#            strand = '+'
#            orf_start = 1
#            orf_end = len(record.seq) - 3
             
            mm_str = ''
            for mm in mm_loc_dict:
                mm_str+= '| '+mm+'_base0: '+ str(mm_loc_dict[mm])
                    
#            description_str = mm_str + ' | prot_start: ' + str(prot_start) + ' | prot_end: ' + str(prot_end) + ' | strand: ' + strand + ' | prot_start_nuc: ' + str(prot_start_nuc) + ' | prot_end_nuc: ' + str(prot_end_nuc) + ' | original_orf_start: ' + str(orf_start) + ' | original_orf_end: ' + str(orf_end)
            description_str = mm_str
            
            if last_is_stop:    
                final_sequence = str(record.seq[0:-3]).replace('a','A').replace('g','G').replace('t','T').replace('c','C')
            else:
                final_sequence = str(record.seq).replace('a','A').replace('g','G').replace('t','T').replace('c','C')
        
            good_record = True
            if stop_as_bad_records:
                if '*' in Seq(str(final_sequence), generic_dna).translate():
                    good_record = False
            if met_as_good_records:
                if Seq(str(record.seq[0:3]), generic_dna).translate() != 'M':
                    good_record = False
            if last_is_stop:
                if Seq(str(record.seq[-3:len(record.seq)]), generic_dna).translate() != '*':
                    good_record = False
                
            if not good_record:
                writer_bad.write_record(record)
                n_bad+=1
                sites_bad+=sum([len(mm_loc_dict[mm]) for mm in all_mm])
            else:
                if len(final_sequence)%3:
                    final_sequence=final_sequence[0:-len(final_sequence)%3]
                current_record = SeqRecord(Seq(final_sequence,generic_dna), id = rec_id, description = description_str)
                writer.write_record(current_record)
                n_good+=1
                sites_good+=sum([len(mm_loc_dict[mm]) for mm in all_mm])
    
    writer.write_footer()
    if n_bad:    
        writer_bad.write_footer()
    
    
    print(str(n_good) + ' good sequence with ' + str(sites_good) + 'sites')
    print(str(n_bad) + ' bad sequence with ' + str(sites_bad) + 'sites')
"""
Remove unpaired reads from a fasta file.

This script can be used for the case that unpaired reads (e.g. as
reads were removed during quality trimming) in a pair of fasta files
from paired-end sequencing need to be removed.

"""

import argparse
from Bio import SeqIO
from Bio.SeqIO.FastaIO import FastaWriter

parser = argparse.ArgumentParser()
parser.add_argument("fasta_file_to_filter")
parser.add_argument("reference_fasta_file")
parser.add_argument("--output_fasta", default="output.fa")
args = parser.parse_args()

# Read reference file header
reference_headers = {}
for seq_record in SeqIO.parse(args.reference_fasta_file, "fasta"):
    reference_headers[seq_record.id.split()[0]] = 1

# Read fasta file to filter and write output
with open(args.output_fasta, 'w') as output_fh:
    writer = FastaWriter(output_fh, wrap=0)
    writer.write_file(
        filter(lambda seq_record: seq_record.id.split()[0] in reference_headers,
               SeqIO.parse(args.fasta_file_to_filter, "fasta")))
original_file=sys.argv[1]
otu_table=sys.argv[2]
project_file=sys.argv[3]

import itertools
from Bio import SeqIO
from Bio.SeqIO.FastaIO import FastaWriter

total_fasta = SeqIO.parse(open(original_file,"rU"), "fasta")
project_fasta = open(project_file,'w')
project_fasta.close() 
project_fasta = open(project_file,'a')

## read in the csv file and get header names
import csv
table_normalized_otus = open(otu_table, 'rb')
reader = csv.reader(table_normalized_otus, delimiter="\t")
headers = reader.next()
print headers


writer = FastaWriter(project_fasta, wrap=None)    
writer.write_header()

for records in total_fasta:
    # print records.name
    if records.name in headers:
        writer.write_record(records)

writer.write_footer()
def write_fasta_output(fasta_output_file, filtered_seqs):
    handle = open(fasta_output_file, "w")
    writer = FastaWriter(handle)
    writer.write_file(filtered_seqs)
    handle.close()
示例#34
0
for i in codes:
    z=[x.description for x in fa if i in x.description]
    if len(z)>0:
        new_name=df2[i]
        full_name=z[0]
        master_dict.update({full_name : new_name})

for i in fa:
    if i.description in master_dict.keys():
        i.id=master_dict[i.description]
        i.description=""
    

## Write temporary file
handle = open('temp.fa', "w")
writer = FastaWriter(handle, wrap=0)
writer.write_file(fa)
handle.close()

## Read in temporary file and print properly formatted fasta
x = open("temp.fa", "r")
y=x.readlines()
z=''.join(y)
if z[-1]=='\n': 
    z=z[:-1]
print (z)
os.remove("temp.fa")