Пример #1
0
def trim_additional_merged_contigs(original_contig, merged_contig):
    open_merged = open(merged_contig)
    fasta_reader = FastaReader(open_merged)
    header, sequence = fasta_reader.next()
    sp_header = header.split('_')
    trim = int(sp_header[-2].strip('os'))
    cigar = sp_header[-1]
    all_cigar = re.findall('(\d+)([MXDI])', cigar)
    for count, type in all_cigar:
        if type == "M" or type == "X":
            trim += int(count)

    open_merged.close()
    if trim > 50:
        logging.info("trim %s of %s" % (trim, original_contig))
        open_contig = open(original_contig)
        fasta_reader = FastaReader(open_contig)
        header, sequence, fasta_reader.next()
        header = header + "trim_%s" % trim
        sequence = sequence[trim:]
        open_contig.close()
        open_contig = open(original_contig, 'w')
        if len(sequence) > 100:
            open_contig.write(">%s\n%s\n" % (header, sequence))
        open_contig.close()
Пример #2
0
def trim_additional_merged_contigs(original_contig, merged_contig):
    open_merged=open(merged_contig)
    fasta_reader=FastaReader(open_merged)
    header, sequence = fasta_reader.next()
    sp_header = header.split('_')
    trim=int(sp_header[-2].strip('os'))
    cigar=sp_header[-1]
    all_cigar = re.findall('(\d+)([MXDI])', cigar)
    for count, type  in all_cigar:
        if type=="M" or type=="X":
            trim+=int(count)
    
    open_merged.close()
    if trim>50:
        logging.info("trim %s of %s"%(trim, original_contig))
        open_contig=open(original_contig)
        fasta_reader=FastaReader(open_contig)
        header, sequence,fasta_reader.next()
        header=header+"trim_%s"%trim
        sequence=sequence[trim:]
        open_contig.close()
        open_contig=open(original_contig, 'w')
        if len(sequence)>100:
            open_contig.write(">%s\n%s\n"%(header,sequence))
        open_contig.close()
Пример #3
0
def get_fasta_length(fasta_file):
    length = 0
    with open(fasta_file) as open_fasta:
        reader = FastaReader(open_fasta)
        header, sequence = reader.next()
        length = len(sequence)

    return length
def get_fasta_length(fasta_file):
    length=0
    with open(fasta_file) as open_fasta:
        reader = FastaReader(open_fasta)
        header,sequence = reader.next()
        length=len(sequence)
        
    return length
Пример #5
0
def force_merge_consensus(read1_consensus, read2_consensus, output_merge_file):
    open_output = open(output_merge_file, 'w')
    open_read1 = open(read1_consensus)
    open_read2 = open(read2_consensus)
    fasta_reader1 = FastaReader(open_read1)
    read1_name, read1_sequence = fasta_reader1.next()
    open_read1.close()
    name = "%s_forced_merged" % read1_name
    array = [read1_sequence]
    fasta_reader2 = FastaReader(open_read2)
    for read2_name, read2_sequence in fasta_reader2:
        name = "%s_%s" % (name, read2_name)
        array.append("N" * 100)
        array.append(read2_sequence)

    open_output.write(">%s\n%s\n" % (name, ''.join(array)))
    open_read2.close()
    open_output.close()
Пример #6
0
def force_merge_consensus(read1_consensus, read2_consensus, output_merge_file):
    open_output = open(output_merge_file,'w')
    open_read1 = open(read1_consensus)
    open_read2 = open(read2_consensus)
    fasta_reader1 = FastaReader(open_read1)
    read1_name, read1_sequence = fasta_reader1.next()
    open_read1.close()
    name="%s_forced_merged"%read1_name
    array=[read1_sequence]
    fasta_reader2 = FastaReader(open_read2)
    for read2_name, read2_sequence in fasta_reader2:
        name="%s_%s"%(name,read2_name)
        array.append("N"*100)
        array.append(read2_sequence)
    

    open_output.write(">%s\n%s\n"%(name, ''.join(array)))
    open_read2.close()
    open_output.close()
Пример #7
0
class GenomeLoader:
    """Genome Loader take a fasta file and try to find a given chromosome in it.
    You can specify so it also keep the all the loaded unused chromosome into memory until they are required (keep_in_memory=True).
    You can also specify so it keep the all the loaded chromosome until the object is destroyed (keep_until_done=True).
    You can provide a prefix that will be added to the chromsome names if not already there.
    """
    def __init__(self,
                 genome_file,
                 keep_in_memory=True,
                 keep_until_done=False,
                 prefix=''):
        self.open_genome_file = open_input_file(genome_file)
        self.reader = FastaReader(self.open_genome_file)
        self.keep_in_memory = keep_in_memory
        self.keep_until_done = keep_until_done
        self.prefix = prefix
        self.all_chr = {}

    def load_all(self):
        self.get_chr('***********************************************')
        return self.all_chr.keys()

    def get_chr(self, chr):
        if not chr.startswith(self.prefix):
            chr = self.prefix + chr

        if self.keep_until_done:  #return if loaded already
            fasta_seq = self.all_chr.get(chr)
        else:  #remove if loaded already
            fasta_seq = self.all_chr.pop(chr, None)
        if fasta_seq:
            (header, seq) = fasta_seq
            logging.debug('return %s' % header)
            return fasta_seq
        curr_chr = ''
        seq = ''
        while not curr_chr == chr:
            fasta_seq = self.reader.next()
            if fasta_seq:
                (header, seq) = fasta_seq
                logging.debug('load %s' % header)
                curr_chr = header.split()[0]
                if not curr_chr.startswith(self.prefix):
                    curr_chr = self.prefix + curr_chr
                if (self.keep_in_memory
                        and not curr_chr == chr) or self.keep_until_done:
                    logging.debug('keep %s' % header)
                    self.all_chr[curr_chr] = fasta_seq
            else:
                break
        return fasta_seq

    def next(self):
        fasta_seq = None
        if len(self.all_chr) > 0:
            chr = self.all_chr.keys[0]
        if len(self.all_chr) > 0:
            chr = self.all_chr.keys()[0]
            if self.keep_until_done:  #return if loaded already
                fasta_seq = self.all_chr.get(chr)
            else:  #remove if loaded already
                fasta_seq = self.all_chr.pop(chr, None)
        if not fasta_seq:
            fasta_seq = self.reader.next()
            if fasta_seq:
                (header, seq) = fasta_seq
                curr_chr = header.split()[0]
                logging.debug('load %s' % header)
                if self.keep_until_done:
                    logging.debug('keep %s' % header)
                    self.all_chr[curr_chr] = fasta_seq
        if fasta_seq:
            logging.debug('return %s' % header)
            return fasta_seq
        return None

    def __iter__(self):
        return iter(self.next, None)

    def __del__(self):
        self.open_genome_file.close()
        self.all_chr = None

    def close(self):
        self.__del__()
Пример #8
0
class GenomeLoader:
    """Genome Loader take a fasta file and try to find a given chromosome in it.
    You can specify so it also keep the all the loaded unused chromosome into memory until they are required (keep_in_memory=True).
    You can also specify so it keep the all the loaded chromosome until the object is destroyed (keep_until_done=True).
    You can provide a prefix that will be added to the chromsome names if not already there.
    """
    def __init__(self,genome_file, keep_in_memory=True, keep_until_done=False, prefix=''):
        self.open_genome_file=open_input_file(genome_file)
        self.reader=FastaReader(self.open_genome_file)
        self.keep_in_memory=keep_in_memory
        self.keep_until_done=keep_until_done
        self.prefix=prefix
        self.all_chr={}
    
    def load_all(self):
        self.get_chr('***********************************************')
        return self.all_chr.keys()
    
    def get_chr(self, chr):
        if not chr.startswith(self.prefix):
            chr=self.prefix+chr
            
        if self.keep_until_done: #return if loaded already
            fasta_seq=self.all_chr.get(chr)
        else:               #remove if loaded already
            fasta_seq=self.all_chr.pop(chr,None)
        if fasta_seq:
            (header,seq)=fasta_seq
            logging.debug('return %s'%header)
            return fasta_seq
        curr_chr=''
        seq=''
        while not curr_chr==chr:
            fasta_seq=self.reader.next()
            if fasta_seq:
                (header,seq)=fasta_seq
                logging.debug('load %s'%header)
                curr_chr=header.split()[0]
                if not curr_chr.startswith(self.prefix):
                    curr_chr=self.prefix+curr_chr
                if (self.keep_in_memory and not curr_chr==chr) or self.keep_until_done:
                    logging.debug('keep %s'%header)
                    self.all_chr[curr_chr]=fasta_seq
            else: break
        return fasta_seq
    
    def next(self):
        fasta_seq=None
        if len(self.all_chr)>0:
            chr=self.all_chr.keys[0]
        if len(self.all_chr)>0:
            chr=self.all_chr.keys()[0]
            if self.keep_until_done: #return if loaded already
                fasta_seq=self.all_chr.get(chr)
            else:               #remove if loaded already
                fasta_seq=self.all_chr.pop(chr,None)
        if not fasta_seq:
            fasta_seq=self.reader.next()
            if fasta_seq:
                (header,seq)=fasta_seq
                curr_chr=header.split()[0]
                logging.debug('load %s'%header)
                if self.keep_until_done:
                    logging.debug('keep %s'%header)
                    self.all_chr[curr_chr]=fasta_seq
        if fasta_seq:
            logging.debug('return %s'%header)
            return fasta_seq
        return None
    
    def __iter__(self):
        return iter(self.next, None)
    
    def __del__(self):
        self.open_genome_file.close()
        self.all_chr=None
        
    def close(self):
        self.__del__()