Exemplo n.º 1
0
def abyssmap_rmdups(in_fa, out_fa, strand_specific=False, cleanup=False, threads=1):
    ids_file = in_fa + '.dup_ids' 

    # run abyssmap
    cmd_params = ['abyss-map', '--dup']
    
    if strand_specific:
        cmd_params.append('--SS')
    #endif
    
    if threads > 1:
        cmd_params.append('--threads=%d' % threads)
    #endif
    
    cmd_params.extend([in_fa, in_fa])
    cmd_params.append('> %s' % ids_file)
    
    run_shell_cmd(' '.join(cmd_params))
    
    cids_set = set()
    with open(ids_file, 'r') as fh:
        for line in fh:
            line_stripped = line.strip()
            if len(line_stripped) > 0:
                cids_set.add(line_stripped)
            #endif
        #endfor
    #endwith
        
    filter_fasta(in_fa, out_fa, remove_set=cids_set)
    
    if cleanup:
        os.remove(ids_file)
Exemplo n.º 2
0
def bowtie2_self_align(fasta, outputsam, threads=1, strand_specific=False, path_strip_sam_seq_qual=None, preset='--sensitive', k=2):
    """Align all fasta sequences to each other with Bowtie2.
    """

    # Build index files for the concatenated fasta
    bt2_index_cmd_params = ['bowtie2-build --quiet', fasta, fasta]
    run_shell_cmd(' '.join(bt2_index_cmd_params))
        
    # Self-align concatenated fasta with Bowtie2
    bt2_align_cmd_params = ['set -euo pipefail && bowtie2']
    
    if strand_specific:
        bt2_align_cmd_params.append('--norc')
    #endif
        
    bt2_align_cmd_params.extend([preset, '-k %d' % k, '--omit-sec-seq --end-to-end -f', '-p %d' % threads, fasta, fasta])
    
    if path_strip_sam_seq_qual:
        bt2_align_cmd_params.append('|' + path_strip_sam_seq_qual)
    #endif
    
    bt2_align_cmd_params.append('|gzip -c >' + outputsam)
    
    run_shell_cmd(' '.join(bt2_align_cmd_params))
Exemplo n.º 3
0
def blat_merge_fastas(path_prefix_map, merged_fa, concat_fa=None, concat_fa_selfalign_psl=None, percent_identity=0.95, strand_specific=False, cleanup=False, minoverlap=0, threads=1, indel_size_tolerance=1, min_seq_len=32):
    """Merge fasta files into a single fasta file by removing redundant sequences.
    """

    if concat_fa is None:
        concat_fa = merged_fa + '.tmp.concat.fa'
    #endif
    
    if concat_fa_selfalign_psl is None:
        concat_fa_selfalign_psl = merged_fa + '.tmp.concat.psl'
    #endif

    # Concatenate the fastas together and give the contigs of each set a prefix
    concat_fastas(path_prefix_map, concat_fa)
    
    # Self-align concatenated fasta with Bowtie2
    blat_self_align(concat_fa, concat_fa_selfalign_psl, percent_id=percent_identity, max_consecutive_edits=indel_size_tolerance, min_seq_len=min_seq_len, threads=threads)

    # Identify NON-redundant contigs
    nrrefs = psl_cid_extractor.extract_cids(psl=concat_fa_selfalign_psl, samestrand=strand_specific, min_percent_identity=percent_identity, max_consecutive_edits=indel_size_tolerance, report_redundant=False)
    
    tmpfiles = []
    nr_fa_long = merged_fa + '.tmp.long.fa'
    nr_fa_short = None
    if minoverlap > 0:
        nr_fa_short = merged_fa + '.tmp.short.fa'
        tmpfiles.append(nr_fa_short)
    #endif
    
    # Gather the non-contained sequences and split into 2 partitions:
    # 1. shorter than (min overlap + 1)
    # 2. longer than or equal to (min overlap + 1)
    filter_fasta(concat_fa, nr_fa_long, min_length=minoverlap+1, keep_set=nrrefs, fasta_out_st=nr_fa_short)
        
    # overlap-layout the long sequences
    if minoverlap > 0:
        # generate the sequence overlap graph
        overlap_dot = merged_fa + '.tmp.long.dot'
        overlap_cmd_params = ['abyss-overlap', '--threads=%d' % threads, '--min=%d' % minoverlap]
        if strand_specific:
            overlap_cmd_params.append('--SS')
        #endif
        overlap_cmd_params.append(nr_fa_long)
        overlap_cmd_params.append('>' + overlap_dot)
        run_shell_cmd(' '.join(overlap_cmd_params))
        
        # layout contigs using the overlap graph
        layout_path = merged_fa + '.tmp.long.path'
        layout_cmd_params = ['abyss-layout', '--kmer=%d' % (minoverlap+1), '--out=%s' % layout_path]
        if strand_specific:
            layout_cmd_params.append('--SS')
        #endif
        layout_cmd_params.append(overlap_dot)
        run_shell_cmd(' '.join(layout_cmd_params))
        
        # generate fasta for O-L
        overlap_fa = merged_fa + '.incomplete'
        mergecontigs_cmd_params = ['MergeContigs', '--kmer=%d' % (minoverlap+1), '--out=%s' % overlap_fa, nr_fa_long, overlap_dot, layout_path]
        run_shell_cmd(' '.join(mergecontigs_cmd_params))
        
        # append the short sequences to the same fasta
        with open(overlap_fa, 'a') as fout:
            with open(nr_fa_short, 'r') as fin:
                for line in fin:
                    fout.write(line)
                #endfor
            #endwith
        #endwith
        
        shutil.move(overlap_fa, merged_fa)
        tmpfiles.extend([concat_fa, concat_fa_selfalign_psl, nr_fa_long, nr_fa_short, overlap_dot, layout_path])
    else:
        shutil.move(nr_fa_long, merged_fa)
        tmpfiles.extend([concat_fa, concat_fa_selfalign_psl])
    #endif
    
    if cleanup and tmpfiles is not None:
        for t in tmpfiles:
            if t is not None and os.path.isfile(t):
                os.remove(t)