Exemplo n.º 1
0
    def graph(self, current_kmer, next_kmer):
        options = {
            'k': next_kmer,
            'host_mem': self.available_memory,
            'mem_flag': 1,
            'output_prefix': self._graph_prefix(next_kmer),
            'num_cpu_threads': self.threads,
            'need_mercy': not self.no_mercy and current_kmer == self.kmin,
            'kmer_from': current_kmer,
            'useconv': False
        }

        if current_kmer == 0:  # Indicating it's the first graph
            if not self.one_pass:
                logger.log(2, f"Extracting solid (k+1)-mers for k={next_kmer}")
                count_opts = options.copy()
                count_opts['m'] = self.min_multi
                count_opts['read_lib_file'] = self.read_lib
                count_opts.pop('need_mercy')
                count_opts.pop('kmer_from')
                logger.log(0, f"Extract options : {count_opts}")
                shell_call(self.MEGAHIT_CORE, 'count', **count_opts)

        file_size = 0

        if path.exists(self._graph_prefix(next_kmer) + '.edges.0'):
            options['input_prefix'] = self._graph_prefix(next_kmer)
            file_size += path.getsize(
                self._graph_prefix(next_kmer) + '.edges.0')

        if path.exists(self._contig_prefix(current_kmer) + '.addi.fa'):
            options['addi_contig'] = \
                self._contig_prefix(current_kmer) + '.addi.fa'
            file_size += path.getsize(
                self._contig_prefix(current_kmer) + '.addi.fa')

        if path.exists(self._contig_prefix(current_kmer) + '.local.fa'):
            options['local_contig'] = \
                self._contig_prefix(current_kmer) + '.local.fa'
            file_size += path.getsize(
                self._contig_prefix(current_kmer) + '.addi.fa')

        if path.exists(self._contig_prefix(current_kmer) + '.contigs.fa'):
            options['contig'] = \
                self._contig_prefix(current_kmer) + '.contigs.fa'
            options['bubble'] = \
                self._contig_prefix(current_kmer) + '.bubble_seq.fa'
            file_size += path.getsize(
                self._contig_prefix(current_kmer) + '.contigs.fa')

        if file_size == 0 and current_kmer != 0:
            raise EmptyGraph

        logger.log(2, f'Building graph for k={next_kmer}')
        logger.log(0, f'Build options : {options}')

        shell_call(self.MEGAHIT_CORE, 'seq2sdbg', **options)

        if file_size != 0 and current_kmer != 0 and not self.keep_temp:
            os.system(f"rm -r {path.join(self.temp_dir, f'k{current_kmer}')}")
Exemplo n.º 2
0
def findmitoscaf(args):

    if args.__calling == 'findmitoscaf':

        if not args.from_megahit:
            logger.log(2, 'Remapping reads to contigs since contigs are not assembled from pipeline.')
            fastfilter_bin = path.abspath(path.join(path.dirname(__file__), 'assemble', 'fastfilter'))
            filtered_fasta = path.join(args.findmitoscaf_dir, f'{args.workname}.filtered.fa')
            shell_call(fastfilter_bin, i=args.fastafile, o=filtered_fasta,
                       l=f"{configurations.assemble.min_length},{configurations.assemble.max_length}",
                       d=0)
            fq1, fq2 = args.fastq1, args.fastq2
            if not (fq1 or fq2):
                raise RuntimeError("At least one fastq file should be specified!")
            if not fq1:
                fq1, fq2 = fq2, fq1
            # Remapping to calculate average depth.
            from findmitoscaf.findmitoscaf import remap_sequence
            args.fastafile = remap_sequence(args.workname, args.findmitoscaf_dir, filtered_fasta, args.fastq1, args.fastq2, args.threads)
        else:
            logger.log(2, "Remapping skipped since from-megahit is specified, no tagging needed.")

    from findmitoscaf.findmitoscaf import findmitoscaf as _findmitoscaf
    picked_fa = _findmitoscaf(
        thread_number=args.threads, clade=args.clade, relaxing=args.taxa_tolerance, gene_code=args.genetic_code,
        multi=args.min_abundance, taxa=args.required_taxa if not args.disable_taxa else None,
        prefix=args.workname, basedir=args.findmitoscaf_dir, contigs_file=args.fastafile,
        merge_method=args.merge_method, merge_overlapping=args.merge_overlap, merge_search=args.merge_start)

    # Further processing for calling directly
    if args.__calling == 'findmitoscaf':
        os.rename(picked_fa, path.join(
            args.result_dir, path.basename(picked_fa)))
    return picked_fa
Exemplo n.º 3
0
def filter_pe(fq1=None, fq2=None, o1=None, o2=None,
              dedup=False, start=None, end=None,
              n=10, q=55, l=0.2, trim=0, trunc=False):
    fsin1, fsin2 = path.getsize(fq1), path.getsize(fq2)
    logger.log(level=1, info='Start filtering pair-end rawdata.')
    logger.log(
        level=0, info=f'Input file 1 has {fsin1} bytes, 2 has {fsin2} bytes.')
    if fsin1 != fsin2:
        logger.log(
            level=3, info=f'Input file 1 and 2 have different sizes! This could cause loss on rawdata, or even crash the program.')
    logger.log(
        level=1, info=f'Using argument : Ns={n}, quality={q}, start={start}, end={end},limit={l}, trimming={trim}')
    try:
        shell_call(path.join(filter_dir, 'filter_v2'),
                   _1=f'"{fq1}"', _2=f'"{fq2}"', _3=f'"{o1}"', _4=f'"{o2}"', d=dedup, s=start,
                   e=end, n=n, q=q, l=l, t=trim, truncate_only=trunc)
    except Exception as identifier:
        logger.log(
            level=4, info=f'Error occured when running filter, cause : {identifier}')
        logger.log(level=1, info=f'Input file : {fq1} , {fq2}')
        logger.log(level=1, info=f'Output file : {o1} , {o2}')
        sys.exit("Error occured when running filter!")

    fsot1 = path.getsize(o1)
    logger.log(level=0, info=f'Output file has {fsot1} bytes.')
    logger.log(level=1,
               info=f'Filtered {fsin1 - fsot1} bytes, ratio {100*fsot1/fsin1:.2f}%.')
    return o1, o2
Exemplo n.º 4
0
def remap_sequence(prefix=None, basedir=None, fasta_file=None, fastq1=None, fastq2=None, threads=8):

    # Remap sequence back to the fastq file
    # This can be a non-trival task, so a partial of threads are
    # given to samtools view and samtools sort.
    logger.log(2, "Mapping fastq reads back onto fasta file.")
    shell_call('bwa index', fasta_file)
    bam_file = path.join(basedir, f'{prefix}.bam')
    check_output(
        f'bwa mem -t {max(1, int(threads*0.75))} {fasta_file} {fastq1} {fastq2 if fastq2!=None else ""} |samtools view -bS -q 30 -h -@ {max(1, int(threads*0.25))} -o {bam_file} -', shell=True)
    bam_sorted_file = path.join(basedir, f'{prefix}.sorted.bam')
    check_output(f'samtools sort -@ {threads} -o {bam_sorted_file} {bam_file}', shell=True)

    logger.log(2, "Calculating average depth for each sequence.")
    gene_depth_file = path.join(basedir, f'{prefix}.dep')
    avgdep_bin = path.join(path.abspath(path.dirname(__file__)), 'avgdep_bin')
    check_output(
        f'samtools depth -aa {bam_sorted_file} |{avgdep_bin} -o {gene_depth_file}', shell=True)

    mapping = {k: v for k, v in map(str.split, open(gene_depth_file))}

    logger.log(2, "Retagging sequences for latter processing.")
    sequences = []
    for seq in SeqIO.parse(fasta_file, 'fasta'):
        seq.description = f"flag=1 multi={mapping[seq.id]}"
        sequences.append(seq)
    SeqIO.write(sequences, path.join(basedir, path.basename(fasta_file)), 'fasta')

    return fasta_file
Exemplo n.º 5
0
 def local(self, current_kmer, next_kmer):
     logger.log(2, f'Local assembly for k = {current_kmer}')
     shell_call(self.MEGAHIT_CORE,
                'local',
                c=self._contig_prefix(current_kmer) + '.contigs.fa',
                l=self.read_lib,
                t=self.threads,
                o=self._contig_prefix(current_kmer) + '.local.fa',
                kmax=next_kmer)
Exemplo n.º 6
0
    def assemble(self, kmer) -> Tuple[ContigInfo, ContigInfo]:
        min_standalone = max(
            min(self.kmax * 3 - 1, int(self.min_length * 1.5)),
            self.min_length)

        options = {
            's':
            self._graph_prefix(kmer),
            'o':
            self._contig_prefix(kmer),
            't':
            self.threads,
            'min_standalone':
            min_standalone,
            'prune_level':
            self.prune_level,
            'merge_len':
            20,
            'merge_similar':
            0.95,
            'cleaning_rounds':
            5,
            'disconnect_ratio':
            0.1,
            'low_local_ratio':
            0.2,
            'min_depth':
            self.prune_depth,
            'bubble_level':
            2,
            'max_tip_len':
            max(1, self.min_length * 1.5 + 1 -
                kmer) if kmer * 3 - 1 > self.min_length * 1.5 else -1,
            'careful_bubble':
            kmer < self.kmax,
            'is_final_round':
            kmer == self.kmax,
            'output_standalone':
            self.no_local,
            'useconv':
            False
        }

        logger.log(2, f'Assembling contigs from SdBG for k = {kmer}')
        logger.log(0, f'Assemble arguments : {options}')

        shell_call(self.MEGAHIT_CORE, 'assemble', **options)
        with open(self._contig_prefix(kmer) + '.contigs.fa.info', 'r') as c, \
                open(self._contig_prefix(kmer) + '.addi.fa.info', 'r') as a:
            return ContigInfo(c), ContigInfo(a)
Exemplo n.º 7
0
 def iterate(self, current_kmer, next_kmer):
     logger.log(
         2,
         f'Extracting iterative edges from k = {current_kmer} to {next_kmer}'
     )
     shell_call(self.MEGAHIT_CORE,
                'iterate',
                c=self._contig_prefix(current_kmer) + '.contigs.fa',
                b=self._contig_prefix(current_kmer) + '.bubble_seq.fa',
                t=self.threads,
                s=next_kmer - current_kmer,
                o=self._graph_prefix(next_kmer),
                r=self.read_lib + '.bin',
                k=current_kmer)
Exemplo n.º 8
0
    def finalize(self, kmer):
        self.final_contig = path.join(self.result_dir, f'k{kmer}.contig.fa')

        shell_call('cat', path.join(self.contig_dir, '*.final.contigs.fa'),
                   self._contig_prefix(kmer) + '.contigs.fa', '>',
                   self.final_contig)

        if not self.keep_temp:
            to_remove = self.temp_dir
            if path.isdir(str(a_conf.external_temp)):
                to_remove = path.join(to_remove, "..")
            to_remove = path.abspath(to_remove)

            os.system(f'rm -r {to_remove}')
Exemplo n.º 9
0
    def initialize(self):
        self.basedir = path.abspath(self.basedir)
        self.fq1 = path.abspath(self.fq1)
        if self.fq2:
            self.fq2 = path.abspath(self.fq2)

        # Check if POPCNT command is supported
        if self.use_popcnt:
            if shell_call('megahit_core checkpopcnt').rstrip() != '1':
                self.use_popcnt = False
                logger.log(3, "POPCNT is disabled since no features detected.")
            else:
                self.hwaccel = shell_call(
                    "megahit_core checkcpu").rstrip() == '1'

                logger.log(
                    2,
                    f"Using megahit with {'POPCNT' if not self.hwaccel else 'hardware acceleration'} support."
                )
        else:
            logger.log(2, "POPCNT disabled by argument.")

        if self.one_pass:
            logger.log(3, "Using 1-pass mode.")

        self.result_dir = safe_makedirs(
            path.join(self.basedir, f'{self.prefix}.result'), False)

        if not path.isdir(str(a_conf.external_temp)):
            self.temp_dir = safe_makedirs(
                path.join(self.basedir, f'{self.prefix}.temp'), False)
        else:
            self.temp_dir = safe_makedirs(
                path.join(a_conf.external_temp, str(uuid.uuid4()),
                          f'{self.prefix}.temp'), False)

        self.read_lib = path.join(self.temp_dir, 'reads.lib')
        self.contig_dir = safe_makedirs(
            path.join(self.temp_dir, 'intermediate_contigs'), False)

        vm = psutil.virtual_memory()
        logger.log(
            1,
            f"System memory status : {', '.join([f'{k}={v/(1024**2):.2f}MB' for k,v in vm._asdict().items() if type(v) is int])}"
        )
        self.available_memory = int(vm.available * a_conf.max_mem_percent)
        logger.log(
            2, f'Scheduled {self.available_memory/(1024**2):.2f}MB to use.')
Exemplo n.º 10
0
    def build_lib(self):

        # Write reads info
        with open(self.read_lib, 'w') as l:
            fifos = []

            if self.fq1 and self.fq2:
                print(self.fq1, self.fq2, sep=',', file=l)
                fq1, fq2 = (self.fq1 if not self.fq1.endswith('gz') else
                            path.join(self.temp_dir, 'pipe.pe1'),
                            self.fq2 if not self.fq2.endswith('gz') else
                            path.join(self.temp_dir, 'pipe.pe2'))

                if self.fq1.endswith('gz'):
                    fifo1 = path.join(self.temp_dir, 'pipe.pe1')
                    os.mkfifo(fifo1)
                    fifos.append(
                        subprocess.Popen(f'gzip -dc {self.fq1} > {fifo1}',
                                         shell=True,
                                         preexec_fn=os.setsid))

                if self.fq2.endswith('gz'):
                    fifo2 = path.join(self.temp_dir, 'pipe.pe2')
                    os.mkfifo(fifo2)
                    fifos.append(
                        subprocess.Popen(f'gzip -dc {self.fq2} > {fifo2}',
                                         shell=True,
                                         preexec_fn=os.setsid))

                print('pe', fq1, fq2, file=l)
            else:
                print(self.fq1, file=l)
                fq1 = self.fq1 if not self.fq1.endswith('gz') else path.join(
                    self.temp_dir, 'pipe.se')
                print('se', fq1, file=l)

        logger.log(1, "Converting reads to binary library.")
        shell_call(self.MEGAHIT_CORE, 'buildlib', self.read_lib, self.read_lib)

        if False in (x.wait() == 0 for x in fifos):
            raise RuntimeError("Error occured in reading input fifos")

        with open(self.read_lib + '.lib_info') as ri:
            info = [x.split(' ') for x in ri.readlines()]
            return LibInfo(info)
Exemplo n.º 11
0
def blastn_multi(dbfile=None, infile=None, basedir=None, prefix=None, threads=8):
    infile = path.abspath(infile)
    dbfile = path.abspath(dbfile)

    truncated_call('makeblastdb', '-in', infile, dbtype='nucl')

    nucl_data_dir = path.join(basedir, "blastn_data")

    try:
        os.mkdir(nucl_data_dir)
    except FileExistsError:
        raise RuntimeError("Folder is already created, please make sure the working folder is clean.")

    logger.log(1, f'Making {threads} small datasets for calling blastn.')

    file_names = [path.join(nucl_data_dir, f'dataset_{x}.fasta') for x in range(threads)]

    tasks = [f'blastn -evalue 1e-5 -outfmt 6 -db {infile} -query {dataset_path}' for dataset_path in file_names]
    seqs = [[] for i in range(threads)]

    for i, seq in enumerate(SeqIO.parse(dbfile, 'fasta')):
        seqs[i % threads].append(seq)

    for i in range(threads):
        SeqIO.write(seqs[i], file_names[i], 'fasta')

    logger.log(1, 'Generating map for calling blastn.')
    pool = multiprocessing.Pool(processes=threads)

    out_blast = path.join(path.abspath(basedir), f'{prefix}.blast')
    with open(out_blast, 'w') as f:
        pool.map_async(direct_call, tasks, callback=lambda x: f.write(''.join(x)))
        pool.close()
        logger.log(1, "Waiting for all processes to finish.")
        pool.join()

    logger.log(1, f'Cleaning generated temp files.')

    shell_call('rm -r', nucl_data_dir)
    os.remove(f'{infile}.nhr')
    os.remove(f'{infile}.nin')
    os.remove(f'{infile}.nsq')

    return out_blast
Exemplo n.º 12
0
def tblastn_multi(dbfile=None, infile=None, genetic_code=9, basedir=None,
                  prefix=None, threads=8):

    infile = path.abspath(infile)
    dbfile = path.abspath(dbfile)

    truncated_call('makeblastdb', '-in', infile, dbtype='nucl')

    tasks = []

    protein_data_dir = path.join(basedir, 'tblastn_data')

    try:
        os.mkdir(protein_data_dir)
    except FileExistsError:
        raise RuntimeError(
            "Folder is already created, please make sure the working folder is clean.")

    logger.log(1, f'Making {threads} small datasets for calling tblastn.')
    tblastn_db = np.array_split(list(SeqIO.parse(dbfile, 'fasta')), threads)
    for idx, data in enumerate(tblastn_db):
        if data.any():
            logger.log(0, f'Dataset {idx} has {len(data)} queries.')
            dataset_path = path.join(protein_data_dir, f'dataset_{idx}.fasta')
            SeqIO.write(data, dataset_path, 'fasta')
            tasks.append(
                f'tblastn -evalue 1e-5 -outfmt 6 -seg no -db_gencode {genetic_code} -db {infile} -query {dataset_path}')
    logger.log(1, f'Generating map for calling tblastn.')
    pool = multiprocessing.Pool(processes=threads)

    out_blast = path.join(path.abspath(basedir), f'{prefix}.blast')
    with open(out_blast, 'w') as f:
        pool.map_async(direct_call, tasks, callback=lambda x: f.write(''.join(x)))
        logger.log(1, f'Waiting for all processes to finish.')
        pool.close()
        pool.join()

    logger.log(1, f'Cleaning generated temp files.')
    shell_call('rm -r', protein_data_dir)
    os.remove(f'{infile}.nhr')
    os.remove(f'{infile}.nin')
    os.remove(f'{infile}.nsq')
    return out_blast
Exemplo n.º 13
0
    def scaf(self) -> str:
        if self.lib_file == None:
            raise RuntimeError("Lib was not build before scaffolding!")

        kmer = int(self.read_length / 2)
        prefix = path.join(self.basedir, f'k{kmer}')

        # Prepare
        logger.log(2, "Constructing graph for SOAPdenovo-127.")
        shell_call(soap_fusion,
                   D=True,
                   s=self.lib_file,
                   p=self.threads,
                   K=kmer,
                   g=prefix,
                   c=self.contigs)

        # Map
        logger.log(2, "Mapping sequences.")
        shell_call(soap_127, 'map', s=self.lib_file, p=self.threads, g=prefix)

        # Scaff
        logger.log(2, "Scaffolding.")
        shell_call(soap_127, 'scaff', p=self.threads, g=prefix)

        # Convert
        logger.log(2, "Converting output scaffolds back.")
        scaf2mega(prefix + '.scafSeq',
                  path.join(path.dirname(self.contigs), 'scaf.fa'),
                  overlay=kmer)
        return path.join(path.dirname(self.contigs), 'scaf.fa')
Exemplo n.º 14
0
def nhmmer_search(fasta_file=None, thread_number=None, nhmmer_profile=None,
                  prefix=None, basedir=None):

    logger.log(1, 'Calling nhmmer.')

    # Call nhmmer
    hmm_out = os.path.join(basedir, f'{prefix}.nhmmer.out')
    hmm_tbl = os.path.join(basedir, f'{prefix}.nhmmer.tblout')
    logger.log(1, f'Out file : o={hmm_out}, tbl={hmm_tbl}')
    shell_call('nhmmer', o=hmm_out, tblout=hmm_tbl,
               cpu=thread_number, appending=[nhmmer_profile, fasta_file])

    # Process data to pandas readable table
    hmm_tbl_pd = f'{hmm_tbl}.readable'
    with open(hmm_tbl, 'r') as fin, open(hmm_tbl_pd, 'w') as fout:
        for line in fin:
            striped = line.strip()
            splitted = striped.split()
            # Dispose the description of genes, god damned nhmmer...
            print(' '.join(splitted[:15]), file=fout)

    # Read table with pandas
    hmm_frame = pandas.read_csv(hmm_tbl_pd, comment='#', delimiter=' ',
                                names=[
                                    'target', 'accession1', 'query',
                                    'accession2', 'hmmfrom', 'hmm to',
                                    'alifrom', 'alito', 'envfrom', 'envto',
                                    'sqlen', 'strand', 'e', 'score',
                                    'bias'
                                ])
    hmm_frame = hmm_frame.drop(columns=['accession1', 'accession2'])

    # Deduplicate multiple hits on the same gene of same sequence
    hmm_frame = hmm_frame.drop_duplicates(
        subset=['target', 'query'], keep='first')
    hmm_frame.to_csv(f'{hmm_tbl}.dedup.csv', index=False)

    logger.log(1, f'HMM query have {len(hmm_frame.index)} results.')
    return hmm_frame
Exemplo n.º 15
0
def filter_se(fqiabs=None, fqoabs=None, Ns=10, quality=55, limit=0.2, start=None, end=None, trim=0, trunc=False):
    fsin = path.getsize(fqiabs)
    logger.log(level=1, info='Start filtering single-end rawdata.')
    logger.log(level=0, info=f'Input file has {fsin} bytes.')
    logger.log(level=1,
               info=f'Using argument : Ns={Ns}, quality={quality}, limit={limit}, start={start}, end={end}, trimming={trim}, trunc={trunc}')
    try:
        shell_call(path.join(filter_dir, 'filter_v2'), cleanq1=f'"{fqoabs}"', fastq1=f'"{fqiabs}"',
                   n=Ns, q=quality, l=limit, s=start, e=end, t=trim, truncate_only=trunc)
    except Exception as identifier:
        logger.log(
            level=4, info=f'Error occured when running filter, cause : {identifier}')
        logger.log(level=1, info=f'Input file : {fqiabs}')
        logger.log(level=1, info=f'Output file : {fqoabs}')

        sys.exit("Error occured when running filter!")

    fsot = path.getsize(fqoabs)
    logger.log(level=0, info=f'Output file has {fsot} bytes.')
    logger.log(level=0,
               info=f'Filtered {fsin - fsot} bytes, ratio {fsot/fsin}.')

    return fqoabs
Exemplo n.º 16
0
    def filter(self,
               kmer=None,
               min_depth=3,
               min_length=0,
               max_length=20000,
               force_filter=False,
               deny_number=a_conf.filter_keep) -> Tuple[int, int, int]:
        logger.log(2, f'Filtering output contig files of k = {kmer}')

        results = [0, 0, 0]
        if not a_conf.no_filter or force_filter:
            for idx, suffix in enumerate(
                ['.contigs.fa', '.addi.fa', '.bubble_seq.fa']):
                if path.exists(self._contig_prefix(kmer) + suffix):
                    results[idx] = int(
                        shell_call(self.FAST_FILTER,
                                   i=self._contig_prefix(kmer) + suffix,
                                   o=self._contig_prefix(kmer) + '.filtered' +
                                   suffix,
                                   l=f"{min_length},{max_length}",
                                   d=min_depth))

                    if results[idx] <= deny_number and idx == 0:
                        results[idx] = int(
                            shell_call(self.FAST_FILTER,
                                       i=self._contig_prefix(kmer) + suffix,
                                       o=self._contig_prefix(kmer) +
                                       '.filtered' + suffix,
                                       l=f"{min_length},{max_length}",
                                       m=deny_number))

                    shell_call(
                        'mv',
                        self._contig_prefix(kmer) + '.filtered' + suffix,
                        self._contig_prefix(kmer) + suffix)

        return tuple(results)
Exemplo n.º 17
0
def merge_sequences(fasta_file=None, overlapped_len=50, search_range=5, threads=8, index=0):
    # Merge sequences that are possibly be overlapped with each others.

    logger.log(1, "Trying to merge candidates that are possibly overlapped.")

    fasta_file = path.abspath(fasta_file)
    if some(SeqIO.parse(fasta_file, 'fasta')):
        logger.log(1, "No sequences needed merging.")
        return 0

    while True:
        blast_results = tk.blastn_multi(fasta_file, fasta_file, path.dirname(fasta_file), 'merge', threads=threads)

        # Overlap Conditions:
        # 1. Not aligning itself
        # 2. One of the sequences can be sticked into the other in a short range
        # 3. Aligned length is long enough
        # 4. After merging, they will be longer and no too much sequences are discarded.
        logger.log(1, "Washing blast results.")
        libfastmathcal.wash_merge_blast(blast_results, fasta_file, search_range, overlapped_len, a_conf.max_length)

        logger.log(1, "Sorting outputs.")
        shell_call('sort -n -k12,12 -k3,3', appending=[blast_results + ".filtered", ">", blast_results])

        logger.log(1, "Merging sequences.")
        new_index = libfastmathcal.merge_overlaps(blast_results, fasta_file, fasta_file + '.merged', index)
        os.rename(fasta_file + '.merged', fasta_file)

        logger.log(1, f"Merged {new_index - index} sequences")
        if index == new_index:
            break
        index = new_index
    os.remove(blast_results)
    os.remove(blast_results + ".filtered")

    return index
Exemplo n.º 18
0
def visualize(fasta_file=None,
              fastq1=None,
              fastq2=None,
              pos_json=None,
              prefix=None,
              basedir=None,
              threads=8,
              circular=False):
    logger.log(2, 'Entering visualize module.')
    # Validate the paths
    fasta_file = path.abspath(fasta_file)
    fastq1 = path.abspath(fastq1)
    if fastq2 != None:
        fastq2 = path.abspath(fastq2)
    basedir = path.abspath(basedir)
    pos_json = path.abspath(pos_json)

    fa_copy = path.join(basedir, f'{prefix}.fasta')
    list_conv = []
    counter = 1

    # Rename to a easier form
    index_list = {}
    for seq in SeqIO.parse(fasta_file, 'fasta'):
        index_list[seq.id] = f'mt{counter}'
        seq.id_old = seq.id
        seq.id = f'mt{counter}'
        seq.description = ''
        list_conv.append(seq)
        counter += 1
    SeqIO.write(list_conv, fa_copy, 'fasta')

    with open(pos_json, 'r') as f:
        poses = json.load(f)

    # Gene name files
    logger.log(1, 'Generating gene name and feature files.')
    gene_name_file = path.join(basedir, f'{prefix}.gene.txt')
    with open(gene_name_file, 'w') as gn_f:
        for key, value in poses.items():
            start, end, gene_type, strand, _ = value
            strand_conv = index_list[strand]
            print(strand_conv,
                  start,
                  end,
                  key.split('_')[0] if '_' in key else key,
                  sep='\t',
                  file=gn_f)

    # Gene feature files
    gene_feature_file = path.join(basedir, f'{prefix}.features.txt')
    with open(gene_feature_file, 'w') as gf_f:
        for key, value in poses.items():
            start, end, gene_type, strand, plus = value
            plus = plus == '+'
            r0 = 0.965 if plus else 1
            r1 = 1 if plus else 1.035
            strand_conv = index_list[strand]
            print(strand_conv,
                  start,
                  start,
                  f'fill_color=black,r0={r0}r,r1={r1}r',
                  file=gf_f,
                  sep='\t')
            print(
                strand_conv,
                start,
                end,
                f'fill_color={circos_config.fill_colors[int(gene_type)]},r0={r0}r,r1={r1}r',
                file=gf_f,
                sep='\t')
            print(strand_conv,
                  end,
                  end,
                  f'fill_color=black,r0={r0}r,r1={r1}r',
                  file=gf_f,
                  sep='\t')

    logger.log(1, 'Generating depth files.')
    # Using check_output directly because being too lazy to remove decoder
    from subprocess import check_output

    shell_call('bwa index', fa_copy)
    bam_file = path.join(basedir, f'{prefix}.bam')

    mem_count = max(int(threads * 0.8), 1)
    view_count = max(threads - mem_count, 1)

    check_output(
        f'bwa mem -t {mem_count} {fa_copy} {fastq1} {fastq2 if fastq2!=None else ""} |samtools view -bS -@ {view_count} -q 30 -h -o {bam_file} -',
        shell=True)
    bam_sorted_file = path.join(basedir, f'{prefix}.sorted.bam')
    check_output(f'samtools sort -@ {threads} -o {bam_sorted_file} {bam_file}',
                 shell=True)
    gene_depth_file = path.join(basedir, f'{prefix}.dep')
    check_output(f'samtools depth -aa {bam_sorted_file} > {gene_depth_file}',
                 shell=True)

    # Calculate the things
    circos_depth_file = path.join(basedir, f'{prefix}.depth.txt')
    max_gene_depth = 0
    with open(gene_depth_file, 'r') as gdf, open(circos_depth_file,
                                                 'w') as cdf:
        for line in gdf:
            content = str(line).rstrip().split()
            print(' '.join([content[0], content[1], content[1], content[2]]),
                  file=cdf)
            if int(content[2]) > max_gene_depth:
                max_gene_depth = int(content[2])

    # GC content
    # Reusing conv-list here, as it's not deleted in the scope
    gc_content_file = path.join(basedir, f'{prefix}.gc.txt')
    with open(gc_content_file, 'w') as gc_f:
        for seq in list_conv:
            # Stepping 50 to walk through
            for s in range(0, len(seq), 50):
                seq_slice = seq[s:s + 50]
                gc_num = sum(x == 'G' or x == 'C' for x in seq_slice)
                gc_per = gc_num / len(seq_slice)
                print(seq.id, s, s + len(seq_slice), gc_per, file=gc_f)

    # Karyotype
    logger.log(1, 'Generating chr files.')
    karyotype_file = path.join(basedir, f'{prefix}.karyotype.txt')
    with open(karyotype_file, 'w') as ky_f:
        for seq in list_conv:
            chr_name = seq.id.replace('mt', 'chr')
            print(f'{chr_name} - {seq.id}\t{seq.id_old}\t0\t{len(seq)}\tgrey',
                  file=ky_f)

    # Plus generation
    logger.log(1, 'Generating plus.')
    plus_file = path.join(basedir, f'{prefix}.plus.txt')
    with open(plus_file, 'w') as p_f:
        print('mt1\t0\t300\t+\tr0=1r-150p,r1=1r-100p', file=p_f)

    # Giving the values
    logger.log(1, 'Generating circos config file.')
    generated_config = circos_config.circos_conf
    generated_config.ideogram.spacing._break = "0.5r" if not circular else "0.01r"
    generated_config.image.dir = basedir
    generated_config.karyotype = karyotype_file
    generated_config.plots['plot', 0].file = gene_name_file
    generated_config.plots['plot', 1].file = plus_file
    generated_config.plots['plot', 2].file = gc_content_file
    with generated_config.plots['plot', 3] as depth_plot:
        depth_plot.file = circos_depth_file
        depth_plot.max = max_gene_depth
        depth_plot.rules[
            'rule', 0].condition = f'var(value) > {int(max_gene_depth*0.9)}'
        depth_plot.rules[
            'rule', 1].condition = f'var(value) < {int(max_gene_depth*0.1)}'

    generated_config.highlights['highlight', 0].file = gene_feature_file

    # Writing to final
    # I guess it would be better to use a f-string formatted cfg, but
    # well this is fine.
    cfg_dict = circos.collapse(generated_config)
    cfg_file = path.join(basedir, 'circos.conf')
    with open(cfg_file, 'w') as cfg_f:
        cfg_f.write('<<include etc/colors_fonts_patterns.conf>>\n')
        cfg_f.write(circos.dict2circos(cfg_dict) + '\n')
        cfg_f.write('<<include etc/housekeeping.conf>>')

    logger.log(1, 'Running Circos.')
    try:
        check_output('circos', shell=True, cwd=basedir)
    except Exception:
        logger.log(4, "Running circos errored, no graph is outputted!")

    return path.join(basedir, 'Circos.png'), path.join(basedir, 'Circos.svg')