예제 #1
0
 def prepare_nfasta_for_indexing(input_file: str,
                                 output_dir: str,
                                 preserve_headers: bool = False,
                                 chop: bool = False,
                                 chunk_length: int = int(3.6 * 10**9)):
     array = FASTAArray.parse(Utilities.load_string(input_file))
     if not preserve_headers:
         array._fix_headers()
     output_dir = Utilities.ends_with_slash(output_dir)
     os.makedirs(output_dir, exist_ok=True)
     output_file_mask = (output_dir + Utilities.filename_only(input_file))
     annotation_file = "{}_annotation.tsv".format(output_file_mask)
     array.dump_annotation(annotation_file)
     arrays_dict = {"{}.fasta".format(output_file_mask): array}
     if chop and array.get_total_length() >= chunk_length:
         print("Too large reference nFASTA file: '{}'. Splitting sequences".
               format(input_file))
         arrays_dict = array._chop_sequences(chunk_length)
         arrays_dict = {
             "{a}_{i}.fasta".format(a=output_file_mask, i=i): arrays_dict[i]
             for i in arrays_dict
         }
     refdatas_dict = {}
     counter = 0
     for chunk_file in arrays_dict:
         counter += 1
         arrays_dict[chunk_file].dump_fastas(chunk_file)
         refdatas_dict["sequence_{}".format(counter)] = {
             "reference_nfasta": chunk_file,
             "annotation": annotation_file
         }
     print("FASTA files created: {}".format(counter))
     return refdatas_dict
예제 #2
0
 def __init__(self, parsed_dictionary: dict):
     self._nfasta = parsed_dictionary["reference_nfasta"]
     self.db_name = parsed_dictionary.get("alias")
     if not self.db_name:
         self.db_name = Utilities.filename_only(self._nfasta)
     self._reference_mask = Utilities.ends_with_slash(os.path.dirname(os.path.realpath(self._nfasta))) + self.db_name
     self.bowtie_index_mask = parsed_dictionary["ebwt_mask"]
     self.bowtie2_index_mask = parsed_dictionary["bt2_mask"]
     self.samtools_index_file = parsed_dictionary["fai"]
     self.bedtools_genome_file = parsed_dictionary["genome"]
     self.annotation_file = parsed_dictionary["annotation"]
예제 #3
0
    def compile(input_file: str,
                output_dir: str,
                preserve_headers: bool = False,
                chop: bool = False,
                chunk_length: int = int(3.6 * 10**9)):
        import json
        from modules.FASTAArray import FASTAArray
        from modules.RefDataLine import RefDataLine

        output_dir = Utilities.ends_with_slash(output_dir)
        os.makedirs(output_dir, exist_ok=True)
        refdatas_dict = FASTAArray.prepare_nfasta_for_indexing(
            input_file=input_file,
            output_dir=output_dir,
            preserve_headers=preserve_headers,
            chop=chop,
            chunk_length=chunk_length)
        output_dict = {}
        for sequence_id in refdatas_dict:
            annotation_dict = refdatas_dict[sequence_id]
            nfasta_file = annotation_dict.get("reference_nfasta")
            if not nfasta_file:
                continue
            indexing_dict = {"alias": Utilities.filename_only(nfasta_file)}
            indexing_dict.update(RefDataLine.fill_dict(nfasta_file))
            indexing_dict.update(annotation_dict)
            print("Processing nFASTA: '{}'".format(nfasta_file))
            refdata = RefDataLine(indexing_dict)
            refdata.index()
            output_dict[sequence_id] = indexing_dict
        output_file = "{a}{b}_refdata.json".format(
            a=Utilities.ends_with_slash(output_dir),
            b=Utilities.filename_only(input_file))
        Utilities.dump_string(
            string=json.dumps(output_dict, sort_keys=False, indent=4) + "\n",
            file=output_file)
        print("Created reference data linker: '{}'".format(output_file))
        return output_file
예제 #4
0
 def fill_dict(nfasta_file: str):
     mask = Utilities.ends_with_slash(os.path.dirname(os.path.realpath(nfasta_file))) + Utilities.filename_only(nfasta_file)
     d = {"ebwt_mask": "{}_colorspace".format(mask),
          "bt2_mask": "{}_bowtie2".format(mask),
          "fai": "{}_samtools.fai".format(mask),
          "genome": "{}_samtools.genome".format(mask),
          "annotation": "{}_annotation.tsv".format(mask)}
     return d