def prepare_nfasta_for_indexing(input_file: str, output_dir: str, preserve_headers: bool = False, chop: bool = False, chunk_length: int = int(3.6 * 10**9)): array = FASTAArray.parse(Utilities.load_string(input_file)) if not preserve_headers: array._fix_headers() output_dir = Utilities.ends_with_slash(output_dir) os.makedirs(output_dir, exist_ok=True) output_file_mask = (output_dir + Utilities.filename_only(input_file)) annotation_file = "{}_annotation.tsv".format(output_file_mask) array.dump_annotation(annotation_file) arrays_dict = {"{}.fasta".format(output_file_mask): array} if chop and array.get_total_length() >= chunk_length: print("Too large reference nFASTA file: '{}'. Splitting sequences". format(input_file)) arrays_dict = array._chop_sequences(chunk_length) arrays_dict = { "{a}_{i}.fasta".format(a=output_file_mask, i=i): arrays_dict[i] for i in arrays_dict } refdatas_dict = {} counter = 0 for chunk_file in arrays_dict: counter += 1 arrays_dict[chunk_file].dump_fastas(chunk_file) refdatas_dict["sequence_{}".format(counter)] = { "reference_nfasta": chunk_file, "annotation": annotation_file } print("FASTA files created: {}".format(counter)) return refdatas_dict
def __init__(self, parsed_dictionary: dict): self._nfasta = parsed_dictionary["reference_nfasta"] self.db_name = parsed_dictionary.get("alias") if not self.db_name: self.db_name = Utilities.filename_only(self._nfasta) self._reference_mask = Utilities.ends_with_slash(os.path.dirname(os.path.realpath(self._nfasta))) + self.db_name self.bowtie_index_mask = parsed_dictionary["ebwt_mask"] self.bowtie2_index_mask = parsed_dictionary["bt2_mask"] self.samtools_index_file = parsed_dictionary["fai"] self.bedtools_genome_file = parsed_dictionary["genome"] self.annotation_file = parsed_dictionary["annotation"]
def compile(input_file: str, output_dir: str, preserve_headers: bool = False, chop: bool = False, chunk_length: int = int(3.6 * 10**9)): import json from modules.FASTAArray import FASTAArray from modules.RefDataLine import RefDataLine output_dir = Utilities.ends_with_slash(output_dir) os.makedirs(output_dir, exist_ok=True) refdatas_dict = FASTAArray.prepare_nfasta_for_indexing( input_file=input_file, output_dir=output_dir, preserve_headers=preserve_headers, chop=chop, chunk_length=chunk_length) output_dict = {} for sequence_id in refdatas_dict: annotation_dict = refdatas_dict[sequence_id] nfasta_file = annotation_dict.get("reference_nfasta") if not nfasta_file: continue indexing_dict = {"alias": Utilities.filename_only(nfasta_file)} indexing_dict.update(RefDataLine.fill_dict(nfasta_file)) indexing_dict.update(annotation_dict) print("Processing nFASTA: '{}'".format(nfasta_file)) refdata = RefDataLine(indexing_dict) refdata.index() output_dict[sequence_id] = indexing_dict output_file = "{a}{b}_refdata.json".format( a=Utilities.ends_with_slash(output_dir), b=Utilities.filename_only(input_file)) Utilities.dump_string( string=json.dumps(output_dict, sort_keys=False, indent=4) + "\n", file=output_file) print("Created reference data linker: '{}'".format(output_file)) return output_file
def fill_dict(nfasta_file: str): mask = Utilities.ends_with_slash(os.path.dirname(os.path.realpath(nfasta_file))) + Utilities.filename_only(nfasta_file) d = {"ebwt_mask": "{}_colorspace".format(mask), "bt2_mask": "{}_bowtie2".format(mask), "fai": "{}_samtools.fai".format(mask), "genome": "{}_samtools.genome".format(mask), "annotation": "{}_annotation.tsv".format(mask)} return d