def __init__(self): namespace = self._parse_args() self.input_file_name = namespace.input self.refdata_file_name = namespace.refdata self.chunk_id = namespace.chunk self.mapped_reads_directory = Utilities.ends_with_slash( os.path.dirname(os.path.abspath(self.input_file_name))) self._output_directory = Utilities.ends_with_slash("/".join( os.path.dirname(os.path.abspath( self.input_file_name)).split("/")[:-1])) self.logs_directory = "{}Logs/".format(self._output_directory) self.statistics_directory = "{}Statistics/".format( self._output_directory) self._create_dirs()
def prepare_nfasta_for_indexing(input_file: str, output_dir: str, preserve_headers: bool = False, chop: bool = False, chunk_length: int = int(3.6 * 10**9)): array = FASTAArray.parse(Utilities.load_string(input_file)) if not preserve_headers: array._fix_headers() output_dir = Utilities.ends_with_slash(output_dir) os.makedirs(output_dir, exist_ok=True) output_file_mask = (output_dir + Utilities.filename_only(input_file)) annotation_file = "{}_annotation.tsv".format(output_file_mask) array.dump_annotation(annotation_file) arrays_dict = {"{}.fasta".format(output_file_mask): array} if chop and array.get_total_length() >= chunk_length: print("Too large reference nFASTA file: '{}'. Splitting sequences". format(input_file)) arrays_dict = array._chop_sequences(chunk_length) arrays_dict = { "{a}_{i}.fasta".format(a=output_file_mask, i=i): arrays_dict[i] for i in arrays_dict } refdatas_dict = {} counter = 0 for chunk_file in arrays_dict: counter += 1 arrays_dict[chunk_file].dump_fastas(chunk_file) refdatas_dict["sequence_{}".format(counter)] = { "reference_nfasta": chunk_file, "annotation": annotation_file } print("FASTA files created: {}".format(counter)) return refdatas_dict
def fill_dict(nfasta_file: str): mask = Utilities.ends_with_slash(os.path.dirname(os.path.realpath(nfasta_file))) + Utilities.filename_only(nfasta_file) d = {"ebwt_mask": "{}_colorspace".format(mask), "bt2_mask": "{}_bowtie2".format(mask), "fai": "{}_samtools.fai".format(mask), "genome": "{}_samtools.genome".format(mask), "annotation": "{}_annotation.tsv".format(mask)} return d
def __init__(self): self._namespace = self.parse_args() self.input_nfasta = self._namespace.input self.preserve_headers_bool = self._namespace.preserve_headers self.not_large_index_bool = self._namespace.not_large_index self.chunk_length = int(self._namespace.size * 10**9) self.output_dir = Utilities.ends_with_slash(self._namespace.output) os.makedirs(self.output_dir, exist_ok=True)
def __init__(self): self._namespace = self._parse_args() self.sampledata = self._namespace.input self.target_length = CoveragesVerifier.get_wc_l(self._namespace.genome) + 1 self.prefix = self._namespace.prefix self.suffix = self._namespace.suffix self.debugging_bool = self._namespace.debug self.output = self._namespace.output if len(self.output) == 0: self.output = "{}sampledata/{}.sampledata".format(Utilities.ends_with_slash(os.path.dirname(self.prefix)), Utilities.get_time())
def __init__(self, parsed_dictionary: dict): self._nfasta = parsed_dictionary["reference_nfasta"] self.db_name = parsed_dictionary.get("alias") if not self.db_name: self.db_name = Utilities.filename_only(self._nfasta) self._reference_mask = Utilities.ends_with_slash(os.path.dirname(os.path.realpath(self._nfasta))) + self.db_name self.bowtie_index_mask = parsed_dictionary["ebwt_mask"] self.bowtie2_index_mask = parsed_dictionary["bt2_mask"] self.samtools_index_file = parsed_dictionary["fai"] self.bedtools_genome_file = parsed_dictionary["genome"] self.annotation_file = parsed_dictionary["annotation"]
def compile(input_file: str, output_dir: str, preserve_headers: bool = False, chop: bool = False, chunk_length: int = int(3.6 * 10**9)): import json from modules.FASTAArray import FASTAArray from modules.RefDataLine import RefDataLine output_dir = Utilities.ends_with_slash(output_dir) os.makedirs(output_dir, exist_ok=True) refdatas_dict = FASTAArray.prepare_nfasta_for_indexing( input_file=input_file, output_dir=output_dir, preserve_headers=preserve_headers, chop=chop, chunk_length=chunk_length) output_dict = {} for sequence_id in refdatas_dict: annotation_dict = refdatas_dict[sequence_id] nfasta_file = annotation_dict.get("reference_nfasta") if not nfasta_file: continue indexing_dict = {"alias": Utilities.filename_only(nfasta_file)} indexing_dict.update(RefDataLine.fill_dict(nfasta_file)) indexing_dict.update(annotation_dict) print("Processing nFASTA: '{}'".format(nfasta_file)) refdata = RefDataLine(indexing_dict) refdata.index() output_dict[sequence_id] = indexing_dict output_file = "{a}{b}_refdata.json".format( a=Utilities.ends_with_slash(output_dir), b=Utilities.filename_only(input_file)) Utilities.dump_string( string=json.dumps(output_dict, sort_keys=False, indent=4) + "\n", file=output_file) print("Created reference data linker: '{}'".format(output_file)) return output_file
def __init__(self): namespace = self._parse_args() self.sampledata_file_name = namespace.input self.refdata_file_name = namespace.refdata self.input_mask = namespace.mask # *_output_mask are attributes of RefDataLine class self.threads_number = self._parse_threads_number(namespace.threads) self.no_coverage_bool = namespace.no_coverage self.output_dir = Utilities.ends_with_slash(namespace.output) self.logs_directory = "{}Logs/".format(self.output_dir) [ os.makedirs(i, exist_ok=True) for i in [self.output_dir, self.logs_directory] ]
def __init__(self, sampledata: SampleDataLine, refdata: RefDataLine, input_mask: str, output_dir: str): # Output directories output_dir = Utilities.ends_with_slash(output_dir) unmapped_reads_directory = "{}Unmapped_reads/".format(output_dir) mapped_reads_directory = "{}Mapped_reads/".format(output_dir) statistics_directory = "{}Statistics/".format(output_dir) logs_directory = "{}Logs/".format(output_dir) for path in [ unmapped_reads_directory, mapped_reads_directory, statistics_directory, logs_directory ]: os.makedirs(path, exist_ok=True) # Reference data self.bowtie_index_mask = refdata.bowtie_index_mask self.bowtie2_index_mask = refdata.bowtie2_index_mask self.samtools_index_file = refdata.samtools_index_file self.bedtools_genome_file = refdata.bedtools_genome_file self.annotation_file = refdata.annotation_file # Output masks mapped_output_mask = "{}_{}".format(input_mask, refdata.db_name).strip("_") unmapped_output_mask = "{}_no_{}".format(input_mask, refdata.db_name).strip("_") # Sample data sample_name = sampledata.name self.raw_reads_files_list = sampledata.raw_reads_files_list self.raw_reads_file_extension = self.raw_reads_files_list[0].split( ".")[-1] unmapped_reads_file_mask = "{a}{b}_{c}".format( a=unmapped_reads_directory, b=sample_name, c=unmapped_output_mask) self.unmapped_reads_file_name = "{a}.{b}".format( a=unmapped_reads_file_mask, b=self.raw_reads_file_extension) self.pairwise_unmapped_reads_files_list = [ "{a}.{i}.{b}".format(a=unmapped_reads_file_mask, i=i, b=self.raw_reads_file_extension) for i in [1, 2] ] mapped_reads_file_mask = "{a}{b}_{c}".format(a=mapped_reads_directory, b=sample_name, c=mapped_output_mask) self.mapped_reads_file_name = "{}.sam".format(mapped_reads_file_mask) self.samtools_converted_file_name = "{}.bam".format( mapped_reads_file_mask) self.samtools_sorted_file_name = "{}_sorted.bam".format( mapped_reads_file_mask) self.samtools_index_file_name = "{}.bai".format( self.samtools_sorted_file_name) statistics_file_mask = "{a}{b}_{c}".format(a=statistics_directory, b=sample_name, c=mapped_output_mask) self.samtools_idxstats_file_name = "{}_idxstats.txt".format( statistics_file_mask) self.samtools_stats_file_name = "{}_sam_stats.txt".format( statistics_file_mask) self.bedtools_histogram_file_name = "{}_genomeCoverageBed.txt".format( statistics_file_mask) self.stacked_coverage_file_name = "{}_pos_bp.txt".format( statistics_file_mask) self.final_coverage_file_name = "{}_coverage.tsv".format( statistics_file_mask) logs_file_mask = "{a}{b}_{c}".format(a=logs_directory, b=sample_name, c=mapped_output_mask) self.aligner_log_file_name = "{}_aligner.log".format(logs_file_mask) self.samtools_converted_log_file_name = "{}_samtools_sort.log".format( logs_file_mask) self.samtools_index_log_file_name = "{}_samtools_index.log".format( logs_file_mask) self.samtools_idxstats_log_file_name = "{}_samtools_idxstats.log".format( logs_file_mask) self.samtools_stats_log_file_name = "{}_samtools_stats.log".format( logs_file_mask) self.genomeCoverageBed_log_file_name = "{}_genomeCoverageBed.log".format( logs_file_mask)
def __init__(self, refdata: RefDataLine): # Output directories output_dir = Utilities.ends_with_slash( os.path.dirname(os.path.realpath(mainInitializer.input_file_name))) mapped_reads_directory = "{}Mapped_reads/".format(output_dir) statistics_directory = "{}Statistics/".format(output_dir) logs_directory = "{}Logs/".format(output_dir) for path in [ mapped_reads_directory, statistics_directory, logs_directory ]: os.makedirs(path, exist_ok=True) # Reference data self.samtools_index_file = refdata.samtools_index_file self.bedtools_genome_file = refdata.bedtools_genome_file self.annotation_file = refdata.annotation_file # Output files if mainInitializer.input_file_name.endswith("_sorted.bam"): sample_name = re.sub("_sorted.bam$", "", mainInitializer.input_file_name) else: sample_name = ".".join( mainInitializer.input_file_name.split(".")[:-1]) mapped_reads_file_mask = "{a}{b}".format(a=mapped_reads_directory, b=sample_name) self.mapped_reads_file_name = "{}.sam".format(mapped_reads_file_mask) self.samtools_converted_file_name = "{}.bam".format( mapped_reads_file_mask) self.samtools_sorted_file_name = "{}_sorted.bam".format( mapped_reads_file_mask) self._mapped_output_mask = refdata.db_name unmapped_output_mask = "_".join(["no", self._mapped_output_mask]) # Sample data sample_name = sampledata.name self.raw_reads_files_list = sampledata.raw_reads_files_list self.raw_reads_file_extension = self.raw_reads_files_list[0].split( ".")[-1] unmapped_reads_file_mask = "{a}{b}_{c}".format( a=unmapped_reads_directory, b=sample_name, c=unmapped_output_mask) self.unmapped_reads_file_name = "{a}.{b}".format( a=unmapped_reads_file_mask, b=self.raw_reads_file_extension) self.pairwise_unmapped_reads_files_list = [ "{a}.{i}.{b}".format(a=unmapped_reads_file_mask, i=i, b=self.raw_reads_file_extension) for i in [1, 2] ] mapped_reads_file_mask = "{a}{b}_{c}".format( a=mapped_output_mask, b=sample_name, c=self._mapped_output_mask) self.mapped_reads_file_name = "{}.sam".format(mapped_reads_file_mask) self.samtools_converted_file_name = "{}.bam".format( mapped_reads_file_mask) self.samtools_sorted_file_name = "{}_sorted.bam".format( mapped_reads_file_mask) self.samtools_index_file_name = "{}.bai".format( self.samtools_sorted_file_name) statistics_file_mask = "{a}{b}_{c}".format(a=statistics_directory, b=sample_name, c=self._mapped_output_mask) self.samtools_idxstats_file_name = "{}_idxstats.txt".format( statistics_file_mask) self.samtools_stats_file_name = "{}_sam_stats.txt".format( statistics_file_mask) self.bedtools_histogram_file_name = "{}_genomeCoverageBed.txt".format( statistics_file_mask) self.stacked_coverage_file_name = "{}_pos_bp.txt".format( statistics_file_mask) self.final_coverage_file_name = "{}_coverage.tsv".format( statistics_file_mask) logs_file_mask = "{a}{b}_{c}".format(a=logs_directory, b=sample_name, c=self._mapped_output_mask) self.aligner_log_file_name = "{}_aligner.log".format(logs_file_mask) self.samtools_converted_log_file_name = "{}_sam2bam.log".format( logs_file_mask) self.samtools_index_log_file_name = "{}_index_bam.log".format( logs_file_mask)