예제 #1
0
 def prepare_nfasta_for_indexing(input_file: str,
                                 output_dir: str,
                                 preserve_headers: bool = False,
                                 chop: bool = False,
                                 chunk_length: int = int(3.6 * 10**9)):
     array = FASTAArray.parse(Utilities.load_string(input_file))
     if not preserve_headers:
         array._fix_headers()
     output_dir = Utilities.ends_with_slash(output_dir)
     os.makedirs(output_dir, exist_ok=True)
     output_file_mask = (output_dir + Utilities.filename_only(input_file))
     annotation_file = "{}_annotation.tsv".format(output_file_mask)
     array.dump_annotation(annotation_file)
     arrays_dict = {"{}.fasta".format(output_file_mask): array}
     if chop and array.get_total_length() >= chunk_length:
         print("Too large reference nFASTA file: '{}'. Splitting sequences".
               format(input_file))
         arrays_dict = array._chop_sequences(chunk_length)
         arrays_dict = {
             "{a}_{i}.fasta".format(a=output_file_mask, i=i): arrays_dict[i]
             for i in arrays_dict
         }
     refdatas_dict = {}
     counter = 0
     for chunk_file in arrays_dict:
         counter += 1
         arrays_dict[chunk_file].dump_fastas(chunk_file)
         refdatas_dict["sequence_{}".format(counter)] = {
             "reference_nfasta": chunk_file,
             "annotation": annotation_file
         }
     print("FASTA files created: {}".format(counter))
     return refdatas_dict
예제 #2
0
    def _bam2stats(self):
        def __get_base_alignment_stats(string: str):
            d = {}
            # SamTools stats file columns: ID, stat, value, comment
            for line_list in Utilities.string_to_2d_array(string):
                if len(line_list) < 3 or line_list[0] != "SN":
                    continue
                d[re.sub(":$", "", line_list[1])] = line_list[2]
            if len(d) == 0:
                logging.critical("Bad alignment: no SAMTools stats to extract!")
                return {}
            try:
                out = {"total_reads": d["raw total sequences"],
                       "mapped_reads": d["reads mapped"],
                       "total_bp": d["total length"],
                       "mapped_bp": d["bases mapped"]}
            except KeyError:
                return {}
            return {"sample_{}".format(k): int(out[k]) for k in out}

        Utilities.batch_remove(self._pk.samtools_stats_file_name, self._pk.samtools_stats_log_file_name)
        s = subprocess.getoutput("samtools stats {a} 2> {b}".format(a=self._pk.samtools_sorted_file_name, b=self._pk.samtools_stats_log_file_name))
        Utilities.dump_string(string=s, file=self._pk.samtools_stats_file_name)
        logging.info("Saved SAMTools total coverage statistics: '{}'".format(self._pk.samtools_stats_file_name))
        self._samtools_stats_dict = __get_base_alignment_stats(s)
        del s
예제 #3
0
 def __init__(self, path_keeper: PathsKeeper, threads_number: int):
     self._pk = path_keeper
     self._threads_number = threads_number
     Utilities.batch_remove(self._pk.mapped_reads_file_name,
                            self._pk.samtools_converted_file_name,
                            self._pk.samtools_sorted_file_name,
                            self._pk.unmapped_reads_file_name,
                            *self._pk.pairwise_unmapped_reads_files_list,
                            self._pk.aligner_log_file_name)
예제 #4
0
 def run(self):
     subprocess.getoutput("rm -f {}*".format(self._pk.samtools_sorted_file_name))
     Utilities.batch_remove(self._pk.aligner_log_file_name)
     bwt_cmd_string = " ".join(self._get_cmd())
     pipeline = """{a} 2> {b} | \
     samtools view - -bu -@ {c} | \
     samtools sort - -@ {c} -o {d}""".format(a=bwt_cmd_string, b=self._pk.aligner_log_file_name, c=self._threads_number, d=self._pk.samtools_sorted_file_name)
     logging.debug("Started alignment pipeline with arguments: '{}'".format(pipeline))
     s = subprocess.getoutput(pipeline)
     logging.info("Completed alignment pipeline with arguments: '{a}' and output:\n{b}\n".format(a=pipeline, b=s))
예제 #5
0
 def _bam2idxstats(self):
     Utilities.batch_remove(self._pk.samtools_idxstats_file_name, self._pk.samtools_idxstats_log_file_name)
     s = subprocess.getoutput("samtools idxstats {a} 2> {b}".format(a=self._pk.samtools_sorted_file_name, b=self._pk.samtools_idxstats_log_file_name))
     Utilities.dump_string(string=s, file=self._pk.samtools_idxstats_file_name)
     logging.info("Saved SAMTools mapped reads statistics: '{}'".format(self._pk.samtools_idxstats_file_name))
     self._samtools_idxstats_df = pd.DataFrame(Utilities.string_to_2d_array(s), columns=[self._index_column,
                                                                                         "id_bp",
                                                                                         "id_mapped_reads",
                                                                                         "id_unmapped_reads"])
     del s
예제 #6
0
 def __init__(self, parsed_dictionary: dict):
     self._nfasta = parsed_dictionary["reference_nfasta"]
     self.db_name = parsed_dictionary.get("alias")
     if not self.db_name:
         self.db_name = Utilities.filename_only(self._nfasta)
     self._reference_mask = Utilities.ends_with_slash(os.path.dirname(os.path.realpath(self._nfasta))) + self.db_name
     self.bowtie_index_mask = parsed_dictionary["ebwt_mask"]
     self.bowtie2_index_mask = parsed_dictionary["bt2_mask"]
     self.samtools_index_file = parsed_dictionary["fai"]
     self.bedtools_genome_file = parsed_dictionary["genome"]
     self.annotation_file = parsed_dictionary["annotation"]
예제 #7
0
 def _sam2bam2sorted_bam(self):
     subprocess.getoutput("rm -f {}*".format(self._pk.samtools_sorted_file_name))
     Utilities.batch_remove(self._pk.samtools_converted_log_file_name)
     # SamTools details: http://www.htslib.org/doc/samtools.html
     # Avoiding self._pk.samtools_converted_file_name
     s = subprocess.getoutput("samtools view -bu -@ 1 {a} | \
                               samtools sort - -o -@ 1 {b}".format(a=self._pk.mapped_reads_file_name,
                                                                   b=self._pk.samtools_sorted_file_name))
     Utilities.dump_string(string=s, file=self._pk.samtools_converted_log_file_name)
     logging.info("Sorted SAM file: '{}'".format(self._pk.samtools_sorted_file_name))
     del s
예제 #8
0
 def _bam2histogram(self):
     Utilities.batch_remove(self._pk.bedtools_histogram_file_name, self._pk.genomeCoverageBed_log_file_name)
     s = subprocess.getoutput("genomeCoverageBed -ibam {a} 2> {b}".format(a=self._pk.samtools_sorted_file_name, b=self._pk.genomeCoverageBed_log_file_name))
     # GenomeCoverageBed details: https://bedtools.readthedocs.io/en/stable/content/tools/genomecov.html
     # Cannot be converted to DataFrame before stacking
     Utilities.dump_string(string=s, file=self._pk.bedtools_histogram_file_name)
     self._bedtools_histogram_2d_array = Utilities.string_to_2d_array(s)
     if len(self._bedtools_histogram_2d_array) == 0:
         logging.critical("Bad alignment: no BEDTools coverage histogram to save!")
     logging.info("Saved BEDTools coverage histogram data: '{}'".format(self._pk.bedtools_histogram_file_name))
     del s
예제 #9
0
 def read(file: str):
     wrapper = open(file=file, mode="r", encoding="utf-8")
     try:
         if file.endswith(".json"):
             return RefDataArray._parse_json_refdata(wrapper)
         else:
             return RefDataArray._parse_table_refdata(wrapper)
     except ValueError:
         wrapper.close()
         traceback.print_exc()
         Utilities.log_and_raise("Bad reference data file: {}".format(file))
     wrapper.close()
예제 #10
0
 def export(self):
     sampledatas_2d_array = [i for i in self._queue if i[0] in self._no_coverages_df["sample_name"].values.tolist()]
     if len(sampledatas_2d_array) == 0:
         print("All files have correct number of lines. No files to process")
     else:
         os.makedirs(os.path.dirname(mainInitializer.output), exist_ok=True)
         Utilities.dump_2d_array(sampledatas_2d_array, file=mainInitializer.output)
         print("Files to process: {}\nDumped sample data: '{}'".format(len(self._no_coverages_df), mainInitializer.output))
     if mainInitializer.debugging_bool:
         debug_table = "{}_debug.tsv".format(mainInitializer.output)
         self._verified_df.to_csv(debug_table, sep='\t', header=True, index=False)
         print("Dumped debug table: '{}'".format(debug_table))
예제 #11
0
    def ___fai2genome(self):
        """Process existing fasta index, depends from 'samtools_faidx' function"""
        def ____parse_fai_line(split_line: list):
            if len(split_line) >= 2:
                return split_line[:2]
            print("Bad FAI file line: {}".format("\t".join(split_line)))

        fai_2d_array = Utilities.load_2d_array("{}_samtools.fai".format(self._reference_mask))
        genome_2d_array = []
        for line in fai_2d_array:
            genome_2d_array.append(____parse_fai_line(line))
        out = "{}_samtools.genome".format(self._reference_mask)
        Utilities.dump_2d_array(array=Utilities.remove_empty_values(genome_2d_array), file=out)
        print("Created BEDTools genome index: '{}'".format(out))
예제 #12
0
 def __init__(self, single_sampledata_row):
     body_list = Utilities.remove_empty_values(
         [i.strip() for i in re.sub("[\r\n]", "", single_sampledata_row).split("\t")])
     if len(body_list) < 2:
         Utilities.log_and_raise(
             "Failed to parse sample data row (not enough columns): {}".format(single_sampledata_row))
     self.name = body_list[0]
     self.raw_reads_files_list = body_list[1:]
     if len(self.raw_reads_files_list) > 2:
         logging.warning("Only up to two input files are supported for alignment, using first two values. "
                         "Given sample data row: '{}'".format(single_sampledata_row))
         self.raw_reads_files_list = self.raw_reads_files_list[:2]
     for file in self.raw_reads_files_list:
         if not os.path.isfile(file):
             logging.warning("Not found the raw reads file: '{}'".format(file))
예제 #13
0
    def __init__(self):
        namespace = self._parse_args()
        self.input_file_name = namespace.input
        self.refdata_file_name = namespace.refdata
        self.chunk_id = namespace.chunk
        self.mapped_reads_directory = Utilities.ends_with_slash(
            os.path.dirname(os.path.abspath(self.input_file_name)))

        self._output_directory = Utilities.ends_with_slash("/".join(
            os.path.dirname(os.path.abspath(
                self.input_file_name)).split("/")[:-1]))
        self.logs_directory = "{}Logs/".format(self._output_directory)
        self.statistics_directory = "{}Statistics/".format(
            self._output_directory)
        self._create_dirs()
예제 #14
0
    def set_root_dir(self, inputDirPath):
        """set the root directory"""

        if os.path.isdir(inputDirPath):
            self.rootDir = inputDirPath
            logging.debug("ScriptsBrowser::set_root_dir-> pathDirName: %s" %
                          self.rootDir)
            Utilities.save_json(
                self.rootDirSaveFile,
                self.rootDir)  # Saves root directory path to a json file
        else:
            logging.error(
                "ERROR << ScriptsBrowser::set_root_dir-> '%s' is not valid path"
                % inputDirPath)
            raise Exception
예제 #15
0
 def parse(string: str):
     string = "\n" + re.sub("[\r\n]+", "\n", string).strip()
     q = [
         ">{}".format(j) for j in Utilities.remove_empty_values(
             [i.strip() for i in string.split("\n>")])
     ]
     return FASTAArray(sorted(set([FASTALine(i) for i in q]), reverse=True))
예제 #16
0
 def __init__(self, parsed_fastas_list: list):
     self._parsed_fastas_list = Utilities.remove_empty_values(
         parsed_fastas_list)
     self._parsed_fastas_list.sort(key=len, reverse=True)
     self._annotations_2d_array = [["reference_id", "id_bp"]]
     for fasta in self._parsed_fastas_list:
         self._annotations_2d_array.append([fasta.header, str(len(fasta))])
    def __init__(self, ScriptsBrowserWidget, ScriptsBrowserInstance):
        logging.debug(
            "SettingsDialog::__init__-> initalizing MainWindow class")
        super().__init__()
        self.setupUi(self)

        #
        self.scriptsBrowserInstance = ScriptsBrowserInstance
        self.scriptsBrowserWidget = ScriptsBrowserWidget

        # Load json files
        # ----------------
        # load saved root directory
        try:
            rootDirSaveFile = self.scriptsBrowserInstance.rootDirSaveFile
            self.scriptsBrowserWidget.rootDirModel.directory = Utilities.load_json(
                rootDirSaveFile)
            self.rootDirectory_lineEdit.setText(
                self.scriptsBrowserWidget.rootDirModel.directory)
            self.entered_root_dir()
        except Exception:
            logging.error(
                "ERROR << SettingsDialog::__init__-> Utilites.load_json call failed"
            )

        # gets the path Nuke indie executable from the ScriptsBrowser class
        self.NukeExe_lineEdit.setText(self.scriptsBrowserInstance.exePath)

        # Slot-Signal connections
        # -----------------------
        # self.NukeExe_lineEdit.selectionChanged.connect()
        self.ok_buttonBox.accepted.connect(self.enter_confirm_settings)
        self.ok_buttonBox.rejected.connect(self.close_settings_dialog_window)
예제 #18
0
 def __init__(self, sampledata_file_name):
     if not os.path.isfile(sampledata_file_name):
         Utilities.log_and_raise(
             "Sample data linker file not found: {}".format(
                 sampledata_file_name))
     self._sampledatas_list = []
     with open(sampledata_file_name, "r", encoding="utf-8") as f:
         for r in f:
             r = r.strip()
             if len(r) > 0:
                 try:
                     self._sampledatas_list.append(SampleDataLine(r))
                 except ValueError:
                     continue
     self._sampledatas_list = Utilities.remove_empty_values(
         self._sampledatas_list)
예제 #19
0
 def _reference2statistics(self):
     Utilities.batch_remove(self._pk.final_coverage_file_name)
     stats_dict = self._samtools_stats_dict
     if len(stats_dict) == 0:
         logging.critical("Bad alignment: empty SAMTools stats: '{}'".format(self._pk.samtools_stats_file_name))
         return
     if len(self._stacked_coverages_df) == 0:
         logging.critical("Bad alignment: empty stacked BEDTools coverage: '{}'".format(self._pk.stacked_coverage_file_name))
         return
     chunk_size = 10 ** 6
     reader = pd.read_table(self._pk.bedtools_genome_file, sep='\t', header="infer", names=[self._index_column, "id_bp"], chunksize=chunk_size)
     for chunk_number, reference_df in enumerate(reader):
         genomes_coverages_df = reference_df.merge(self._stacked_coverages_df.loc[:, [self._index_column] + [i for i in list(self._stacked_coverages_df) if i not in list(reference_df)]], on=self._index_column, how="left")
         genomes_coverages_df = genomes_coverages_df[~genomes_coverages_df[self._index_column].isin(["*", "genome"])]
         if self._non_zero_bool:
             genomes_coverages_df = genomes_coverages_df[genomes_coverages_df.id_coverage_breadth.notnull()]
         else:
             genomes_coverages_df = genomes_coverages_df.fillna(0)
         genomes_coverages_df["id_total_relative_abundance"] = (10 ** 12) * genomes_coverages_df["id_mapped_bp"].astype(int) / (genomes_coverages_df["id_bp"].astype(int) * int(stats_dict["sample_total_bp"]))
         genomes_coverages_df["id_mapped_relative_abundance"] = (10 ** 12) * genomes_coverages_df["id_mapped_bp"].astype(int) / (genomes_coverages_df["id_bp"].astype(int) * int(stats_dict["sample_mapped_bp"]))
         # MRA details: http://www.ibmc.msk.ru/content/thesisDocs/TyakhtAV_thesis.pdf (p.63)
         genomes_coverages_df["sample_total_reads"] = stats_dict["sample_total_reads"]
         genomes_coverages_df["sample_mapped_reads"] = stats_dict["sample_mapped_reads"]
         genomes_coverages_df["sample_total_bp"] = stats_dict["sample_total_bp"]
         genomes_coverages_df["sample_mapped_bp"] = stats_dict["sample_mapped_bp"]
         genomes_coverages_df["sample_average_total_reads_bp"] = float(stats_dict["sample_total_reads"]) / float(stats_dict["sample_total_bp"])
         genomes_coverages_df["sample_average_mapped_reads_bp"] = float(stats_dict["sample_mapped_reads"]) / float(stats_dict["sample_total_bp"])
         genomes_coverages_df["sample_mapped_reads_to_total_reads"] = float(stats_dict["sample_mapped_reads"]) / float(stats_dict["sample_total_reads"])
         genomes_coverages_df = genomes_coverages_df.merge(self._samtools_idxstats_df.loc[:, [self._index_column] + [i for i in list(self._samtools_idxstats_df) if i not in list(genomes_coverages_df)]], on=self._index_column, how="left")
         genomes_coverages_df["id_mapped_reads_per_million_sample_total_reads"] = genomes_coverages_df["id_mapped_reads"].astype(int) * (10 ** 6) / int(stats_dict["sample_total_reads"])
         genomes_coverages_df["id_mapped_reads_per_million_sample_mapped_reads"] = genomes_coverages_df["id_mapped_reads"].astype(int) * (10 ** 6) / int(stats_dict["sample_mapped_reads"])
         # RPM details: https://www.biostars.org/p/273537/
         genomes_coverages_df["id_mapped_reads_per_kbp_per_million_sample_total_reads"] = genomes_coverages_df["id_mapped_reads"].astype(int) * (10 ** 9) / (int(stats_dict["sample_total_reads"]) * genomes_coverages_df["id_bp"])
         genomes_coverages_df["id_mapped_reads_per_kbp_per_million_sample_mapped_reads"] = genomes_coverages_df["id_mapped_reads"].astype(int) * (10 ** 9) / (int(stats_dict["sample_mapped_reads"]) * genomes_coverages_df["id_bp"])
         # RPKM details: https://www.biostars.org/p/273537/
         for int_column in ["id_bp", "id_coverage_breadth", "id_mapped_bp", "id_maximal_coverage_depth",
                            "id_mapped_reads", "sample_total_reads", "sample_mapped_reads", "sample_total_bp",
                            "sample_mapped_bp"]:
             genomes_coverages_df[int_column] = genomes_coverages_df[int_column].astype(int)
         genomes_coverages_df = genomes_coverages_df.loc[:, [i for i in list(genomes_coverages_df) if len(i.strip()) > 0]]
         if chunk_number == 0:
             genomes_coverages_df.to_csv(self._pk.final_coverage_file_name, sep='\t', header=True, index=False)
         else:
             with open(file=self._pk.final_coverage_file_name, mode="a", encoding="utf-8") as f:
                 genomes_coverages_df.to_csv(f, sep='\t', header=False, index=False)
             logging.info("Processed chunk {} with size of {} lines".format(chunk_number, chunk_size))
     logging.info("Finished processing coverage table: '{}'".format(self._pk.final_coverage_file_name))
예제 #20
0
 def __init__(self):
     self._namespace = self.parse_args()
     self.input_nfasta = self._namespace.input
     self.preserve_headers_bool = self._namespace.preserve_headers
     self.not_large_index_bool = self._namespace.not_large_index
     self.chunk_length = int(self._namespace.size * 10**9)
     self.output_dir = Utilities.ends_with_slash(self._namespace.output)
     os.makedirs(self.output_dir, exist_ok=True)
예제 #21
0
 def fill_dict(nfasta_file: str):
     mask = Utilities.ends_with_slash(os.path.dirname(os.path.realpath(nfasta_file))) + Utilities.filename_only(nfasta_file)
     d = {"ebwt_mask": "{}_colorspace".format(mask),
          "bt2_mask": "{}_bowtie2".format(mask),
          "fai": "{}_samtools.fai".format(mask),
          "genome": "{}_samtools.genome".format(mask),
          "annotation": "{}_annotation.tsv".format(mask)}
     return d
예제 #22
0
 def _stack_coverage(self):
     Utilities.batch_remove(self._pk.stacked_coverage_file_name)
     # genomecov file columns: reference sequence name, depth of coverage, breadth of coverage with that depth, sequence length, coverage ratio
     stacked_coverages_2d_array = []
     row_processing_2d_array = []
     counting_id = ""
     for row_list in self._bedtools_histogram_2d_array:
         if len(row_list) != 5:
             logging.warning("Cannot parse coverage histogram row '{a}' from file '{b}'".format(a=row_list, b=self._pk.bedtools_histogram_file_name))
             continue
         reference_id, id_local_coverage_depth, id_local_coverage_breadth, id_bp, id_local_coverage_ratio = row_list
         if reference_id == 'genome' or '*' in reference_id:
             continue
         if reference_id == counting_id and int(id_local_coverage_depth) > 0:
             row_processing_2d_array.append(row_list)
         else:
             if len(row_processing_2d_array) > 0:
                 # output file columns: reference sequence name, maximal depth of coverage, total breadth of coverage, sequence length, coverage ratio, total mapped bases
                 id_maximal_coverage_depth = max([int(i[1]) for i in row_processing_2d_array])
                 id_coverage_breadth = sum([int(i[2]) for i in row_processing_2d_array])
                 id_bp = int(row_processing_2d_array[0][3])
                 id_coverage_breadth_to_id_bp = sum([float(i[4]) for i in row_processing_2d_array])
                 id_mapped_bp = sum([int(i[1]) * int(i[2]) for i in row_processing_2d_array])
                 stacked_coverages_2d_array.append([counting_id,
                                                    id_maximal_coverage_depth,
                                                    id_coverage_breadth,
                                                    id_bp,
                                                    id_coverage_breadth_to_id_bp,
                                                    id_mapped_bp])
             row_processing_2d_array = []
             counting_id = reference_id
     if len(stacked_coverages_2d_array) == 0:
         logging.critical("Bad alignment: no coverage to stack!")
         return
     self._stacked_coverages_df = pd.DataFrame(stacked_coverages_2d_array, columns=[self._index_column,
                                                                                    "id_maximal_coverage_depth",
                                                                                    "id_coverage_breadth",
                                                                                    "id_bp",
                                                                                    "id_coverage_breadth_to_id_bp",
                                                                                    "id_mapped_bp"])
     self._stacked_coverages_df.to_csv(self._pk.stacked_coverage_file_name, sep='\t', index=False)
     logging.info("Stacked BEDTools coverage: '{}'".format(self._pk.stacked_coverage_file_name))
     del self._bedtools_histogram_2d_array, stacked_coverages_2d_array
     gc.collect()
예제 #23
0
 def __init__(self):
     self._namespace = self._parse_args()
     self.sampledata = self._namespace.input
     self.target_length = CoveragesVerifier.get_wc_l(self._namespace.genome) + 1
     self.prefix = self._namespace.prefix
     self.suffix = self._namespace.suffix
     self.debugging_bool = self._namespace.debug
     self.output = self._namespace.output
     if len(self.output) == 0:
         self.output = "{}sampledata/{}.sampledata".format(Utilities.ends_with_slash(os.path.dirname(self.prefix)), Utilities.get_time())
예제 #24
0
 def __init__(self):
     namespace = self._parse_args()
     self.sampledata_file_name = namespace.input
     self.refdata_file_name = namespace.refdata
     self.input_mask = namespace.mask
     # *_output_mask are attributes of RefDataLine class
     self.threads_number = self._parse_threads_number(namespace.threads)
     self.no_coverage_bool = namespace.no_coverage
     self.output_dir = Utilities.ends_with_slash(namespace.output)
     self.logs_directory = "{}Logs/".format(self.output_dir)
     [
         os.makedirs(i, exist_ok=True)
         for i in [self.output_dir, self.logs_directory]
     ]
예제 #25
0
 def __get_base_alignment_stats(string: str):
     d = {}
     # SamTools stats file columns: ID, stat, value, comment
     for line_list in Utilities.string_to_2d_array(string):
         if len(line_list) < 3 or line_list[0] != "SN":
             continue
         d[re.sub(":$", "", line_list[1])] = line_list[2]
     if len(d) == 0:
         logging.critical("Bad alignment: no SAMTools stats to extract!")
         return {}
     try:
         out = {"total_reads": d["raw total sequences"],
                "mapped_reads": d["reads mapped"],
                "total_bp": d["total length"],
                "mapped_bp": d["bases mapped"]}
     except KeyError:
         return {}
     return {"sample_{}".format(k): int(out[k]) for k in out}
예제 #26
0
    def compile(input_file: str,
                output_dir: str,
                preserve_headers: bool = False,
                chop: bool = False,
                chunk_length: int = int(3.6 * 10**9)):
        import json
        from modules.FASTAArray import FASTAArray
        from modules.RefDataLine import RefDataLine

        output_dir = Utilities.ends_with_slash(output_dir)
        os.makedirs(output_dir, exist_ok=True)
        refdatas_dict = FASTAArray.prepare_nfasta_for_indexing(
            input_file=input_file,
            output_dir=output_dir,
            preserve_headers=preserve_headers,
            chop=chop,
            chunk_length=chunk_length)
        output_dict = {}
        for sequence_id in refdatas_dict:
            annotation_dict = refdatas_dict[sequence_id]
            nfasta_file = annotation_dict.get("reference_nfasta")
            if not nfasta_file:
                continue
            indexing_dict = {"alias": Utilities.filename_only(nfasta_file)}
            indexing_dict.update(RefDataLine.fill_dict(nfasta_file))
            indexing_dict.update(annotation_dict)
            print("Processing nFASTA: '{}'".format(nfasta_file))
            refdata = RefDataLine(indexing_dict)
            refdata.index()
            output_dict[sequence_id] = indexing_dict
        output_file = "{a}{b}_refdata.json".format(
            a=Utilities.ends_with_slash(output_dir),
            b=Utilities.filename_only(input_file))
        Utilities.dump_string(
            string=json.dumps(output_dict, sort_keys=False, indent=4) + "\n",
            file=output_file)
        print("Created reference data linker: '{}'".format(output_file))
        return output_file
예제 #27
0
 def run(self):
     Utilities.single_core_queue(func=self._run_pipeline,
                                 queue=self.chunks_list)
예제 #28
0
 def run(self):
     Utilities.single_core_queue(func=self._run_aligner,
                                 queue=sampleFilesList)
     if not mainInitializer.no_coverage_bool:
         Utilities.single_core_queue(func=self._run_extractor,
                                     queue=sampleFilesList)
예제 #29
0
        self.chunks_list = RefDataArray.read(
            mainInitializer.refdata_file_name).get_parsed_list()

    @staticmethod
    def _run_pipeline(refdata: RefDataLine):
        handler = PipelineHandler(refdata)
        handler.run()

    def run(self):
        Utilities.single_core_queue(func=self._run_pipeline,
                                    queue=self.chunks_list)


if __name__ == '__main__':
    mainInitializer = Initializer()
    launchTime = Utilities.get_time()
    nodeName = subprocess.getoutput("hostname").strip()
    mainLogFile = "{a}nBee_{b}_{c}.log".format(
        a=mainInitializer.logs_directory, b=nodeName, c=launchTime)
    print("Started main workflow with log file: '{}'".format(mainLogFile))
    logging.basicConfig(
        format=u'%(levelname)-8s [%(asctime)s] %(message)s',
        level=logging.DEBUG,
        handlers=[logging.FileHandler(mainLogFile),
                  logging.StreamHandler()])
    sampleDataParser = SampleDataParser(mainInitializer.sampledata_file_name)
    sampleFilesList = sampleDataParser.get_parsed_list()
    if len(sampleFilesList) == 0:
        Utilities.log_and_raise(
            "No files to process, exiting: '{}'".format(sampleFilesList))
    chunksHandler = ChunksHandler()
예제 #30
0
 def __samtools_faidx(self):
     s = subprocess.getoutput("samtools faidx {}".format(self._nfasta))
     Utilities.dump_string(string=s, file="{}_samtools_faidx.log".format(self._reference_mask))
     os.rename("{}.fai".format(self._nfasta), self.samtools_index_file)
     print("Created SAMTools FAI file: '{}'".format(self.samtools_index_file))
     self.___fai2genome()