예제 #1
0
 def __merge_chains_output(self):
     """Call parse results."""
     # define where to save intermediate table
     eprint("Merging chain output...")
     merge_chains_output(self.ref_bed, self.isoforms,
                         self.chain_class_results, self.chain_results_df)
     self.temp_files.append(self.chain_results_df)
예제 #2
0
    def __merge_cesar_output(self):
        """Merge CESAR output, save final fasta and bed."""
        eprint(
            "Merging CESAR output to make final fasta and pre-final bed files."
        )
        merge_c_stage_skipped = os.path.join(self.rejected_dir,
                                             "CESAR_MERGE.txt")
        self.temp_files.append(self.intermediate_bed)

        all_ok = merge_cesar_output(self.cesar_results, self.intermediate_bed,
                                    self.nucl_fasta, self.meta_data,
                                    merge_c_stage_skipped, self.prot_fasta,
                                    self.trash_exons)
        if all_ok:
            # there are no empty output files
            print("CESAR results merged")
            self.cesar_ok_merged = True
        else:
            # there are some empty output files
            # MAYBE everything is fine
            # but need to notify user anyway
            print("WARNING!\nSOME CESAR JOBS LIKELY CRASHED\n!")
            print("RESULTS ARE LIKELY INCOMPLETE")
            print("PLEASE SEE LOGS FOR DETAILS")
            self.cesar_ok_merged = False
예제 #3
0
 def __gene_loss_summary(self):
     """Call gene loss summary."""
     eprint("Calling gene loss summary")
     gene_losses_summary(self.gene_loss_data,
                         self.ref_bed,
                         self.intermediate_bed,
                         self.query_annotation,
                         self.loss_summ,
                         iforms=self.isoforms,
                         paral=self.paralogs_log)
예제 #4
0
 def __make_indexed_chain(self):
     """Make chain index file."""
     # make *.bb file
     eprint("make_indexed in progress...")
     chain_bst_index(self.chain_file,
                     self.chain_index_file,
                     txt_index=self.chain_index_txt_file)
     self.temp_files.append(self.chain_index_file)
     self.temp_files.append(self.chain_file)
     self.temp_files.append(self.chain_index_txt_file)
     eprint("Indexed")
예제 #5
0
def read_fasta(fasta_line, v=False):
    """Read fasta, return dict and type."""
    fasta_data = fasta_line.split(">")
    eprint(f"fasta_data[0] is:\n{fasta_data[0]}") if v else None
    eprint(f"fasta_data[1] is:\n{fasta_data[1]}") if v else None
    if fasta_data[0] != "":
        # this is a bug
        eprint("ERROR! Cesar output is corrupted")
        eprint(f"Issue detected in the following string:\n{fasta_line}")
        die("Abort")
    del fasta_data[0]  # remove it "" we don't need that
    sequences = {}  # accumulate data here
    order = []  # to have ordered list

    # there is no guarantee that dict will contain elements in the
    # same order as they were added
    for elem in fasta_data:
        raw_lines = elem.split("\n")
        # it must be first ['capHir1', 'ATGCCGCGCCAATTCCCCAAGCTGA... ]
        header = raw_lines[0]
        # separate nucleotide-containing lines
        lines = [x for x in raw_lines[1:] if x != "" and not x.startswith("!")]
        if len(lines) == 0:  # it is a mistake - empty sequence --> get rid of
            continue
        fasta_content = "".join(lines)
        sequences[header] = fasta_content
        order.append(header)
    return sequences, order
예제 #6
0
def load_results(results_dir):
    """Load and sort the chain feature extractor results."""
    verbose("Loading the results...")
    results_files = os.listdir(results_dir)
    verbose(f"There are {len(results_files)} result files to combine")

    # to hold data from fields "genes":
    chain_genes_data = defaultdict(list)
    # to hold data from "chains" field:
    chain_raw_data = {}
    # read file-by-file, otherwise it takes too much place
    genes_counter, chain_counter = 0, 0  # count chain and genes lines

    for results_file in results_files:
        # there are N files: read them one-by-one
        path = os.path.join(results_dir, results_file)
        f = open(path, "r")
        for line in f:
            # read file line-by-line, all fields are tab-separated
            line_data = line.rstrip().split("\t")
            # define the class of this line
            # a line could be either gene or chain-related
            if line_data[0] == "genes":
                # process as a gene line
                chain, genes = process_gene_line(line_data)
                chain_genes_data[chain].extend(genes)
                genes_counter += 1
            elif line_data[0] == "chain":
                # chain related data
                the_chain_related = process_chain_line(line_data)
                # add this chain-related dict to the global one
                chain_raw_data.update(the_chain_related)
                chain_counter += 1
        # do not forget to close the file
        f.close()

    verbose(f"Got {len(chain_genes_data)} keys in chain_genes_data")
    verbose(f"Got {len(chain_raw_data)} keys in chain_raw_data")
    verbose(
        f"There were {genes_counter} genes lines and {chain_counter} chain lines"
    )
    # actually, these values must be equal
    # just a sanity check
    if not genes_counter == chain_counter:
        eprint(f"WARNING! genes_counter and chain_counter hold different "
               f"values:\n{genes_counter} and {chain_counter} respectively")
        die("Some features extracting jobs died!")
    return chain_genes_data, chain_raw_data
예제 #7
0
    def __check_dependencies(self):
        """Check all dependencies."""
        eprint("check if binaries are compiled and libs are installed...")
        c_not_compiled = any(
            os.path.isfile(f) is False for f in [
                self.CHAIN_SCORE_FILTER, self.CHAIN_COORDS_CONVERT_LIB,
                self.CHAIN_FILTER_BY_ID, self.EXTRACT_SUBCHAIN_LIB,
                self.CHAIN_INDEX_SLIB
            ])
        if c_not_compiled:
            eprint("Warning! C code is not compiled, trying to compile...")
        imports_not_found = False
        try:
            import twobitreader
            import networkx
            import pandas
            import xgboost
            import joblib
            import h5py
        except ImportError:
            eprint("Warning! Some of the required packages are not installed.")
            imports_not_found = True

        not_all_found = any([c_not_compiled, imports_not_found])
        self.__call_proc(self.CONFIGURE, "Could not call configure.sh!")\
            if not_all_found else eprint("All dependencies found")
예제 #8
0
 def __orthology_type_map(self):
     """Call orthology_type_map.py"""
     # need to combine projections in genes
     query_isoforms_file = os.path.join(self.wd, "query_isoforms.tsv")
     query_gene_spans = os.path.join(self.wd, "query_gene_spans.bed")
     get_query_isoforms_data(self.query_annotation,
                             query_isoforms_file,
                             save_genes_track=query_gene_spans)
     eprint("Calling orthology_type_map...")
     skipped_ref_trans = os.path.join(self.wd, "ref_orphan_transcripts.txt")
     orthology_type_map(self.ref_bed,
                        self.query_annotation,
                        self.orthology_type,
                        ref_iso=self.isoforms,
                        que_iso=query_isoforms_file,
                        paralogs_arg=self.paralogs_log,
                        loss_data=self.loss_summ,
                        save_skipped=skipped_ref_trans)
예제 #9
0
    def __check_isoforms_file(self, t_in_bed):
        """Sanity checks for isoforms file."""
        if not self.isoforms_arg:
            return  # not provided: nothing to check
        # isoforms file provided: need to check correctness and completeness
        # then check isoforms file itself
        f = open(self.isoforms_arg, "r")
        self.isoforms = os.path.join(self.wd, "isoforms.tsv")
        header = f.__next__()  # first line is header
        filt_isoforms_lines = [
            header,
        ]  # remove isoforms that don't appear in the bed file
        # also we catch isoforms that are in the bed but not in the isoforms file
        t_in_i = []
        for num, line in enumerate(f, 2):
            line_data = line.rstrip().split("\t")
            if len(line_data) != ISOFORMS_FILE_COLS:
                err_msg = f"Error! Isoforms file {self.isoforms} line {num}: " \
                          f"Expected {ISOFORMS_FILE_COLS} fields, got {len(line_data)}"
                self.die(err_msg)
            transcript = line_data[1]
            if transcript in t_in_bed:
                # this isoforms appears in the bed file: keep it
                filt_isoforms_lines.append(line)
                t_in_i.append(line_data[1])
            else:  # this isoform doesn't appear in the bed: we can skip it
                continue
        f.close()
        # this set contains isoforms found in the isoforms file
        t_in_i = set(t_in_i)
        # there are transcripts that appear in bed but not in the isoforms file
        # if this set is non-empty: raise an error
        u_in_b = t_in_bed.difference(t_in_i)

        if len(u_in_b) != 0:  # isoforms file is incomplete
            extra_t_list = "\n".join(
                list(u_in_b)[:100])  # show first 100 (or maybe show all?)
            err_msg = f"Error! There are {len(u_in_b)} transcripts in the bed file absent in the isoforms file! " \
                      f"There are the transcripts (first 100):\n{extra_t_list}"
            self.die(err_msg)
        # write isoforms file
        with open(self.isoforms, "w") as f:
            f.write("".join(filt_isoforms_lines))
        eprint("Isoforms file is OK")
예제 #10
0
 def __check_u12_file(self, t_in_bed):
     """Sanity check for U12 file."""
     if not self.u12_arg:
         # just not provided: nothing to check
         return
     # U12 file provided
     self.u12 = os.path.join(self.wd, "u12_data.txt")
     filt_lines = []
     f = open(self.u12_arg, "r")
     for num, line in enumerate(f, 1):
         line_data = line.rstrip().split("\t")
         if len(line_data) != U12_FILE_COLS:
             err_msg = f"Error! U12 file {self.u12} line {num} is corrupted, 3 fields expected; "\
                       f"Got {len(line_data)}; please note that a tab-separated file expected"
             self.die(err_msg)
         trans_id = line_data[0]
         if trans_id not in t_in_bed:
             # transcript doesn't appear in the bed file: skip it
             continue
         exon_num = line_data[1]
         if not exon_num.isnumeric():
             err_msg = f"Error! U12 file {self.u12} line {num} is corrupted, field 2 value is {exon_num}; "\
                       f"This field must contain a numeric value (exon number)."
             self.die(err_msg)
         acc_don = line_data[2]
         if acc_don not in U12_AD_FIELD:
             err_msg = f"Error! U12 file {self.u12} line {num} is corrupted, field 3 value is {acc_don}; "\
                       f"This field could have either A or D value."
             self.die(err_msg)
         filt_lines.append(line)  # save this line
     f.close()
     # another check: what if there are no lines after filter?
     if len(filt_lines) == 0:
         err_msg = f"Error! No lines left in the {self.u12_arg} file after filter." \
                   f"Please check that transcript IDs in this file and bed {self.ref_bed} are consistent"
         self.die(err_msg)
     with open(self.u12, "w") as f:
         f.write("".join(filt_lines))
     eprint("U12 file is correct")
예제 #11
0
 def __call_proc(self, cmd, extra_msg=None):
     """Call a subprocess and catch errors."""
     eprint(f"{cmd} in progress...")
     rc = subprocess.call(cmd, shell=True)
     if rc != 0:
         eprint(extra_msg) if extra_msg else None
         self.die(f"Error! Process {cmd} died! Abort.")
     eprint(f"{cmd} done with code 0")
예제 #12
0
def merge_chains_output(bed_file,
                        isoforms,
                        results_dir,
                        output,
                        exon_cov_chains=False):
    """Chains output merger core function."""
    # read bed file, get gene features
    bed_data = read_bed_data(bed_file)
    # load isoforms data if provided
    isoforms = read_isoforms(isoforms) if isoforms else None
    # read result files from unit
    chain_genes_data, chain_raw_data = load_results(results_dir)
    # I need this dict reverted actually
    # not chain-genes-data but gene-chains-data
    genes_data = revert_dict(chain_genes_data)

    # combine all the data into one gene-oriented dictionary
    combined_data = combine(bed_data, chain_raw_data, genes_data,
                            exon_cov_chains, isoforms)
    # save this data
    save(combined_data, output)
    # finish the program
    eprint(f"Estimated_time: {format(dt.now() - t0)}")
예제 #13
0
 def __classify_chains(self):
     """Run decision tree."""
     # define input and output."""
     eprint("Decision tree in progress...")
     self.orthologs = os.path.join(self.wd, "trans_to_chain_classes.tsv")
     self.pred_scores = os.path.join(self.wd, "orthology_scores.tsv")
     self.se_model = os.path.join(self.LOCATION, "models", "se_model.dat")
     self.me_model = os.path.join(self.LOCATION, "models", "me_model.dat")
     cl_rej_log = os.path.join(self.rejected_dir,
                               "classify_chains_rejected.txt")
     if not os.path.isfile(self.se_model) or not os.path.isfile(
             self.me_model):
         self.__call_proc(self.MODEL_TRAINER,
                          "Models not found, training...")
     classify_chains(self.chain_results_df,
                     self.orthologs,
                     self.se_model,
                     self.me_model,
                     rejected=cl_rej_log,
                     raw_out=self.pred_scores)
     if self.stop_at_chain_class:
         self.die("User requested to halt after chain features extraction",
                  rc=0)
예제 #14
0
def check_args(chain_id, genes, chain_file, chain_dict, bed_file, verbose_level, work_data, result):
    # print(chain_index, chain_file)
    """Check if arguments are correct, extract initial data if so."""
    global VERBOSE  # set verbosity level
    VERBOSE = True if verbose_level else False
    verbose("# unit.py called for chain {} and genes {}".format(chain_id, genes))
    # another minor things
    verbose(f"Using {bed_file} and {chain_file}")
    work_data["chain_id"] = chain_id

    # check genes
    raw_genes = [x for x in genes.split(",") if x != ""]
    # bed_lines = bedExtractSqlite(raw_genes, bed_index, bed_file)
    bed_lines = bed_extract_id(bed_file, raw_genes)
    work_data["bed"] = bed_lines  # save it
    work_data["genes"] = [x.split("\t")[3] for x in bed_lines.split("\n")[:-1]]

    # check if numbers of genes are equal
    if len(raw_genes) != len(bed_lines.split("\n")[:-1]):
        eprint("Warning. Not all the genes you set were found!\n")
        need_ = len(raw_genes)
        extracted_ = len(bed_lines.split('\n')[:-1])
        eprint(f"You set {need_} genes, {extracted_}")
        missing_genes = ",".join([x for x in raw_genes if x not in work_data["genes"]])
        eprint(f"Missing genes:\n{missing_genes}")

    # extract chain body from the file 
    work_data["chain"] = extract_chain(chain_file, chain_dict, chain_id)

    # parse chain header
    chain_header = work_data["chain"].split("\n")[0].split()
    verbose("Chain header is:\n{0}".format(chain_header))
    q_start = int(chain_header[10])
    q_end = int(chain_header[11])
    q_len = abs(q_end - q_start)
    work_data["chain_QLen"] = q_len
    work_data["chain_Tstarts"] = int(chain_header[5])
    work_data["chain_Tends"] = int(chain_header[6])
    result["chain_global_score"] = int(chain_header[1])
    result["chain_len"] = work_data["chain_Tends"] - work_data["chain_Tstarts"]
예제 #15
0
def merge_cesar_output(input_dir, output_bed, output_fasta, meta_data_arg,
                       skipped_arg, prot_arg, output_trash):
    """Merge multiple CESAR output files."""
    # check that input dir is correct
    die(f"Error! {input_dir} is not a dir!") \
        if not os.path.isdir(input_dir) else None
    # get list of bdb files (output of CESAR part)
    bdbs = [x for x in os.listdir(input_dir) if x.endswith(".bdb")]

    # initiate lists for different types of output:
    bed_summary = []
    fasta_summary = []
    trash_summary = []
    meta_summary = []
    prot_summary = []
    skipped = []
    all_ok = True

    task_size = len(bdbs)
    # extract data for all the files
    for num, bdb_file in enumerate(bdbs):
        # parse bdb files one by one
        bdb_path = os.path.join(input_dir, bdb_file)
        try:  # try to parse data
            parsed_data = parse_cesar_bdb(bdb_path)
        except AssertionError:
            # if this happened: some assertion was violated
            # probably CESAR output data is corrupted
            sys.exit(f"Error! Failed reading file {bdb_file}")

        # unpack parsed data tuple:
        bed_lines = parsed_data[0]
        trash_exons = parsed_data[1]
        fasta_lines = parsed_data[2]
        meta_data = parsed_data[3]
        prot_fasta = parsed_data[4]
        skip = parsed_data[5]

        if len(bed_lines) == 0:
            # actually should not happen, but can
            eprint(f"Warning! {bdb_file} is empty")
            all_ok = False
            continue  # it is empty

        # append data to lists
        bed_summary.append("\n".join(bed_lines) + "\n")
        fasta_summary.append(fasta_lines)
        trash_summary.append("".join(trash_exons))
        meta_summary.append(meta_data)
        skipped.append(skip)
        prot_summary.append(prot_fasta)
        eprint(f"Reading file {num + 1}/{task_size}", end="\r")

    # save output
    eprint("Saving the output")

    if len(bed_summary) == 0:
        # if so, no need to continue
        eprint("! merge_cesar_output.py:")
        die("No projections found! Abort.")

    # save bed, fasta and the rest
    with open(output_bed, "w") as f:
        f.write("".join(bed_summary))
    with open(output_fasta, "w") as f:
        f.write("".join(fasta_summary))
    with open(meta_data_arg, "w") as f:
        f.write("\n".join(meta_summary))
    with open(skipped_arg, "w") as f:
        f.write("\n".join(skipped))
    with open(prot_arg, "w") as f:
        f.write("\n".join(prot_summary))

    if output_trash:
        # if requested: provide trash annotation
        f = open(output_trash, "w")
        f.write("".join(trash_summary))
        f.close()
    return all_ok
예제 #16
0
def parse_cesar_bdb(arg_input, v=False):
    """Parse CESAR bdb file core function."""
    in_ = open(arg_input, "r")  # read cesar bdb file
    # two \n\n divide each unit of information
    content = [x for x in in_.read().split("\n\n") if x]
    in_.close()
    # GLP-related data is already filtered out by cesar_runner

    # initiate collectors
    bed_lines = []  # save bed lines here
    skipped = []  # save skipper projections here
    pred_seq_chain = {}  # for nucleotide sequences to fasta
    t_exon_seqs = defaultdict(dict)  # reference exon sequences
    wrong_exons = []  # exons that are predicted but actually deleted/missing
    all_meta_data = [META_HEADER]  # to collect exons meta data
    prot_data = []  # protein sequences

    for elem in content:
        # one elem - one CESAR call (one ref transcript and >=1 chains)
        # now loop gene-by-gene
        gene = elem.split("\n")[0][1:]
        eprint(f"Reading gene {gene}") if v else None
        cesar_out = "\n".join(elem.split("\n")[1:])
        # basically this is a fasta file with headers
        # saturated with different information
        sequences, order = read_fasta(cesar_out, v=v)
        # initiate dicts to fill later
        ranges_chain, chain_dir = defaultdict(dict), {}
        pred_seq_chain[gene] = defaultdict(dict)

        # split fasta headers in different classes
        # query, ref and prot sequence headers are explicitly marked
        query_headers = [h for h in order if h.endswith("query_exon")]
        ref_headers = [h for h in order if h.endswith("reference_exon")]
        prot_ids = [h for h in order if "PROT" in h]

        # parse reference exons, quite simple
        for header in ref_headers:
            # one header for one exon
            # fields look like this:
            # FIELD_1 | FIELD_2 | FIELD_3\n
            header_fields = [s.replace(" ", "") for s in header.split("|")]
            exon_num = int(header_fields[1])  # 0-based!
            exon_seq = sequences[header].replace(
                "-", "")  # header is also a key for seq dict
            t_exon_seqs[gene][exon_num] = exon_seq

        # save protein data
        for prot_id in prot_ids:
            prot_seq = sequences[prot_id]
            prot_line = f">{prot_id}\n{prot_seq}\n"
            prot_data.append(prot_line)

        # get gene: exons dict to trace deleted exons
        gene_chain_exon_status = defaultdict(dict)

        # parse query headers
        for header in query_headers:
            header_fields = [s.replace(" ", "") for s in header.split("|")]
            if len(header_fields) != Q_HEADER_FIELDS_NUM:
                continue  # ref exon?

            # extract metadata, parse query header
            trans = header_fields[0]
            exon_num = int(header_fields[1])
            chain_id = int(header_fields[2])
            exon_region = read_region(header_fields[3])
            pid = float(header_fields[4])  # nucleotide %ID
            blosum = float(header_fields[5])
            is_gap = header_fields[6]  # asm gap in the expected region
            exon_class = header_fields[7]  # how it aligns to chain
            exp_region_str = header_fields[8]  # expected region
            in_exp = header_fields[9]  # detected in the expected region or not
            in_exp_b = True if in_exp == "INC" else False

            # mark that it's paralogous projection:
            para_annot = True if header_fields[10] == "True" else False
            stat_key = (trans, chain_id)  # projection ID
            # classify exon, check whether it's deleted/missing
            exon_decision, q_mark = classify_exon(exon_class, in_exp_b, pid,
                                                  blosum)

            if exon_decision is False:
                # exon is deleted/missing
                wrong_exons.append(header)  # save this data
                gene_chain_exon_status[stat_key][exon_num] = False
            else:  # exon is not deleted
                # get/write necessary info
                gene_chain_exon_status[stat_key][exon_num] = True
                chain_dir[chain_id] = exon_region["end"] > exon_region["start"]
                ranges_chain[chain_id][exon_num] = exon_region
                pred_seq_chain[gene][chain_id][exon_num] = sequences[header]
            # collect exon meta-data -> write to file later
            meta_data = "\t".join([
                gene, header_fields[1], header_fields[2], header_fields[3],
                exp_region_str, in_exp, header_fields[4], header_fields[5],
                is_gap, exon_class,
                str(para_annot), q_mark
            ])
            all_meta_data.append(meta_data)

        # check if there are any exons
        for name, stat in gene_chain_exon_status.items():
            any_exons_left = any(stat.values())
            if any_exons_left:
                continue
            # projection has no exons: log it
            name_ = f"{name[0]}.{name[1]}"
            skipped.append(f"{name_}\tall exons are deleted.")

        # make bed tracks
        for chain_id in chain_dir.keys():
            # go projection-by-projection: fixed gene, loop over chains
            block_starts = []
            block_sizes = []
            ranges = ranges_chain[chain_id]
            name = f"{gene}.{chain_id}"  # projection name for bed file

            if len(ranges) == 0:  # this projection is completely missing
                skipped.append(f"{name}\tall exons are deleted.")
                continue
            direct = chain_dir[chain_id]
            exon_nums = sorted(ranges.keys()) if direct else sorted(
                ranges.keys(), reverse=True)

            # get basic coordinates
            chrom = ranges[exon_nums[0]]["chrom"]
            chrom_start = ranges[exon_nums[0]]["start"] if direct else ranges[
                exon_nums[0]]["end"]
            chrom_end = ranges[exon_nums[-1]]["end"] if direct else ranges[
                exon_nums[-1]]["start"]

            # we do not predict UTRs: thickStart/End = chrom_start/End
            thickStart = chrom_start
            thick_end = chrom_end
            strand = "+" if direct else "-"
            block_count = len(exon_nums)

            # need to convert to "block starts" \ "block sizes" format
            for exon_num in exon_nums:
                ex_range = ranges[exon_num]
                block_sizes.append(abs(ex_range["end"] - ex_range["start"]))
                blockStart = ex_range[
                    "start"] - chrom_start if direct else ex_range[
                        "end"] - chrom_start
                block_starts.append(blockStart)

            # need this as strings to save it in a text file
            block_starts_str = ",".join(map(str, block_starts)) + ","
            block_sizes_str = ",".join(map(str, block_sizes)) + ","

            # join in a bed line
            bed_list = map(str, [
                chrom, chrom_start, chrom_end, name, DEFAULT_SCORE, strand,
                thickStart, thick_end, BLACK, block_count, block_sizes_str,
                block_starts_str
            ])
            bed_line = "\t".join(bed_list)
            bed_lines.append(bed_line)

    # arrange fasta content
    fasta_lines_lst = []
    for gene, chain_exon_seq in pred_seq_chain.items():
        # write target gene info
        t_gene_seq_dct = t_exon_seqs.get(gene)
        if t_gene_seq_dct is None:
            # no sequence data for this transcript?
            eprint(f"Warning! Missing data for {gene}")
            skipped.append(f"{gene}\tmissing data after cesar stage")
            continue
        # We have sequence fragments split between different exons
        t_exon_nums = sorted(t_gene_seq_dct.keys())
        t_header = ">ref_{0}\n".format(gene)
        t_seq = "".join([t_gene_seq_dct[num] for num in t_exon_nums]) + "\n"
        # append data to fasta strings
        fasta_lines_lst.append(t_header)
        fasta_lines_lst.append(t_seq)

        # and query info
        for chain_id, exon_seq in chain_exon_seq.items():
            track_header = ">{0}.{1}\n".format(gene, chain_id)
            exon_nums = sorted(exon_seq.keys())
            # also need to assemble different exon sequences
            seq = "".join([exon_seq[num] for num in exon_nums]) + "\n"
            fasta_lines_lst.append(track_header)
            fasta_lines_lst.append(seq)

    # save corrupted exons as bed-6 track
    # to make it possible to save them and visualize in the browser
    trash_exons = []
    for elem in wrong_exons:
        elem_fields = [s.replace(" ", "") for s in elem.split("|")]
        # need to fill the following:
        # chrom, start, end, name, score, strand
        gene_name = elem_fields[0]
        exon_num = elem_fields[1]
        chain_id = elem_fields[2]
        label = ".".join([gene_name, exon_num, chain_id])
        grange = elem_fields[3].split(":")
        chrom, (start, end) = grange[0], grange[1].split("-")
        strand = "+"
        score = str(int(float(elem_fields[4]) * 10))
        bed_6 = "\t".join([chrom, start, end, label, score, strand]) + "\n"
        trash_exons.append(bed_6)

    # join output strings
    meta_str = "\n".join(all_meta_data) + "\n"
    skipped_str = "\n".join(skipped) + "\n"
    prot_fasta = "".join(prot_data)
    fasta_lines = "".join(fasta_lines_lst)
    return bed_lines, trash_exons, fasta_lines, meta_str, prot_fasta, skipped_str
예제 #17
0
파일: filter_bed.py 프로젝트: heziqing/TOGA
def prepare_bed_file(bed_file,
                     output,
                     ouf=False,
                     save_rejected=None,
                     only_chrom=None):
    """Filter the bed file given and save the updated version."""
    new_lines = []  # keep updated lines
    rejected = []  # keep IDs of skipped transcripts + the reason why
    names = Counter()  # we need to make sure that all names are unique

    f = open(bed_file, "r")
    for num, line in enumerate(f, 1):
        # parse bed file according to specification
        line_data = line.rstrip().split("\t")

        if len(line_data) != 12:
            f.close()  # this is for sure an error
            # it is possible only if something except a bed12 was provided
            die("Error! Bed 12 file is required! Got a file with {len(line_data)} fields instead"
                )

        chrom = line_data[0]
        if only_chrom and chrom != only_chrom:
            # TOGA allows to perform the analysis on a specific chromosome only
            # is so, we can skip all transcripts that located on other chromosomes
            continue
        chromStart = int(line_data[1])
        chromEnd = int(line_data[2])
        name = line_data[3]  # gene_name usually
        # bed_score = int(line_data[4])  # never used
        # strand = line_data[5]  # otherwise:
        # strand = True if line_data[5] == '+' else False
        thickStart = int(line_data[6])
        thickEnd = int(line_data[7])
        # itemRgb = line_data[8]  # never used
        blockCount = int(line_data[9])
        blockSizes = [int(x) for x in line_data[10].split(',') if x != '']
        blockStarts = [int(x) for x in line_data[11].split(',') if x != '']
        blockEnds = [blockStarts[i] + blockSizes[i] for i in range(blockCount)]
        blockAbsStarts = [
            blockStarts[i] + chromStart for i in range(blockCount)
        ]
        blockAbsEnds = [blockEnds[i] + chromStart for i in range(blockCount)]
        blockNewStarts, blockNewEnds = [], []
        names[name] += 1

        if thickStart > thickEnd:
            f.close(
            )  # according to bed12 specification this should never happen
            sys.stderr.write(f"Problem occurred at line {num}, gene {name}\n")
            die("Error! Bed file is corrupted, thickEnd MUST be >= thickStart")
        elif thickStart == thickEnd:
            # this means that this is a non-coding transcript
            # TOGA cannot process them: we can skip it
            rejected.append((name, "No CDS"))
            continue

        if thickStart < chromStart or thickEnd > chromEnd:
            # a very strange (but still possible) case
            f.close()  # for sure an error with input data
            sys.stderr.write(f"Problem occurred at line {num}, gene {name}\n")
            die("Error! Bed file is corrupted, thickRange is outside chromRange!"
                )

        # now select CDS only
        # we keep UTRs in the filtered file
        # however, we need CDS to check whether it's correct (% 3 == 0)
        for block_num in range(blockCount):
            blockStart = blockAbsStarts[block_num]
            blockEnd = blockAbsEnds[block_num]

            # skip the block if it is entirely UTR
            if blockEnd <= thickStart:
                continue
            elif blockStart >= thickEnd:
                continue

            # if we are here: this is not an entirely UTR exon
            # it might intersect the CDS border or to be in the CDS entirely
            # remove UTRs: block start must be >= CDS_start (thickStart)
            # block end must be <= CDS_end (thickEnd)
            blockNewStart = blockStart if blockStart >= thickStart else thickStart
            blockNewEnd = blockEnd if blockEnd <= thickEnd else thickEnd
            blockNewStarts.append(blockNewStart - thickStart)
            blockNewEnds.append(blockNewEnd - thickStart)

        if len(blockNewStarts) == 0:
            # even it thickStart != thickEnd this transcript still can be non-coding
            # but if there are no blocks in the CDS -> we can catch this
            rejected.append((name, "No CDS"))
            continue

        block_new_count = len(blockNewStarts)
        blockNewSizes = [
            blockNewEnds[i] - blockNewStarts[i] for i in range(block_new_count)
        ]

        if sum(blockNewSizes) % 3 != 0 and not ouf:
            # this is an out-of-frame (or incomplete transcript)
            # ideally CDS length should be divisible by 3
            # not ouf means that we like to keep such transcripts for some reason
            rejected.append((name, "Out-of-frame gene"))
            continue

        # if there are non-unique transcript IDs: die
        # I kill it there, not earlier to show them altogether
        if any(v > 1 for v in names.values()):
            eprint("Error! There are non-uniq transcript IDs:")
            for k in names.keys():
                eprint(k)
            die("Abort")
        # we keep this transcript: add in to the list
        new_line = "\t".join([str(x) for x in line_data])
        new_lines.append(new_line)
    f.close()

    if len(new_lines) == 0:
        # no transcripts pass the filter: probably an input data mistake
        sys.exit(
            f"Error! No reference annotation tracks left after filtering procedure! Abort"
        )

    # write transcripts that passed the filter to the output file
    f = open(output, "w") if output != "stdout" else sys.stdout
    f.write("\n".join(new_lines) + "\n")
    f.close() if output != "stdout" else None

    if save_rejected:
        # save transcripts that didn't pass the filter + reason why
        f = open(save_rejected, "w")
        for elem in rejected:
            f.write(f"{elem[0]}\t{elem[1]}\n")
        f.close()
예제 #18
0
def verbose(msg):
    """Eprint for verbose messages."""
    eprint(msg + "\n") if VERBOSE else None
예제 #19
0
def main():
    """Entry point."""
    t0 = dt.now()
    args = parse_args()
    os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"  # otherwise it could crash

    # as default we create CESAR jobs for chains with "orth" or "trans" class
    # but user could select another set of chain classes
    fields = "ORTH,TRANS" if args.fields is None else args.fields

    # read U12 introns: to create a list of U12-containing genes
    # need it to make subsequent commands
    u12_data = read_u12_data(args.u12)

    # get lists of orthologous chains per each gene
    # skipped_1 - no chains found -> log them
    batch, chain_gene_field, skipped_1 = read_orthologs(args.orthologs_file,
                                                        fields,
                                                        only_o2o=args.o2o_only)
    # split cesar jobs in different buckets (if user requested so)
    # like put all jobs that require < 5Gig in the bucket 1
    # jobs requiring 5 to 15 Gb to bucket 2 and so on
    # CESAR might be very memory-consuming -> so we care about this
    mem_limit, buckets = define_buckets(args.mem_limit, args.buckets)

    # load reference bed file data; coordinates and exon sizes
    bed_data = read_bed(args.bed_file)
    # check if cesar binary exists
    die(f"Error! Cannot find cesar executable at {args.cesar_binary}!") if \
        not os.path.isfile(args.cesar_binary) else None

    # pre-compute chain : gene : region data
    # collect the second list of skipped genes
    # skipped_2 -> too long corresponding regions in query
    regions, skipped_2 = precompute_regions(batch,
                                            bed_data,
                                            args.bdb_chain_file,
                                            chain_gene_field,
                                            args.chains_limit)
    
    # start making the jobs
    all_jobs = {}
    skipped_3 = []

    for gene in batch.keys():
        u12_this_gene = u12_data.get(gene)
        block_sizes = bed_data[gene][3]

        # proceed to memory estimation
        # the same procedure as inside CESAR2.0 code
        num_states, r_length = 0, 0

        # required memory depends on numerous params
        # first, we need reference transcript-related parameters
        # query-related parameters will be later
        for block_size in block_sizes:
            # num_states += 6 + 6 * reference->num_codons + 1 + 2 + 2 + 22 + 6;
            #  /* 22 and 6 for acc and donor states */
            num_codons = block_size // 3
            num_states += 6 + 6 * num_codons + 1 + 2 + 2 + 22 + 6
            # r_length += 11 + 6 * fasta.references[i]->length
            # + donors[i]->length + acceptors[i]->length;
            r_length += block_size

        gene_chains_data = regions.get(gene)
        # check that there is something for this gene
        if not gene_chains_data:
            continue
        elif len(gene_chains_data) == 0:
            continue

        chains = gene_chains_data.keys()
        chains_arg = ",".join(chains)  # chain ids -> one of the cmd args
        
        # now compute query sequence-related parameters
        query_lens = [v for v in gene_chains_data.values()]
        q_length_max = max(query_lens)
        # and now compute the amount of required memory
        memory = (num_states * 4 * 8) + \
                 (num_states * q_length_max * 4) + \
                 (num_states * 304) + \
                 (2 * q_length_max + r_length) * 8 + \
                 (q_length_max + r_length) * 2 * 1 + EXTRA_MEM

        # convert to gigs + 0.25 extra gig
        gig = math.ceil(memory / 1000000000) + 0.25 
        if gig > mem_limit:
            # it is going to consume TOO much memory
            # skip this gene -> save to log
            skipped_3.append((gene, ",".join(chains),
                             f"memory limit ({mem_limit} gig) exceeded (needs {gig})"))
            continue

        # # 0 gene; 1 chains; 2 bed_file; 3 bdb chain_file; 4 tDB; 5 qDB; 6 output; 7 cesar_bin
        job = WRAPPER_TEMPLATE.format(gene, chains_arg,
                                      os.path.abspath(args.bdb_bed_file),
                                      os.path.abspath(args.bdb_chain_file),
                                      os.path.abspath(args.tDB),
                                      os.path.abspath(args.qDB),
                                      gig,
                                      os.path.abspath(args.cesar_binary),
                                      args.uhq_flank)
        # add some flags if required
        job = job + " --mask_stops" if args.mask_stops else job
        job = job + " --check_loss" if args.check_loss else job
        job = job + " --no_fpi" if args.no_fpi else job

        # add U12 introns data if this gene has them:
        job = job + f" --u12 {os.path.abspath(args.u12)}" if u12_this_gene else job

        all_jobs[job] = gig

    eprint(f"\nThere are {len(all_jobs.keys())} jobs in total.")
    eprint("Splitting the jobs.")
    # split jobs in buckets | compute proportions
    filled_buckets = fill_buckets(buckets, all_jobs)
    prop_sum = sum([k * len(v) for k, v in filled_buckets.items()])
    # estimate proportion of a bucket in the runtime
    buckets_prop = {k: (k * len(v)) / prop_sum for k, v in filled_buckets.items()} \
        if 0 not in filled_buckets.keys() else {0: 1.0}
    eprint("Bucket proportions are:")
    eprint("\n".join([f"{k} -> {v}" for k, v in buckets_prop.items()]))
    # get number of jobs for each bucket
    bucket_jobs_num = {k: math.ceil(args.jobs_num * v) for k, v in buckets_prop.items()}
    # save jobs, get comb lines
    to_combine = save_jobs(filled_buckets, bucket_jobs_num, args.jobs_dir)
    # save combined jobs, combined is a file containing paths to separate jobs
    os.mkdir(args.results) if not os.path.isdir(args.results) else None
    os.mkdir(args.check_loss) if args.check_loss \
        and not os.path.isdir(args.check_loss) else None

    f = open(args.combined, "w")
    for num, comb in enumerate(to_combine, 1):
        basename = os.path.basename(comb).split(".")[0]
        results_path = os.path.abspath(os.path.join(args.results, basename + ".bdb"))
        combined_command = f"{CESAR_RUNNER} {comb} {results_path}"
        if args.check_loss:
            loss_data_path = os.path.join(args.check_loss,
                                          f"{basename}.inact_mut.txt")
            combined_command += f" --check_loss {loss_data_path}"
        if args.rejected_log:
            log_path = os.path.join(args.rejected_log, f"{num}.txt")
            combined_command += f" --rejected_log {log_path}"
        f.write(combined_command + "\n")
    f.close()

    # save skipped genes if required
    if args.skipped_genes:
        skipped = skipped_1 + skipped_2 + skipped_3
        f = open(args.skipped_genes, "w")
        # usually we have gene + reason why skipped
        # we split them with tab
        f.write("\n".join(["\t".join(x) for x in skipped]) + "\n")
        f.close()

    f = open(args.paralogs_log, "w")
    # save IDs of paralogous projections
    for k, v in chain_gene_field.items():
        if v != "PARALOG":
            continue
        gene_ = f"{k[1]}.{k[0]}\n"
        f.write(gene_)
    f.close()

    eprint(f"Estimated: {dt.now() - t0}")
    sys.exit(0)
예제 #20
0
def verbose(msg):
    """Eprint for verbose messages."""
    eprint(msg + "\n")
예제 #21
0
 def __get_proc_pseudogenes_track(self):
     """Create annotation of processed genes in query."""
     eprint("Creating processed pseudogenes track.")
     proc_pgenes_track = os.path.join(self.wd, "proc_pseudogenes.bed")
     create_ppgene_track(self.orthologs, self.chain_file,
                         self.index_bed_file, proc_pgenes_track)
예제 #22
0
def precompute_regions(batch, bed_data, bdb_chain_file, chain_gene_field, limit):
    """Precompute region for each chain: bed pair."""
    eprint("Precompute regions for each gene:chain pair...")
    chain_to_genes, skipped = defaultdict(list), []
    # revert the dict, from gene2chain to chain2genes
    for gene, chains in batch.items():
        if len(chains) == 0:
            skipped.append((gene, ",".join(chains), "no orthologous chains"))
            continue
        chains_ = sorted(chains, key=lambda x: int(x))
        chains_ = chains_[:limit]
        if len(chains) > limit:
            # skip genes that have > limit orthologous chains
            skipped.append((gene, ",".join(chains_[limit:]),
                            f"number of chains ({limit} chains) limit exceeded"))
        for chain in chains_:
            chain_to_genes[chain].append(gene)
    # read regions themselves
    gene_chain_grange = defaultdict(dict)
    chains_num, iter_num = len(chain_to_genes.keys()), 0

    for chain_id, genes in chain_to_genes.items():
        # extract chain itself
        chain_body = chain_extract_id(bdb_chain_file, chain_id).encode()
        all_gene_ranges = []
        for gene in genes:
            # get genomic coordinates for each gene
            gene_data = bed_data.get(gene)
            grange = f"{gene_data[0]}:{gene_data[1]}-{gene_data[2]}"
            all_gene_ranges.append(grange)
            
        # we need to get corresponding regions in the query
        # for now we have chain blocks coordinates and gene
        # regions in the reference genome
        # use chain_coords_converter shared library to
        # convert target -> query coordinates via chain
        # first need to convert to C-types
        c_chain = ctypes.c_char_p(chain_body)
        c_shift = ctypes.c_int(2)
        granges_bytes = [s.encode("utf-8") for s in all_gene_ranges]
        granges_num = len(all_gene_ranges)
        c_granges_num = ctypes.c_int(granges_num)
        granges_arr = (ctypes.c_char_p * (granges_num + 1))()
        granges_arr[:-1] = granges_bytes
        granges_arr[granges_num] = None

        # then call the function
        raw_ch_conv_out = ch_lib.chain_coords_converter(c_chain,
                                                        c_shift,
                                                        c_granges_num,
                                                        granges_arr)
        chain_coords_conv_out = []  # keep lines here
        # convert C output to python-readable type
        for i in range(granges_num + 1):
            chain_coords_conv_out.append(raw_ch_conv_out[i].decode("utf-8"))

        for line in chain_coords_conv_out[1:]:
            # then parse the output
            line_info = line.rstrip().split()
            # line info is: region num, region in reference, region in query
            # one line per one gene, in the same order
            num = int(line_info[0])
            # regions format is chrom:start-end
            q_grange = line_info[1].split(":")[1].split("-")
            q_start, q_end = int(q_grange[0]), int(q_grange[1])
            que_len = q_end - q_start
            t_grange = line_info[2].split(":")[1].split("-")
            t_start, t_end = int(t_grange[0]), int(t_grange[1])
            tar_len = t_end - t_start
            len_delta = abs(tar_len - que_len)
            delta_gene_times = len_delta / tar_len
            gene = genes[num]
            field = chain_gene_field.get((chain_id, gene))
            # check that corresponding region in the query is not too long
            # if so: skip this
            high_rel_len = delta_gene_times > REL_LENGTH_THR
            high_abs_len = len_delta > ABS_LENGTH_TRH
            long_loci_field = field in LONG_LOCI_FIELDS
            if (high_rel_len or high_abs_len) and long_loci_field:
                skipped.append((gene, chain_id, "too long query locus"))
                continue
            # for each chain-gene pair save query region length
            # need this for required memory estimation
            gene_chain_grange[gene][chain_id] = que_len

        del raw_ch_conv_out  # not sure if necessary but...
        iter_num += 1  # verbosity
        eprint(f"Chain {iter_num} / {chains_num}", end="\r")
    return gene_chain_grange, skipped
예제 #23
0
def verbose(msg, end="\n"):
    """Eprint for verbose messages."""
    eprint(msg + end)
예제 #24
0
    def __init__(self, args):
        """Initiate toga class."""
        self.t0 = dt.now()
        # check if all files TOGA needs are here
        self.temp_files = []  # remove at the end, list of temp files
        self.__modules_addr()
        self.__check_dependencies()
        self.__check_completeness()
        self.nextflow_dir = self.__get_nf_dir(args.nextflow_dir)
        self.nextflow_config_dir = args.nextflow_config_dir
        self.__check_nf_config()
        # to avoid crash on filesystem without locks:
        os.environ[
            "HDF5_USE_FILE_LOCKING"] = "FALSE"  # otherwise it could crash

        eprint("mkdir_and_move_chain in progress...")
        chain_basename = os.path.basename(args.chain_input)

        # create project dir
        self.project_name = chain_basename.split(".")[1] if not args.project_name \
            else args.project_name
        self.wd = args.project_folder if args.project_folder else  \
            os.path.join(os.getcwd(), self.project_name)
        # for safety; need this to make paths later
        self.project_name = self.project_name.replace("/", "")
        os.mkdir(self.wd) if not os.path.isdir(self.wd) else None

        # dir to collect log files with rejected reference genes:
        self.rejected_dir = os.path.join(self.wd, "rejected")
        os.mkdir(self.rejected_dir) if not os.path.isdir(
            self.rejected_dir) else None

        # filter chain in this folder
        g_ali_basename = "genome_alignment"
        self.chain_file = os.path.join(self.wd, f"{g_ali_basename}.chain")
        # there is an assumption that chain file has .chain extension
        # chain indexing was a bit problematic: (i) bsddb3 fits perfectly but is very
        # painful to install, (ii) sqlite is also fine but might be dysfunctional on some
        # cluster file systems, so we create chain_ID: (start_byte, offset) dictionary for
        # instant extraction of a particular chain from the chain file
        # we save these dictionaries into two files: a text file (tsv) and binary file with BST
        # depending on the case we will use both (for maximal performance)
        self.chain_index_file = os.path.join(self.wd, f"{g_ali_basename}.bst")
        self.chain_index_txt_file = os.path.join(
            self.wd, f"{g_ali_basename}.chain_ID_position")

        # make the command, prepare the chain file
        if not os.path.isfile(args.chain_input):
            chain_filter_cmd = None
            self.die(f"Error! File {args.chain_input} doesn't exist!")
        elif chain_basename.endswith(".gz"):  # version for gz
            chain_filter_cmd = f"gzip -dc {args.chain_input} | "\
                               f"{self.CHAIN_SCORE_FILTER} stdin "\
                               f"{args.min_score} > {self.chain_file}"
        elif args.no_chain_filter:  # it is .chain and score filter is not required
            chain_filter_cmd = f"rsync -a {args.chain_input} {self.chain_file}"
        else:  # it is .chain | score filter required
            chain_filter_cmd = f"{self.CHAIN_SCORE_FILTER} {args.chain_input} "\
                               f"{args.min_score} > {self.chain_file}"

        # filter chains with score < threshold
        self.__call_proc(chain_filter_cmd,
                         "Please check if you use a proper chain file.")

        # bed define bed files addresses
        self.ref_bed = os.path.join(self.wd, "toga_filt_ref_annot.bed")
        self.index_bed_file = os.path.join(self.wd, "toga_filt_ref_annot.hdf5")

        # filter bed file
        bed_filt_rejected_file = "BED_FILTER_REJECTED.txt"
        bed_filt_rejected = os.path.join(self.rejected_dir,
                                         bed_filt_rejected_file)
        # keeping UTRs!
        prepare_bed_file(args.bed_input,
                         self.ref_bed,
                         save_rejected=bed_filt_rejected,
                         only_chrom=args.limit_to_ref_chrom)

        # mics things
        self.isoforms_arg = args.isoforms if args.isoforms else None
        self.isoforms = None  # will be assigned after completeness check
        self.chain_jobs = args.chain_jobs_num
        self.cesar_binary = self.DEFAULT_CESAR if not args.cesar_binary \
            else args.cesar_binary
        self.time_log = args.time_marks
        self.stop_at_chain_class = args.stop_at_chain_class

        self.keep_temp = True if args.keep_temp else False
        # define to call CESAR or not to call
        self.t_2bit = self.__find_two_bit(args.tDB)
        self.q_2bit = self.__find_two_bit(args.qDB)

        self.hq_orth_threshold = 0.95
        self.cesar_jobs_num = args.cesar_jobs_num
        self.cesar_buckets = args.cesar_buckets
        self.cesar_mem_limit = args.cesar_mem_limit
        self.cesar_chain_limit = args.cesar_chain_limit
        self.uhq_flank = args.uhq_flank
        self.cesar_fields = args.homology_types
        self.mask_stops = args.mask_stops
        self.no_fpi = args.no_fpi
        self.o2o_only = args.o2o_only
        self.keep_nf_logs = args.do_not_del_nf_logs
        self.cesar_ok_merged = None

        self.chain_results_df = os.path.join(self.wd, "chain_results_df.tsv")
        self.nucl_fasta = os.path.join(self.wd, "nucleotide.fasta")
        self.prot_fasta = os.path.join(self.wd, "prot.fasta")
        self.final_bed = os.path.join(self.wd, "query_annotation.bed")
        self.low_conf_bed = os.path.join(self.wd, "low_confidence.bed")
        self.meta_data = os.path.join(self.wd, "exons_meta_data.tsv")
        self.intermediate_bed = os.path.join(self.wd, "intermediate.bed")
        self.orthology_type = os.path.join(self.wd,
                                           "orthology_classification.tsv")
        self.classification_log = os.path.join(self.wd, "o_class.log")
        self.trash_exons = os.path.join(self.wd, "trash_exons.bed")
        self.gene_loss_data = os.path.join(self.wd, "inact_mut_data")
        self.query_annotation = os.path.join(self.wd, "query_annotation.bed")
        self.loss_summ = os.path.join(self.wd, "loss_summ_data.tsv")
        self.u12_arg = args.u12
        self.u12 = None  # assign after U12 file check
        self.__check_param_files()

        # dump input parameters, object state
        self.toga_params_file = os.path.join(self.wd, "toga_init_state.json")
        self.toga_args_file = os.path.join(self.wd, "project_args.json")
        with open(self.toga_params_file, "w") as f:
            # default=string is a workaround to serialize datetime object
            json.dump(self.__dict__, f, default=str)
        with open(self.toga_args_file, "w") as f:
            json.dump(vars(args), f, default=str)
        print("TOGA initiated successfully!")
예제 #25
0
def check_args(args):
    """Check if args are correct, fill global dict."""
    # check the directories
    global VERBOSE  # set verbosity level
    VERBOSE = True if args.verbose else False
    WORK_DATA["vv"] = True if args.vv else False

    try:  # check the directories, create if it is necessary
        os.mkdir(args.jobs) if not os.path.isdir(args.jobs) else None
        os.mkdir(
            args.results_dir) if not os.path.isdir(args.results_dir) else None
        os.mkdir(args.errors_dir) \
            if args.errors_dir and not os.path.isdir(args.errors_dir) \
            else None
        WORK_DATA["jobs"] = args.jobs
        WORK_DATA["results_dir"] = args.results_dir
        WORK_DATA["errors_dir"] = args.errors_dir
        verbose(
            f"Directories in usage: {args.jobs} {args.results_dir} {args.errors_dir}"
        )

    except FileNotFoundError as grepexc:  # a one of those tasks failed
        eprint(f"Arguments are corrupted!\n{str(grepexc)}")
        die("Cannot create one of the directories requested.")

    # define about chain and bed files
    WORK_DATA["chain_file"] = args.chain_file if os.path.isfile(args.chain_file) \
        else die(f"Error! Chain file {args.chain_file} is wrong!")

    WORK_DATA["bed_file"] = args.bed_file if os.path.isfile(args.bed_file) \
        else die(f"Error! Bed file {args.bed_file} is wrong!")
    verbose(f"Use bed file {args.bed_file} and chain file {args.chain_file}")

    # look for .ID.bb file
    index_file = args.index_file if args.index_file else args.chain_file.replace(
        ".chain", ".chain_ID_position")

    if os.path.isfile(index_file):  # check if bb file is here
        WORK_DATA["index_file"] = index_file
        verbose(f"And {index_file} as an index file")
    elif args.make_index:  # create index if not exists
        eprint("make_indexed in progress...")
        idbb_cmd = f"/modules/chain_bdb_index.py {args.chain_file} {index_file}"
        call_proc(idbb_cmd)
        WORK_DATA["index_file"] = index_file
    else:  # die
        die(f"Error! Cannot find index file at {index_file}\n"
            "Please define it manually")

    # define the number of jobs
    if args.job_size:  # easy:
        WORK_DATA["job_size"] = args.job_size
        WORK_DATA["jobs_num"] = None
    else:  # we must compute how many jobs to put into one cluster job
        WORK_DATA["job_size"] = None
        WORK_DATA["jobs_num"] = args.jobs_num
    WORK_DATA["bed_index"] = args.bed_index

    # some defaults
    WORK_DATA["jobs_file"] = args.jobs_file
    WORK_DATA["ref"] = args.ref
    # check if we are on cluster
    WORK_DATA["on_cluster"] = True
    verbose("Program-wide dictionary looks like:\n")
    for k, v in WORK_DATA.items():
        verbose(f"{k}: {v}")
예제 #26
0
    def __run_cesar_jobs(self):
        """Run CESAR jobs using nextflow.
        
        At first -> push joblists, there might be a few of them
        At second -> monitor joblists, wait until all are done.
        """
        # for each bucket I create a separate joblist and config file
        # different config files because different memory limits
        project_paths = []  # dirs with logs
        processes = []  # keep subprocess objects here
        timestamp = str(time.time()).split(".")[1]  # for project name
        # get a list of buckets
        if self.cesar_buckets == "0":
            buckets = [
                0,
            ]  # a single bucket
        else:  # several buckets, each int -> memory limit in gb
            buckets = [
                int(x) for x in self.cesar_buckets.split(",") if x != ""
            ]
        print(f"Pushing {len(buckets)} joblists")
        # cmd to grep bucket-related commands
        grep_bucket_template = "cat {0} | grep _{1}.bdb"
        for b in buckets:
            # create config file
            # 0 means that that buckets were not split
            mem_lim = b if b != 0 else self.cesar_mem_limit
            if not self.local_executor:
                # running on cluster, need to create config file
                # for this bucket's memory requirement
                config_string = self.cesar_config_template.replace(
                    "${_MEMORY_}", f"{mem_lim}")
                config_file_path = os.path.join(self.wd,
                                                f"cesar_config_{b}_queue.nf")
                config_file_abspath = os.path.abspath(config_file_path)
                with open(config_file_path, "w") as f:
                    f.write(config_string)
                self.temp_files.append(config_file_path)
            else:  # no config dir given: use local executor
                # OK if there is a single bucket
                config_file_abspath = None
            # extract jobs related to this bucket (if it's not 0)
            if b != 0:
                grep_bucket_cmd = grep_bucket_template.format(
                    self.cesar_combined, b)
                try:
                    bucket_tasks = subprocess.check_output(
                        grep_bucket_cmd, shell=True).decode("utf-8")
                except subprocess.CalledProcessError:
                    eprint(f"There are no jobs in the {b} bucket")
                    continue
                joblist_name = f"cesar_joblist_queue_{b}.txt"
                joblist_path = os.path.join(self.wd, joblist_name)
                with open(joblist_path, "w") as f:
                    f.write(bucket_tasks)
                joblist_abspath = os.path.abspath(joblist_path)
                self.temp_files.append(joblist_path)
            else:  # nothing to extract, there is a single joblist
                joblist_abspath = os.path.abspath(self.cesar_combined)
            # create project directory for logs
            nf_project_name = f"{self.project_name}_cesar_at_{timestamp}_q_{b}"
            nf_project_path = os.path.join(self.nextflow_dir, nf_project_name)
            project_paths.append(nf_project_path)
            os.mkdir(nf_project_path
                     ) if not os.path.isdir(nf_project_path) else None
            # create subprocess object
            nf_cmd = f"nextflow {self.NF_EXECUTE} " \
                     f"--joblist {joblist_abspath}"
            if config_file_abspath:
                nf_cmd += f" -c {config_file_abspath}"
            p = subprocess.Popen(nf_cmd, shell=True, cwd=nf_project_path)
            sys.stderr.write(f"Pushed cluster jobs with {nf_cmd}")
            processes.append(p)
            time.sleep(CESAR_PUSH_INTERVAL)

        # monitor jobs
        iter_num = 0
        while True:
            # Run until all jobs are done (or crashed)
            all_done = True  # default val, re-define if something is not done
            for p in processes:
                # check if each process is still running
                running = p.poll() is None
                if running:
                    all_done = False
            if all_done:
                print("CESAR jobs done")
                break
            else:
                print(
                    f"Iter {iter_num} waiting {ITER_DURATION * iter_num} seconds Not done"
                )
                time.sleep(ITER_DURATION)
                iter_num += 1
        if not self.keep_nf_logs:
            # remove nextflow intermediate files
            for path in project_paths:
                shutil.rmtree(path)
예제 #27
0
 def __make_indexed_bed(self):
     """Create gene_ID: bed line bdb indexed file."""
     eprint("index_bed in progress...")
     bed_hdf5_index(self.ref_bed, self.index_bed_file)
     self.temp_files.append(self.index_bed_file)
     eprint("Bed file indexed")