Exemplo n.º 1
0
def jellyfish(parameters):
    print_logo("K-mer counting using jellyfish")
    print_info('Start k-mer counting using jellyfish.')

    for file_prefix in parameters['prefixes']:
        fasta_file = f'{file_prefix}.fasta'
        fasta_file_full_path = os.path.join(parameters['data_dir'], fasta_file)

        jellyfish_file = f'{file_prefix}.jf'
        jellyfish_file_full_path = os.path.join(parameters['jellyfish_out_dir'], jellyfish_file)

        output_file = f'{file_prefix}_dump.fasta'
        output_file_full_path = os.path.join(parameters['jellyfish_out_dir'], output_file)

        if os.path.exists(output_file_full_path):
            print_info(f'The output {output_file} file already exists. Skipping ...')
            continue

        if not kmer_counting(fasta_file_full_path, jellyfish_file_full_path, parameters):
            return False

        if not dump_jf_file(output_file_full_path, jellyfish_file_full_path, jellyfish_file, output_file):
            return False

        remove_jf_file(jellyfish_file_full_path, parameters)

    return True
Exemplo n.º 2
0
    def print_progress(self):
        self.counter += 1
        offset = 100
        diff = (self.counter / self.kmers_number) * 100

        if not self.counter % offset or self.counter == self.kmers_number:
            print_info(
                f'Processed {self.counter} / {self.kmers_number} ({diff:.2f}%) kmers ...',
                self.worker_name)
Exemplo n.º 3
0
def remove_jf_file(jellyfish_file, parameters):
    if parameters['keep_intermediate_jf_files'] == 'no':
        print_info(f'Deleting the {jellyfish_file} file ... ')

        try:
            os.remove(jellyfish_file)
            print_info(f"File '{jellyfish_file}' removed successfully")
        except FileNotFoundError:
            print_warning(f'The {jellyfish_file} file was not found.')
Exemplo n.º 4
0
    def run(self):
        print_logo("K-mer comparing")

        if self.parameters['run_tomtom'] == 'no':
            print_info("Analysis canceled. To run tomtom set 'run_tomtom' parameter to 'yes'")
            return True

        self.kmers_to_meme()
        self.tomtom()

        return True
Exemplo n.º 5
0
    def check_run(self):
        for prefix in self.parameters['prefixes']:
            if not os.path.exists(
                    os.path.join(self.parameters['output_dir'], 'tables',
                                 f'table_{prefix}')):
                return True

        if self.parameters['keep_kmers_table'] == 'yes':
            print_info("Keeping k-mer counting from the previous run")
            return False

        return True
Exemplo n.º 6
0
def bulk_fasta_to_oneline(parameters):
    print('')
    print_info('Converting FASTA files to text files.')

    for file_prefix in parameters['prefixes']:
        input_file = f'{file_prefix}.fasta'
        input_file_path = os.path.join(parameters['data_dir'], input_file)
        output_file = f'{file_prefix}_oneLine.txt'
        output_file_path = os.path.join(parameters['data_dir'],
                                        f'{file_prefix}_oneLine.txt')

        if os.path.exists(output_file_path):
            print_info(
                f'The output {output_file} file already exists. Skipping ...')
            continue

        print_info(f'Converting {input_file} into {output_file} ... ')

        try:
            fasta_to_oneline(input_file_path, output_file_path)
        except Exception as e:
            print_warning(
                f'Something went wrong during saving to the {output_file} file.'
            )
            print_warning('Please, check the stderr output:\n')
            print(e)

            return False

    print_info("Conversion completed")

    return True
Exemplo n.º 7
0
def dump_jf_file(output_file_full_path, jellyfish_file_full_path, jellyfish_file, output_file_name):
    print_info(f'Outputting counts from the {jellyfish_file} file to the {output_file_name} file ... ')

    result = subprocess.run(['jellyfish', 'dump', jellyfish_file_full_path,
                             '-o', output_file_full_path], capture_output=True, text=True)

    if result.returncode:
        print_warning('Something went wrong during outputting counts')
        print_warning('Please, check the stderr output:')
        print(result.stderr)

        return False

    return True
Exemplo n.º 8
0
    def kmers_to_meme(self):
        """Converts kmer sequences into MEME format and saves them to the 'kmers.meme' file"""

        if os.path.exists(self.output_meme_file_path):
            os.remove(self.output_meme_file_path)

        print_info(f"Converting kmers into MEME format ... ")

        with open(self.output_meme_file_path, 'a+') as output:
            with open(self.stats_file_path, 'r') as file:
                for line in file:
                    line = line.rstrip()
                    line_splitted = line.split("\t")

                    if len(line_splitted[0]) > 0:
                        result = subprocess.run(['iupac2meme', line_splitted[0]], capture_output=True, text=True)
                        output.write(result.stdout)
Exemplo n.º 9
0
def kmer_counting(fasta_file, jellyfish_file, parameters):
    print('')
    print_info(f'Counting k-mers in the {fasta_file} file ... ')

    result = subprocess.run(['jellyfish', 'count',
                             '-m', parameters['kmer_length'],
                             '-s', parameters['hash_size'],
                             '-t', parameters['threads_number'],
                             # '-C', fasta_file,
                             fasta_file,
                             '-o', jellyfish_file], capture_output=True, text=True)

    if result.returncode:
        print_warning('Something went wrong during k-mer counting.')
        print_warning('Please, check the stderr output:')
        print(result.stderr)
        print(parameters['kmer_length'], parameters['hash_size'], parameters['threads_number'], jellyfish_file)

        return False

    return True
Exemplo n.º 10
0
    def run(self):
        print_logo("Statistic analysis")

        if not os.path.exists(self.merged_table_path):
            print_warning("the merged table does not exist")
            return False

        if not os.path.exists(os.path.join(self.parameters['output_dir'], 'stats', 'stats.txt')) \
                or (os.path.exists(os.path.join(self.parameters['output_dir'], 'stats', 'stats.txt')) and self.parameters['keep_stats_file'] == 'no'):
            try:
                print_info("Applying Fisher test ...")
                self.chrom_len_calc()
                self.mite_total_len_calc()
                self.analyse()

                self.save_stats_to_file(
                    os.path.join(self.parameters['output_dir'], 'stats',
                                 'stats.txt'))
            except Exception:
                return False
        else:
            print_info(
                f"The output 'stats.txt' file exists. Loading saved data ... ")
            self.data = pd.read_csv(os.path.join(self.parameters['output_dir'],
                                                 'stats', 'stats.txt'),
                                    sep='\t')

        try:
            print("")
            print_info(f"Filter statistics data:")
            self.filter_kmers_by_p_corrected_bon_thresh()
            self.save_stats_to_file(
                os.path.join(self.parameters['output_dir'], 'stats',
                             'stats_filtered_1_corr_bonif_thresh.txt'))

            self.filter_kmers_by_freq_higher()
            self.save_stats_to_file(
                os.path.join(self.parameters['output_dir'], 'stats',
                             'stats_filtered_2_by_freq_higher.txt'))

            self.filter_kmers_by_freq_lesser()
            self.save_stats_to_file(
                os.path.join(self.parameters['output_dir'], 'stats',
                             'stats_filtered_3_by_freq_lesser.txt'))

            self.merge_coords_files()
            self.filter_coords_file()

            return True
        except Exception as e:
            print(f"Exception: {e}")
            return False
Exemplo n.º 11
0
    def tomtom(self):
        """Compares kmer motifs with database using tomtom"""
        print_info(f"Comparing kmer motifs with database using tomtom ... ")

        parameters = ['tomtom']

        # parameters.append('-min-overlap')
        parameters.append('-min-overlap')
        parameters.append(self.parameters['min_overlap'])

        if self.parameters['internal'] == 'yes':
            parameters.append('-internal')

        if self.parameters['threshold_type'] == 'e-value':
            parameters.append('-evalue')

        parameters.append('-thresh')
        parameters.append(self.parameters['threshold_value'])

        parameters.append('-oc')
        parameters.append(os.path.join(self.parameters['output_dir'], 'tomtom', 'tomtom_out'))

        parameters.append(self.output_meme_file_path)
        parameters.append(self.parameters['motif_database'])

        # subprocess.run(parameters, capture_output=True, text=True)
        process = subprocess.Popen(parameters, stderr=subprocess.PIPE)
        while True:
            output = process.stderr.readline().decode('utf-8').rstrip()

            if output == '' and process.poll() is not None:
                break

            if output != '':
                try:
                    if output[0] == 'P':
                        print(f"\r\033[0K{output}", end='', flush=True)
                except IndexError:
                    print(f"DEBUG: {output}")
                    raise IndexError

        print("")

        if not process.returncode:
            print_info(f"Processing completed")
            print_info(f"Report in HTML format is available at: ./output/tomtom/tomtom_out/tomtom.html")
        else:
            print_warning("Something went wrong with tomtom run. Used command:")
            print(" ".join(parameters))

        print(" ".join(parameters))
Exemplo n.º 12
0
    def worker(self, data_input):
        # print("Loading '{}' file ...".format(data_input["dump_file"]))
        worker_name = f"{data_input['chr_name']} worker"
        data_kmer = {}
        name_tmp = ""

        print_info(
            f"Start reading {os.path.basename(data_input['dump_file'])} file",
            worker_name)

        with open(data_input["dump_file"], 'r') as file:
            cont = True
            while cont:
                line = file.readline()
                if line == '':
                    cont = False
                    break
                line = line.rstrip()
                if line[0] == ">":
                    name_tmp = line[1:]
                else:
                    data_kmer[line] = name_tmp

        print_info(
            f"Reading {os.path.basename(data_input['dump_file'])} file completed. Read {len(data_kmer)} kmers.",
            worker_name)

        if len(list(data_kmer.keys())[0]) != int(
                self.parameters['kmer_length']):
            print_info(
                f'{red("Warning")} - The kmer length in {os.path.basename(data_input["dump_file"])} ({len(list(data_kmer.keys())[0])} bp) file is not equal to '
                f'kmer length in config file ({self.parameters["kmer_length"]} bp)',
                worker_name)
            return

        print_info("Loading '{}' file ...".format(data_input["chr_file"]),
                   worker_name)
        with open(data_input["chr_file"], 'r') as f:
            chromosome = f.read()

        #----------------------------------------#
        print_info(
            f"Loading '{os.path.basename(self.parameters['bed_file'])}' file ...",
            worker_name)
        data_mites = []
        with open(self.parameters['bed_file'], 'r') as f:
            while True:
                line = f.readline()
                if line == '':
                    break
                line = line.rstrip()
                data_mites.append(line.split("\t"))
        #----------------------------------------#

        if os.path.exists(data_input["output_file"]):
            print_info(
                "The file '{}' exists. Removing ...".format(
                    data_input["output_file"]), worker_name)
            os.remove(data_input["output_file"])

        output = open(data_input["output_file"], 'a+')

        output_data_template = {}
        output_data_template["edge"] = 0
        output_data_template["genome"] = 0
        mite_names = []

        t = IntervalTree()
        for mite in data_mites:
            if mite[3] not in output_data_template.keys():
                output_data_template[mite[3]] = 0
                output_data_template[mite[3] + "_edge"] = 0
                mite_names.append(mite[3])
                mite_names.append(mite[3] + "_edge")
            if mite[0] == data_input["chr_name"]:
                t[int(mite[1]) - 0:int(mite[2])] = mite[3]
        mite_names = set(mite_names)

        output.write("\t".join([
            "k-mer", "total_occurences_in_{}".format(data_input["chr_name"]),
            "\t".join(sorted(mite_names)), "edge", "genome"
        ]))
        output.write("\n")

        timer = Timer(len(data_kmer), worker_name)
        timer.startt()

        print_info("Started analysis ...", worker_name)

        log_file_path = os.path.join(
            self.parameters['output_dir'], 'tables',
            time.strftime('%y-%m-%d_%H-%M_') + data_input["chr_name"] +
            "_log.txt")
        # log = open(time.strftime('%y-%m-%d_%H-%M_') + data_input["chr_name"] + "_log.txt", 'a+')
        log = open(log_file_path, 'a+')
        log.write("Analysis started at " + time.ctime() + "\n")
        log.flush()

        kmer_No = 0
        kmer_coords = {}

        for kmer in data_kmer.keys():
            timer.print_progress()

            kmer_coords[kmer] = []

            output_data = copy.deepcopy(output_data_template)

            kmer_occurences = self.my_find(chromosome, kmer)

            for kmer_occurence in kmer_occurences:
                kmer_occurence = int(kmer_occurence)
                # result = t[kmer_occurence + 1:kmer_occurence + 11]
                # result = t[kmer_occurence + 1:kmer_occurence + int(self.parameters['kmer_length']) + 1]
                result = t[kmer_occurence:kmer_occurence +
                           int(self.parameters['kmer_length']) + 1]
                if result:
                    if len(list(result)) > 2:
                        print_info(
                            f"\nThe interval tree length is higher than 2: {len( list(result) )} {data_input['chr_name']}",
                            worker_name)
                        log.write("\t".join([
                            "intTree>2", kmer,
                            str(kmer_occurence), mite_name
                        ]) + "\n")
                        log.flush()
                        log.close()
                        exit(1)
                    elif len(list(
                            result)) > 1:  # True if a k-mer overlaps two mites
                        print_info(
                            f"\nThe interval tree length is higher than 1: {data_input['chr_name']}",
                            worker_name)
                        log.write("\t".join([
                            "1<intTree<2", kmer,
                            str(kmer_occurence),
                            str(list(result))
                        ]) + "\n")
                        log.flush()

                        for interval in result:
                            output_data["edge"] += 1
                            output_data[interval.data + "_edge"] += 1
                    else:
                        result_parsed = list(result)[0]
                        result_parsed_list = list(result_parsed)

                        if (kmer_occurence) >= (result_parsed.begin - 1) and \
                            (kmer_occurence + int(self.parameters['kmer_length'])) <= result_parsed.end:
                            output_data[result_parsed.data] += 1
                            kmer_coords[kmer].append("\t".join([
                                data_input["chr_name"],
                                str(result_parsed_list[self.INTERVAL_FROM]),
                                str(result_parsed_list[self.INTERVAL_TO]),
                                f"{kmer};{result_parsed_list[self.INTERVAL_MITE_NAME]}"
                            ]))
                            # kmer_coords[kmer].append(data_input["chr_name"] + ":" + "-".join(
                            #     [str(result_parsed_list[0]), str(result_parsed_list[1])]))
                        else:
                            output_data["edge"] += 1
                            output_data[result_parsed.data + "_edge"] += 1
                else:
                    output_data["genome"] += 1

            kmer_No += 1

            output.write("\t".join([kmer, str(len(kmer_occurences))]))
            for mite_name in sorted(mite_names):
                output.write("\t" + str(output_data[mite_name]))
            output.write("\t" + str(output_data["edge"]))
            output.write("\t" + str(output_data["genome"]))
            output.write("\n")
        log.close()
        output.close()
        timer.stopp()

        self.write_kmer_coords_to_file(data_input['chr_name'], kmer_coords)
Exemplo n.º 13
0
 def stopp(self):
     self.stop = time.time()
     diff = self.stop - self.start
     print_info(f"Processing kmers finished in {diff:.2f} sek",
                self.worker_name)
Exemplo n.º 14
0
 def save_stats_to_file(self, filename):
     print_info(f"Saving data to file '{os.path.basename(filename)}'")
     self.data.to_csv(filename, sep='\t')