示例#1
0
                              '--memory',
                              type=str,
                              dest="memory",
                              required=False,
                              default=8,
                              help='MAx memory to use')

    #argcomplete.autocomplete(parser)
    arguments = parser.parse_args()

    return arguments


args = get_arguments()

sample_list_F = file_to_list(args.sample_list)
print("\n%d samples will be analysed: %s" %
      (len(sample_list_F), ",".join(sample_list_F)))

######################################################################
#####################START PIPELINE###################################
######################################################################
output = os.path.abspath(args.output)
group_name = args.input.split("/")[-1].split(".")[0]
out_vcf_dir = os.path.join(args.output, "VCF")
check_create_dir(out_vcf_dir)

output_vcf_file = os.path.abspath(args.input)

base_input = os.path.basename(args.input)
示例#2
0
def main():
    """
    Create main function to capture code errors: https://stackoverflow.com/questions/6234405/logging-uncaught-exceptions-in-python
    """

    args = get_arguments()

    ######################################################################
    #####################START PIPELINE###################################
    ######################################################################
    output = os.path.abspath(args.output)
    group_name = output.split("/")[-1]
    reference = os.path.abspath(args.reference)
    #annotation = os.path.abspath(args.annotation)

    # LOGGING
    # Create log file with date and time
    right_now = str(datetime.datetime.now())
    right_now_full = "_".join(right_now.split(" "))
    log_filename = group_name + "_" + right_now_full + ".log"
    log_folder = os.path.join(output, 'Logs')
    check_create_dir(log_folder)
    log_full_path = os.path.join(log_folder, log_filename)

    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)

    formatter = logging.Formatter('%(asctime)s:%(message)s')

    file_handler = logging.FileHandler(log_full_path)
    file_handler.setLevel(logging.DEBUG)
    file_handler.setFormatter(formatter)

    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.INFO)
    # stream_handler.setFormatter(formatter)

    logger.addHandler(stream_handler)
    logger.addHandler(file_handler)

    logger.info("\n\n" + BLUE + BOLD +
                "STARTING PIPELINE IN GROUP: " + group_name + END_FORMATTING)

    today = str(datetime.date.today())

    logger.info("ARGUMENTS:")
    logger.info(str(args))

    # Obtain all R1 and R2 from folder
    r1, r2 = extract_read_list(args.input_dir)

    # Check if there are samples to filter out
    sample_list_F = []
    if args.sample_list == None:
        logger.info("\n" + "No samples to filter")
        for r1_file, r2_file in zip(r1, r2):
            sample = extract_sample(r1_file, r2_file)
            sample_list_F.append(sample)
    else:
        logger.info("samples will be filtered")
        sample_list_F = file_to_list(args.sample_list)

    new_samples = check_reanalysis(args.output, sample_list_F)

    logger.info("\n%d samples will be analysed: %s" %
                (len(sample_list_F), ",".join(sample_list_F)))
    logger.info("\n%d NEW samples will be analysed: %s" %
                (len(new_samples), ",".join(new_samples)))
    #DECLARE FOLDERS CREATED IN PIPELINE ################
    #AND KEY FILES ######################################
    #####################################################
    # Annotation related parameters
    #script_dir = os.path.dirname(os.path.realpath(__file__))

    # Output related
    out_qc_dir = os.path.join(output, "Quality")
    out_qc_pre_dir = os.path.join(out_qc_dir, "raw")  # subfolder
    out_variant_dir = os.path.join(output, "Variants")
    out_core_dir = os.path.join(output, "Core")

    out_stats_dir = os.path.join(output, "Stats")
    out_stats_bamstats_dir = os.path.join(
        out_stats_dir, "Bamstats")  # subfolder
    out_stats_coverage_dir = os.path.join(
        out_stats_dir, "Coverage")  # subfolder
    out_compare_dir = os.path.join(output, "Compare")

    out_annot_dir = os.path.join(output, "Annotation")
    out_annot_snpeff_dir = os.path.join(out_annot_dir, "snpeff")  # subfolder
    out_annot_user_dir = os.path.join(out_annot_dir, "user")  # subfolder
    out_annot_user_aa_dir = os.path.join(out_annot_dir, "user_aa")  # subfolder
    out_annot_blast_dir = os.path.join(out_annot_dir, "blast")  # subfolder

    out_species_dir = os.path.join(output, "Species")
    new_sample_number = 0
    for r1_file, r2_file in zip(r1, r2):
        # EXtract sample name
        sample = extract_sample(r1_file, r2_file)
        args.sample = sample
        if sample in sample_list_F:
            # VARINAT SAMPLE DIR
            sample_variant_dir = os.path.join(out_variant_dir, sample)

            sample_number = str(sample_list_F.index(sample) + 1)
            sample_total = str(len(sample_list_F))
            if sample in new_samples:
                new_sample_number = str(int(new_sample_number) + 1)
                new_sample_total = str(len(new_samples))
                logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample +
                            " (" + sample_number + "/" + sample_total + ")" + " (" + new_sample_number + "/" + new_sample_total + ")" + END_FORMATTING)
            else:
                logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample +
                            " (" + sample_number + "/" + sample_total + ")" + END_FORMATTING)

            output_final_vcf = os.path.join(
                sample_variant_dir, 'snps.all.ivar.tsv')

            if not os.path.isfile(output_final_vcf):

                ##############START PIPELINE#####################
                #################################################

                # INPUT ARGUMENTS
                ################
                # check_file_exists(r1_file)
                # check_file_exists(r2_file)

                args.output = os.path.abspath(args.output)
                check_create_dir(args.output)

                # QUALITY CHECK in RAW with fastqc
                ######################################################
                check_create_dir(out_qc_dir)

                out_qc_raw_name_r1 = (".").join(r1_file.split(
                    '/')[-1].split('.')[0:-2]) + '_fastqc.html'
                out_qc_raw_name_r2 = (".").join(r2_file.split(
                    '/')[-1].split('.')[0:-2]) + '_fastqc.html'
                output_qc_raw_file_r1 = os.path.join(
                    out_qc_pre_dir, out_qc_raw_name_r1)
                output_qc_raw_file_r2 = os.path.join(
                    out_qc_pre_dir, out_qc_raw_name_r2)

                if os.path.isfile(output_qc_raw_file_r1) and os.path.isfile(output_qc_raw_file_r2):
                    logger.info(YELLOW + DIM + output_qc_raw_file_r1 +
                                " EXIST\nOmmiting QC for sample " + sample + END_FORMATTING)
                else:
                    logger.info(
                        GREEN + "Checking quality in sample " + sample + END_FORMATTING)
                    logger.info("R1: " + r1_file + "\nR2: " + r2_file)
                    fastqc_quality(r1_file, r2_file,
                                   out_qc_pre_dir, args.threads)

                """
                TODO: Human filter
                """

                # VARIANT CALLING WITH SNIPPY
                ###################################################

                output_vcf_sub = os.path.join(
                    sample_variant_dir, "snps.subs.vcf")
                output_vcf = os.path.join(sample_variant_dir, "snps.vcf")

                if os.path.isfile(output_vcf_sub) and os.path.isfile(output_vcf):
                    logger.info(YELLOW + DIM + output_vcf +
                                " EXIST\nOmmiting Variant calling in  " + sample + END_FORMATTING)
                else:
                    logger.info(
                        GREEN + "Calling variants with snippy " + sample + END_FORMATTING)
                    run_snippy(r1_file, r2_file, reference, out_variant_dir, sample,
                               threads=args.threads, minqual=10, minfrac=0.1, mincov=1)
                    old_bam = os.path.join(sample_variant_dir, "snps.bam")
                    old_bai = os.path.join(sample_variant_dir, "snps.bam.bai")
                    new_bam = os.path.join(sample_variant_dir, sample + ".bam")
                    new_bai = os.path.join(
                        sample_variant_dir, sample + ".bam.bai")
                    os.rename(old_bam, new_bam)
                    os.rename(old_bai, new_bai)

                #VARIANT FORMAT COMBINATION (REMOVE COMPLEX) ########
                #####################################################
                out_variant_indel_sample = os.path.join(
                    sample_variant_dir, "snps.indel.vcf")
                out_variant_all_sample = os.path.join(
                    sample_variant_dir, "snps.all.vcf")

                if os.path.isfile(out_variant_indel_sample):
                    logger.info(YELLOW + DIM + out_variant_indel_sample +
                                " EXIST\nOmmiting indel filtering in sample " + sample + END_FORMATTING)
                else:
                    logger.info(GREEN + "Filtering INDELS in " +
                                sample + END_FORMATTING)
                    extract_indels(output_vcf)

                if os.path.isfile(out_variant_all_sample):
                    logger.info(YELLOW + DIM + out_variant_all_sample +
                                " EXIST\nOmmiting vcf combination in sample " + sample + END_FORMATTING)
                else:
                    logger.info(GREEN + "Combining vcf in " +
                                sample + END_FORMATTING)
                    merge_vcf(output_vcf_sub, out_variant_indel_sample)

                #VARIANT FORMAT ADAPTATION TO IVAR ##################
                #####################################################
                out_variant_tsv_file = os.path.join(
                    sample_variant_dir, 'snps.all.ivar.tsv')

                if os.path.isfile(out_variant_tsv_file):
                    logger.info(YELLOW + DIM + out_variant_tsv_file +
                                " EXIST\nOmmiting format adaptation for sample " + sample + END_FORMATTING)
                else:
                    logger.info(
                        GREEN + "Adapting variants format in sample " + sample + END_FORMATTING)
                    prior = datetime.datetime.now()
                    vcf_to_ivar_tsv(out_variant_all_sample,
                                    out_variant_tsv_file)
                    after = datetime.datetime.now()
                    print(("Done with function in: %s" % (after - prior)))

            # SPECIES DETERMINATION
            ###################################################
            check_create_dir(out_species_dir)

            output_species = os.path.join(
                out_species_dir, sample + ".screen.tab")

            if os.path.isfile(output_species):
                logger.info(YELLOW + DIM + output_species +
                            " EXIST\nOmmiting Species determinatin in " + sample + END_FORMATTING)
            else:
                logger.info(
                    GREEN + "Determining species in " + sample + END_FORMATTING)
                mash_screen(r1_file, out_species_dir, r2_file=r2_file, winner=True, threads=args.threads,
                            mash_database=args.mash_database)

            ########################CREATE STATS AND QUALITY FILTERS########################################################################
            ################################################################################################################################
            #CREATE Bamstats#######################################
            #######################################################
            check_create_dir(out_stats_dir)
            check_create_dir(out_stats_bamstats_dir)
            out_bamstats_name = sample + ".bamstats"
            out_bamstats_file = os.path.join(
                out_stats_bamstats_dir, out_bamstats_name)
            bam_sample_file = os.path.join(sample_variant_dir, sample + ".bam")

            if os.path.isfile(out_bamstats_file):
                logger.info(YELLOW + DIM + out_bamstats_file +
                            " EXIST\nOmmiting Bamstats for  sample " + sample + END_FORMATTING)
            else:
                logger.info(GREEN + "Creating bamstats in sample " +
                            sample + END_FORMATTING)
                create_bamstat(
                    bam_sample_file, out_stats_bamstats_dir, sample, threads=args.threads)

            #CREATE Bamstats#######################################
            #######################################################
            check_create_dir(out_stats_coverage_dir)
            out_coverage_name = sample + ".cov"
            out_coverage_file = os.path.join(
                out_stats_coverage_dir, out_coverage_name)

            if os.path.isfile(out_coverage_file):
                logger.info(YELLOW + DIM + out_coverage_file +
                            " EXIST\nOmmiting Bamstats for  sample " + sample + END_FORMATTING)
            else:
                logger.info(GREEN + "Creating coverage in sample " +
                            sample + END_FORMATTING)
                create_coverage(bam_sample_file,
                                out_stats_coverage_dir, sample)

    # coverage OUTPUT SUMMARY
    ######################################################
    prior_recal = datetime.datetime.now()
    logger.info(GREEN + "Creating summary report for coverage result in group " +
                group_name + END_FORMATTING)
    obtain_group_cov_stats(out_stats_dir, group_name)
    after_recal = datetime.datetime.now()
    logger.info("Done with report for coverage: %s" %
                (after_recal - prior_recal))

    # READS and VARIANTS OUTPUT SUMMARY
    ######################################################
    logger.info(GREEN + "Creating overal summary report in group " +
                group_name + END_FORMATTING)
    obtain_overal_stats(output, group_name)

    # REMOVE UNCOVERED
    ##############################################################################################################################
    logger.info(GREEN + "Removing low quality samples in group " +
                group_name + END_FORMATTING)
    uncovered_samples = remove_low_quality(
        output, min_coverage=args.coverage20, min_hq_snp=args.min_snp, type_remove='Uncovered')

    if len(uncovered_samples) > 1:
        logger.info(GREEN + "Uncovered samples: " +
                    (",").join(uncovered_samples) + END_FORMATTING)
    else:
        logger.info(GREEN + "NO uncovered samples found" + END_FORMATTING)

    # RUN SNIPPY CORE
    ##############################################################################################################################
    if args.core:
        check_create_dir(out_core_dir)
        logger.info(GREEN + "Running snippy-core " +
                    group_name + END_FORMATTING)
        run_snippy_core(out_variant_dir, out_core_dir, reference)

        logger.info(GREEN + "Adapting core-snp to compare format " +
                    group_name + END_FORMATTING)
        core_vcf_file = os.path.join(out_core_dir, "core.vcf")
        core_vcf_file_adapted = os.path.join(
            out_core_dir, "core.vcf.adapted.tsv")
        core_vcf_file_removed = os.path.join(
            out_core_dir, "core.vcf.adapted.final.tsv")

        core_vcf_df_adapted = import_VCF4_core_to_compare(core_vcf_file)
        core_vcf_df_adapted.to_csv(
            core_vcf_file_adapted, sep="\t", index=False)

        logger.info(GREEN + "Obtaining clustered positions " +
                    group_name + END_FORMATTING)

        close_positions_list = extract_close_snps(
            core_vcf_df_adapted, snps_in_10=1)
        logger.info(GREEN + "Obtaining uncovered positions " +
                    group_name + END_FORMATTING)
        uncovered_list = identify_uncovered(
            out_stats_coverage_dir, min_coverage=10, nocall_fr=0.5)

        logger.debug('Clustered positions in core SNP:\n{}'.format(
            (",".join([str(x) for x in close_positions_list]))))
        logger.debug('Uncovered positions in all samples:\n{}'.format(
            (",".join([str(x) for x in uncovered_list]))))

        to_remove_list = close_positions_list + uncovered_list

        remove_df = remove_position_from_compare(
            core_vcf_df_adapted, to_remove_list)
        remove_df.to_csv(core_vcf_file_removed, sep="\t", index=False)

        ddtb_compare(core_vcf_file_removed, distance=10)

    #ANNOTATION WITH SNPEFF AND USER INPUT ##############
    #####################################################
    logger.info("\n\n" + BLUE + BOLD + "STARTING ANNOTATION IN GROUP: " +
                group_name + END_FORMATTING + "\n")
    check_create_dir(out_annot_dir)
    check_create_dir(out_annot_snpeff_dir)
    # SNPEFF
    if args.snpeff_database != False:
        for root, _, files in os.walk(out_variant_dir):
            for name in files:
                if name == 'snps.all.vcf':
                    sample = root.split('/')[-1]
                    filename = os.path.join(root, name)
                    chrom_filename = os.path.join(
                        root, 'snps.all.chromosome.vcf')
                    out_annot_file = os.path.join(
                        out_annot_snpeff_dir, sample + ".annot")
                    if os.path.isfile(out_annot_file):
                        logger.info(YELLOW + DIM + out_annot_file +
                                    " EXIST\nOmmiting snpEff Annotation for sample " + sample + END_FORMATTING)
                    else:
                        logger.info(
                            GREEN + "Annotating sample with snpEff: " + sample + END_FORMATTING)
                        rename_reference_snpeff(filename, chrom_filename)
                        annotate_snpeff(chrom_filename, out_annot_file,
                                        database=args.snpeff_database)
    else:
        logger.info(YELLOW + DIM + " No SnpEff database suplied, skipping annotation in group " +
                    group_name + END_FORMATTING)
    # USER DEFINED
    if not args.annot_bed and not args.annot_vcf:
        logger.info(
            YELLOW + BOLD + "Ommiting User Annotation, no BED or VCF files supplied" + END_FORMATTING)
    else:
        check_create_dir(out_annot_user_dir)
        for root, _, files in os.walk(out_variant_dir):
            for name in files:
                if name == 'snps.all.ivar.tsv':
                    sample = root.split('/')[-1]
                    logger.info(
                        'User bed/vcf annotation in sample {}'.format(sample))
                    filename = os.path.join(root, name)
                    out_annot_file = os.path.join(
                        out_annot_user_dir, sample + ".tsv")
                    user_annotation(
                        filename, out_annot_file, vcf_files=args.annot_vcf, bed_files=args.annot_bed)

    # USER AA DEFINED
    if not args.annot_aa:
        logger.info(
            YELLOW + BOLD + "Ommiting User aa Annotation, no AA files supplied" + END_FORMATTING)
    else:
        check_create_dir(out_annot_user_aa_dir)
        for root, _, files in os.walk(out_annot_snpeff_dir):
            if root == out_annot_snpeff_dir:
                for name in files:
                    if name.endswith('.annot'):
                        sample = name.split('.')[0]
                        logger.info(
                            'User aa annotation in sample {}'.format(sample))
                        filename = os.path.join(root, name)
                        out_annot_aa_file = os.path.join(
                            out_annot_user_aa_dir, sample + ".tsv")
                        if os.path.isfile(out_annot_aa_file):
                            user_annotation_aa(
                                out_annot_aa_file, out_annot_aa_file, aa_files=args.annot_aa)
                        else:
                            user_annotation_aa(
                                filename, out_annot_aa_file, aa_files=args.annot_aa)
    # USER FASTA ANNOTATION
    if not args.annot_fasta:
        logger.info(
            YELLOW + BOLD + "Ommiting User FASTA Annotation, no FASTA files supplied" + END_FORMATTING)
    else:
        check_create_dir(out_annot_blast_dir)
        for root, _, files in os.walk(out_variant_dir):
            for name in files:
                if name.endswith('.consensus.subs.fa'):
                    filename = os.path.join(root, name)
                    sample = root.split('/')[-1]
                    logger.info(
                        'User FASTA annotation in sample {}'.format(sample))
                    # out_annot_aa_file = os.path.join(
                    #    out_annot_user_aa_dir, sample + ".tsv")
                    for db in args.annot_fasta:
                        make_blast(filename, db, sample, out_annot_blast_dir,
                                   db_type="nucl", query_type="nucl", evalue=0.0001, threads=8)

    # USER AA TO HTML
    if not args.annot_aa:
        logger.info(
            YELLOW + BOLD + "Ommiting User aa Annotation to HTML, no AA files supplied" + END_FORMATTING)
    else:
        annotated_samples = []
        logger.info('Adapting annotation to html in {}'.format(group_name))
        for root, _, files in os.walk(out_annot_user_aa_dir):
            if root == out_annot_user_aa_dir:
                for name in files:
                    if name.endswith('.tsv'):
                        sample = name.split('.')[0]
                        annotated_samples.append(sample)
                        filename = os.path.join(root, name)
                        annotation_to_html(filename, sample)
        annotated_samples = [str(x) for x in annotated_samples]
        report_samples_html_all = report_samples_html.replace(
            'ALLSAMPLES', ('","').join(annotated_samples))  # NEW
        with open(os.path.join(out_annot_user_aa_dir, '00_all_samples.html'), 'w+') as f:
            f.write(report_samples_html_all)

    # SNP COMPARISON using tsv variant files
    ######################################################
    logger.info("\n\n" + BLUE + BOLD + "STARTING COMPARISON IN GROUP: " +
                group_name + END_FORMATTING + "\n")

    check_create_dir(out_compare_dir)
    folder_compare = today + "_" + group_name
    path_compare = os.path.join(out_compare_dir, folder_compare)
    check_create_dir(path_compare)
    full_path_compare = os.path.join(path_compare, group_name)

    compare_snp_matrix_recal = full_path_compare + ".revised.final.tsv"
    compare_snp_matrix_recal_intermediate = full_path_compare + ".revised_intermediate.tsv"
    compare_snp_matrix_recal_mpileup = full_path_compare + \
        ".revised_intermediate_vcf.tsv"
    compare_snp_matrix_INDEL_intermediate = full_path_compare + \
        ".revised_INDEL_intermediate.tsv"

    # Create intermediate

    recalibrated_snp_matrix_intermediate = ddbb_create_intermediate(
        out_variant_dir, out_stats_coverage_dir, min_freq_discard=0.1, min_alt_dp=10, only_snp=False)
    # recalibrated_snp_matrix_intermediate.to_csv(
    #     compare_snp_matrix_recal_intermediate, sep="\t", index=False)

    # Remove SNPs from BED file (PE/PPE)

    if args.remove_bed:
        recalibrated_snp_matrix_intermediate = remove_bed_positions(
            recalibrated_snp_matrix_intermediate, args.remove_bed)

    recalibrated_snp_matrix_intermediate.to_csv(
        compare_snp_matrix_recal_intermediate, sep="\t", index=False)

    # Recalibrate intermediate with VCF

    prior_recal = datetime.datetime.now()
    recalibrated_snp_matrix_mpileup = recalibrate_ddbb_vcf_intermediate(
        compare_snp_matrix_recal_intermediate, out_variant_dir, min_cov_low_freq=10)
    recalibrated_snp_matrix_mpileup.to_csv(
        compare_snp_matrix_recal_mpileup, sep="\t", index=False)

    after_recal = datetime.datetime.now()
    logger.debug("Done with recalibration vcf: %s" %
                 (after_recal - prior_recal))

    # Remove SNPs located within INDELs

    compare_snp_matrix_INDEL_intermediate_df = remove_position_range(
        recalibrated_snp_matrix_mpileup)
    compare_snp_matrix_INDEL_intermediate_df.to_csv(
        compare_snp_matrix_INDEL_intermediate, sep="\t", index=False)

    # Extract all positions marked as complex
    complex_variants = extract_complex_list(out_variant_dir)
    logger.debug('Complex positions in all samples:\n{}'.format(
        (",".join([str(x) for x in complex_variants]))))

    # Clean all faulty positions and samples => Final table

    recalibrated_revised_INDEL_df = revised_df(compare_snp_matrix_INDEL_intermediate_df,
                                               path_compare,
                                               complex_pos=complex_variants,
                                               min_freq_include=0.8,
                                               min_threshold_discard_uncov_sample=args.min_threshold_discard_uncov_sample,
                                               min_threshold_discard_uncov_pos=args.min_threshold_discard_uncov_pos,
                                               min_threshold_discard_htz_sample=args.min_threshold_discard_htz_sample,
                                               min_threshold_discard_htz_pos=args.min_threshold_discard_htz_pos,
                                               min_threshold_discard_all_pos=args.min_threshold_discard_all_pos,
                                               min_threshold_discard_all_sample=args.min_threshold_discard_all_sample,
                                               remove_faulty=True,
                                               drop_samples=True,
                                               drop_positions=True,
                                               windows_size_discard=args.window)
    recalibrated_revised_INDEL_df.to_csv(
        compare_snp_matrix_recal, sep="\t", index=False)

    # Matrix to pairwise and mwk

    ddtb_compare(compare_snp_matrix_recal, distance=5)

    logger.info("\n\n" + MAGENTA + BOLD + "COMPARING FINISHED IN GROUP: " +
                group_name + END_FORMATTING + "\n")

    logger.info("\n\n" + MAGENTA + BOLD +
                "#####END OF PIPELINE AUTOSNIPPY ANALYSIS#####" + END_FORMATTING + "\n")
示例#3
0
def main():
    """
    Create main function to capture code errors: https://stackoverflow.com/questions/6234405/logging-uncaught-exceptions-in-python
    """

    # ARGUMENTS

    def get_arguments():

        parser = argparse.ArgumentParser(
            prog='covidma.py',
            description=
            'Pipeline to call variants (SNVs) with any non model organism. Specialised in SARS-CoV-2'
        )

        input_group = parser.add_argument_group('Input', 'Input parameters')

        input_group.add_argument(
            '-i',
            '--input',
            dest="input_dir",
            metavar="input_directory",
            type=str,
            required=True,
            help='REQUIRED.Input directory containing all fast[aq] files')
        input_group.add_argument('-r',
                                 '--reference',
                                 metavar="reference",
                                 type=str,
                                 required=True,
                                 help='REQUIRED. File to map against')
        input_group.add_argument(
            '-a',
            '--annotation',
            metavar="annotation",
            type=str,
            required=True,
            help='REQUIRED. gff3 file to annotate variants')
        input_group.add_argument('-s',
                                 '--sample',
                                 metavar="sample",
                                 type=str,
                                 required=False,
                                 help='Sample to identify further files')
        input_group.add_argument(
            '-L',
            '--sample_list',
            type=str,
            required=False,
            help='Sample names to analyse only in the file supplied')
        input_group.add_argument(
            '-p',
            '--primers',
            type=str,
            default=
            '/home/laura/DATABASES/Anotacion/COVID/primers/nCoV-2019.bed',
            required=False,
            help='Bed file including primers to trim')

        quality_group = parser.add_argument_group(
            'Quality parameters', 'parameters for diferent triming conditions')

        quality_group.add_argument(
            '-c',
            '--coverage20',
            type=int,
            default=90,
            required=False,
            help=
            'Minimum percentage of coverage at 20x to clasify as uncovered (Default 90)'
        )
        quality_group.add_argument('-n',
                                   '--min_snp',
                                   type=int,
                                   required=False,
                                   default=1,
                                   help='SNP number to pass quality threshold')

        output_group = parser.add_argument_group(
            'Output', 'Required parameter to output results')

        output_group.add_argument(
            '-o',
            '--output',
            type=str,
            required=True,
            help='REQUIRED. Output directory to extract all results')
        output_group.add_argument(
            '-C',
            '--noclean',
            required=False,
            action='store_false',
            help='Clean unwanted files for standard execution')

        params_group = parser.add_argument_group(
            'Parameters', 'parameters for diferent stringent conditions')

        params_group.add_argument('-T',
                                  '--threads',
                                  type=str,
                                  dest="threads",
                                  required=False,
                                  default=16,
                                  help='Threads to use')
        params_group.add_argument('-M',
                                  '--memory',
                                  type=str,
                                  dest="memory",
                                  required=False,
                                  default=32,
                                  help='Max memory to use')

        annot_group = parser.add_argument_group(
            'Annotation', 'parameters for variant annotation')

        annot_group.add_argument('-B',
                                 '--annot_bed',
                                 type=str,
                                 default=[],
                                 required=False,
                                 action='append',
                                 help='bed file to annotate')
        annot_group.add_argument('-V',
                                 '--annot_vcf',
                                 type=str,
                                 default=[],
                                 required=False,
                                 action='append',
                                 help='vcf file to annotate')
        annot_group.add_argument('-A',
                                 '--annot_aa',
                                 type=str,
                                 default=[],
                                 required=False,
                                 action='append',
                                 help='aminoacid file to annotate')
        annot_group.add_argument('-R',
                                 '--remove_bed',
                                 type=str,
                                 default=False,
                                 required=False,
                                 help='BED file with positions to remove')
        annot_group.add_argument(
            '--mash_database',
            type=str,
            required=False,
            default=False,
            help='MASH ncbi annotation containing all species database')
        annot_group.add_argument('--snpeff_database',
                                 type=str,
                                 required=False,
                                 default='NC_045512.2',
                                 help='snpEFF annotation database')

        compare_group = parser.add_argument_group(
            'Compare', 'parameters for compare_snp')

        compare_group.add_argument('-S',
                                   '--only_snp',
                                   required=False,
                                   action='store_true',
                                   help='Use INDELS while comparing')

        arguments = parser.parse_args()

        return arguments

    args = get_arguments()

    ######################################################################
    #####################START PIPELINE###################################
    ######################################################################
    output = os.path.abspath(args.output)
    group_name = output.split("/")[-1]
    reference = os.path.abspath(args.reference)
    annotation = os.path.abspath(args.annotation)

    # LOGGING
    # Create log file with date and time
    right_now = str(datetime.datetime.now())
    right_now_full = "_".join(right_now.split(" "))
    log_filename = group_name + "_" + right_now_full + ".log"
    log_folder = os.path.join(output, 'Logs')
    check_create_dir(log_folder)
    log_full_path = os.path.join(log_folder, log_filename)

    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)

    formatter = logging.Formatter('%(asctime)s:%(message)s')

    file_handler = logging.FileHandler(log_full_path)
    file_handler.setLevel(logging.DEBUG)
    file_handler.setFormatter(formatter)

    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.INFO)
    # stream_handler.setFormatter(formatter)

    logger.addHandler(stream_handler)
    logger.addHandler(file_handler)

    logger.info("\n\n" + BLUE + BOLD + "STARTING PIPELINE IN GROUP: " +
                group_name + END_FORMATTING)

    today = str(datetime.date.today())

    logger.info("ARGUMENTS:")
    logger.info(str(args))

    # Obtain all R1 and R2 from folder
    r1, r2 = extract_read_list(args.input_dir)

    # Check if there are samples to filter out
    sample_list_F = []
    if args.sample_list == None:
        logger.info("\n" + "No samples to filter")
        for r1_file, r2_file in zip(r1, r2):
            sample = extract_sample(r1_file, r2_file)
            sample_list_F.append(sample)
    else:
        logger.info("samples will be filtered")
        sample_list_F = file_to_list(args.sample_list)

    new_samples = check_reanalysis(args.output, sample_list_F)

    logger.info("\n%d samples will be analysed: %s" %
                (len(new_samples), ",".join(new_samples)))

    #PREPARE REFERENCE FOR MAPPING + FAI + DICT #########
    #####################################################

    # picard_dictionary(args)
    samtools_faidx(args)

    #DECLARE FOLDERS CREATED IN PIPELINE ################
    #AND KEY FILES ######################################
    #####################################################
    # Annotation related parameters
    # script_dir = os.path.dirname(os.path.realpath(__file__))

    # Output related
    out_qc_dir = os.path.join(output, "Quality")
    out_qc_pre_dir = os.path.join(out_qc_dir, "raw")  # subfolder
    out_qc_post_dir = os.path.join(out_qc_dir, "processed")  # subfolder
    out_trim_dir = os.path.join(output, "Trimmed")
    out_map_dir = os.path.join(output, "Bam")
    out_variant_dir = os.path.join(output, "Variants")
    out_variant_ivar_dir = os.path.join(out_variant_dir,
                                        "ivar_raw")  # subfolder
    out_filtered_ivar_dir = os.path.join(out_variant_dir,
                                         "ivar_filtered")  # subfolder
    out_consensus_dir = os.path.join(output, "Consensus")
    out_consensus_ivar_dir = os.path.join(out_consensus_dir,
                                          "ivar")  # subfolder

    out_stats_dir = os.path.join(output, "Stats")
    out_stats_bamstats_dir = os.path.join(out_stats_dir,
                                          "Bamstats")  # subfolder
    out_stats_coverage_dir = os.path.join(out_stats_dir,
                                          "Coverage")  # subfolder
    out_compare_dir = os.path.join(output, "Compare")

    out_annot_dir = os.path.join(output, "Annotation")
    out_annot_snpeff_dir = os.path.join(out_annot_dir, "snpeff")  # subfolder
    out_annot_pangolin_dir = os.path.join(out_annot_dir,
                                          "pangolin")  # subfolder
    out_annot_user_dir = os.path.join(out_annot_dir, "user")  # subfolder
    out_annot_user_aa_dir = os.path.join(out_annot_dir, "user_aa")  # subfolder

    new_sample_number = 0

    for r1_file, r2_file in zip(r1, r2):
        # EXtract sample name
        sample = extract_sample(r1_file, r2_file)
        args.sample = sample
        if sample in sample_list_F:

            sample_number = str(sample_list_F.index(sample) + 1)
            sample_total = str(len(sample_list_F))

            out_markdup_trimmed_name = sample + ".rg.markdup.trimmed.sorted.bam"
            output_markdup_trimmed_file = os.path.join(
                out_map_dir, out_markdup_trimmed_name)

            if sample in new_samples:
                new_sample_number = str(int(new_sample_number) + 1)
                new_sample_total = str(len(new_samples))
                logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample +
                            " (" + sample_number + "/" + sample_total + ")" +
                            " (" + new_sample_number + "/" + new_sample_total +
                            ")" + END_FORMATTING)
            else:
                logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample +
                            " (" + sample_number + "/" + sample_total + ")" +
                            END_FORMATTING)

            if not os.path.isfile(output_markdup_trimmed_file):

                args.r1_file = r1_file
                args.r2_file = r2_file

                ##############START PIPELINE#####################
                #################################################

                # INPUT ARGUMENTS
                ################
                check_file_exists(r1_file)
                check_file_exists(r2_file)

                args.output = os.path.abspath(args.output)
                check_create_dir(args.output)

                # QUALITY CHECK in RAW with fastqc
                ######################################################
                check_create_dir(out_qc_dir)

                out_qc_raw_name_r1 = (".").join(
                    r1_file.split('/')[-1].split('.')[0:-2]) + '_fastqc.html'
                out_qc_raw_name_r2 = (".").join(
                    r2_file.split('/')[-1].split('.')[0:-2]) + '_fastqc.html'
                output_qc_raw_file_r1 = os.path.join(out_qc_pre_dir,
                                                     out_qc_raw_name_r1)
                output_qc_raw_file_r2 = os.path.join(out_qc_pre_dir,
                                                     out_qc_raw_name_r2)

                if os.path.isfile(output_qc_raw_file_r1) and os.path.isfile(
                        output_qc_raw_file_r2):
                    logger.info(YELLOW + DIM + output_qc_raw_file_r1 +
                                " EXIST\nOmmiting QC for sample " + sample +
                                END_FORMATTING)
                else:
                    logger.info(GREEN + "Checking quality in sample " +
                                sample + END_FORMATTING)
                    logger.info("R1: " + r1_file + "\nR2: " + r2_file)
                    fastqc_quality(r1_file, r2_file, out_qc_pre_dir,
                                   args.threads)
                """
                TODO: Human filter
                """

                # QUALITY TRIMMING AND ADAPTER REMOVAL WITH fastp
                ###################################################
                out_trim_name_r1 = sample + ".trimmed_R1.fastq.gz"
                out_trim_name_r2 = sample + ".trimmed_R2.fastq.gz"
                output_trimming_file_r1 = os.path.join(out_trim_dir,
                                                       out_trim_name_r1)
                output_trimming_file_r2 = os.path.join(out_trim_dir,
                                                       out_trim_name_r2)

                if os.path.isfile(output_trimming_file_r1) and os.path.isfile(
                        output_trimming_file_r2):
                    logger.info(YELLOW + DIM + output_trimming_file_r1 +
                                " EXIST\nOmmiting Trimming for sample " +
                                sample + END_FORMATTING)
                else:
                    logger.info(GREEN + "Trimming sample " + sample +
                                END_FORMATTING)
                    fastp_trimming(r1_file,
                                   r2_file,
                                   sample,
                                   out_trim_dir,
                                   threads=args.threads,
                                   min_qual=20,
                                   window_size=10,
                                   min_len=35)

                # QUALITY CHECK in TRIMMED with fastqc
                ######################################################
                check_create_dir(out_qc_dir)

                out_qc_pos_r1 = sample + ".trimmed_R1_fastqc.html"
                out_qc_pos_r2 = sample + ".trimmed_R2_fastqc.html"
                output_qc_precessed_file_r1 = os.path.join(
                    out_qc_post_dir, out_qc_pos_r1)
                output_qc_precessed_file_r2 = os.path.join(
                    out_qc_post_dir, out_qc_pos_r2)

                if os.path.isfile(
                        output_qc_precessed_file_r1) and os.path.isfile(
                            output_qc_precessed_file_r2):
                    logger.info(YELLOW + DIM + output_qc_raw_file_r1 +
                                " EXIST\nOmmiting QC for sample " + sample +
                                END_FORMATTING)
                else:
                    logger.info(GREEN +
                                "Checking quality in processed sample " +
                                sample + END_FORMATTING)
                    logger.info("R1: " + output_trimming_file_r1 + "\nR2: " +
                                output_trimming_file_r2)
                    fastqc_quality(output_trimming_file_r1,
                                   output_trimming_file_r2, out_qc_post_dir,
                                   args.threads)

                # MAPPING WITH BWA - SAM TO SORTED BAM - ADD HEADER SG
                #####################################################
                out_map_name = sample + ".rg.sorted.bam"
                output_map_file = os.path.join(out_map_dir, out_map_name)

                if os.path.isfile(output_map_file):
                    logger.info(YELLOW + DIM + output_map_file +
                                " EXIST\nOmmiting Mapping for sample " +
                                sample + END_FORMATTING)
                else:
                    logger.info(GREEN + "Mapping sample " + sample +
                                END_FORMATTING)
                    logger.info("R1: " + output_trimming_file_r1 + "\nR2: " +
                                output_trimming_file_r2 + "\nReference: " +
                                reference)
                    bwa_mapping(output_trimming_file_r1,
                                output_trimming_file_r2,
                                reference,
                                sample,
                                out_map_dir,
                                threads=args.threads)
                    sam_to_index_bam(sample,
                                     out_map_dir,
                                     output_trimming_file_r1,
                                     threads=args.threads)

                #MARK DUPLICATES WITH PICARDTOOLS ###################
                #####################################################
                out_markdup_name = sample + ".rg.markdup.sorted.bam"
                output_markdup_file = os.path.join(out_map_dir,
                                                   out_markdup_name)

                if os.path.isfile(output_markdup_file):
                    logger.info(YELLOW + DIM + output_markdup_file +
                                " EXIST\nOmmiting Duplucate Mark for sample " +
                                sample + END_FORMATTING)
                else:
                    logger.info(GREEN + "Marking Dupes in sample " + sample +
                                END_FORMATTING)
                    logger.info("Input Bam: " + output_map_file)
                    picard_markdup(output_map_file)

                #TRIM PRIMERS WITH ivar trim ########################
                #####################################################

                if os.path.isfile(output_markdup_trimmed_file):
                    logger.info(YELLOW + DIM + output_markdup_trimmed_file +
                                " EXIST\nOmmiting Duplucate Mark for sample " +
                                sample + END_FORMATTING)
                else:
                    logger.info(GREEN + "Trimming primers in sample " +
                                sample + END_FORMATTING)
                    logger.info("Input Bam: " + output_markdup_file)
                    ivar_trim(output_markdup_file,
                              args.primers,
                              sample,
                              min_length=30,
                              min_quality=20,
                              sliding_window_width=4)
            else:
                logger.info(
                    YELLOW + DIM + output_markdup_trimmed_file +
                    " EXIST\nOmmiting BAM mapping and BAM manipulation in sample "
                    + sample + END_FORMATTING)

            ########################END OF MAPPING AND BAM MANIPULATION#####################################################################
            ################################################################################################################################

            #VARIANT CALLING WTIH ivar variants##################
            #####################################################
            check_create_dir(out_variant_dir)
            out_ivar_variant_name = sample + ".tsv"
            out_ivar_variant_file = os.path.join(out_variant_ivar_dir,
                                                 out_ivar_variant_name)

            if os.path.isfile(out_ivar_variant_file):
                logger.info(YELLOW + DIM + out_ivar_variant_file +
                            " EXIST\nOmmiting Variant call for  sample " +
                            sample + END_FORMATTING)
            else:
                logger.info(GREEN + "Calling variants with ivar in sample " +
                            sample + END_FORMATTING)
                ivar_variants(reference,
                              output_markdup_trimmed_file,
                              out_variant_dir,
                              sample,
                              annotation,
                              min_quality=15,
                              min_frequency_threshold=0.01,
                              min_depth=1)

            #VARIANT FILTERING ##################################
            #####################################################
            check_create_dir(out_filtered_ivar_dir)
            out_ivar_filtered_file = os.path.join(out_filtered_ivar_dir,
                                                  out_ivar_variant_name)

            if os.path.isfile(out_ivar_filtered_file):
                logger.info(YELLOW + DIM + out_ivar_filtered_file +
                            " EXIST\nOmmiting Variant filtering for  sample " +
                            sample + END_FORMATTING)
            else:
                logger.info(GREEN + "Filtering variants in sample " + sample +
                            END_FORMATTING)
                filter_tsv_variants(out_ivar_variant_file,
                                    out_filtered_ivar_dir,
                                    min_frequency=0.7,
                                    min_total_depth=10,
                                    min_alt_dp=4,
                                    is_pass=True,
                                    only_snp=False)

            #CREATE CONSENSUS with ivar consensus##################
            #######################################################
            check_create_dir(out_consensus_dir)
            check_create_dir(out_consensus_ivar_dir)
            out_ivar_consensus_name = sample + ".fa"
            out_ivar_consensus_file = os.path.join(out_consensus_ivar_dir,
                                                   out_ivar_consensus_name)

            if os.path.isfile(out_ivar_consensus_file):
                logger.info(YELLOW + DIM + out_ivar_consensus_file +
                            " EXIST\nOmmiting Consensus for  sample " +
                            sample + END_FORMATTING)
            else:
                logger.info(GREEN + "Creating consensus with ivar in sample " +
                            sample + END_FORMATTING)
                ivar_consensus(output_markdup_trimmed_file,
                               out_consensus_ivar_dir,
                               sample,
                               min_quality=20,
                               min_frequency_threshold=0.8,
                               min_depth=20,
                               uncovered_character='N')
                logger.info(GREEN + "Replacing consensus header in " + sample +
                            END_FORMATTING)
                replace_consensus_header(out_ivar_consensus_file)

            ########################CREATE STATS AND QUALITY FILTERS########################################################################
            ################################################################################################################################
            #CREATE Bamstats#######################################
            #######################################################
            check_create_dir(out_stats_dir)
            check_create_dir(out_stats_bamstats_dir)
            out_bamstats_name = sample + ".bamstats"
            out_bamstats_file = os.path.join(out_stats_bamstats_dir,
                                             out_bamstats_name)

            if os.path.isfile(out_bamstats_file):
                logger.info(YELLOW + DIM + out_bamstats_file +
                            " EXIST\nOmmiting Bamstats for  sample " + sample +
                            END_FORMATTING)
            else:
                logger.info(GREEN + "Creating bamstats in sample " + sample +
                            END_FORMATTING)
                create_bamstat(output_markdup_trimmed_file,
                               out_stats_bamstats_dir,
                               sample,
                               threads=args.threads)

            #CREATE Bamstats#######################################
            #######################################################
            check_create_dir(out_stats_coverage_dir)
            out_coverage_name = sample + ".cov"
            out_coverage_file = os.path.join(out_stats_coverage_dir,
                                             out_coverage_name)

            if os.path.isfile(out_coverage_file):
                logger.info(YELLOW + DIM + out_coverage_file +
                            " EXIST\nOmmiting Bamstats for  sample " + sample +
                            END_FORMATTING)
            else:
                logger.info(GREEN + "Creating coverage in sample " + sample +
                            END_FORMATTING)
                create_coverage(output_markdup_trimmed_file,
                                out_stats_coverage_dir, sample)

    # fastqc OUTPUT FORMAT FOR COMPARISON
    ######################################################
    logger.info(GREEN + "Creating summary report for quality result " +
                END_FORMATTING)
    # format_html_image(out_qc_dir)

    # coverage OUTPUT SUMMARY
    ######################################################
    logger.info(GREEN + "Creating summary report for coverage result " +
                END_FORMATTING)
    obtain_group_cov_stats(out_stats_coverage_dir, group_name)

    # READS and VARIANTS OUTPUT SUMMARY
    ######################################################
    logger.info(GREEN + "Creating overal summary report " + END_FORMATTING)
    obtain_overal_stats(output, group_name)

    # REMOVE UNCOVERED
    ##############################################################################################################################
    logger.info(GREEN + "Removing low quality samples" + END_FORMATTING)
    # remove_low_quality(output, min_percentage_20x=args.coverage20,
    #                   min_hq_snp=args.min_snp, type_remove='Uncovered')

    #ANNOTATION WITH SNPEFF, USER INOUT AND PANGOLIN ####
    #####################################################
    logger.info("\n\n" + BLUE + BOLD + "STARTING ANNOTATION IN GROUP: " +
                group_name + END_FORMATTING + "\n")
    check_create_dir(out_annot_dir)
    check_create_dir(out_annot_snpeff_dir)
    check_create_dir(out_annot_pangolin_dir)
    # SNPEFF
    if args.snpeff_database != False:
        # CHANGE FOR RAW/FILTERED ANNOTATION
        for root, _, files in os.walk(out_filtered_ivar_dir):
            if root == out_filtered_ivar_dir:  # CHANGE FOR RAW/FILTERED ANNOTATION
                for name in files:
                    if name.endswith('.tsv'):
                        sample = name.split('.')[0]
                        filename = os.path.join(root, name)
                        out_annot_file = os.path.join(out_annot_snpeff_dir,
                                                      sample + ".annot")
                        if os.path.isfile(out_annot_file):
                            logger.info(
                                YELLOW + DIM + out_annot_file +
                                " EXIST\nOmmiting snpEff Annotation for sample "
                                + sample + END_FORMATTING)
                        else:
                            logger.info(GREEN +
                                        "Annotating sample with snpEff: " +
                                        sample + END_FORMATTING)
                            output_vcf = os.path.join(out_annot_snpeff_dir,
                                                      sample + '.vcf')
                            annotate_snpeff(filename,
                                            output_vcf,
                                            out_annot_file,
                                            database=args.snpeff_database)
    # USER DEFINED
    if not args.annot_bed and not args.annot_vcf:
        logger.info(YELLOW + BOLD +
                    "Ommiting User Annotation, no BED or VCF files supplied" +
                    END_FORMATTING)
    else:
        check_create_dir(out_annot_user_dir)
        # CHANGE FOR RAW/FILTERED ANNOTATION
        for root, _, files in os.walk(out_variant_ivar_dir):
            if root == out_variant_ivar_dir:  # CHANGE FOR RAW/FILTERED ANNOTATION
                for name in files:
                    if name.endswith('.tsv'):
                        sample = name.split('.')[0]
                        logger.info(
                            'User bed/vcf annotation in sample {}'.format(
                                sample))
                        filename = os.path.join(root, name)
                        out_annot_file = os.path.join(out_annot_user_dir,
                                                      sample + ".tsv")
                        user_annotation(filename,
                                        out_annot_file,
                                        vcf_files=args.annot_vcf,
                                        bed_files=args.annot_bed)

    # USER AA DEFINED
    if not args.annot_aa:
        logger.info(YELLOW + BOLD +
                    "Ommiting User aa Annotation, no AA files supplied" +
                    END_FORMATTING)
    else:
        check_create_dir(out_annot_user_aa_dir)
        for root, _, files in os.walk(out_annot_snpeff_dir):
            if root == out_annot_snpeff_dir:
                for name in files:
                    if name.endswith('.annot'):
                        sample = name.split('.')[0]
                        logger.info(
                            'User aa annotation in sample {}'.format(sample))
                        filename = os.path.join(root, name)
                        out_annot_aa_file = os.path.join(
                            out_annot_user_aa_dir, sample + ".tsv")
                        if os.path.isfile(out_annot_aa_file):
                            user_annotation_aa(out_annot_aa_file,
                                               out_annot_aa_file,
                                               aa_files=args.annot_aa)
                        else:
                            user_annotation_aa(filename,
                                               out_annot_aa_file,
                                               aa_files=args.annot_aa)

    # PANGOLIN
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=args.threads) as executor:
        futures_pangolin = []

        for root, _, files in os.walk(out_consensus_ivar_dir):
            if root == out_consensus_ivar_dir:
                for name in files:
                    if name.endswith('.fa'):
                        sample = name.split('.')[0]
                        filename = os.path.join(root, name)
                        out_pangolin_filename = sample + ".lineage.csv"
                        out_pangolin_file = os.path.join(
                            out_annot_pangolin_dir, out_pangolin_filename)
                        if os.path.isfile(out_pangolin_file):
                            logger.info(
                                YELLOW + DIM + out_pangolin_file +
                                " EXIST\nOmmiting Lineage for  sample " +
                                sample + END_FORMATTING)
                        else:
                            logger.info(GREEN +
                                        "Obtaining Lineage in sample " +
                                        sample + END_FORMATTING)
                            future = executor.submit(annotate_pangolin,
                                                     filename,
                                                     out_annot_pangolin_dir,
                                                     out_pangolin_filename,
                                                     threads=args.threads,
                                                     max_ambig=0.6)
                            futures_pangolin.append(future)
                for future in concurrent.futures.as_completed(
                        futures_pangolin):
                    logger.info(future.result())
                    # annotate_pangolin(filename, out_annot_pangolin_dir,
                    #                out_pangolin_filename, threads=args.threads, max_ambig=0.6)

    # USER AA TO HTML
    annotated_samples = []
    logger.info('Adapting annotation to html in {}'.format(group_name))
    for root, _, files in os.walk(out_annot_user_aa_dir):
        if root == out_annot_user_aa_dir:
            for name in files:
                if name.endswith('.tsv'):
                    sample = name.split('.')[0]
                    annotated_samples.append(sample)
                    filename = os.path.join(root, name)
                    annotation_to_html(filename, sample)
    annotated_samples = [str(x) for x in annotated_samples]
    report_samples_html_all = report_samples_html.replace(
        'ALLSAMPLES', ('","').join(annotated_samples))  # NEW
    with open(os.path.join(out_annot_user_aa_dir, '00_all_samples.html'),
              'w+') as f:
        f.write(report_samples_html_all)

    # SNP COMPARISON using tsv variant files
    ######################################################
    logger.info("\n\n" + BLUE + BOLD + "STARTING COMPARISON IN GROUP: " +
                group_name + END_FORMATTING + "\n")

    check_create_dir(out_compare_dir)
    folder_compare = today + "_" + group_name
    path_compare = os.path.join(out_compare_dir, folder_compare)
    check_create_dir(path_compare)
    full_path_compare = os.path.join(path_compare, group_name)

    # ddtb_add(out_filtered_ivar_dir, full_path_compare)
    compare_snp_matrix_recal = full_path_compare + ".revised.final.tsv"
    compare_snp_matrix_INDEL = full_path_compare + ".revised_INDEL.final.tsv"
    compare_snp_matrix_recal_intermediate = full_path_compare + ".revised_intermediate.tsv"
    compare_snp_matrix_INDEL_intermediate = full_path_compare + \
        ".revised_INDEL_intermediate.tsv"
    recalibrated_snp_matrix_intermediate = ddbb_create_intermediate(
        out_variant_ivar_dir,
        out_stats_coverage_dir,
        min_freq_discard=0.1,
        min_alt_dp=4,
        only_snp=args.only_snp)
    recalibrated_snp_matrix_intermediate.to_csv(
        compare_snp_matrix_recal_intermediate, sep="\t", index=False)
    compare_snp_matrix_INDEL_intermediate_df = remove_position_range(
        recalibrated_snp_matrix_intermediate)
    compare_snp_matrix_INDEL_intermediate_df.to_csv(
        compare_snp_matrix_INDEL_intermediate, sep="\t", index=False)
    recalibrated_revised_df = revised_df(recalibrated_snp_matrix_intermediate,
                                         path_compare,
                                         min_freq_include=0.7,
                                         min_threshold_discard_sample=0.07,
                                         min_threshold_discard_position=0.4,
                                         remove_faulty=True,
                                         drop_samples=True,
                                         drop_positions=True)
    recalibrated_revised_df.to_csv(compare_snp_matrix_recal,
                                   sep="\t",
                                   index=False)
    recalibrated_revised_INDEL_df = revised_df(
        compare_snp_matrix_INDEL_intermediate_df,
        path_compare,
        min_freq_include=0.7,
        min_threshold_discard_sample=0.07,
        min_threshold_discard_position=0.4,
        remove_faulty=True,
        drop_samples=True,
        drop_positions=True)
    recalibrated_revised_INDEL_df.to_csv(compare_snp_matrix_INDEL,
                                         sep="\t",
                                         index=False)

    ddtb_compare(compare_snp_matrix_recal, distance=0)
    ddtb_compare(compare_snp_matrix_INDEL, distance=0, indel=True)

    logger.info("\n\n" + MAGENTA + BOLD + "COMPARING FINISHED IN GROUP: " +
                group_name + END_FORMATTING + "\n")

    #####################CONSENSUS WITH REFINED CALL######
    ######################################################
    logger.info(GREEN + "Creating refined consensus" + END_FORMATTING)
    create_consensus(reference, compare_snp_matrix_recal,
                     out_stats_coverage_dir, out_consensus_dir)

    logger.info("\n\n" + MAGENTA + BOLD +
                "#####END OF PIPELINE COVID MULTI ANALYSIS#####" +
                END_FORMATTING + "\n")