Пример #1
0
def main():
    try:
        description = [
            '~~~CRISPRessoMeta~~~',
            '-Analysis of CRISPR/Cas9 outcomes from deep sequencing data using a metadata file-'
        ]
        meta_string = r'''
 ________________________________________
|   _________   ______ _______  ______   |
|  | | | | | \ | |       | |   | |  | |  |
|  | | | | | | | |----   | |   | |__| |  |
|  |_| |_| |_| |_|____   |_|   |_|  |_|  |
|________________________________________|
        '''
        print(CRISPRessoShared.get_crispresso_header(description, meta_string))

        parser = CRISPRessoShared.getCRISPRessoArgParser(
            parserTitle='CRISPRessoMeta Parameters')

        #batch specific params
        parser.add_argument(
            '--metadata',
            type=str,
            help='Metadata file according to NIST specification',
            required=True)
        parser.add_argument(
            '-mo',
            '--meta_output_folder',
            help='Directory where analysis output will be stored')
        parser.add_argument('--crispresso_command',
                            help='CRISPResso command to call',
                            default='CRISPResso')

        args = parser.parse_args()

        debug_flag = args.debug

        crispresso_options = CRISPRessoShared.get_crispresso_options()
        options_to_ignore = {'name', 'output_folder'}
        crispresso_options_for_meta = list(crispresso_options -
                                           options_to_ignore)

        CRISPRessoShared.check_file(args.metadata)

        meta_params = pd.DataFrame(
            columns=['name', 'guide_seq', 'amplicon_seq'])
        with open(args.metadata) as metadata_file:
            metadata = json.load(metadata_file)

            exp = metadata['Experiment']
            for guide in data['Experiment']:
                print('Guide: ' + guide['name'])
                print('Sequence: ' + guide['sequence'])
                print('Amplicon: ' + guide['amplicon'])
                print('Fastq_R1: ' + guide['fastq_r1'])
                print('Fastq_R2: ' + guide['fastq_r2'])
                meta_params.append({
                    'name': guide['name'],
                    'guide_seq': guide['sequence'],
                    'amplicon_seq': guide['amplicon'],
                    'fastq_r1': guide['fastq_r1'],
                    'fastq_r2': guide['fastq_r2']
                })

        print('table:')
        print(meta_params)
        #rename column "a" to "amplicon_seq", etc
        meta_params.rename(
            index=str,
            columns=CRISPRessoShared.get_crispresso_options_lookup(),
            inplace=True)
        meta_count = meta_params.shape[0]
        meta_params.index = range(meta_count)

        if 'fastq_r1' not in meta_params:
            raise CRISPRessoShared.BadParameterException(
                "fastq_r1 must be specified in the meta settings file. Current headings are: "
                + str(meta_params.columns.values))

        #add args from the command line to meta_params
        for arg in vars(args):
            if arg not in meta_params:
                meta_params[arg] = getattr(args, arg)
            else:
                if (getattr(args, arg) is not None):
                    meta_params[arg].fillna(value=getattr(args, arg),
                                            inplace=True)

        #assert that all names are unique
        #and clean names

        for i in range(meta_count):
            if meta_params.loc[i, 'name'] == '':
                meta_params.at[i, 'name'] = i
            meta_params.at[i, 'name'] = CRISPRessoShared.clean_filename(
                meta_params.loc[i, 'name'])

        if meta_params.drop_duplicates(
                'name').shape[0] != meta_params.shape[0]:
            raise CRISPRessoShared.BadParameterException(
                'Sample input names must be unique. The given names are not unique: '
                + str(meta_params.loc[:, 'name']))

        #Check files
        meta_params[
            "sgRNA_intervals"] = ''  #create empty array for sgRNA intervals
        meta_params["sgRNA_intervals"] = meta_params["sgRNA_intervals"].apply(
            list)
        meta_params[
            "cut_point_include_idx"] = ''  #create empty array for cut point intervals for each batch based on sgRNA
        meta_params["cut_point_include_idx"] = meta_params[
            "cut_point_include_idx"].apply(list)
        for idx, row in meta_params.iterrows():
            if row.fastq_r1 is None:
                raise CRISPRessoShared.BadParameterException(
                    "At least one fastq file must be given as a command line parameter or be specified in the meta settings file with the heading 'fastq_r1' (fastq_r1 on row %s '%s' is invalid)"
                    % (int(idx) + 1, row.fastq_r1))
            CRISPRessoShared.check_file(row.fastq_r1)

            if row.fastq_r2 != "":
                CRISPRessoShared.check_file(row.fastq_r2)

            if args.auto:
                continue

            curr_amplicon_seq_str = row.amplicon_seq
            if curr_amplicon_seq_str is None:
                raise CRISPRessoShared.BadParameterException(
                    "Amplicon sequence must be given as a command line parameter or be specified in the meta settings file with the heading 'amplicon_seq' (Amplicon seq on row %s '%s' is invalid)"
                    % (int(idx) + 1, curr_amplicon_seq_str))

            guides_are_in_amplicon = {
            }  #dict of whether a guide is in at least one amplicon sequence
            #iterate through amplicons
            for curr_amplicon_seq in curr_amplicon_seq_str.split(','):
                this_include_idxs = [
                ]  #mask for bp to include for this amplicon seq, as specified by sgRNA cut points
                this_sgRNA_intervals = []
                wrong_nt = CRISPRessoShared.find_wrong_nt(curr_amplicon_seq)
                if wrong_nt:
                    raise CRISPRessoShared.NTException(
                        'The amplicon sequence in row %d (%s) contains incorrect characters:%s'
                        % (idx + 1, curr_amplicon_seq_str, ' '.join(wrong_nt)))

                #iterate through guides
                curr_guide_seq_string = row.guide_seq
                if curr_guide_seq_string is not None and curr_guide_seq_string != "":
                    guides = curr_guide_seq_string.strip().upper().split(',')
                    for curr_guide_seq in guides:
                        wrong_nt = CRISPRessoShared.find_wrong_nt(
                            curr_guide_seq)
                        if wrong_nt:
                            raise CRISPRessoShared.NTException(
                                'The sgRNA sequence in row %d (%s) contains incorrect characters:%s'
                                %
                                (idx + 1, curr_guide_seq, ' '.join(wrong_nt)))
                    guide_names = [''] * len(guides)
                    guide_mismatches = [[]] * len(guides)
                    guide_qw_centers = CRISPRessoShared.set_guide_array(
                        row.quantification_window_center, guides,
                        'guide quantification center')
                    guide_qw_sizes = CRISPRessoShared.set_guide_array(
                        row.quantification_window_size, guides,
                        'guide quantification size')
                    guide_plot_cut_points = [1] * len(guides)
                    discard_guide_positions_overhanging_amplicon_edge = False
                    if 'discard_guide_positions_overhanging_amplicon_edge' in row:
                        discard_guide_positions_overhanging_amplicon_edge = row.discard_guide_positions_overhanging_amplicon_edge

                    (this_sgRNA_sequences, this_sgRNA_intervals,
                     this_sgRNA_cut_points, this_sgRNA_plot_cut_points,
                     this_sgRNA_plot_idxs, this_sgRNA_names, this_include_idxs,
                     this_exclude_idxs
                     ) = CRISPRessoShared.get_amplicon_info_for_guides(
                         curr_amplicon_seq, guides, guide_mismatches,
                         guide_names, guide_qw_centers, guide_qw_sizes,
                         row.quantification_window_coordinates,
                         row.exclude_bp_from_left, row.exclude_bp_from_right,
                         row.plot_window_size, guide_plot_cut_points,
                         discard_guide_positions_overhanging_amplicon_edge)
                    for guide_seq in this_sgRNA_sequences:
                        guides_are_in_amplicon[guide_seq] = 1

                meta_params.ix[idx, "cut_point_include_idx"].append(
                    this_include_idxs)
                meta_params.ix[idx,
                               "sgRNA_intervals"].append(this_sgRNA_intervals)

            for guide_seq in guides_are_in_amplicon:
                if guides_are_in_amplicon[guide_seq] != 1:
                    warn(
                        '\nThe guide sequence provided on row %d (%s) is not present in any amplicon sequence:%s! \nNOTE: The guide will be ignored for the analysis. Please check your input!'
                        % (idx + 1, row.guide_seq, curr_amplicon_seq))

        meta_folder_name = os.path.splitext(os.path.basename(args.metadata))[0]
        if args.name and args.name != "":
            meta_folder_name = args.name

        output_folder_name = 'CRISPRessoMeta_on_%s' % meta_folder_name
        OUTPUT_DIRECTORY = os.path.abspath(output_folder_name)

        if args.meta_output_folder:
            OUTPUT_DIRECTORY = os.path.join(
                os.path.abspath(args.meta_output_folder), output_folder_name)

        _jp = lambda filename: os.path.join(
            OUTPUT_DIRECTORY, filename
        )  #handy function to put a file in the output directory

        try:
            info('Creating Folder %s' % OUTPUT_DIRECTORY)
            os.makedirs(OUTPUT_DIRECTORY)
        except:
            warn('Folder %s already exists.' % OUTPUT_DIRECTORY)

        log_filename = _jp('CRISPRessoMeta_RUNNING_LOG.txt')
        logger.addHandler(logging.FileHandler(log_filename))

        with open(log_filename, 'w+') as outfile:
            outfile.write('[Command used]:\n%s\n\n[Execution log]:\n' %
                          ' '.join(sys.argv))

        crispresso2Meta_info_file = os.path.join(OUTPUT_DIRECTORY,
                                                 'CRISPResso2Meta_info.json')
        crispresso2_info = {
            'running_info': {},
            'results': {
                'alignment_stats': {},
                'general_plots': {}
            }
        }  #keep track of all information for this run to be pickled and saved at the end of the run
        crispresso2_info['running_info'][
            'version'] = CRISPRessoShared.__version__
        crispresso2_info['running_info']['args'] = deepcopy(args)

        crispresso2_info['running_info']['log_filename'] = os.path.basename(
            log_filename)

        crispresso_cmds = []
        meta_names_arr = []
        meta_input_names = {}
        for idx, row in meta_params.iterrows():

            metaName = CRISPRessoShared.slugify(row["name"])
            meta_names_arr.append(metaName)
            meta_input_names[metaName] = row["name"]

            crispresso_cmd = args.crispresso_command + ' -o %s --name %s' % (
                OUTPUT_DIRECTORY, metaName)
            crispresso_cmd = propagate_options(crispresso_cmd,
                                               crispresso_options_for_meta,
                                               meta_params, idx)
            crispresso_cmds.append(crispresso_cmd)

        crispresso2_info['meta_names_arr'] = meta_names_arr
        crispresso2_info['meta_input_names'] = meta_input_names

        CRISPRessoMultiProcessing.run_crispresso_cmds(crispresso_cmds,
                                                      args.n_processes, 'meta',
                                                      args.skip_failed)

        run_datas = []  #crispresso2 info from each row

        all_amplicons = set()
        amplicon_names = {}
        amplicon_counts = {}
        completed_meta_arr = []
        for idx, row in meta_params.iterrows():
            metaName = CRISPRessoShared.slugify(row["name"])
            folder_name = os.path.join(OUTPUT_DIRECTORY,
                                       'CRISPResso_on_%s' % metaName)
            run_data_file = os.path.join(folder_name, 'CRISPResso2_info.json')
            if os.path.isfile(run_data_file) is False:
                info("Skipping folder '%s'. Cannot find run data at '%s'." %
                     (folder_name, run_data_file))
                run_datas.append(None)
                continue

            run_data = CRISPRessoShared.load_crispresso_info(folder_name)
            run_datas.append(run_data)
            for ref_name in run_data['results']['ref_names']:
                ref_seq = run_data['results']['refs'][ref_name]['sequence']
                all_amplicons.add(ref_seq)
                #if this amplicon is called something else in another sample, just call it the amplicon
                if ref_name in amplicon_names and amplicon_names[
                        ref_seq] != ref_name:
                    amplicon_names[ref_seq] = ref_seq
                else:
                    amplicon_names[ref_seq] = ref_name
                if ref_seq not in amplicon_counts:
                    amplicon_counts[ref_seq] = 0
                amplicon_counts[ref_seq] += 1

            completed_meta_arr.append(metaName)

        crispresso2_info['completed_meta_arr'] = completed_meta_arr

        #make sure amplicon names aren't super long
        for amplicon in all_amplicons:
            if len(amplicon_names[amplicon]) > 20:
                amplicon_names[amplicon] = amplicon_names[amplicon][0:20]

        #make sure no duplicate names (same name for the different amplicons)
        seen_names = {}
        for amplicon in all_amplicons:
            suffix_counter = 2
            while amplicon_names[amplicon] in seen_names:
                amplicon_names[amplicon] = amplicon_names[
                    amplicon] + "_" + str(suffix_counter)
                suffix_counter += 1
            seen_names[amplicon_names[amplicon]] = 1

        save_png = True
        if args.suppress_report:
            save_png = False

        #summarize amplicon modifications
        with open(
                _jp('CRISPRessoBatch_quantification_of_editing_frequency.txt'),
                'w') as outfile:
            wrote_header = False
            for idx, row in meta_params.iterrows():
                metaName = CRISPRessoShared.slugify(row["name"])
                folder_name = os.path.join(OUTPUT_DIRECTORY,
                                           'CRISPResso_on_%s' % metaName)
                run_data = run_datas[idx]
                if run_data is None:
                    continue

                amplicon_modification_file = os.path.join(
                    folder_name,
                    run_data['running_info']['quant_of_editing_freq_filename'])
                with open(amplicon_modification_file, 'r') as infile:
                    file_head = infile.readline()
                    if not wrote_header:
                        outfile.write('Batch\t' + file_head)
                        wrote_header = True
                    for line in infile:
                        outfile.write(metaName + "\t" + line)

        #summarize alignment
        with open(_jp('CRISPRessoBatch_mapping_statistics.txt'),
                  'w') as outfile:
            wrote_header = False
            for idx, row in meta_params.iterrows():
                metaName = CRISPRessoShared.slugify(row["name"])
                folder_name = os.path.join(OUTPUT_DIRECTORY,
                                           'CRISPResso_on_%s' % metaName)

                run_data = run_datas[idx]
                if run_data is None:
                    continue
                amplicon_modification_file = os.path.join(
                    folder_name,
                    run_data['running_info']['mapping_stats_filename'])
                with open(amplicon_modification_file, 'r') as infile:
                    file_head = infile.readline()
                    if not wrote_header:
                        outfile.write('Batch\t' + file_head)
                        wrote_header = True
                    for line in infile:
                        outfile.write(metaName + "\t" + line)

        if not args.suppress_report:
            if (args.place_report_in_output_folder):
                report_name = _jp("CRISPResso2Meta_report.html")
            else:
                report_name = OUTPUT_DIRECTORY + '.html'
            CRISPRessoReport.make_meta_report_from_folder(
                report_name, crispresso2_info, OUTPUT_DIRECTORY, _ROOT)
            crispresso2_info['running_info']['report_location'] = report_name
            crispresso2_info['running_info'][
                'report_filename'] = os.path.basename(report_name)

        CRISPRessoShared.write_crispresso_info(
            crispresso2Meta_info_file,
            crispresso2_info,
        )
        info('Analysis Complete!')
        print(CRISPRessoShared.get_crispresso_footer())
        sys.exit(0)

    except Exception as e:
        debug_flag = False
        if 'args' in vars() and 'debug' in args:
            debug_flag = args.debug

        if debug_flag:
            traceback.print_exc(file=sys.stdout)

        error('\n\nERROR: %s' % e)
        sys.exit(-1)
Пример #2
0
def main():
    def print_stacktrace_if_debug():
        debug_flag = False
        if 'args' in vars() and 'debug' in args:
            debug_flag = args.debug

        if debug_flag:
            traceback.print_exc(file=sys.stdout)
            error(traceback.format_exc())

    try:
        start_time = datetime.now()
        start_time_string = start_time.strftime('%Y-%m-%d %H:%M:%S')

        description = [
            '~~~CRISPRessoWGS~~~',
            '-Analysis of CRISPR/Cas9 outcomes from WGS data-'
        ]
        wgs_string = r'''
 ____________
|     __  __ |
||  |/ _ (_  |
||/\|\__)__) |
|____________|
        '''
        print(CRISPRessoShared.get_crispresso_header(description, wgs_string))

        parser = CRISPRessoShared.getCRISPRessoArgParser(
            parserTitle='CRISPRessoWGS Parameters', requiredParams={})

        #tool specific optional
        parser.add_argument('-b',
                            '--bam_file',
                            type=str,
                            help='WGS aligned bam file',
                            required=True,
                            default='bam filename')
        parser.add_argument(
            '-f',
            '--region_file',
            type=str,
            help=
            'Regions description file. A BED format  file containing the regions to analyze, one per line. The REQUIRED\
        columns are: chr_id(chromosome name), bpstart(start position), bpend(end position), the optional columns are:name (an unique indentifier for the region), guide_seq, expected_hdr_amplicon_seq,coding_seq, see CRISPResso help for more details on these last 3 parameters)',
            required=True)
        parser.add_argument(
            '-r',
            '--reference_file',
            type=str,
            help=
            'A FASTA format reference file (for example hg19.fa for the human genome)',
            default='',
            required=True)
        parser.add_argument(
            '--min_reads_to_use_region',
            type=float,
            help=
            'Minimum number of reads that align to a region to perform the CRISPResso analysis',
            default=10)
        parser.add_argument(
            '--skip_failed',
            help='Continue with pooled analysis even if one sample fails',
            action='store_true')
        parser.add_argument(
            '--gene_annotations',
            type=str,
            help=
            'Gene Annotation Table from UCSC Genome Browser Tables (http://genome.ucsc.edu/cgi-bin/hgTables?command=start), \
        please select as table "knownGene", as output format "all fields from selected table" and as file returned "gzip compressed"',
            default='')
        parser.add_argument('--crispresso_command',
                            help='CRISPResso command to call',
                            default='CRISPResso')

        args = parser.parse_args()

        crispresso_options = CRISPRessoShared.get_crispresso_options()
        options_to_ignore = {
            'fastq_r1', 'fastq_r2', 'amplicon_seq', 'amplicon_name',
            'output_folder', 'name'
        }
        crispresso_options_for_wgs = list(crispresso_options -
                                          options_to_ignore)

        info('Checking dependencies...')

        if check_samtools() and check_bowtie2():
            info('\n All the required dependencies are present!')
        else:
            sys.exit(1)

        #check files
        check_file(args.bam_file)

        check_file(args.reference_file)

        check_file(args.region_file)

        if args.gene_annotations:
            check_file(args.gene_annotations)

        # for computation performed in CRISPRessoWGS (e.g. bowtie alignment, etc) use n_processes_for_wgs
        n_processes_for_wgs = 1
        if args.n_processes == "max":
            n_processes_for_wgs = CRISPRessoMultiProcessing.get_max_processes()
        else:
            n_processes_for_wgs = int(args.n_processes)

        # here, we set args.n_processes as 1 because this value is propagated to sub-CRISPResso runs (not for usage in CRISPRessoWGS)
        args.n_processes = 1

        #INIT
        get_name_from_bam = lambda x: os.path.basename(x).replace('.bam', '')

        if not args.name:
            database_id = '%s' % get_name_from_bam(args.bam_file)
        else:
            clean_name = CRISPRessoShared.slugify(args.name)
            if args.name != clean_name:
                warn(
                    'The specified name {0} contained invalid characters and was changed to: {1}'
                    .format(
                        args.name,
                        clean_name,
                    ), )
            database_id = clean_name

        OUTPUT_DIRECTORY = 'CRISPRessoWGS_on_%s' % database_id

        if args.output_folder:
            OUTPUT_DIRECTORY = os.path.join(
                os.path.abspath(args.output_folder), OUTPUT_DIRECTORY)

        _jp = lambda filename: os.path.join(
            OUTPUT_DIRECTORY, filename
        )  #handy function to put a file in the output directory

        try:
            info('Creating Folder %s' % OUTPUT_DIRECTORY)
            os.makedirs(OUTPUT_DIRECTORY)
            info('Done!')
        except:
            warn('Folder %s already exists.' % OUTPUT_DIRECTORY)

        log_filename = _jp('CRISPRessoWGS_RUNNING_LOG.txt')
        logger.addHandler(logging.FileHandler(log_filename))

        crispresso2_info_file = os.path.join(OUTPUT_DIRECTORY,
                                             'CRISPResso2WGS_info.json')
        crispresso2_info = {
            'running_info': {},
            'results': {
                'alignment_stats': {},
                'general_plots': {}
            }
        }  #keep track of all information for this run to be pickled and saved at the end of the run
        crispresso2_info['running_info'][
            'version'] = CRISPRessoShared.__version__
        crispresso2_info['running_info']['args'] = deepcopy(args)

        crispresso2_info['running_info']['log_filename'] = os.path.basename(
            log_filename)
        crispresso2_info['running_info']['finished_steps'] = {}

        crispresso_cmd_to_write = ' '.join(sys.argv)
        if args.write_cleaned_report:
            cmd_copy = sys.argv[:]
            cmd_copy[0] = 'CRISPRessoWGS'
            for i in range(len(cmd_copy)):
                if os.sep in cmd_copy[i]:
                    cmd_copy[i] = os.path.basename(cmd_copy[i])

            crispresso_cmd_to_write = ' '.join(
                cmd_copy
            )  #clean command doesn't show the absolute path to the executable or other files
        crispresso2_info['running_info'][
            'command_used'] = crispresso_cmd_to_write

        with open(log_filename, 'w+') as outfile:
            outfile.write(
                'CRISPResso version %s\n[Command used]:\n%s\n\n[Execution log]:\n'
                % (CRISPRessoShared.__version__, crispresso_cmd_to_write))

        #keep track of args to see if it is possible to skip computation steps on rerun
        can_finish_incomplete_run = False
        if args.no_rerun:
            if os.path.exists(crispresso2_info_file):
                previous_run_data = CRISPRessoShared.load_crispresso_info(
                    OUTPUT_DIRECTORY)
                if previous_run_data['running_info'][
                        'version'] == CRISPRessoShared.__version__:
                    args_are_same = True
                    for arg in vars(args):
                        if arg == "no_rerun" or arg == "debug" or arg == "n_processes":
                            continue
                        if arg not in vars(
                                previous_run_data['running_info']['args']):
                            info(
                                'Comparing current run to previous run: old run had argument '
                                + str(arg) + ' \nRerunning.')
                            args_are_same = False
                        elif str(
                                getattr(
                                    previous_run_data['running_info']['args'],
                                    arg)) != str(getattr(args, arg)):
                            info(
                                'Comparing current run to previous run:\n\told argument '
                                + str(arg) + ' = ' + str(
                                    getattr(
                                        previous_run_data['running_info']
                                        ['args'], arg)) +
                                '\n\tnew argument: ' + str(arg) + ' = ' +
                                str(getattr(args, arg)) + '\nRerunning.')
                            args_are_same = False

                    if args_are_same:
                        if 'end_time_string' in previous_run_data:
                            info('Analysis already completed on %s!' %
                                 previous_run_data['running_info']
                                 ['end_time_string'])
                            sys.exit(0)
                        else:
                            can_finish_incomplete_run = True
                            if 'finished_steps' in previous_run_data[
                                    'running_info']:
                                for key in previous_run_data['running_info'][
                                        'finished_steps'].keys():
                                    crispresso2_info['running_info'][
                                        'finished_steps'][
                                            key] = previous_run_data[
                                                'running_info'][
                                                    'finished_steps'][key]
                                    if args.debug:
                                        info('finished: ' + key)
                else:
                    info(
                        'The no_rerun flag is set, but this analysis will be rerun because the existing run was performed using an old version of CRISPResso ('
                        + str(previous_run_data['running_info']['version']) +
                        ').')

        #write this file early on so we can check the params if we have to rerun
        CRISPRessoShared.write_crispresso_info(
            crispresso2_info_file,
            crispresso2_info,
        )

        def rreplace(s, old, new):
            li = s.rsplit(old)
            return new.join(li)

        #check if bam has the index already
        if os.path.exists(rreplace(args.bam_file, ".bam", ".bai")):
            info('Index file for input .bam file exists, skipping generation.')
        elif os.path.exists(args.bam_file + '.bai'):
            info('Index file for input .bam file exists, skipping generation.')
        else:
            info('Creating index file for input .bam file...')
            sb.call('samtools index %s ' % (args.bam_file), shell=True)

        #load gene annotation
        if args.gene_annotations:
            info('Loading gene coordinates from annotation file: %s...' %
                 args.gene_annotations)
            try:
                df_genes = pd.read_csv(args.gene_annotations,
                                       compression='gzip',
                                       sep="\t")
                df_genes.txEnd = df_genes.txEnd.astype(int)
                df_genes.txStart = df_genes.txStart.astype(int)
                df_genes.head()
            except:
                raise Exception('Failed to load the gene annotations file.')

        #Load and validate the REGION FILE
        df_regions = pd.read_csv(args.region_file,
                                 names=[
                                     'chr_id', 'bpstart', 'bpend', 'Name',
                                     'sgRNA', 'Expected_HDR', 'Coding_sequence'
                                 ],
                                 comment='#',
                                 sep='\t',
                                 dtype={
                                     'Name': str,
                                     'chr_id': str
                                 })

        #remove empty amplicons/lines
        df_regions.dropna(subset=['chr_id', 'bpstart', 'bpend'], inplace=True)

        df_regions.Expected_HDR = df_regions.Expected_HDR.apply(
            capitalize_sequence)
        df_regions.sgRNA = df_regions.sgRNA.apply(capitalize_sequence)
        df_regions.Coding_sequence = df_regions.Coding_sequence.apply(
            capitalize_sequence)

        #check or create names
        for idx, row in df_regions.iterrows():
            if pd.isnull(row.Name):
                df_regions.iloc[idx, ]['Name'] = '_'.join(
                    map(str, [row['chr_id'], row['bpstart'], row['bpend']]))

        if not len(df_regions.Name.unique()) == df_regions.shape[0]:
            raise Exception('The amplicon names should be all distinct!')

        df_regions.set_index('Name', inplace=True)
        #df_regions.index=df_regions.index.str.replace(' ','_')
        df_regions.index = df_regions.index.to_series().str.replace(' ', '_')

        #extract sequence for each region
        uncompressed_reference = args.reference_file

        if os.path.exists(uncompressed_reference + '.fai'):
            info(
                'The index for the reference fasta file is already present! Skipping generation.'
            )
        else:
            info('Indexing reference file... Please be patient!')
            sb.call('samtools faidx %s >>%s 2>&1' %
                    (uncompressed_reference, log_filename),
                    shell=True)

        info(
            'Retrieving reference sequences for amplicons and checking for sgRNAs'
        )
        df_regions['sequence'] = df_regions.apply(
            lambda row: get_region_from_fa(row.chr_id, row.bpstart, row.bpend,
                                           uncompressed_reference),
            axis=1)

        for idx, row in df_regions.iterrows():

            if not pd.isnull(row.sgRNA):

                cut_points = []
                guides = row.sgRNA.strip().upper().split(',')
                guide_qw_centers = CRISPRessoShared.set_guide_array(
                    args.quantification_window_center, guides,
                    'guide quantification center')
                for idx, current_guide_seq in enumerate(guides):

                    wrong_nt = find_wrong_nt(current_guide_seq)
                    if wrong_nt:
                        raise NTException(
                            'The sgRNA sequence %s contains wrong characters:%s'
                            % (current_guide_seq, ' '.join(wrong_nt)))

                    offset_fw = guide_qw_centers[idx] + len(
                        current_guide_seq) - 1
                    offset_rc = (-guide_qw_centers[idx]) - 1
                    cut_points+=[m.start() + offset_fw for \
                                m in re.finditer(current_guide_seq,  row.sequence)]+[m.start() + offset_rc for m in re.finditer(CRISPRessoShared.reverse_complement(current_guide_seq),  row.sequence)]

                if not cut_points:
                    df_regions.iloc[idx, :]['sgRNA'] = ''
                    info('Cannot find guide ' + str(row.sgRNA) +
                         ' in amplicon ' + str(idx) + ' (' + str(row) + ')')

        df_regions['bpstart'] = pd.to_numeric(df_regions['bpstart'])
        df_regions['bpend'] = pd.to_numeric(df_regions['bpend'])

        df_regions.bpstart = df_regions.bpstart.astype(int)
        df_regions.bpend = df_regions.bpend.astype(int)

        if args.gene_annotations:
            df_regions = df_regions.apply(
                lambda row: find_overlapping_genes(row, df_genes), axis=1)

        #extract reads with samtools in that region and create a bam
        #create a fasta file with all the trimmed reads
        info('\nProcessing each region...')

        ANALYZED_REGIONS = _jp('ANALYZED_REGIONS/')
        if not os.path.exists(ANALYZED_REGIONS):
            os.mkdir(ANALYZED_REGIONS)

        df_regions['region_number'] = np.arange(len(df_regions))

        def set_filenames(row):
            row_fastq_exists = False
            fastq_gz_filename = os.path.join(
                ANALYZED_REGIONS, '%s.fastq.gz' %
                clean_filename('REGION_' + str(row.region_number)))
            bam_region_filename = os.path.join(
                ANALYZED_REGIONS,
                '%s.bam' % clean_filename('REGION_' + str(row.region_number)))
            #if bam file already exists, don't regenerate it
            if os.path.isfile(fastq_gz_filename):
                row_fastq_exists = True
            return bam_region_filename, fastq_gz_filename, row_fastq_exists

        df_regions['bam_file_with_reads_in_region'], df_regions[
            'fastq_file_trimmed_reads_in_region'], df_regions[
                'row_fastq_exists'] = zip(
                    *df_regions.apply(set_filenames, axis=1))
        df_regions['n_reads'] = 0
        df_regions[
            'original_bam'] = args.bam_file  #stick this in the df so we can parallelize the analysis and not pass params

        report_reads_aligned_filename = _jp(
            'REPORT_READS_ALIGNED_TO_SELECTED_REGIONS_WGS.txt')
        num_rows_without_fastq = len(
            df_regions[df_regions.row_fastq_exists == False])

        if can_finish_incomplete_run and num_rows_without_fastq == 0 and os.path.isfile(
                report_reads_aligned_filename
        ) and 'generation_of_fastq_files_for_each_amplicon' in crispresso2_info[
                'running_info']['finished_steps']:
            info('Skipping generation of fastq files for each amplicon.')
            df_regions = pd.read_csv(report_reads_aligned_filename,
                                     comment='#',
                                     sep='\t',
                                     dtype={
                                         'Name': str,
                                         'chr_id': str
                                     })
            df_regions.set_index('Name', inplace=True)

        else:
            #run region extraction here
            df_regions = CRISPRessoMultiProcessing.run_pandas_apply_parallel(
                df_regions, extract_reads_chunk, n_processes_for_wgs)
            df_regions.sort_values('region_number', inplace=True)
            cols_to_print = [
                "chr_id", "bpstart", "bpend", "sgRNA", "Expected_HDR",
                "Coding_sequence", "sequence", "n_reads",
                "bam_file_with_reads_in_region",
                "fastq_file_trimmed_reads_in_region"
            ]
            if args.gene_annotations:
                cols_to_print.append('gene_overlapping')
            df_regions.fillna('NA').to_csv(report_reads_aligned_filename,
                                           sep='\t',
                                           columns=cols_to_print,
                                           index_label="Name")

            #save progress
            crispresso2_info['running_info']['finished_steps'][
                'generation_of_fastq_files_for_each_amplicon'] = True
            CRISPRessoShared.write_crispresso_info(
                crispresso2_info_file,
                crispresso2_info,
            )

        #Run Crispresso
        info('Running CRISPResso on each region...')
        crispresso_cmds = []
        for idx, row in df_regions.iterrows():
            if row['n_reads'] >= args.min_reads_to_use_region:
                info('\nThe region [%s] has enough reads (%d) mapped to it!' %
                     (idx, row['n_reads']))

                crispresso_cmd= args.crispresso_command + ' -r1 %s -a %s -o %s --name %s' %\
                (row['fastq_file_trimmed_reads_in_region'], row['sequence'], OUTPUT_DIRECTORY, idx)

                if row['sgRNA'] and not pd.isnull(row['sgRNA']):
                    crispresso_cmd += ' -g %s' % row['sgRNA']

                if row['Expected_HDR'] and not pd.isnull(row['Expected_HDR']):
                    crispresso_cmd += ' -e %s' % row['Expected_HDR']

                if row['Coding_sequence'] and not pd.isnull(
                        row['Coding_sequence']):
                    crispresso_cmd += ' -c %s' % row['Coding_sequence']

                crispresso_cmd = CRISPRessoShared.propagate_crispresso_options(
                    crispresso_cmd, crispresso_options_for_wgs, args)

                #logging like this causes the multiprocessing step to not block for some reason #mysteriesOfThPythonUniverse
                #log_name = _jp("CRISPResso_on_"+idx) +".log"
                #crispresso_cmd += " &> %s"%log_name

                crispresso_cmds.append(crispresso_cmd)


#                    info('Running CRISPResso:%s' % crispresso_cmd)
#                    sb.call(crispresso_cmd,shell=True)

            else:
                info(
                    '\nThe region [%s] has too few reads mapped to it (%d)! Not running CRISPResso!'
                    % (idx, row['n_reads']))

        CRISPRessoMultiProcessing.run_crispresso_cmds(crispresso_cmds,
                                                      n_processes_for_wgs,
                                                      'region',
                                                      args.skip_failed)

        quantification_summary = []
        all_region_names = []
        all_region_read_counts = {}
        good_region_names = []
        good_region_folders = {}
        header = 'Name\tUnmodified%\tModified%\tReads_total\tReads_aligned\tUnmodified\tModified\tDiscarded\tInsertions\tDeletions\tSubstitutions\tOnly Insertions\tOnly Deletions\tOnly Substitutions\tInsertions and Deletions\tInsertions and Substitutions\tDeletions and Substitutions\tInsertions Deletions and Substitutions'
        header_els = header.split("\t")
        header_el_count = len(header_els)
        empty_line_els = [np.nan] * (header_el_count - 1)
        n_reads_index = header_els.index('Reads_total') - 1
        for idx, row in df_regions.iterrows():
            folder_name = 'CRISPResso_on_%s' % idx
            run_name = idx

            all_region_names.append(run_name)
            all_region_read_counts[run_name] = row.n_reads

            run_file = os.path.join(_jp(folder_name), 'CRISPResso2_info.json')
            if not os.path.exists(run_file):
                warn(
                    'Skipping the folder %s: not enough reads, incomplete, or empty folder.'
                    % folder_name)
                this_els = empty_line_els[:]
                this_els[n_reads_index] = row.n_reads
                to_add = [run_name]
                to_add.extend(this_els)
                quantification_summary.append(to_add)
            else:
                run_data = CRISPRessoShared.load_crispresso_info(
                    _jp(folder_name), )
                ref_name = run_data['results']['ref_names'][
                    0]  #only expect one amplicon sequence
                n_tot = row.n_reads
                n_aligned = run_data['results']['alignment_stats'][
                    'counts_total'][ref_name]
                n_unmod = run_data['results']['alignment_stats'][
                    'counts_unmodified'][ref_name]
                n_mod = run_data['results']['alignment_stats'][
                    'counts_modified'][ref_name]
                n_discarded = run_data['results']['alignment_stats'][
                    'counts_discarded'][ref_name]

                n_insertion = run_data['results']['alignment_stats'][
                    'counts_insertion'][ref_name]
                n_deletion = run_data['results']['alignment_stats'][
                    'counts_deletion'][ref_name]
                n_substitution = run_data['results']['alignment_stats'][
                    'counts_substitution'][ref_name]
                n_only_insertion = run_data['results']['alignment_stats'][
                    'counts_only_insertion'][ref_name]
                n_only_deletion = run_data['results']['alignment_stats'][
                    'counts_only_deletion'][ref_name]
                n_only_substitution = run_data['results']['alignment_stats'][
                    'counts_only_substitution'][ref_name]
                n_insertion_and_deletion = run_data['results'][
                    'alignment_stats']['counts_insertion_and_deletion'][
                        ref_name]
                n_insertion_and_substitution = run_data['results'][
                    'alignment_stats']['counts_insertion_and_substitution'][
                        ref_name]
                n_deletion_and_substitution = run_data['results'][
                    'alignment_stats']['counts_deletion_and_substitution'][
                        ref_name]
                n_insertion_and_deletion_and_substitution = run_data[
                    'results']['alignment_stats'][
                        'counts_insertion_and_deletion_and_substitution'][
                            ref_name]

                unmod_pct = "NA"
                mod_pct = "NA"
                if n_aligned > 0:
                    unmod_pct = 100 * n_unmod / float(n_aligned)
                    mod_pct = 100 * n_mod / float(n_aligned)

                vals = [run_name]
                vals.extend([
                    round(unmod_pct, 8),
                    round(mod_pct, 8), n_aligned, n_tot, n_unmod, n_mod,
                    n_discarded, n_insertion, n_deletion, n_substitution,
                    n_only_insertion, n_only_deletion, n_only_substitution,
                    n_insertion_and_deletion, n_insertion_and_substitution,
                    n_deletion_and_substitution,
                    n_insertion_and_deletion_and_substitution
                ])
                quantification_summary.append(vals)

                good_region_names.append(idx)
                good_region_folders[idx] = folder_name
        samples_quantification_summary_filename = _jp(
            'SAMPLES_QUANTIFICATION_SUMMARY.txt')

        df_summary_quantification = pd.DataFrame(quantification_summary,
                                                 columns=header_els)
        if args.crispresso1_mode:
            crispresso1_columns = [
                'Name', 'Unmodified%', 'Modified%', 'Reads_aligned',
                'Reads_total'
            ]
            df_summary_quantification.fillna('NA').to_csv(
                samples_quantification_summary_filename,
                sep='\t',
                index=None,
                columns=crispresso1_columns)
        else:
            df_summary_quantification.fillna('NA').to_csv(
                samples_quantification_summary_filename, sep='\t', index=None)

        crispresso2_info['results']['alignment_stats'][
            'samples_quantification_summary_filename'] = os.path.basename(
                samples_quantification_summary_filename)
        crispresso2_info['results']['regions'] = df_regions
        crispresso2_info['results']['all_region_names'] = all_region_names
        crispresso2_info['results'][
            'all_region_read_counts'] = all_region_read_counts
        crispresso2_info['results']['good_region_names'] = good_region_names
        crispresso2_info['results'][
            'good_region_folders'] = good_region_folders

        crispresso2_info['results']['general_plots']['summary_plot_names'] = []
        crispresso2_info['results']['general_plots'][
            'summary_plot_titles'] = {}
        crispresso2_info['results']['general_plots'][
            'summary_plot_labels'] = {}
        crispresso2_info['results']['general_plots']['summary_plot_datas'] = {}

        df_summary_quantification.set_index('Name')

        save_png = True
        if args.suppress_report:
            save_png = False

        if not args.suppress_plots:
            plot_root = _jp("CRISPRessoWGS_reads_summary")
            CRISPRessoPlot.plot_reads_total(plot_root,
                                            df_summary_quantification,
                                            save_png,
                                            args.min_reads_to_use_region)
            plot_name = os.path.basename(plot_root)
            crispresso2_info['results']['general_plots'][
                'reads_summary_plot'] = plot_name
            crispresso2_info['results']['general_plots'][
                'summary_plot_names'].append(plot_name)
            crispresso2_info['results']['general_plots'][
                'summary_plot_titles'][
                    plot_name] = 'CRISPRessoWGS Read Allocation Summary'
            crispresso2_info['results']['general_plots']['summary_plot_labels'][
                plot_name] = 'Each bar shows the total number of reads allocated to each amplicon. The vertical line shows the cutoff for analysis, set using the --min_reads_to_use_region parameter.'
            crispresso2_info['results']['general_plots']['summary_plot_datas'][
                plot_name] = [
                    ('CRISPRessoWGS summary',
                     os.path.basename(samples_quantification_summary_filename))
                ]

            plot_root = _jp("CRISPRessoWGS_modification_summary")
            CRISPRessoPlot.plot_unmod_mod_pcts(plot_root,
                                               df_summary_quantification,
                                               save_png,
                                               args.min_reads_to_use_region)
            plot_name = os.path.basename(plot_root)
            crispresso2_info['results']['general_plots'][
                'modification_summary_plot'] = plot_name
            crispresso2_info['results']['general_plots'][
                'summary_plot_names'].append(plot_name)
            crispresso2_info['results']['general_plots'][
                'summary_plot_titles'][
                    plot_name] = 'CRISPRessoWGS Modification Summary'
            crispresso2_info['results']['general_plots']['summary_plot_labels'][
                plot_name] = 'Each bar shows the total number of reads aligned to each amplicon, divided into the reads that are modified and unmodified. The vertical line shows the cutoff for analysis, set using the --min_reads_to_use_region parameter.'
            crispresso2_info['results']['general_plots']['summary_plot_datas'][
                plot_name] = [
                    ('CRISPRessoWGS summary',
                     os.path.basename(samples_quantification_summary_filename))
                ]

        if not args.suppress_report and not args.suppress_plots:
            if (args.place_report_in_output_folder):
                report_name = _jp("CRISPResso2WGS_report.html")
            else:
                report_name = OUTPUT_DIRECTORY + '.html'
            CRISPRessoReport.make_wgs_report_from_folder(
                report_name, crispresso2_info, OUTPUT_DIRECTORY, _ROOT)
            crispresso2_info['running_info']['report_location'] = report_name
            crispresso2_info['running_info'][
                'report_filename'] = os.path.basename(report_name)

        end_time = datetime.now()
        end_time_string = end_time.strftime('%Y-%m-%d %H:%M:%S')
        running_time = end_time - start_time
        running_time_string = str(running_time)

        crispresso2_info['running_info']['end_time'] = end_time
        crispresso2_info['running_info']['end_time_string'] = end_time_string
        crispresso2_info['running_info']['running_time'] = running_time
        crispresso2_info['running_info'][
            'running_time_string'] = running_time_string

        CRISPRessoShared.write_crispresso_info(
            crispresso2_info_file,
            crispresso2_info,
        )

        info('Analysis Complete!')
        print(CRISPRessoShared.get_crispresso_footer())
        sys.exit(0)

    except Exception as e:
        print_stacktrace_if_debug()
        error('\n\nERROR: %s' % e)
        sys.exit(-1)
Пример #3
0
def main():
    try:
        description = [
            '~~~CRISPRessoPooledWGSCompare~~~',
            '-Comparison of two CRISPRessoPooled or CRISPRessoWGS analyses-',
        ]

        compare_header = r'''
 ____________________________________
| __  __  __     __ __        __  __ |
||__)/  \/  \|  |_ |  \ /|  |/ _ (_  |
||   \__/\__/|__|__|__// |/\|\__)__) |
|   __ __      __      __  __        |
|  /  /  \|\/||__) /\ |__)|_         |
|  \__\__/|  ||   /--\| \ |__        |
|____________________________________|
        '''
        compare_header = CRISPRessoShared.get_crispresso_header(
            description,
            compare_header,
        )
        print(compare_header)

        parser = argparse.ArgumentParser(
            description='CRISPRessoPooledWGSCompare Parameters',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        )
        parser.add_argument(
            'crispresso_pooled_wgs_output_folder_1',
            type=str,
            help=
            'First output folder with CRISPRessoPooled or CRISPRessoWGS analysis',
        )
        parser.add_argument(
            'crispresso_pooled_wgs_output_folder_2',
            type=str,
            help=
            'Second output folder with CRISPRessoPooled or CRISPRessoWGS analysis',
        )

        #OPTIONALS
        parser.add_argument('-n', '--name', help='Output name', default='')
        parser.add_argument(
            '-n1',
            '--sample_1_name',
            help='Sample 1 name',
            default='Sample_1',
        )
        parser.add_argument(
            '-n2',
            '--sample_2_name',
            help='Sample 2 name',
            default='Sample_2',
        )
        parser.add_argument('-o', '--output_folder', help='', default='')
        parser.add_argument(
            '-p',
            '--n_processes',
            type=str,
            help="""
Specify the number of processes to use for analysis.
Please use with caution since increasing this parameter will significantly
increase the memory required to run CRISPResso. Can be set to 'max'.
            """,
            default='1',
        )
        parser.add_argument(
            '--min_frequency_alleles_around_cut_to_plot',
            type=float,
            help=
            'Minimum %% reads required to report an allele in the alleles table plot.',
            default=0.2,
        )
        parser.add_argument(
            '--max_rows_alleles_around_cut_to_plot',
            type=int,
            help='Maximum number of rows to report in the alleles table plot. ',
            default=50,
        )
        parser.add_argument(
            '--place_report_in_output_folder',
            help=
            'If true, report will be written inside the CRISPResso output folder. By default, the report will be written one directory up from the report output.',
            action='store_true',
        )
        parser.add_argument(
            '--suppress_report',
            help='Suppress output report',
            action='store_true',
        )
        parser.add_argument(
            '--debug',
            help='Show debug messages',
            action='store_true',
        )

        args = parser.parse_args()
        debug_flag = args.debug

        crispresso_compare_options = [
            'min_frequency_alleles_around_cut_to_plot',
            'max_rows_alleles_around_cut_to_plot',
            'place_report_in_output_folder',
            'suppress_report',
            'debug',
        ]

        sample_1_name = CRISPRessoShared.slugify(args.sample_1_name)
        sample_2_name = CRISPRessoShared.slugify(args.sample_2_name)

        n_processes = 1
        if args.n_processes == 'max':
            n_processes = CRISPRessoMultiProcessing.get_max_processes()
        else:
            n_processes = int(args.n_processes)

        # check that the CRISPRessoPooled output is present
        quantification_summary_file_1 = check_PooledWGS_output_folder(
            args.crispresso_pooled_wgs_output_folder_1, )
        quantification_summary_file_2 = check_PooledWGS_output_folder(
            args.crispresso_pooled_wgs_output_folder_2, )

        # create outputfolder and initialize the log
        get_name_from_folder = lambda x: os.path.basename(os.path.abspath(
            x)).replace('CRISPRessoPooled_on_', '').replace(
                'CRISPRessoWGS_on_', '')

        if not args.name:
            database_id = '{0}_VS_{1}'.format(
                get_name_from_folder(
                    args.crispresso_pooled_wgs_output_folder_1, ),
                get_name_from_folder(
                    args.crispresso_pooled_wgs_output_folder_2, ),
            )
        else:
            database_id = CRISPRessoShared.slugify(args.name)

        OUTPUT_DIRECTORY = 'CRISPRessoPooledWGSCompare_on_{0}'.format(
            database_id)

        if args.output_folder:
            OUTPUT_DIRECTORY = os.path.join(
                os.path.abspath(args.output_folder),
                OUTPUT_DIRECTORY,
            )

        _jp = lambda filename: os.path.join(
            OUTPUT_DIRECTORY, filename
        )  #handy function to put a file in the output directory
        log_filename = _jp('CRISPRessoPooledWGSCompare_RUNNING_LOG.txt')

        try:
            info('Creating Folder %s' % OUTPUT_DIRECTORY)
            os.makedirs(OUTPUT_DIRECTORY)
            info('Done!')
        except:
            warn('Folder %s already exists.' % OUTPUT_DIRECTORY)

        log_filename = _jp('CRISPRessoPooledWGSCompare_RUNNING_LOG.txt')
        logger.addHandler(logging.FileHandler(log_filename))

        with open(log_filename, 'w+') as outfile:
            outfile.write(
                '[Command used]:\nCRISPRessoPooledWGSCompare {0}\n\n[Execution log]:\n'
                .format(' '.join(sys.argv), ), )

        crispresso2Compare_info_file = os.path.join(
            OUTPUT_DIRECTORY, 'CRISPResso2PooledWGSCompare_info.pickle')
        crispresso2_info = {
            'running_info': {},
            'results': {
                'alignment_stats': {},
                'general_plots': {}
            }
        }  #keep track of all information for this run to be saved at the end of the run
        crispresso2_info['running_info'][
            'version'] = CRISPRessoShared.__version__
        crispresso2_info['running_info']['args'] = deepcopy(args)

        crispresso2_info['running_info']['log_filename'] = os.path.basename(
            log_filename)

        crispresso2_info['results']['general_plots']['summary_plot_names'] = []
        crispresso2_info['results']['general_plots'][
            'summary_plot_titles'] = {}
        crispresso2_info['results']['general_plots'][
            'summary_plot_labels'] = {}
        crispresso2_info['results']['general_plots']['summary_plot_datas'] = {}

        save_png = True
        if args.suppress_report:
            save_png = False

        # load data and calculate the difference
        df_quant_1 = pd.read_csv(quantification_summary_file_1, sep='\t')
        df_quant_2 = pd.read_csv(quantification_summary_file_2, sep='\t')
        #        df_comp=df_quant_1.set_index(['Name','Amplicon']).join(df_quant_2.set_index(['Name','Amplicon']),lsuffix='_%s' % args.sample_1_name,rsuffix='_%s' % args.sample_2_name)
        df_comp = df_quant_1.set_index('Name').join(
            df_quant_2.set_index('Name'),
            lsuffix='_{0}'.format(sample_1_name),
            rsuffix='_{0}'.format(sample_2_name),
        )

        df_comp['({0}-{1})_Unmodified%%'.format(
            sample_1_name,
            args.sample_2_name)] = df_comp['Unmodified%%_{0}'.format(
                sample_1_name)] - df_comp['Unmodified%%_{0}'.format(
                    args.sample_2_name)]

        df_comp.fillna('NA').to_csv(
            _jp('COMPARISON_SAMPLES_QUANTIFICATION_SUMMARIES.txt'), sep='\t')

        # now run CRISPRessoCompare for the pairs for wich we have data in both folders
        crispresso_cmds = []
        processed_regions = []
        processed_region_folder_names = {}
        processed_region_html_files = {}
        for idx, row in df_comp.iterrows():
            if idx in processed_regions:
                continue
            if row.isnull().any():
                warn(
                    'Skipping sample {0} since it was not processed in one or both conditions'
                    .format(idx))
            else:
                processed_regions.add(idx)
                crispresso_output_folder_1 = os.path.join(
                    args.crispresso_pooled_wgs_output_folder_1,
                    'CRISPResso_on_{0}'.format(idx),
                )
                crispresso_output_folder_2 = os.path.join(
                    args.crispresso_pooled_wgs_output_folder_2,
                    'CRISPResso_on_{0}'.format(idx),
                )
                compare_output_name = '{0}_{1}_VS_{2}'.format(
                    idx,
                    sample_1_name,
                    sample_2_name,
                )
                crispresso_compare_cmd = CRISPResso_compare_to_call + \
                    ' "{0}" "{1}" -o "{2}" -n {3} -n1 "{4}" -n2 "{5}" '.format(
                      crispresso_output_folder_1,
                      crispresso_output_folder_2,
                      OUTPUT_DIRECTORY,
                      compare_output_name,
                      args.sample_1_name + '_' + idx,
                      args.sample_2_name + '_' + idx,
                    )

                crispresso_compare_cmd = CRISPRessoShared.propagate_crispresso_options(
                    crispresso_compare_cmd,
                    crispresso_compare_options,
                    args,
                )
                info('Running CRISPRessoCompare:%s' % crispresso_compare_cmd)
                crispresso_cmds.append(crispresso_compare_cmd)

                sub_folder = os.path.join(
                    OUTPUT_DIRECTORY,
                    'CRISPRessoCompare_on_' + compare_output_name,
                )
                this_sub_html_file = os.path.basename(sub_folder) + ".html"
                if args.place_report_in_output_folder:
                    this_sub_html_file = os.path.join(
                        os.path.basename(sub_folder),
                        "CRISPResso2Compare_report.html",
                    )
                processed_region_html_files[idx] = this_sub_html_file
                processed_region_folder_names[idx] = compare_output_name

        CRISPRessoMultiProcessing.run_crispresso_cmds(
            crispresso_cmds,
            n_processes,
            'Comparison',
        )
        crispresso2_info['results']['processed_regions'] = processed_regions
        crispresso2_info['results'][
            'processed_region_folder_names'] = processed_region_folder_names

        if not args.suppress_report:
            if args.place_report_in_output_folder:
                report_name = _jp("CRISPResso2PooledWGSCompare_report.html")
            else:
                report_name = OUTPUT_DIRECTORY + '.html'
            CRISPRessoReport.make_multi_report(
                processed_regions,
                processed_region_html_files,
                report_name,
                OUTPUT_DIRECTORY,
                _ROOT,
                'CRISPREssoPooledWGSCompare Report<br>{0} vs {1}'.format(
                    sample_1_name,
                    sample_2_name,
                ),
            )
            crispresso2_info['running_info']['report_location'] = report_name
            crispresso2_info['running_info'][
                'report_filename'] = os.path.basename(report_name)

        CRISPRessoShared.write_crispresso_info(
            crispresso2Compare_info_file,
            crispresso2_info,
        )

        info('All Done!')
        print(CRISPRessoShared.get_crispresso_footer())
        sys.exit(0)

    except Exception as e:
        debug_flag = False
        if 'args' in vars() and 'debug' in args:
            debug_flag = args.debug

        if debug_flag:
            traceback.print_exc(file=sys.stdout)
        error('\n\nERROR: %s' % e)
        sys.exit(-1)