def main(args=None):

    args = parse_arguments().parse_args(args)

    matrices_name = args.matrix
    threads = args.threads
    matrices_list = cell_name_list(matrices_name)
    bulk_matrix = None

    all_data_collected = False
    thread_done = [False] * threads
    length_index = [None] * threads
    length_index[0] = 0
    matricesPerThread = len(matrices_list) // threads
    queue = [None] * threads
    process = [None] * threads
    for i in range(threads):

        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) *
                                               matricesPerThread]
            length_index[i + 1] = length_index[i] + len(matrices_name_list)
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=create_bulk_matrix,
                             kwargs=dict(pMatrixName=matrices_name,
                                         pMatricesList=matrices_name_list,
                                         pQueue=queue[i]))

        process[i].start()

    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                csr_matrix_worker = queue[i].get()
                if bulk_matrix is None:
                    bulk_matrix = csr_matrix_worker
                else:
                    bulk_matrix.matrix += csr_matrix_worker.matrix

                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
        time.sleep(1)

    bulk_matrix.save(args.outFileName)
示例#2
0
def main(args=None):

    args = parse_arguments().parse_args(args)

    matrices_list = cell_name_list(args.matrix)

    print('Filename: {}'.format(args.matrix))
    print('Contains {} single-cell matrices'.format(len(matrices_list)))
    print('The information stored via cooler.info of the first cell is: \n')
    cooler_file = cooler.Cooler(args.matrix + '::' + matrices_list[0])

    if cooler_file.info is not None:
        for key, value in cooler_file.info.items():
            print(key, value)
    print('Chromosomes: {}'.format(cooler_file.chromnames))

    if args.writeOutNames is not None:
        with open(args.writeOutNames, 'w') as file:
            for cell in matrices_list:
                file.write("{}\n".format(cell[7:]))
示例#3
0
def main(args=None):

    args = parse_arguments().parse_args(args)
    matrices_list = cell_name_list(args.matrix)
    # if args.labels and len(matrices_list) != len(args.labels):
    #     log.error("The number of labels does not match the number of matrices.")
    #     exit(0)

    if args.labels:
        label_list = [None] * len(matrices_list)
        with open(args.labels, 'r') as file:
            for line in file.readlines():
                try:
                    matrix_name, label_name = line.strip().split('\t')
                except Exception:
                    matrix_name, label_name = line.strip().split('    ')
                if matrix_name in matrices_list:
                    index = matrices_list.index(matrix_name)
                    label_list[index] = label_name
            args.labels = label_list
    else:
        label_list = [x.split('/')[2].split('.')[0].split('_')[2] for x in matrices_list]
        args.labels = label_list

    num_files = len(matrices_list)
    map(lambda x: os.path.basename(x), matrices_list)
    # initialize results matrix
    results = np.zeros((num_files, num_files), dtype='float')

    rows, cols = np.triu_indices(num_files)
    correlation_opts = {'spearman': spearmanr,
                        'pearson': pearsonr}
    hic_mat_list = []
    max_value = None
    min_value = None
    all_mat = None
    all_nan = []

    # load csr matrices in parallel
    chromosome_indices = None
    cooler_obj = cooler.Cooler(args.matrix + '::' + matrices_list[0])
    binsDataFrame = cooler_obj.bins()[:]
    chromosome_indices = {}
    for chromosome in cooler_obj.chromnames:
        chromosome_indices[chromosome] = np.array(binsDataFrame.index[binsDataFrame['chrom'] == chromosome].tolist())

    threads = args.threads
    all_data_collected = False
    thread_done = [False] * threads
    length_index = [None] * threads
    matrix_list_threads = [None] * threads
    # all_mat_thread = [None] * threads
    length_index[0] = 0
    matricesPerThread = len(matrices_list) // threads
    queue = [None] * threads
    process = [None] * threads
    for i in range(threads):

        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) * matricesPerThread]
            length_index[i + 1] = length_index[i] + len(matrices_name_list)
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=load_matrix_list, kwargs=dict(
            pMatrixName=args.matrix,
            pMatricesList=matrices_name_list,
            pArgs=args,
            pChromosomeIndices=chromosome_indices,
            pQueue=queue[i]
        )
        )
        process[i].start()

    fail_flag = False
    time_start = time.time()
    wait_threshold = 60 * 5
    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                csr_matrix_worker = queue[i].get()
                if isinstance(csr_matrix_worker, str):
                    log.error('{}'.format(csr_matrix_worker))
                    fail_flag = True
                else:
                    matrix_list_threads[i], all_mat_thread = csr_matrix_worker
                    if all_mat is None:
                        all_mat = all_mat_thread
                    else:
                        all_mat += all_mat_thread
                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
                time_start = time.time()
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False

        if time.time() - time_start > wait_threshold:
            log.error('The wait threshold time limit is reached. It seems parts of your data are too large for Python\'s queues to be passed back. Please use either a higher number of threads or use the `--saveMemory` option if available.')

            for i in range(threads):
                if process[i] is not None:
                    process[i].join()
                    process[i].terminate()
            exit(1)
        time.sleep(1)

    if fail_flag:
        # log.error(fail_message)
        exit(1)

    hic_mat_list = [item for sublist in matrix_list_threads for item in sublist]
    # remove nan bins

    rows_keep = cols_keep = np.delete(list(range(all_mat.shape[1])), all_nan)
    all_mat = all_mat[rows_keep, :][:, cols_keep]

    # make large matrix to correlate by
    # using sparse matrix tricks

    big_mat = None
    for mat in hic_mat_list:
        mat = mat[rows_keep, :][:, cols_keep]
        sample_vector = (mat + all_mat).data - all_mat.data
        if big_mat is None:
            big_mat = sample_vector
        else:
            big_mat = np.vstack([big_mat, sample_vector])

    # take the transpose such that columns represent each of the samples
    big_mat = np.ma.masked_invalid(big_mat).T

    grids = gridspec.GridSpec(num_files, num_files)
    grids.update(wspace=0, hspace=0)
    plt.figure(figsize=(2 * num_files, 2 * num_files))
    plt.rcParams['font.size'] = 8.0

    min_value = int(big_mat.min())
    max_value = int(big_mat.max())
    if (min_value % 2 == 0 and max_value % 2 == 0) or \
            (min_value % 1 == 0 and max_value % 2 == 1):
        # make one value odd and the other even
        max_value += 1

    # if args.log1p:
    #     major_locator = FixedLocator(list(range(min_value, max_value, 2)))
    #     minor_locator = FixedLocator(list(range(min_value, max_value, 1)))

    # parallel correlation computation
    all_data_collected = False
    thread_done = [False] * threads
    matricesPerThread = len(rows) // threads

    queue = [None] * threads
    process = [None] * threads
    for i in range(threads):

        if i < threads - 1:
            start_index = i * matricesPerThread
            end_index = (i + 1) * matricesPerThread
        else:
            start_index = i * matricesPerThread
            end_index = len(rows)

        queue[i] = Queue()
        process[i] = Process(target=compute_correlation, kwargs=dict(
            pCorrelationFunction=correlation_opts[args.method],
            pRows=rows,
            pColumns=cols,
            pBigMatrix=big_mat,
            pIndexStart=start_index,
            pIndexEnd=end_index,
            pResults=results,
            pQueue=queue[i]
        )
        )
        process[i].start()

    fail_flag = False
    time_start = time.time()
    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                correlated_result = queue[i].get()
                if isinstance(correlated_result, str):
                    log.error('{}'.format(correlated_result))
                    fail_flag = True
                else:
                    results += correlated_result
                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
                time_start = time.time()
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False

        time.sleep(1)

    if fail_flag:
        exit(1)

    results = results + np.triu(results, 1).T
    plot_correlation(results, args.labels,
                     args.outFileNameHeatmap,
                     args.zMax,
                     args.zMin,
                     args.colorMap,
                     image_format=args.plotFileFormat,
                     pFontSize=args.fontsize,
                     pFigureSize=args.figuresize)
示例#4
0
def main(args=None):

    args = parse_arguments().parse_args(args)

    matrices_name = args.matrix
    threads = args.threads
    matrices_list = cell_name_list(matrices_name)
    all_samples_number = len(matrices_list)

    if args.runChromosomeCheck:
        #####################################################
        # Detect broken chromosomes and remove these matrices
        #####################################################
        keep_matrices_thread = [None] * threads
        all_data_collected = False
        thread_done = [False] * threads
        length_index = [None] * threads
        length_index[0] = 0
        matricesPerThread = len(matrices_list) // threads
        queue = [None] * threads
        process = [None] * threads
        for i in range(threads):

            if i < threads - 1:
                matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) * matricesPerThread]
                length_index[i + 1] = length_index[i] + len(matrices_name_list)
            else:
                matrices_name_list = matrices_list[i * matricesPerThread:]

            queue[i] = Queue()
            process[i] = Process(target=compute_contains_all_chromosomes, kwargs=dict(
                pMatrixName=matrices_name,
                pMatricesList=matrices_name_list,
                pChromosomes=args.chromosomes,
                pQueue=queue[i]
            )
            )

            process[i].start()

        while not all_data_collected:
            for i in range(threads):
                if queue[i] is not None and not queue[i].empty():
                    worker_result = queue[i].get()
                    keep_matrices_thread[i] = worker_result
                    queue[i] = None
                    process[i].join()
                    process[i].terminate()
                    process[i] = None
                    thread_done[i] = True
            all_data_collected = True
            for thread in thread_done:
                if not thread:
                    all_data_collected = False
            time.sleep(1)

        keep_matrices_chromosome_names = np.array([item for sublist in keep_matrices_thread for item in sublist], dtype=bool)

        matrices_name_chromosome_names = np.array(matrices_list)
        matrices_list = matrices_name_chromosome_names[keep_matrices_chromosome_names]

        matrices_remove = matrices_name_chromosome_names[~keep_matrices_chromosome_names]

    #######################################

    read_coverage_thread = [None] * threads
    sparsity_thread = [None] * threads

    all_data_collected = False
    thread_done = [False] * threads
    length_index = [None] * threads
    length_index[0] = 0
    matricesPerThread = len(matrices_list) // threads
    queue = [None] * threads
    process = [None] * threads
    for i in range(threads):

        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) * matricesPerThread]
            length_index[i + 1] = length_index[i] + len(matrices_name_list)
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=compute_read_coverage_sparsity, kwargs=dict(
            pMatrixName=matrices_name,
            pMatricesList=matrices_name_list,
            pXDimension=len(matrices_list),
            pMaximumRegionToConsider=args.maximumRegionToConsider,
            pQueue=queue[i]
        )
        )

        process[i].start()

    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                worker_result = queue[i].get()
                read_coverage_thread[i] = worker_result[0]
                sparsity_thread[i] = worker_result[1]
                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
        time.sleep(1)

    read_coverage = np.array([item for sublist in read_coverage_thread for item in sublist])
    sparsity = np.array([item for sublist in sparsity_thread for item in sublist])

    plt.close()
    plt.hist(read_coverage, bins=100)

    plt.suptitle('Read coverage of {}'.format(os.path.basename(args.matrix)), fontsize=12)
    plt.grid(True)
    if args.minimumReadCoverage > 0:
        plt.axvline(args.minimumReadCoverage, color='r', linestyle='dashed', linewidth=1)
        plt.title('Matrices with a read coverage < {} are removed.'.format(args.minimumReadCoverage), fontsize=10)

    plt.xlabel('Read coverage')
    plt.ylabel('Frequency')
    plt.savefig(args.outFileNameReadCoverage, dpi=args.dpi)
    plt.close()

    plt.hist(sparsity, bins=100)
    plt.suptitle('Density of {}'.format(os.path.basename(args.matrix)), fontsize=12)
    if args.minimumDensity > 0:
        plt.title('Matrices with a density < {} are removed.'.format(args.minimumDensity), fontsize=10)
    plt.grid(True)
    plt.xlabel('Density')
    plt.ylabel('Frequency')
    if args.minimumDensity > 0:
        plt.axvline(args.minimumDensity, color='r', linestyle='dashed', linewidth=1)

    plt.savefig(args.outFileNameDensity, dpi=args.dpi)
    plt.close()

    mask_read_coverage = read_coverage >= args.minimumReadCoverage
    mask_sparsity = sparsity >= args.minimumDensity

    mask = np.logical_and(mask_read_coverage, mask_sparsity)
    matrices_list_filtered = np.array(matrices_list)[mask]
    sum_read_coverage = np.sum(~mask_read_coverage)
    sum_sparsity = np.sum(~mask_sparsity)

    if not args.plotOnly:
        np.savetxt('accepted_matrices.txt', matrices_list_filtered, fmt="%s")
        np.savetxt('rejected_matrices.txt', np.array(matrices_list)[~mask], fmt="%s")

        if os.path.exists(args.outputScool):
            os.remove(args.outputScool)

        cooler.fileops.cp(args.matrix + '::/bins', args.outputScool + '::/bins')
        cooler.fileops.cp(args.matrix + '::/chroms', args.outputScool + '::/chroms')

        with cooler.util.open_hdf5(args.matrix) as source:
            attributes_dict = {}
            for k, v in source.attrs.items():
                attributes_dict[k] = v

            attributes_dict['ncells'] = len(matrices_list_filtered)
            attributes_dict['creation-date'] = datetime.now().isoformat()
            with h5py.File(args.outputScool, "r+") as f:
                h5 = f['/']
                h5.attrs.update(attributes_dict)

        content_bins_ln = ['chrom', 'start', 'end']
        for matrix in matrices_list_filtered:

            cooler.fileops.cp(args.matrix + '::' + matrix + '/pixels', args.outputScool + '::' + matrix + '/pixels')
            cooler.fileops.cp(args.matrix + '::' + matrix + '/indexes', args.outputScool + '::' + matrix + '/indexes')
            cooler.fileops.ln(args.outputScool + '::' + '/chroms', args.outputScool + '::' + matrix + '/chroms')
            cooler.fileops.ln(args.outputScool + '::' + '/bins/chrom', args.outputScool + '::' + matrix + '/bins/chrom')
            cooler.fileops.ln(args.outputScool + '::' + '/bins/start', args.outputScool + '::' + matrix + '/bins/start')
            cooler.fileops.ln(args.outputScool + '::' + '/bins/end', args.outputScool + '::' + matrix + '/bins/end')

            group_dataset_list = cooler.fileops.ls(args.matrix + '::' + matrix + '/bins/')
            for datatype in group_dataset_list:
                last_element = datatype.split('/')[-1]
                if not (last_element) in content_bins_ln and last_element != '':
                    cooler.fileops.cp(args.matrix + '::' + matrix + '/bins/' + last_element, args.outputScool + '::' + matrix + '/bins/' + last_element)

            with cooler.util.open_hdf5(args.matrix) as source:  # , cooler.util.open_hdf5(args.outputScool + '::' + matrix) as destination:

                attributes_dict = {}
                for k, v in source[matrix].attrs.items():
                    attributes_dict[k] = v
                with h5py.File(args.outputScool, "r+") as f:
                    h5 = f[matrix]
                    h5.attrs.update(attributes_dict)

    ##################
    # Create QC report
    ##################

    header = '# QC report for single-cell Hi-C data generated by scHiCExplorer ' + __version__ + '\n'

    matrix_statistics = 'scHi-C sample contained {} cells:\n'.format(all_samples_number)
    if args.runChromosomeCheck:
        matrices_bad_chromosomes = 'Number of removed matrices containing bad chromosomes {}\n'.format(len(matrices_remove))

    matrices_low_read_coverage = 'Number of removed matrices due to low read coverage (< {}): {}\n'.format(args.minimumReadCoverage, sum_read_coverage)
    matrices_too_sparse = 'Number of removed matrices due to too many zero bins (< {} density, within {} relative genomic distance): {}\n'.format(args.minimumDensity, args.maximumRegionToConsider, sum_sparsity)

    matrix_qc = '{} samples passed the quality control. Please consider matrices with a low read coverage may be the matrices with a low density and overlap therefore.'.format(len(matrices_list_filtered))

    with open(args.outFileNameQCReport, 'w') as file:
        file.write(header)
        file.write(matrix_statistics)
        if args.runChromosomeCheck:
            file.write(matrices_bad_chromosomes)
        file.write(matrices_low_read_coverage)
        file.write(matrices_too_sparse)
        file.write(matrix_qc)
def main(args=None):

    args = parse_arguments().parse_args(args)
    if args.region is not None and args.chromosomes is not None:
        raise Exception('--chromosomes and --region are mutual exclusive.')
        exit(1)
    matrices_list = cell_name_list(args.matrix)
    columns = 4
    if len(matrices_list) < columns:
        columns = len(matrices_list)
    rows = int(np.ceil(len(matrices_list) / columns))
    if rows < 1:
        rows = 1

    if len(matrices_list) > 12:
        figsize = (5, 5.5)
    elif len(matrices_list) > 8:
        figsize = (5, 4.5)
    elif len(matrices_list) > 4:
        figsize = (5, 4)
    else:
        figsize = (5, 3)

    f, axes = plt.subplots(rows, columns, figsize=figsize)

    title_string = 'Consensus matrices of {}'.format(os.path.basename(args.matrix.split('.scool')[0]))
    if args.chromosomes:
        title_string += ' on chromosome: {}'.format(' '.join(args.chromosomes))
    elif args.region:
        title_string += ' for {}'.format(args.region)
    else:
        title_string += ' on all chromosomes'

    if args.no_header:
        plt.suptitle(title_string, fontsize=args.fontsize)
    from mpl_toolkits.axes_grid1 import make_axes_locatable

    for i, matrix in enumerate(matrices_list):
        if args.chromosomes is not None and len(args.chromosomes) == 1:
            hic_ma = hm.hiCMatrix(pMatrixFile=args.matrix + '::' + matrix, pChrnameList=args.chromosomes)
        elif args.region is not None:
            hic_ma = hm.hiCMatrix(pMatrixFile=args.matrix + '::' + matrix, pChrnameList=[args.region])
        else:
            hic_ma = hm.hiCMatrix(pMatrixFile=args.matrix + '::' + matrix)
            if args.chromosomes:
                hic_ma.keepOnlyTheseChr(args.chromosomes)
        matrix_data = hic_ma.matrix
        matrix_data = matrix_data.toarray()
        mask = matrix_data == 0
        try:
            matrix_data[mask] = np.nanmin(matrix_data[mask == False])
        except ValueError:
            log.info('Matrix contains only 0. Set all values to {}'.format(np.finfo(float).tiny))
            matrix_data[mask] = np.finfo(float).tiny
        if np.isnan(matrix_data).any() or np.isinf(matrix_data).any():
            mask_nan = np.isnan(matrix_data)
            mask_inf = np.isinf(matrix_data)
            matrix_data[mask_nan] = np.nanmin(matrix_data[mask_nan == False])
            matrix_data[mask_inf] = np.nanmin(matrix_data[mask_inf == False])
        matrix_data += 1

        if args.log1p:
            matrix_data += 1
            norm = LogNorm()
        else:
            norm = None

        if rows == 1:

            im = axes[i % columns].imshow(matrix_data, cmap=args.colorMap, norm=norm)
            axes[i % columns].get_xaxis().set_ticks([])
            axes[i % columns].get_yaxis().set_ticks([])

            axes[i % columns].yaxis.set_visible(False)
            axes[i % columns].set_xlabel(str(matrix.split('/')[-1].split('cluster_')[-1]))
        else:
            im = axes[i // columns, i % columns].imshow(matrix_data, cmap=args.colorMap, norm=norm)
            axes[i // columns, i % columns].get_xaxis().set_ticks([])
            axes[i // columns, i % columns].get_yaxis().set_ticks([])

            axes[i // columns, i % columns].yaxis.set_visible(False)
            axes[i // columns, i % columns].set_xlabel(str(matrix.split('/')[-1].split('cluster_')[-1].split(':')[0]))

    number_of_plots = len(matrices_list)
    i = -1
    while rows * columns > number_of_plots:

        axes[-1, i].axis('off')
        number_of_plots += 1
        i -= 1

    plt.tight_layout()

    f.subplots_adjust(right=0.8)
    cbar_ax = f.add_axes([0.85, 0.15, 0.05, 0.7])
    f.colorbar(im, cax=cbar_ax)

    plt.savefig(args.outFileName, dpi=args.dpi)
    plt.close()
示例#6
0
def main(args=None):
    args = parse_arguments().parse_args(args)
    log.debug(args)
    matrix_file_handler_object_list = []

    matrices_list = cell_name_list(args.matrix)
    if args.action in ['extractToCool', 'extractScool']:
        if args.cellList is not None:
            matrix_list_tmp = []
            with open(args.cellList, 'r') as file:
                for line in file:
                    values = line.strip()
                    log.debug('values {}'.format(values))
                    if not values.startswith('/cells'):
                        values = '/cells/' + values
                    if values in matrices_list:
                        matrix_list_tmp.append(values)

            matrices_list = matrix_list_tmp

    if len(matrices_list) == 0:
        raise OSError('No cells for processing. Terminating.')
        exit(1)
    if len(matrices_list) < args.threads:
        args.threads = len(matrices_list)

    matrixFileHandlerInput = MatrixFileHandler(pFileType='cool',
                                               pMatrixFile=args.matrix + "::" +
                                               matrices_list[0])

    _matrix, cut_intervals_all, nan_bins, \
        distance_counts, correction_factors = matrixFileHandlerInput.load()

    threads = args.threads

    matrixFileHandler_list = [None] * args.threads
    process = [None] * args.threads
    queue = [None] * args.threads

    thread_done = [False] * args.threads
    matricesPerThread = len(matrices_list) // threads

    for i in range(args.threads):
        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) *
                                               matricesPerThread]
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=load_cool_files,
                             kwargs=dict(pMatrixName=args.matrix,
                                         pMatricesList=matrices_name_list,
                                         pCutIntervals=cut_intervals_all,
                                         pQueue=queue[i]))
        process[i].start()

    all_data_collected = False
    fail_flag = False
    fail_message = ''
    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                matrixFileHandler_list[i] = queue[i].get()
                if 'Fail:' in matrixFileHandler_list[i]:
                    fail_flag = True
                    fail_message = matrixFileHandler_list[i][6:]
                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
        time.sleep(1)

    if fail_flag:
        log.error(fail_message)
        exit(1)
    matrix_file_handler_object_list = [
        item for sublist in matrixFileHandler_list for item in sublist
    ]

    if args.action in ['extractScool', 'update']:
        matrixFileHandler = MatrixFileHandler(pFileType='scool')
        matrixFileHandler.matrixFile.coolObjectsList = matrix_file_handler_object_list
        matrixFileHandler.save(args.outFileName,
                               pSymmetric=True,
                               pApplyCorrection=False)
    else:
        if not os.path.exists(args.outFileName):
            try:
                os.makedirs(args.outFileName)
            except OSError as exc:  # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise
        for matrixFileHandler in matrix_file_handler_object_list:
            matrixFileHandler.save(
                args.outFileName + '/' +
                matrixFileHandler.matrixFile.matrixFileName + '.cool',
                pApplyCorrection=True,
                pSymmetric=True)
示例#7
0
def main(args=None):

    args = parse_arguments().parse_args(args)

    threads = args.threads
    merged_matrices = [None] * threads
    matrices_list = cell_name_list(args.matrix)
    if len(matrices_list) < threads:
        threads = len(matrices_list)
    all_data_collected = False
    thread_done = [False] * threads
    length_index = [None] * threads
    length_index[0] = 0
    matricesPerThread = len(matrices_list) // threads
    queue = [None] * threads
    process = [None] * threads
    for i in range(threads):

        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) *
                                               matricesPerThread]
            length_index[i + 1] = length_index[i] + len(matrices_name_list)
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=compute_merge,
                             kwargs=dict(pMatrixName=args.matrix,
                                         pMatrixList=matrices_name_list,
                                         pRunningWindow=args.runningWindow,
                                         pNumBins=args.numBins,
                                         pQueue=queue[i]))

        process[i].start()
    fail_flag = False
    fail_message = ''
    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                # log.debug('i {}'.format(i))
                # log.debug('len(queue) {}'.format(len(queue)))
                # log.debug('len(merged_matrices) {}'.format(len(merged_matrices)))

                merged_matrices[i] = queue[i].get()
                if isinstance(
                        merged_matrices[i][0],
                        str) and merged_matrices[i][0].startswith('Fail: '):
                    fail_flag = True
                    fail_message = merged_matrices[i][0]
                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
            time.sleep(1)

    if fail_flag:
        log.error('{}'.format(fail_message))
        exit(1)
    matrixFileHandlerObjects_list = [
        item for sublist in merged_matrices for item in sublist
    ]

    matrixFileHandler = MatrixFileHandler(pFileType='scool')
    matrixFileHandler.matrixFile.coolObjectsList = matrixFileHandlerObjects_list
    matrixFileHandler.save(args.outFileName,
                           pSymmetric=True,
                           pApplyCorrection=False)
示例#8
0
def main(args=None):

    args = parse_arguments().parse_args(args)

    threads = args.threads
    matrixFileHandler_list = [None] * threads
    matrices_list = cell_name_list(args.matrix)
    if len(matrices_list) < threads:
        threads = len(matrices_list)

    matrixFileHandlerInput = MatrixFileHandler(pFileType='cool', pMatrixFile=args.matrix + "::" + matrices_list[0])

    _matrix, cut_intervals_all, nan_bins, \
        distance_counts, correction_factors = matrixFileHandlerInput.load()

    all_data_collected = False
    thread_done = [False] * threads
    length_index = [None] * threads
    length_index[0] = 0
    matricesPerThread = len(matrices_list) // threads
    queue = [None] * threads
    process = [None] * threads
    print('Threads: ' + str(threads))
    for i in range(threads):

        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) * matricesPerThread]
            length_index[i + 1] = length_index[i] + len(matrices_name_list)
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=compute_correction, kwargs=dict(
            pMatrixName=args.matrix,
            pMatrixList=matrices_name_list,
            pCutIntervals=cut_intervals_all,
            pQueue=queue[i]
        )
        )

        process[i].start()

    fail_flag = False
    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                matrixFileHandler_list[i] = queue[i].get()
                # csr_matrix_worker = queue[i].get()
                if isinstance(matrixFileHandler_list[i], str):
                    log.error('{}'.format(matrixFileHandler_list[i]))
                    fail_flag = True
                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
        time.sleep(1)

    if fail_flag:
        exit(1)
    matrix_file_handler_object_list = [item for sublist in matrixFileHandler_list for item in sublist]

    matrixFileHandler = MatrixFileHandler(pFileType='scool')
    matrixFileHandler.matrixFile.coolObjectsList = matrix_file_handler_object_list
    matrixFileHandler.save(args.outFileName, pSymmetric=True, pApplyCorrection=False)
示例#9
0
def main(args=None):

    args = parse_arguments().parse_args(args)

    matrices_name = args.matrix
    threads = args.threads
    matrices_list = cell_name_list(matrices_name)
    read_coverage = [None] * threads

    all_data_collected = False
    thread_done = [False] * threads
    matricesPerThread = len(matrices_list) // threads
    queue = [None] * threads
    process = [None] * threads
    for i in range(threads):

        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) *
                                               matricesPerThread]
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=compute_read_distribution,
                             kwargs=dict(pMatrixName=matrices_name,
                                         pMatricesList=matrices_name_list,
                                         pMaximalDistance=args.maximalDistance,
                                         pChromosomes=args.chromosomes,
                                         pQueue=queue[i]))

        process[i].start()

    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                read_coverage[i], resolution = queue[i].get()

                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
        time.sleep(1)

    read_distributions = []
    for thread_data in read_coverage:
        for matrix_data in thread_data:
            read_distributions.append(matrix_data)

    read_distributions = np.array(read_distributions)

    clusters = {}
    clusters_svl = {}
    short_range_distance = args.distanceShortRange // resolution
    long_range_distance = args.distanceLongRange // resolution
    with open(args.clusters, 'r') as cluster_file:

        for i, line in enumerate(cluster_file.readlines()):
            line = line.strip()
            line_ = line.split(' ')[1]
            if int(line_) in clusters:
                clusters[int(line_)].append(read_distributions[i])
                clusters_svl[int(line_)].append(
                    np.sum(read_distributions[i][:short_range_distance]) /
                    np.sum(read_distributions[i]
                           [short_range_distance:long_range_distance]))
            else:
                clusters[int(line_)] = [read_distributions[i]]
                clusters_svl[int(line_)] = [
                    np.sum(read_distributions[i][:short_range_distance]) /
                    np.sum(read_distributions[i]
                           [short_range_distance:long_range_distance])
                ]
    if args.orderBy == 'svl':
        for i, cluster_key in enumerate(clusters.keys()):
            clusters[cluster_key] = np.array(clusters[cluster_key])
            clusters_svl[cluster_key] = np.array(clusters_svl[cluster_key])
            sorted_indices = np.argsort(clusters_svl[cluster_key])
            clusters[cluster_key] = clusters[cluster_key][sorted_indices]

    cluster_to_plot = []

    clusters_list = []
    cluster_size = []
    key_list_cluster = sorted(clusters.keys())
    for i, key in enumerate(key_list_cluster):
        cluster_to_plot = []

        for cluster_item in clusters[key]:
            cluster_to_plot.append(cluster_item)
        clusters_list.append(np.array(cluster_to_plot))
        cluster_size.append(len(cluster_to_plot))

    cluster_size = np.array(cluster_size)
    cluster_size = (1.0 - 0.1) * (cluster_size - min(cluster_size)) / (
        max(cluster_size) - min(cluster_size)) + (0.1)
    cluster_size = list(cluster_size)

    all_data = None
    index_clusters = []
    cluster_ticks = []
    cluster_ticks_top = []
    ticks_position = []
    for i, cluster in enumerate(clusters_list):
        if all_data is None:
            all_data = cluster
            index_clusters.append(len(cluster))
            ticks_position.append(0 + len(cluster) // 2)
        else:
            all_data = np.append(all_data, cluster, axis=0)
            index_clusters.append(index_clusters[i - 1] + len(cluster))
            ticks_position.append(index_clusters[i - 1] + len(cluster) // 2)

        cluster_ticks.append('Cluster {}: {} cells'.format((i), len(cluster)))
        cluster_ticks_top.append('Cluster {}'.format(i))

    if len(matrices_list) > 1000:
        fig = plt.figure(figsize=(8, 3))
    elif len(matrices_list) > 500:
        fig = plt.figure(figsize=(5, 3))
    elif len(matrices_list) > 250:
        fig = plt.figure(figsize=(4, 3))
    else:
        fig = plt.figure(figsize=(3, 3))

    plt.imshow(all_data.T, cmap=args.colorMap, norm=LogNorm(), aspect="auto")

    for index in index_clusters:
        plt.axvline(index - 1, color='black', linewidth=0.4)
    y_ticks = []
    y_labels = []

    unit = 'MB'

    factor = args.maximalDistance // 10

    if factor >= 1000000:
        unit = 'MB'
    elif factor >= 1000:
        unit = 'kb'
    else:
        unit = 'b'

    for i in range(0, (args.maximalDistance) + 1, resolution):
        if i % (factor) == 0:
            y_ticks.append(i // resolution)

            label = ''
            if factor >= 1000000:
                label = str(i // 1000000)
            elif factor >= 1000:
                label = str(i // 1000)
            else:
                label = str(i)

            y_labels.append(label + unit)

    plt.yticks(ticks=y_ticks, labels=y_labels, fontsize=args.fontsize)

    plt.gca().invert_yaxis()
    if args.ticks:
        plt.xticks(ticks=ticks_position,
                   labels=cluster_ticks_top,
                   rotation=args.rotationX,
                   fontsize=args.fontsize)

    elif args.legend:
        plt.tick_params(
            axis='x',  # changes apply to the x-axis
            which='both',  # both major and minor ticks are affected
            bottom=False,  # ticks along the bottom edge are off
            top=False,  # ticks along the top edge are off
            labelbottom=False)
        if len(cluster_ticks) < 5:
            ncols = 1
        else:
            ncols = 3
        leg = plt.legend(cluster_ticks,
                         loc='upper center',
                         bbox_to_anchor=(0.5, -0.01),
                         fancybox=True,
                         shadow=False,
                         ncol=ncols,
                         fontsize=args.fontsize)
        for item in leg.legendHandles:
            item.set_visible(False)
    else:
        plt.tick_params(
            axis='x',  # changes apply to the x-axis
            which='both',  # both major and minor ticks are affected
            bottom=False,  # ticks along the bottom edge are off
            top=False,  # ticks along the top edge are off
            labelbottom=False)
    fig.autofmt_xdate()

    cbar = plt.colorbar()
    cbar.ax.set_ylabel('% contacts', rotation=270, fontsize=args.fontsize)
    cbar.ax.yaxis.set_label_coords(args.fontsize, 0.5)
    cbar.ax.invert_yaxis()
    cbar.ax.tick_params(labelsize=args.fontsize)

    plt.tight_layout()
    plt.savefig(args.outFileName, dpi=args.dpi)

    plt.close()
def main(args=None):
    args = parse_arguments().parse_args(args)
    matrices_name = args.matrix
    threads = args.threads
    matrices_list = cell_name_list(matrices_name)

    if not os.path.exists(args.outputFolder + '/cells'):
        try:
            os.makedirs(args.outputFolder + '/cells')
        except OSError as exc:  # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

    if threads > len(matrices_list):
        threads = len(matrices_list)
    # load bin ids only once
    cooler_obj = cooler.Cooler(matrices_name + '::' + matrices_list[0])
    bins = cooler_obj.bins()[:]

    all_data_collected = False
    thread_done = [False] * threads
    cell_name_array_thread = [None] * threads

    matricesPerThread = len(matrices_list) // threads
    queue = [None] * threads
    process = [None] * threads
    for i in range(threads):

        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) * matricesPerThread]
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=convert_files, kwargs=dict(
            pMatrixName=matrices_name,
            pMatricesList=matrices_name_list,
            pBinsDataFrame=bins,
            pOutputFolder=args.outputFolder,
            pFormat=args.format,
            pQueue=queue[i]
        )
        )

        process[i].start()
    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                cell_name_array_thread[i] = queue[i].get()

                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
        time.sleep(1)

    for subset in cell_name_array_thread:
        if subset[0] is None:
            exit(1)
    cell_name_array = [item for sublist in cell_name_array_thread for item in sublist]

    # write cell names to file
    with open(args.outputCellNameFile, 'w') as file:
        for cell_name in cell_name_array:
            file.write('{}\n'.format(cell_name))
    # write chromsizes to file
    with open(args.outputChromosomeSize, 'w') as file:
        for chromosome_name, size in cooler_obj.chromsizes.items():
            file.write('{}\t{}\n'.format(chromosome_name, size))
示例#11
0
def main(args=None):

    args = parse_arguments().parse_args(args)
    # if args.threads <= 4:
    #     log.error('')
    #     exit(1)
    outputFolder = os.path.dirname(os.path.abspath(args.outFileName)) + '/'

    raw_file_name = os.path.splitext(os.path.basename(args.outFileName))[0]

    if args.numberOfNearestNeighbors is None:
        cooler_obj = cooler.Cooler(args.matrix)
        args.numberOfNearestNeighbors = int(cooler_obj.info['ncells'])
    if args.cell_coloring_type:
        cell_name_cell_type_dict = {}

        cell_type_color_dict = {}
        color_cell_type_dict = {}
        cell_type_counter = 0
        with open(args.cell_coloring_type, 'r') as file:
            for i, line in enumerate(file.readlines()):
                line = line.strip()
                try:
                    cell_name, cell_type = line.split('\t')
                except Exception:
                    cell_name, cell_type = line.split('    ')
                cell_name_cell_type_dict[cell_name] = cell_type
                if cell_type not in cell_type_color_dict:
                    cell_type_color_dict[cell_type] = cell_type_counter
                    color_cell_type_dict[cell_type_counter] = cell_type
                    cell_type_counter += 1

    if args.cell_coloring_batch:
        cell_name_cell_type_dict_batch = {}

        cell_type_color_dict_batch = {}
        color_cell_type_dict_batch = {}
        cell_type_counter_batch = 0
        with open(args.cell_coloring_batch, 'r') as file:
            for i, line in enumerate(file.readlines()):
                line = line.strip()
                try:
                    cell_name, cell_type = line.split('\t')
                except Exception:
                    cell_name, cell_type = line.split('    ')
                cell_name_cell_type_dict_batch[cell_name] = cell_type
                if cell_type not in cell_type_color_dict_batch:
                    cell_type_color_dict_batch[cell_type] = cell_type_counter_batch
                    color_cell_type_dict_batch[cell_type_counter_batch] = cell_type
                    cell_type_counter_batch += 1

    if args.clusterMethod == 'spectral':
        cluster_object = SpectralClustering(n_clusters=args.numberOfClusters, affinity='nearest_neighbors', n_jobs=args.threads, random_state=0)
    elif args.clusterMethod == 'kmeans':
        cluster_object = KMeans(n_clusters=args.numberOfClusters, random_state=0, n_jobs=args.threads, precompute_distances=True)
    elif args.clusterMethod.startswith('agglomerative'):
        for linkage in ['ward', 'complete', 'average', 'single']:
            if linkage in args.clusterMethod:
                cluster_object = AgglomerativeClustering(n_clusters=args.numberOfClusters, linkage=linkage)
                break
    elif args.clusterMethod == 'birch':
        cluster_object = Birch(n_clusters=args.numberOfClusters)
    else:
        log.error('No valid cluster method given: {}'.format(args.clusterMethod))

    umap_params_dict = {}

    if not args.noUMAP:
        for param in vars(args):
            if 'umap_' in param:
                umap_params_dict[param] = vars(args)[param]
        umap_params_dict['umap_random'] = 42
    # log.debug(umap_params_dict)

    if args.saveMemory:
        matrices_list = cell_name_list(args.matrix)
        max_nnz = 0
        for matrix in matrices_list:
            cooler_obj = cooler.Cooler(args.matrix + '::' + matrix)
            nnz = cooler_obj.info['nnz']
            if max_nnz < nnz:
                max_nnz = nnz
        minHash_object = None
        matricesPerRun = int(len(matrices_list) * args.shareOfMatrixToBeTransferred)
        if matricesPerRun < 1:
            matricesPerRun = 1
        chromosome_indices = None
        if args.intraChromosomalContactsOnly:
            cooler_obj = cooler.Cooler(args.matrix + '::' + matrices_list[0])
            binsDataFrame = cooler_obj.bins()[:]
            chromosome_indices = {}
            for chromosome in cooler_obj.chromnames:
                chromosome_indices[chromosome] = np.array(binsDataFrame.index[binsDataFrame['chrom'] == chromosome].tolist())

        for j, i in enumerate(range(0, len(matrices_list), matricesPerRun)):
            if i < len(matrices_list) - 1:
                matrices_share = matrices_list[i:i + matricesPerRun]
            else:
                matrices_share = matrices_list[i:]
            neighborhood_matrix, matrices_list_share = open_and_store_matrix(args.matrix, matrices_share, 0, len(matrices_share),
                                                                             args.chromosomes, args.intraChromosomalContactsOnly, chromosome_indices)
            if minHash_object is None:
                minHash_object = MinHash(n_neighbors=args.numberOfNearestNeighbors, number_of_hash_functions=args.numberOfHashFunctions, number_of_cores=args.threads,
                                         shingle_size=0, fast=args.euclideanModeMinHash, maxFeatures=int(max_nnz), absolute_numbers=False)

            if j == 0:
                minHash_object.fit(neighborhood_matrix)
            else:
                minHash_object.partial_fit(X=neighborhood_matrix)

        precomputed_graph = minHash_object.kneighbors_graph(mode='distance')
        precomputed_graph = np.nan_to_num(precomputed_graph)
        precomputed_graph.data[np.isinf(precomputed_graph.data)] = 0
        if not args.noPCA:

            pca = PCA(n_components=min(precomputed_graph.shape) - 1)
            precomputed_graph = np.nan_to_num(precomputed_graph.todense())
            precomputed_graph[np.isinf(precomputed_graph)] = 0
            precomputed_graph = pca.fit_transform(precomputed_graph)

            if args.dimensionsPCA:
                args.dimensionsPCA = min(args.dimensionsPCA, precomputed_graph.shape[0])
                precomputed_graph = precomputed_graph[:, :args.dimensionsPCA]
                # cluster_object.fit(precomputed_graph[:, :args.dimensionsPCA])
        if not args.noUMAP:

            if umap_params_dict is None:
                reducer = umap.UMAP()
            else:
                reducer = umap.UMAP(n_neighbors=umap_params_dict['umap_n_neighbors'], n_components=umap_params_dict['umap_n_components'], metric=umap_params_dict['umap_metric'],
                                    n_epochs=umap_params_dict['umap_n_epochs'],
                                    learning_rate=umap_params_dict['umap_learning_rate'], init=umap_params_dict['umap_init'], min_dist=umap_params_dict['umap_min_dist'], spread=umap_params_dict['umap_spread'],
                                    set_op_mix_ratio=umap_params_dict['umap_set_op_mix_ratio'], local_connectivity=umap_params_dict['umap_local_connectivity'],
                                    repulsion_strength=umap_params_dict['umap_repulsion_strength'], negative_sample_rate=umap_params_dict['umap_negative_sample_rate'], transform_queue_size=umap_params_dict['umap_transform_queue_size'],
                                    a=umap_params_dict['umap_a'], b=umap_params_dict['umap_b'], angular_rp_forest=umap_params_dict['umap_angular_rp_forest'],
                                    target_n_neighbors=umap_params_dict['umap_target_n_neighbors'], target_metric=umap_params_dict['umap_target_metric'],
                                    target_weight=umap_params_dict['umap_target_weight'], random_state=umap_params_dict['umap_random'],
                                    force_approximation_algorithm=umap_params_dict['umap_force_approximation_algorithm'], verbose=umap_params_dict['umap_verbose'], unique=umap_params_dict['umap_unique'])
            precomputed_graph = reducer.fit_transform(precomputed_graph)
        precomputed_graph = np.nan_to_num(precomputed_graph)
        precomputed_graph[np.isinf(precomputed_graph)] = 0

        try:
            cluster_object.fit(precomputed_graph)
        except Exception:
            cluster_object.fit(precomputed_graph.todense())

        minHashClustering = MinHashClustering(minHashObject=minHash_object, clusteringObject=cluster_object)
        minHashClustering._precomputed_graph = precomputed_graph

    else:
        neighborhood_matrix, matrices_list = create_csr_matrix_all_cells(args.matrix, args.threads, args.chromosomes, outputFolder, raw_file_name, args.intraChromosomalContactsOnly, pDistance=args.distance)

        if args.saveIntermediateRawMatrix:
            save_npz(args.saveIntermediateRawMatrix, neighborhood_matrix)

    if not args.saveMemory:
        minHash_object = MinHash(n_neighbors=args.numberOfNearestNeighbors, number_of_hash_functions=args.numberOfHashFunctions, number_of_cores=args.threads,
                                 shingle_size=5, fast=args.euclideanModeMinHash, maxFeatures=int(max(neighborhood_matrix.getnnz(1))), absolute_numbers=False, max_bin_size=100000,
                                 minimal_blocks_in_common=100, excess_factor=1, prune_inverse_index=False)
        minHashClustering = MinHashClustering(minHashObject=minHash_object, clusteringObject=cluster_object)
        minHashClustering.fit(X=neighborhood_matrix, pSaveMemory=args.shareOfMatrixToBeTransferred, pPca=(not args.noPCA), pPcaDimensions=args.dimensionsPCA, pUmap=(not args.noUMAP), pUmapDict=umap_params_dict)

    if args.noPCA and args.noUMAP:
        mask = np.isnan(minHashClustering._precomputed_graph.data)
        minHashClustering._precomputed_graph.data[mask] = 0

        mask = np.isinf(minHashClustering._precomputed_graph.data)
        minHashClustering._precomputed_graph.data[mask] = 0

    labels_clustering = minHashClustering.predict(minHashClustering._precomputed_graph, pPca=args.noPCA, pPcaDimensions=args.dimensionsPCA)

    if args.createScatterPlot:
        if args.noPCA and args.noUMAP:
            pca = PCA(n_components=min(minHashClustering._precomputed_graph.shape) - 1)
            neighborhood_matrix_knn = pca.fit_transform(minHashClustering._precomputed_graph.todense())
        else:
            neighborhood_matrix_knn = minHashClustering._precomputed_graph

        list(set(labels_clustering))

        colors = process_cmap(args.colorMap)

        try:
            neighborhood_matrix_knn = neighborhood_matrix_knn.toarray()
        except Exception:
            pass

        label_x = 'PC1'
        label_y = 'PC2'
        if not (args.noUMAP):
            label_x = 'UMAP1'
            label_y = 'UMAP2'
        if args.cell_coloring_type:
            if len(colors) < len(cell_type_color_dict):
                log.error('The chosen colormap offers too less values for the number of clusters.')
                exit(1)
            labels_clustering_cell_type = []
            for cell_name in matrices_list:
                labels_clustering_cell_type.append(cell_type_color_dict[cell_name_cell_type_dict[cell_name]])

            labels_clustering_cell_type = np.array(labels_clustering_cell_type)

            log.debug('labels_clustering_cell_type: {}'.format(len(labels_clustering_cell_type)))
            log.debug('matrices_list: {}'.format(len(matrices_list)))

            plt.figure(figsize=(args.figuresize[0], args.figuresize[1]))
            for i, color in enumerate(colors[:len(cell_type_color_dict)]):
                mask = labels_clustering_cell_type == i
                log.debug('plot cluster: {} {}'.format(color_cell_type_dict[i], np.sum(mask)))
                plt.scatter(neighborhood_matrix_knn[:, 0].T[mask], neighborhood_matrix_knn[:, 1].T[mask], color=color, label=str(color_cell_type_dict[i]), s=20, alpha=0.7)

            plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=args.fontsize)
            plt.xticks([])
            plt.yticks([])
            plt.xlabel(label_x, fontsize=args.fontsize)
            plt.ylabel(label_y, fontsize=args.fontsize)
            if '.' not in args.createScatterPlot:
                args.createScatterPlot += '.png'
            scatter_plot_name = '.'.join(args.createScatterPlot.split('.')[:-1]) + '_cell_color.' + args.createScatterPlot.split('.')[-1]
            plt.tight_layout()
            plt.savefig(scatter_plot_name, dpi=args.dpi)
            plt.close()

            # compute overlap of cell_type find found clusters
            computed_clusters = set(labels_clustering)
            cell_type_amounts_dict = {}
            percentage_threshold = 0.8
            if args.latexTable:

                for threshold in [0.7, 0.8, 0.9]:
                    cell_type_amounts_dict[threshold] = {}
                with open(args.latexTable, 'w') as matches_file:
                    header = '\\begin{table}[!htb]\n\\footnotesize\n\\begin{tabular}{|l'
                    body = '\\hline Cluster '
                    for i in range(len(color_cell_type_dict)):
                        mask_cell_type = labels_clustering_cell_type == i
                        header += '|c'
                        body += '& ' + str(color_cell_type_dict[i]) + ' (' + str(np.sum(mask_cell_type)) + ' cells)'
                    header += '|}\n'
                    body += '\\\\\n'
                    # body = ''
                    for i in computed_clusters:
                        body += '\\hline Cluster ' + str(i)
                        mask_computed_clusters = labels_clustering == i
                        body += ' (' + str(np.sum(mask_computed_clusters)) + ' cells)'
                        for j in range(len(cell_type_color_dict)):
                            mask_cell_type = labels_clustering_cell_type == j
                            mask = mask_computed_clusters & mask_cell_type
                            number_of_matches = np.sum(mask)
                            body += '& ' + str(number_of_matches)

                            if number_of_matches != 1:
                                body += ' cells / '
                            else:
                                body += ' cell / '

                            body += '{:.2f}'.format((number_of_matches / np.sum(mask_computed_clusters)) * 100) + ' \\% '
                            for threshold in [0.7, 0.8, 0.9]:

                                if number_of_matches / np.sum(mask_computed_clusters) >= threshold:
                                    if color_cell_type_dict[j] in cell_type_amounts_dict[threshold]:
                                        cell_type_amounts_dict[threshold][color_cell_type_dict[j]] += number_of_matches
                                    else:
                                        cell_type_amounts_dict[threshold][color_cell_type_dict[j]] = number_of_matches
                                else:
                                    if color_cell_type_dict[j] in cell_type_amounts_dict[threshold]:
                                        continue
                                    else:
                                        cell_type_amounts_dict[threshold][color_cell_type_dict[j]] = 0
                        body += '\\\\\n'
                    body += '\\hline ' + '&' * len(cell_type_color_dict) + '\\\\\n'

                    for threshold in [0.7, 0.8, 0.9]:
                        body += '\\hline Correct identified $>{}\\%$'.format(int(threshold * 100))
                        for i in range(len(cell_type_color_dict)):
                            mask_cell_type = labels_clustering_cell_type == i

                            if color_cell_type_dict[i] in cell_type_amounts_dict[threshold]:
                                body += '& ' + str(cell_type_amounts_dict[threshold][color_cell_type_dict[i]]) + ' / ' + str(np.sum(mask_cell_type)) + ' ('
                                body += '{:.2f}'.format((cell_type_amounts_dict[threshold][color_cell_type_dict[i]] / np.sum(mask_cell_type)) * 100)
                            else:
                                body += '& ' + str(0) + ' / ' + str(np.sum(mask_cell_type)) + ' ('
                                body += '{:.2f}'.format(0 / np.sum(mask_cell_type))

                            body += ' \\%)'
                        body += '\\\\\n'
                    body += '\\hline \n'
                    body += '\\end{tabular}\n\\caption{}\n\\end{table}'

                    matches_file.write(header)
                    matches_file.write(body)
            else:
                with open('matches.txt', 'w') as matches_file:
                    for i in computed_clusters:
                        mask_computed_clusters = labels_clustering == i
                        for j in range(len(cell_type_color_dict)):
                            mask_cell_type = labels_clustering_cell_type == j

                            mask = mask_computed_clusters & mask_cell_type

                            number_of_matches = np.sum(mask)
                            matches_file.write('Computed cluster {} (size: {}) matching with cell type {} (size: {}) {} times. Rate (matches/computed_clusters): {}%\n'.format(
                                i, np.sum(mask_computed_clusters), color_cell_type_dict[j], np.sum(mask_cell_type), number_of_matches, number_of_matches / np.sum(mask_computed_clusters)))

                            if number_of_matches / np.sum(mask_computed_clusters) >= percentage_threshold:
                                if color_cell_type_dict[j] in cell_type_amounts_dict:
                                    cell_type_amounts_dict[color_cell_type_dict[j]] += number_of_matches
                                else:
                                    cell_type_amounts_dict[color_cell_type_dict[j]] = number_of_matches

                        matches_file.write('\n')
            all_detected = 0
            all_possible = 0
            for i in range(len(cell_type_color_dict)):

                mask_cell_type = labels_clustering_cell_type == i
                all_possible += np.sum(mask_cell_type)
                if color_cell_type_dict[i] in cell_type_amounts_dict:
                    all_detected += cell_type_amounts_dict[color_cell_type_dict[i]]
                    cell_type_amounts_dict[color_cell_type_dict[i]] /= np.sum(mask_cell_type)
                else:
                    cell_type_amounts_dict[color_cell_type_dict[i]] = 0.0
            correct_associated = 0.0
            for cell_iterator in cell_type_color_dict:
                correct_associated += cell_type_amounts_dict[cell_iterator]

            correct_associated /= len(cell_type_amounts_dict)

            # all_detected /= all_possible

            # correct_associated = ((correct_associated*4) + (all_detected)) / 5
            # correct_associated = correct_associated

            with open('correct_associated', 'w') as file:
                file.write(str(correct_associated))
        if args.cell_coloring_batch:
            if len(colors) < len(cell_type_color_dict_batch):
                log.error('The chosen colormap offers too less values for the number of clusters.')
                exit(1)
            labels_clustering_cell_type_batch = []
            for cell_name in matrices_list:
                labels_clustering_cell_type_batch.append(cell_type_color_dict_batch[cell_name_cell_type_dict_batch[cell_name]])

            labels_clustering_cell_type_batch = np.array(labels_clustering_cell_type_batch)

            log.debug('labels_clustering_cell_type: {}'.format(len(labels_clustering_cell_type_batch)))
            log.debug('matrices_list: {}'.format(len(matrices_list)))

            plt.figure(figsize=(args.figuresize[0], args.figuresize[1]))
            for i, color in enumerate(colors[:len(cell_type_color_dict_batch)]):
                mask = labels_clustering_cell_type_batch == i
                log.debug('plot cluster: {} {}'.format(color_cell_type_dict_batch[i], np.sum(mask)))
                plt.scatter(neighborhood_matrix_knn[:, 0].T[mask], neighborhood_matrix_knn[:, 1].T[mask], color=color, label=str(color_cell_type_dict_batch[i]), s=20, alpha=0.7)

            plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=args.fontsize)
            plt.xticks([])
            plt.yticks([])
            plt.xlabel(label_x, fontsize=args.fontsize)
            plt.ylabel(label_y, fontsize=args.fontsize)
            if '.' not in args.createScatterPlot:
                args.createScatterPlot += '.png'
            scatter_plot_name = '.'.join(args.createScatterPlot.split('.')[:-1]) + '_cell_color_batch.' + args.createScatterPlot.split('.')[-1]
            plt.tight_layout()
            plt.savefig(scatter_plot_name, dpi=args.dpi)
            plt.close()

        plt.figure(figsize=(args.figuresize[0], args.figuresize[1]))
        for i, color in enumerate(colors[:args.numberOfClusters]):
            mask = labels_clustering == i
            plt.scatter(neighborhood_matrix_knn[:, 0].T[mask], neighborhood_matrix_knn[:, 1].T[mask], color=color, label=str(i), s=20, alpha=0.7)
        plt.legend(fontsize=args.fontsize)
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=args.fontsize)

        plt.xticks([])
        plt.yticks([])
        plt.xlabel(label_x, fontsize=args.fontsize)
        plt.ylabel(label_y, fontsize=args.fontsize)
        if '.' not in args.createScatterPlot:
            args.createScatterPlot += '.png'
        scatter_plot_name = '.'.join(args.createScatterPlot.split('.')[:-1]) + '.' + args.createScatterPlot.split('.')[-1]
        plt.tight_layout()
        plt.savefig(scatter_plot_name, dpi=args.dpi)
        plt.close()

    matrices_cluster = list(zip(matrices_list, labels_clustering))
    np.savetxt(args.outFileName, matrices_cluster, fmt="%s")
def main(args=None):
    args = parse_arguments().parse_args(args)
    matrices_name = args.matrix
    threads = args.threads
    matrices_list = cell_name_list(matrices_name)
    if args.createSubmatrix is not None and args.regions is None and args.chromosomes is None:
        for matrix in matrices_list[:args.createSubmatrix]:
            cooler.fileops.cp(args.matrix + '::' + matrix,
                              args.outFileName + '::' + matrix)
        exit(0)

    input_count_matrices = len(matrices_list)
    if threads > len(matrices_list):
        threads = len(matrices_list)

    # load bin ids only once
    cooler_obj_external = cooler.Cooler(matrices_name + '::' +
                                        matrices_list[0])
    bins = cooler_obj_external.bins()[:]

    # apply the inverted operation if the number of values is less
    # the idea is that for
    # indices = pixels['bin1_id'].apply(lambda x: x in pListIds)
    # the search time is less if the list pListIds is shorter
    # therefore the drop must be inverted too
    apply_inverted = False
    if args.action == 'keep':
        list_ids = bins.index[bins['chrom'].apply(
            lambda x: x in args.chromosomes)].tolist()
        list_inverted_logic_ids = bins.index[bins['chrom'].apply(
            lambda x: x not in args.chromosomes)].tolist()

        bins_new = bins[bins['chrom'].apply(
            lambda x: x in args.chromosomes)].reset_index()

    else:
        list_ids = bins.index[bins['chrom'].apply(
            lambda x: x not in args.chromosomes)].tolist()
        list_inverted_logic_ids = bins.index[bins['chrom'].apply(
            lambda x: x in args.chromosomes)].tolist()
        bins_new = bins[bins['chrom'].apply(
            lambda x: x not in args.chromosomes)].reset_index()

    if len(list_inverted_logic_ids) < len(list_ids):
        apply_inverted = True
        list_ids = list_inverted_logic_ids

    dict_values = bins_new['index'].to_dict()
    inv_map = {}
    for k, v in dict_values.items():
        if k == v:
            continue
        inv_map[v] = k
    bins_new.drop(['index'], axis=1, inplace=True)

    all_data_collected = False
    thread_done = [False] * threads
    pixels_thread = [None] * threads
    keep_thread = [None] * threads

    matricesPerThread = len(matrices_list) // threads
    queue = [None] * threads
    process = [None] * threads
    for i in range(threads):

        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) *
                                               matricesPerThread]
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=compute_adjust_matrix,
                             kwargs=dict(pMatrixName=matrices_name,
                                         pMatricesList=matrices_name_list,
                                         pArgs=args,
                                         pListIds=list_ids,
                                         pInvertedMap=inv_map,
                                         pInvertedLogic=apply_inverted,
                                         pQueue=queue[i]))

        process[i].start()
    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                pixels_thread[i], keep_thread[i] = queue[i].get()

                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
        time.sleep(1)

    pixels_list = [item for sublist in pixels_thread for item in sublist]
    keep_list = [item for sublist in keep_thread for item in sublist]

    matrices_list = np.array(matrices_list)
    mask = np.array(keep_list)
    matrices_list = matrices_list[mask]

    matrixFileHandler = MatrixFileHandler(pFileType='scool')
    matrixFileHandler.matrixFile.bins = bins_new
    matrixFileHandler.matrixFile.pixel_list = pixels_list
    matrixFileHandler.matrixFile.name_list = matrices_list

    matrixFileHandler.save(args.outFileName,
                           pSymmetric=True,
                           pApplyCorrection=False)
    broken_count = input_count_matrices - np.sum(np.array(keep_list))
    print(
        'Out of {} matrices, {} were removed because they were broken.'.format(
            input_count_matrices, broken_count))
def main(args=None):

    args = parse_arguments().parse_args(args)

    matrices_name = args.matrix
    threads = args.threads
    matrices_list = cell_name_list(matrices_name)
    if threads > len(matrices_list):
        threads = len(matrices_list)
    compartments_matrix = None

    all_data_collected = False
    thread_done = [False] * threads
    length_index = [None] * threads
    length_index[0] = 0
    matricesPerThread = len(matrices_list) // threads
    queue = [None] * threads
    process = [None] * threads
    for i in range(threads):

        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) *
                                               matricesPerThread]
            length_index[i + 1] = length_index[i] + len(matrices_name_list)
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=open_and_store_matrix,
                             kwargs=dict(pMatrixName=matrices_name,
                                         pMatricesList=matrices_name_list,
                                         pIndex=length_index[i],
                                         pXDimension=len(matrices_list),
                                         pChromosomes=args.chromosomes,
                                         pNorm=args.norm,
                                         pExtraTrack=args.extraTrack,
                                         pHistonMarkType=args.histonMarkType,
                                         pBinarization=args.binarization,
                                         pQueue=queue[i]))

        process[i].start()

    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                compartments_worker = queue[i].get()
                if compartments_matrix is None:
                    compartments_matrix = compartments_worker
                else:
                    compartments_matrix += compartments_worker

                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
        time.sleep(1)

    if args.clusterMethod == 'spectral':
        spectral_clustering = SpectralClustering(
            n_clusters=args.numberOfClusters,
            n_jobs=args.threads,
            random_state=0)
        labels_clustering = spectral_clustering.fit_predict(
            compartments_matrix)
    elif args.clusterMethod == 'kmeans':
        kmeans_object = KMeans(n_clusters=args.numberOfClusters,
                               random_state=0,
                               n_jobs=args.threads,
                               precompute_distances=True)
        labels_clustering = kmeans_object.fit_predict(compartments_matrix)

    matrices_cluster = list(zip(matrices_list, labels_clustering))
    np.savetxt(args.outFileName, matrices_cluster, fmt="%s")
def main(args=None):

    args = parse_arguments().parse_args(args)

    matrices_name = args.matrix
    threads = args.threads
    matrices_list = cell_name_list(matrices_name)
    svl_matrix = None

    all_data_collected = False
    thread_done = [False] * threads
    length_index = [None] * threads
    length_index[0] = 0
    matricesPerThread = len(matrices_list) // threads
    queue = [None] * threads
    process = [None] * threads
    for i in range(threads):

        if i < threads - 1:
            matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) *
                                               matricesPerThread]
            length_index[i + 1] = length_index[i] + len(matrices_name_list)
        else:
            matrices_name_list = matrices_list[i * matricesPerThread:]

        queue[i] = Queue()
        process[i] = Process(target=create_svl_data,
                             kwargs=dict(pMatrixName=matrices_name,
                                         pMatricesList=matrices_name_list,
                                         pIndex=length_index[i],
                                         pXDimension=len(matrices_list),
                                         pDistanceMin=args.distanceShortRange,
                                         pDistanceMax=args.distanceLongRange,
                                         pQueue=queue[i]))

        process[i].start()

    while not all_data_collected:
        for i in range(threads):
            if queue[i] is not None and not queue[i].empty():
                csr_matrix_worker = queue[i].get()
                if svl_matrix is None:
                    svl_matrix = csr_matrix_worker
                else:
                    svl_matrix += csr_matrix_worker

                queue[i] = None
                process[i].join()
                process[i].terminate()
                process[i] = None
                thread_done[i] = True
        all_data_collected = True
        for thread in thread_done:
            if not thread:
                all_data_collected = False
        time.sleep(1)

    if args.clusterMethod == 'spectral':
        spectral_clustering = SpectralClustering(
            n_clusters=args.numberOfClusters,
            affinity='nearest_neighbors',
            n_jobs=args.threads,
            random_state=0)
        labels_clustering = spectral_clustering.fit_predict(svl_matrix)
    elif args.clusterMethod == 'kmeans':
        kmeans_object = KMeans(n_clusters=args.numberOfClusters,
                               random_state=0,
                               n_jobs=args.threads,
                               precompute_distances=True)
        labels_clustering = kmeans_object.fit_predict(svl_matrix)

    matrices_cluster = list(zip(matrices_list, labels_clustering))
    np.savetxt(args.outFileName, matrices_cluster, fmt="%s")