def read_metrics_csv(self, cndata):
        """
        read the input file
        """

        samples = cndata.index

        data = {}
        numread_data = {}
        reads_per_bin_data = {}

        sepdata = defaultdict(list)
        colordata = {}

        header, dtypes, columns = csvutils.get_metadata(self.metrics)
        idxs = self.build_label_indices(columns)

        color_col = self.color_by_col
        sep_col = self.plot_by_col

        with helpers.getFileHandle(self.metrics) as freader:

            if header:
                assert freader.readline().strip().split(',') == columns

            for line in freader:
                line = line.strip().split(self.sep)

                sample_id = line[idxs['cell_id']]

                # skip samples that are just na or inf
                if sample_id not in samples:
                    continue

                val = line[idxs["mad_neutral_state"]]

                val = float('nan') if val == "NA" else float(val)

                ec = 'all' if sep_col == 'all' else line[idxs[sep_col]]

                cc = line[idxs[color_col]]

                numreads = float(line[idxs['total_mapped_reads_hmmcopy']])

                reads_per_bin = line[idxs['median_hmmcopy_reads_per_bin']]

                reads_per_bin = 0 if reads_per_bin == "NA" else float(
                    reads_per_bin)

                if self.cellcalls and cc not in self.cellcalls:
                    continue

                numread_data[sample_id] = numreads
                data[sample_id] = val
                reads_per_bin_data[sample_id] = reads_per_bin

                colordata[sample_id] = cc
                sepdata[ec].append(sample_id)

        return data, sepdata, colordata, numread_data, reads_per_bin_data
Exemplo n.º 2
0
    def parse_segs(self, segs, metrics):
        """parses hmmcopy segments data
        :param segs: path to hmmcopy segs file
        """
        header_flag, dtypes, columns = csvutils.get_metadata(segs)

        header = {v: i for i, v in enumerate(columns)}

        segs_data = {}

        with helpers.getFileHandle(segs) as segfile:

            if header_flag:
                assert segfile.readline().strip().split(',') == columns

            for row in segfile:
                row = row.strip().split(',')

                chrom = row[header["chr"]]
                start = row[header["start"]]
                end = row[header["end"]]
                cell_id = row[header["cell_id"]]
                state = row[header["state"]]
                # float to handle scientific notation
                segment_length = int(float(end)) - int(float(start)) + 1

                if metrics[cell_id] > self.quality_threshold:
                    continue

                segs_data[cell_id] = [
                    cell_id, chrom, start, end, segment_length, state
                ]
        return segs_data
    def read_segs_csv(self):
        """
        read the input file
        """
        data = {}

        bins = {}

        header, dtypes, columns = csvutils.get_metadata(self.input)

        with helpers.getFileHandle(self.input, 'rt') as freader:
            idxs = self.build_label_indices(columns)

            if header:
                assert freader.readline().strip().split(',') == columns

            for line in freader:
                line = line.strip().split(self.sep)

                sample_id = line[idxs['cell_id']]

                val = line[idxs[self.column_name]]

                val = float('nan') if val == "NA" else float(val)

                chrom = line[idxs['chr']]
                start = int(line[idxs['start']])
                end = int(line[idxs['end']])

                seg = (chrom, start, end)

                if self.mappability_threshold and float(
                        line[idxs["map"]]) <= self.mappability_threshold:
                    val = float("nan")

                if chrom not in bins:
                    bins[chrom] = set()
                bins[chrom].add((start, end))

                # just a sanity check, not required
                if sample_id in data and seg in data[sample_id]:
                    raise Exception("repeated val")

                if sample_id not in data:
                    data[sample_id] = {}

                data[sample_id][seg] = val

            samples = sorted(data.keys())
            bins = self.sort_bins_csv(bins)

            data = self.conv_to_matrix(data, bins, samples)
            data = self.get_pandas_dataframe(data, bins)

        return data