Exemplo n.º 1
0
def sortMatrix(hm, regionsFileName, transcriptID, transcript_id_designator):
    """
    Iterate through the files noted by regionsFileName and sort hm accordingly
    """

    labels = dict()
    regions = []
    defaultGroup = None
    if len(regionsFileName) == 1:
        defaultGroup = "genes"
    for fname in regionsFileName:
        fp = dti.openPossiblyCompressed(fname)
        line = dti.getNext(fp)
        labelColumn = None
        while line.startswith("#"):
            if not labelColumn:
                labelColumn = dti.getLabel(line)
            line = dti.getNext(fp)

        # Find the label column
        subtract = 0
        if labelColumn is not None:
            subtract = 1

        # Determine the file type and load into a list (or list of lists)
        cols = line.strip().split("\t")
        if len(cols) - subtract < 3:
            raise RuntimeError('{0} does not seem to be a recognized file type!'.format(fname))
        elif len(cols) - subtract <= 6:
            loadBED(line, fp, fname, labelColumn, labels, regions, defaultGroup)
        elif len(cols) and dti.seemsLikeGTF(cols):
            loadGTF(line, fp, fname, labels, regions, transcriptID, transcript_id_designator, defaultGroup)
        else:
            loadBED(line, fp, fname, labelColumn, labels, regions, defaultGroup)
        fp.close()

    # Do some sanity checking on the group labels and region names within them
    s1 = set(hm.parameters['group_labels'])
    for e in labels:
        if e not in s1:
            sys.exit("The computeMatrix output is missing the '{}' region group. It has [] but the specified regions have {}.\n".format(e, s1, labels.keys()))

    # Make a dictionary out of current labels and regions
    d = dict()
    pos = 0
    groupSizes = dict()
    for idx, label in enumerate(hm.parameters['group_labels']):
        s = hm.parameters['group_boundaries'][idx]
        e = hm.parameters['group_boundaries'][idx + 1]
        if label not in labels:
            continue
        d[label] = dict()
        groupSize = 0
        for reg in hm.matrix.regions[s:e]:
            d[label][reg[2]] = pos
            pos += 1
            groupSize += 1
        groupSizes[label] = groupSize

    # Convert labels to an ordered list
    labelsList = [""] * len(labels)
    for k, v in labels.items():
        labelsList[v] = k

    # Reorder
    order = []
    boundaries = [0]
    for idx, label in enumerate(labelsList):
        # Make an ordered list out of the region names in this region group
        _ = [""] * len(regions[idx])
        for k, v in regions[idx].items():
            _[v] = k
        for name in _:
            if name not in d[label]:
                sys.stderr.write("Skipping {}, due to being absent in the computeMatrix output.\n".format(name))
                continue
            order.append(d[label][name])
        boundaries.append(groupSizes[label] + boundaries[-1])
    hm.matrix.regions = [hm.matrix.regions[i] for i in order]
    order = np.array(order)
    hm.matrix.matrix = hm.matrix.matrix[order, :]

    # Update the parameters
    hm.parameters["group_labels"] = labelsList
    hm.matrix.group_labels = labelsList
    hm.parameters["group_boundaries"] = boundaries
    hm.matrix.group_boundaries = boundaries
    def __init__(self, fnames, keepExons=False, labels=None, verbose=False):
        """
        Driver function to actually parse files. The steps are as follows:

        1) skip to the first non-comment line
        2) Infer the type from that
        3) Call a type-specific processing function accordingly
          * These call the underlying C code for storage
          * These handle chromsome name conversions (python-level)

        Required inputs are as follows:

        fnames:	A list of (possibly compressed with gzip or bzip2) GTF or BED files.

        Optional input is:

        keepExons:    For BED12 files, exons are ignored by default.
        labels:       Override the feature labels supplied in the file(s).
                      Note that this might instead be replaced later in the .features attribute.
        verbose:      Whether to print warnings (default: False)
        """
        self.fname = []
        self.filename = ""
        self.chroms = []
        self.features = []
        self.tree = tree.initTree()
        self.keepExons = keepExons
        self.verbose = verbose

        if not isinstance(fnames, list):
            fnames = [fnames]

        # Load the files
        for labelIdx, fname in enumerate(fnames):
            self.filename = fname
            fp = openPossiblyCompressed(fname)
            line, labelColumn = self.firstNonComment(fp)
            if line is None:
                # This will only ever happen if a file is empty or just has a header/comment
                continue
            line = line.strip()

            ftype = self.inferType(fp, line, labelColumn)

            if ftype != 'GTF' and labels is not None:
                assert(len(labels) > labelIdx)
                bname = labels[labelIdx]
            else:
                bname = basename(fname)
            if ftype == 'GTF':
                self.parseGTF(fp, line)
            elif ftype == 'BED3':
                self.parseBED(fp, line, 3, feature=bname, labelColumn=labelColumn)
            elif ftype == 'BED6':
                self.parseBED(fp, line, 6, feature=bname, labelColumn=labelColumn)
            else:
                self.parseBED(fp, line, 12, feature=bname, labelColumn=labelColumn)
            fp.close()

        # Sanity check
        if self.tree.countEntries() == 0:
            raise RuntimeError("None of the input BED/GTF files had valid regions")

        if len(self.features) == 0:
            raise RuntimeError("There were no valid feature labels!")

        # vine -> tree
        self.tree.finish()
Exemplo n.º 3
0
    def __init__(self, fnames, keepExons=False, labels=None, verbose=False):
        """
        Driver function to actually parse files. The steps are as follows:

        1) skip to the first non-comment line
        2) Infer the type from that
        3) Call a type-specific processing function accordingly
          * These call the underlying C code for storage
          * These handle chromsome name conversions (python-level)

        Required inputs are as follows:

        fnames:	A list of (possibly compressed with gzip or bzip2) GTF or BED files.

        Optional input is:

        keepExons:    For BED12 files, exons are ignored by default.
        labels:       Override the feature labels supplied in the file(s).
                      Note that this might instead be replaced later in the .features attribute.
        verbose:      Whether to print warnings (default: False)
        """
        self.fname = []
        self.filename = ""
        self.chroms = []
        self.features = []
        self.tree = tree.initTree()
        self.keepExons = keepExons
        self.verbose = verbose

        if not isinstance(fnames, list):
            fnames = [fnames]

        # Load the files
        for labelIdx, fname in enumerate(fnames):
            self.filename = fname
            fp = openPossiblyCompressed(fname)
            line, labelColumn = self.firstNonComment(fp)
            if line is None:
                # This will only ever happen if a file is empty or just has a header/comment
                continue
            line = line.strip()

            ftype = self.inferType(fp, line, labelColumn)

            if ftype != 'GTF' and labels is not None:
                assert(len(labels) > labelIdx)
                bname = labels[labelIdx]
            else:
                bname = basename(fname)
            if ftype == 'GTF':
                self.parseGTF(fp, line)
            elif ftype == 'BED3':
                self.parseBED(fp, line, 3, feature=bname, labelColumn=labelColumn)
            elif ftype == 'BED6':
                self.parseBED(fp, line, 6, feature=bname, labelColumn=labelColumn)
            else:
                self.parseBED(fp, line, 12, feature=bname, labelColumn=labelColumn)
            fp.close()

        # Sanity check
        if self.tree.countEntries() == 0:
            raise RuntimeError("None of the input BED/GTF files had valid regions")

        if len(self.features) == 0:
            raise RuntimeError("There were no valid feature labels!")

        # vine -> tree
        self.tree.finish()