def sortMatrix(hm, regionsFileName, transcriptID, transcript_id_designator): """ Iterate through the files noted by regionsFileName and sort hm accordingly """ labels = dict() regions = [] defaultGroup = None if len(regionsFileName) == 1: defaultGroup = "genes" for fname in regionsFileName: fp = dti.openPossiblyCompressed(fname) line = dti.getNext(fp) labelColumn = None while line.startswith("#"): if not labelColumn: labelColumn = dti.getLabel(line) line = dti.getNext(fp) # Find the label column subtract = 0 if labelColumn is not None: subtract = 1 # Determine the file type and load into a list (or list of lists) cols = line.strip().split("\t") if len(cols) - subtract < 3: raise RuntimeError( '{0} does not seem to be a recognized file type!'.format( fname)) elif len(cols) - subtract <= 6: loadBED(line, fp, fname, labelColumn, labels, regions, defaultGroup) elif len(cols) and dti.seemsLikeGTF(cols): loadGTF(line, fp, fname, labels, regions, transcriptID, transcript_id_designator, defaultGroup) else: loadBED(line, fp, fname, labelColumn, labels, regions, defaultGroup) fp.close() # Do some sanity checking on the group labels and region names within them s1 = set(hm.parameters['group_labels']) for e in labels: if e not in s1: sys.exit( "The computeMatrix output is missing the '{}' region group. It has [] but the specified regions have {}.\n" .format(e, s1, labels.keys())) # Make a dictionary out of current labels and regions d = dict() pos = 0 groupSizes = dict() for idx, label in enumerate(hm.parameters['group_labels']): s = hm.parameters['group_boundaries'][idx] e = hm.parameters['group_boundaries'][idx + 1] if label not in labels: continue d[label] = dict() groupSize = 0 for reg in hm.matrix.regions[s:e]: d[label][reg[2]] = pos pos += 1 groupSize += 1 groupSizes[label] = groupSize # Convert labels to an ordered list labelsList = [""] * len(labels) for k, v in labels.items(): labelsList[v] = k # Reorder order = [] boundaries = [0] for idx, label in enumerate(labelsList): # Make an ordered list out of the region names in this region group _ = [""] * len(regions[idx]) for k, v in regions[idx].items(): _[v] = k sz = 0 # Track the number of enries actually matched for name in _: if name not in d[label]: sys.stderr.write( "Skipping {}, due to being absent in the computeMatrix output.\n" .format(name)) continue sz += 1 order.append(d[label][name]) if sz == 0: sys.exit( "The region group {} had no matching entries!\n".format(label)) boundaries.append(sz + boundaries[-1]) hm.matrix.regions = [hm.matrix.regions[i] for i in order] order = np.array(order) hm.matrix.matrix = hm.matrix.matrix[order, :] # Update the parameters hm.parameters["group_labels"] = labelsList hm.matrix.group_labels = labelsList hm.parameters["group_boundaries"] = boundaries hm.matrix.group_boundaries = boundaries
def sortMatrix(hm, regionsFileName, transcriptID, transcript_id_designator): """ Iterate through the files noted by regionsFileName and sort hm accordingly """ labels = dict() regions = [] defaultGroup = None if len(regionsFileName) == 1: defaultGroup = "genes" for fname in regionsFileName: fp = dti.openPossiblyCompressed(fname) line = dti.getNext(fp) labelColumn = None while line.startswith("#"): if not labelColumn: labelColumn = dti.getLabel(line) line = dti.getNext(fp) # Find the label column subtract = 0 if labelColumn is not None: subtract = 1 # Determine the file type and load into a list (or list of lists) cols = line.strip().split("\t") if len(cols) - subtract < 3: raise RuntimeError('{0} does not seem to be a recognized file type!'.format(fname)) elif len(cols) - subtract <= 6: loadBED(line, fp, fname, labelColumn, labels, regions, defaultGroup) elif len(cols) and dti.seemsLikeGTF(cols): loadGTF(line, fp, fname, labels, regions, transcriptID, transcript_id_designator, defaultGroup) else: loadBED(line, fp, fname, labelColumn, labels, regions, defaultGroup) fp.close() # Do some sanity checking on the group labels and region names within them s1 = set(hm.parameters['group_labels']) for e in labels: if e not in s1: sys.exit("The computeMatrix output is missing the '{}' region group. It has [] but the specified regions have {}.\n".format(e, s1, labels.keys())) # Make a dictionary out of current labels and regions d = dict() pos = 0 groupSizes = dict() for idx, label in enumerate(hm.parameters['group_labels']): s = hm.parameters['group_boundaries'][idx] e = hm.parameters['group_boundaries'][idx + 1] if label not in labels: continue d[label] = dict() groupSize = 0 for reg in hm.matrix.regions[s:e]: d[label][reg[2]] = pos pos += 1 groupSize += 1 groupSizes[label] = groupSize # Convert labels to an ordered list labelsList = [""] * len(labels) for k, v in labels.items(): labelsList[v] = k # Reorder order = [] boundaries = [0] for idx, label in enumerate(labelsList): # Make an ordered list out of the region names in this region group _ = [""] * len(regions[idx]) for k, v in regions[idx].items(): _[v] = k for name in _: if name not in d[label]: sys.stderr.write("Skipping {}, due to being absent in the computeMatrix output.\n".format(name)) continue order.append(d[label][name]) boundaries.append(groupSizes[label] + boundaries[-1]) hm.matrix.regions = [hm.matrix.regions[i] for i in order] order = np.array(order) hm.matrix.matrix = hm.matrix.matrix[order, :] # Update the parameters hm.parameters["group_labels"] = labelsList hm.matrix.group_labels = labelsList hm.parameters["group_boundaries"] = boundaries hm.matrix.group_boundaries = boundaries