def _commonComplementGtrackFile(origFn, dbFn, intersectingFactor, gtrackColsToAdd, genome): origGESource = GtrackGenomeElementSource(origFn, genome) dbGESource = GtrackGenomeElementSource(dbFn, genome) dbPrefixes = dbGESource.getPrefixList() if intersectingFactor == 'id': fullDbDict = IdFullInfoDict(dbGESource, dbPrefixes) elif intersectingFactor == 'position': fullDbDict = TupleFullInfoDict(dbGESource, dbPrefixes) else: ShouldNotOccurError forcedHeaderDict = {} dbHeaderDict = dbGESource.getHeaderDict() if 'value' in gtrackColsToAdd: forcedHeaderDict['value type'] = dbHeaderDict['value type'] forcedHeaderDict['value dimension'] = dbHeaderDict['value dimension'] if 'edges' in gtrackColsToAdd: forcedHeaderDict['edge weight type'] = dbHeaderDict['edge weight type'] forcedHeaderDict['edge weight dimension'] = dbHeaderDict['edge weight dimension'] composerCls = ExtendedGtrackComposer if origGESource.isExtendedGtrackFile() else StdGtrackComposer composedFile = composerCls( ElementComplementer(origGESource, fullDbDict, gtrackColsToAdd), \ forcedHeaderDict=forcedHeaderDict).returnComposed() return expandHeadersOfGtrackFileAndReturnComposer('', genome, strToUseInsteadOfFn=composedFile)
def _determineHeaderLines(self, hbColumns, columns): self._setHeaderDict("track type", Gtrack.getTrackTypeFromColumnSpec(columns)) self._setHeaderDict("value type", self._getGtrackValueType()) self._setHeaderDict("value dimension", Gtrack.getGtrackValueDimension(self._geSource.getValDim())) self._setHeaderDict("undirected edges", self._geSource.hasUndirectedEdges()) self._setHeaderDict("edge weights", ("weights" in hbColumns)) self._setHeaderDict("edge weight type", self._getGtrackEdgeWeightType()) self._setHeaderDict("edge weight dimension", Gtrack.getGtrackValueDimension(self._geSource.getEdgeWeightDim())) self._setHeaderDict("uninterrupted data lines", not self._hasMoreThanOneBoundingRegion()) self._setHeaderDict("sorted elements", self._geSource.isSorted()) self._setHeaderDict("no overlapping elements", self._geSource.hasNoOverlappingElements()) self._setHeaderDict("circular elements", self._geSource.hasCircularElements()) compliesToSubtype = False if self._USE_EXTENDED_GTRACK: self._setHeaderDict("fixed length", self._geSource.getFixedLength()) self._setHeaderDict("fixed gap size", self._geSource.getFixedGapSize()) self._setHeaderDict("fixed-size data lines", self._determineIfFixedSizeDataLines(columns)) if self._headerDict["fixed-size data lines"]: self._setHeaderDict("data line size", self._geSource.getValDim()) hbColumns, columns = self._adjustColumnsAccordingToHeaderLines(hbColumns, columns) hbColumns, columns, compliesToSubtype = self._determineIfFileCompliesToSubtypes(hbColumns, columns) if not compliesToSubtype: self._setHeaderDict("1-indexed", self._geSource.inputIsOneIndexed()) self._setHeaderDict("end inclusive", self._geSource.inputIsEndInclusive()) for header, val in self._forcedHeaderDict.iteritems(): if header not in self._headerDict: self._headerDict[header] = val return hbColumns, columns
def _checkValidEnd(self, chr, end, start=None): if start is not None and end <= start: if not self._headerDict['circular elements']: self._headerDict['circular elements'] = True start = None return GtrackGenomeElementSource._checkValidEnd(self, chr, end, start)
def _parseEdges(self, edgeStr): if edgeStr != '.': for edgeSpec in edgeStr.split(';'): if '=' in edgeSpec: if not self._headerDict['edge weights']: self._headerDict['edge weights'] = True self._getValInCorrectType(edgeSpec.split('=')[1], 'edge weight') return GtrackGenomeElementSource._parseEdges(self, edgeStr)
def __init__(self, geSource, fullDbDict, gtrackColsToAdd): self._prefixesToAdd = [GtrackGenomeElementSource.convertNameFromGtrack(col) for col in gtrackColsToAdd] if 'edges' in self._prefixesToAdd: self._prefixesToAdd.append('weights') ElementModifierGESourceWrapper.__init__(self, geSource) self._fullDbDict = fullDbDict self._prefixList = geSource.getPrefixList() + self._prefixesToAdd
def _iter(self): self._valTypeIndexDict = {} self._valLenDict = {} self._allMissingDict = {} #self._headerDict['no overlapping elements'] = True self._headerDict['sorted elements'] = True if self._headerDict['track type'].startswith('linked'): self._headerDict['undirected edges'] = True return GtrackGenomeElementSource._iter(self)
def _getGtrackValueDim(self, val, valTypeInfo, valueOrEdgeWeight): valLen = len(val.split(valTypeInfo.delim) if valTypeInfo.delim != '' else val) if valueOrEdgeWeight in self._valLenDict: if self._valLenDict[valueOrEdgeWeight] != valLen: self._valLenDict[valueOrEdgeWeight] = 0 else: self._valLenDict[valueOrEdgeWeight] = valLen valDim = GtrackGenomeElementSource.getGtrackValueDimension(self._valLenDict[valueOrEdgeWeight]) return valDim
def _handleEndOfFile(self): GtrackGenomeElementSource._handleEndOfFile(self) #To fix an issue where value dimension is "list" if the value type was wrongly #guessed for early elements. newIter = self.__iter__() newIter._valTypeIndexDict = self._valTypeIndexDict newIter._handleEndOfFile = newIter._basicHandleEndOfFile try: while True: newIter.next() except StopIteration: pass self._valLenDict = newIter._valLenDict if len(self._uniqueEdgeIds) == 0: self._headerDict['undirected edges'] = False for valueOrEdgeWeight in ['value', 'edge weight']: if valueOrEdgeWeight in newIter._allMissingDict and newIter._allMissingDict[valueOrEdgeWeight] == True: self._headerDict['%s type' % valueOrEdgeWeight] = 'number'
def _composeBoundingRegionLine(self, boundingRegionTuple): region = copy(boundingRegionTuple.region) if self._headerDict["1-indexed"]: region.start = region.start + 1 if region.start is not None else None region.end = region.end + 1 if region.end is not None else None if self._headerDict["end inclusive"]: region.end = region.end - 1 if region.end is not None else None brLinePartList = [ (Gtrack.convertNameToGtrack(attr), getattr(region, attr)) for attr in ["genome", "chr", "start", "end"] ] return ( "####" + "; ".join( k + "=" + self._formatPhraseWithCorrectChrUsage(str(v), useUrlEncoding=True, notAllowedChars="=;#\t") for k, v in brLinePartList if v is not None ) + os.linesep )
def _getValInCorrectType(self, val, valueOrEdgeWeight='value', isEmptyElement=False): headerDictInFile = self.getHeaderDictInFile() valTypeList = ['binary', 'number', 'category', 'character'] for i,valueType in enumerate(valTypeList): if valueOrEdgeWeight in self._valTypeIndexDict and self._valTypeIndexDict[valueOrEdgeWeight] > i: continue valTypeInfo = GtrackGenomeElementSource.VAL_TYPE_DICT[valueType] if self._isValOfParticularType(val, valTypeInfo): self._noteIfAllValuesAreMissing(valueOrEdgeWeight, val, valTypeInfo) self._valTypeIndexDict[valueOrEdgeWeight] = i valueDim = self._getGtrackValueDim(val, valTypeInfo, valueOrEdgeWeight) if not '%s type' % valueOrEdgeWeight in headerDictInFile: self._headerDict['%s type' % valueOrEdgeWeight] = valTypeList[i] if not '%s dimension' % valueOrEdgeWeight in headerDictInFile: self._headerDict['%s dimension' % valueOrEdgeWeight] = valueDim return GtrackGenomeElementSource._getValInCorrectType(self, val, valueOrEdgeWeight, isEmptyElement) raise ShouldNotOccurError()
def testHeaderExpansion(self): geSourceTest = self._commonSetup() for caseName in geSourceTest.cases: if not caseName.startswith('gtrack'): continue if 'no_expand' in caseName: print 'Test case skipped: ' + caseName continue onlyGuaranteed = 'no_types_expanded' in caseName print caseName print '===========' case = geSourceTest.cases[caseName] headerLines = [line if not self._isHeaderLine(line) else '##' + ': '.join([str(x).lower() for x in Gtrack.getHeaderKeyValue(line.strip())]) for line in case.headerLines] fullContents = os.linesep.join(headerLines + case.lines) print 'Original:\n\n' + fullContents case.headerLines = [line for line in headerLines if not self._isExpandableHeader(line, onlyGuaranteed)] print '-----' print 'With headers removed:\n\n' + os.linesep.join(case.headerLines + case.lines) testFn = self._writeTestFile(case) expandedContents = expandHeadersOfGtrackFileAndReturnContents(testFn, case.genome, onlyNonDefault=False) print '-----' print 'With expanded headers:\n\n' + expandedContents expandedContentsOnlyNonDefaults = expandHeadersOfGtrackFileAndReturnContents(testFn, case.genome, onlyNonDefault=True) print '-----' print 'With expanded headers (only non-default headers):\n\n' + expandedContentsOnlyNonDefaults origExpandableHeaders = dict([Gtrack.getHeaderKeyValue(line) for line in headerLines \ if self._isExpandableHeader(line, onlyGuaranteed=False)]) notExpandableHeaders = dict([Gtrack.getHeaderKeyValue(line) for line in case.headerLines \ if self._isHeaderLine(line) and not self._isValueNotKeptHeader(line)]) expandedHeaders = dict([Gtrack.getHeaderKeyValue(line) for line in expandedContents.split(os.linesep) \ if self._isHeaderLine(line)]) if 'no_check_expand' in caseName: print 'No checks for case: ' + caseName else: for header in origExpandableHeaders: self.assertEquals(origExpandableHeaders[header], expandedHeaders[header]) for header in notExpandableHeaders: self.assertEquals(notExpandableHeaders[header], expandedHeaders[header]) for contents in [expandedContents, expandedContentsOnlyNonDefaults]: sourceClass = GenomeElementSource if case.sourceClass is None else case.sourceClass forPreProcessor = True if case.sourceClass is None else False stdGeSource = GEDependentAttributesHolder(sourceClass('expanded.gtrack', case.genome, \ forPreProcessor=forPreProcessor, \ printWarnings=False, \ strToUseInsteadOfFn=contents)) self.assertEquals(case.assertElementList, [ge for ge in stdGeSource]) self.assertEquals(case.boundingRegionsAssertList, [br for br in stdGeSource.getBoundingRegionTuples()])
def _isValueNotKeptHeader(self, line): return self._isHeaderLine(line) and \ Gtrack.getHeaderKeyValue(line)[0] in VALUE_NOT_KEPT_HEADERS
def _isExpandableHeader(self, line, onlyGuaranteed): return self._isHeaderLine(line) and \ ( (Gtrack.getHeaderKeyValue(line)[0] in EXPANDABLE_HEADERS) or \ (not onlyGuaranteed and Gtrack.getHeaderKeyValue(line)[0] in NOT_GUARANTEED_EXPANDABLE_HEADERS) )
def _getHbColumnsFromGtrackColumns(self, columns): return [Gtrack.convertNameFromGtrack(col) for col in columns]
def _determineIfFileCompliesToSubtypes(self, hbColumns, columns): if "subtype url" in self._forcedHeaderDict: subtypeUrlList = ( [self._forcedHeaderDict["subtype url"]] if self._forcedHeaderDict["subtype url"] != "" else [] ) else: subtypeUrlList = self.GTRACK_PRIORITIZED_SUBTYPE_LIST for subtypeUrl in subtypeUrlList: subtypeGESource = Gtrack.getSubtypeGESource(subtypeUrl) subtypeColumns = subtypeGESource.getColumns(orig=False) subtypeHeaders = subtypeGESource.getHeaderDict() numRepeats = 2 if subtypeHeaders["subtype adherence"] == "redefinable" else 1 for repeat in range(numRepeats): self._setHeaderDict("1-indexed", subtypeHeaders["1-indexed"]) self._setHeaderDict("end inclusive", subtypeHeaders["end inclusive"]) if subtypeHeaders["subtype adherence"] in ["reorderable", "free"]: rearrangedColumns = columns rearrangedHbColumns = hbColumns else: colSet = set(columns) subtypeColSet = set(subtypeColumns) if subtypeHeaders["subtype adherence"] == "redefinable": colsRemoved = list(subtypeColSet - colSet) colsAdded = list(colSet - subtypeColSet) if len(colsRemoved) != len(colsAdded) or len(colsRemoved) > 2: continue colsRedefinedTo = ["value", "edges"] if repeat == 1 else ["edges", "value"] rearrangedColumns = [] i, j = (0, 0) for col in subtypeColumns: if col in colsRemoved: rearrangedColumns.append(colsRedefinedTo[i]) i += 1 elif col in colsRedefinedTo: rearrangedColumns.append(colsAdded[j]) j += 1 else: rearrangedColumns.append(col) for col in columns: if col in colsAdded[j:]: rearrangedColumns.append(col) else: rearrangedColumns = [x for x in subtypeColumns if x in colSet] + [ x for x in columns if x not in subtypeColSet ] rearrangedHbColumns = self._getHbColumnsFromGtrackColumns(rearrangedColumns) try: tempFile = StringIO() self._composeContents( tempFile, rearrangedHbColumns, rearrangedColumns, deepcopy(self._geSource), onlyNonDefault=True, singleDataLine=True, ) gtrackGESource = Gtrack( "subtype.test." + self.getDefaultFileNameSuffix(), printWarnings=False, strToUseInsteadOfFn=tempFile.getvalue(), ) tempFile.close() if gtrackGESource.compliesWithSubtype(subtypeUrl): gtrackGESource._headerDict["subtype url"] = subtypeUrl gtrackGESource._updateHeadersAccordingToSubtype() updatedHeaders = OrderedDict( [ (key, val) for key, val in gtrackGESource.getHeaderDict().iteritems() if val != Gtrack.DEFAULT_HEADER_DICT.get(key) ] ) for header in updatedHeaders: self._setHeaderDict(header, updatedHeaders[header]) return rearrangedHbColumns, rearrangedColumns, True except Exception, e: continue
def _checkUndirectedEdges(self): if self._headerDict['track type'].startswith('linked'): try: GtrackGenomeElementSource._checkUndirectedEdges(self) except InvalidFormatError: self._headerDict['undirected edges'] = False
def _basicHandleEndOfFile(self): GtrackGenomeElementSource._handleEndOfFile(self)
def _createColumnSpec(self, cols, addAnyExtraFixedCols=True): GtrackGenomeElementSource._createColumnSpec(self, cols, addAnyExtraFixedCols) self._headerDict['track type'] = GtrackGenomeElementSource.getTrackTypeFromColumnSpec(self._columnSpec)
def _getGtrackColumnsFromHbColumns(self, hbColumns): return [Gtrack.convertNameToGtrack(col) for col in hbColumns if col != "weights"]
def __init__(self, *args, **kwArgs): GtrackGenomeElementSource.__init__(self, *args, **kwArgs) self._noOverlappingElements = None