예제 #1
0
def _commonComplementGtrackFile(origFn, dbFn, intersectingFactor, gtrackColsToAdd, genome):
    origGESource = GtrackGenomeElementSource(origFn, genome)
    dbGESource = GtrackGenomeElementSource(dbFn, genome)
    
    dbPrefixes = dbGESource.getPrefixList()

    if intersectingFactor == 'id':
        fullDbDict = IdFullInfoDict(dbGESource, dbPrefixes)
    elif intersectingFactor == 'position':
        fullDbDict = TupleFullInfoDict(dbGESource, dbPrefixes)
    else:
        ShouldNotOccurError
        
    forcedHeaderDict = {}
    dbHeaderDict = dbGESource.getHeaderDict()
    
    if 'value' in gtrackColsToAdd:
        forcedHeaderDict['value type'] = dbHeaderDict['value type']
        forcedHeaderDict['value dimension'] = dbHeaderDict['value dimension']
    if 'edges' in gtrackColsToAdd:
        forcedHeaderDict['edge weight type'] = dbHeaderDict['edge weight type']
        forcedHeaderDict['edge weight dimension'] = dbHeaderDict['edge weight dimension']
    
    composerCls = ExtendedGtrackComposer if origGESource.isExtendedGtrackFile() else StdGtrackComposer    
    composedFile = composerCls( ElementComplementer(origGESource, fullDbDict, gtrackColsToAdd), \
                                forcedHeaderDict=forcedHeaderDict).returnComposed()
        
    return expandHeadersOfGtrackFileAndReturnComposer('', genome, strToUseInsteadOfFn=composedFile)
예제 #2
0
    def _determineHeaderLines(self, hbColumns, columns):
        self._setHeaderDict("track type", Gtrack.getTrackTypeFromColumnSpec(columns))
        self._setHeaderDict("value type", self._getGtrackValueType())
        self._setHeaderDict("value dimension", Gtrack.getGtrackValueDimension(self._geSource.getValDim()))
        self._setHeaderDict("undirected edges", self._geSource.hasUndirectedEdges())
        self._setHeaderDict("edge weights", ("weights" in hbColumns))
        self._setHeaderDict("edge weight type", self._getGtrackEdgeWeightType())
        self._setHeaderDict("edge weight dimension", Gtrack.getGtrackValueDimension(self._geSource.getEdgeWeightDim()))
        self._setHeaderDict("uninterrupted data lines", not self._hasMoreThanOneBoundingRegion())
        self._setHeaderDict("sorted elements", self._geSource.isSorted())
        self._setHeaderDict("no overlapping elements", self._geSource.hasNoOverlappingElements())
        self._setHeaderDict("circular elements", self._geSource.hasCircularElements())

        compliesToSubtype = False
        if self._USE_EXTENDED_GTRACK:
            self._setHeaderDict("fixed length", self._geSource.getFixedLength())
            self._setHeaderDict("fixed gap size", self._geSource.getFixedGapSize())
            self._setHeaderDict("fixed-size data lines", self._determineIfFixedSizeDataLines(columns))
            if self._headerDict["fixed-size data lines"]:
                self._setHeaderDict("data line size", self._geSource.getValDim())

            hbColumns, columns = self._adjustColumnsAccordingToHeaderLines(hbColumns, columns)
            hbColumns, columns, compliesToSubtype = self._determineIfFileCompliesToSubtypes(hbColumns, columns)

        if not compliesToSubtype:
            self._setHeaderDict("1-indexed", self._geSource.inputIsOneIndexed())
            self._setHeaderDict("end inclusive", self._geSource.inputIsEndInclusive())

        for header, val in self._forcedHeaderDict.iteritems():
            if header not in self._headerDict:
                self._headerDict[header] = val

        return hbColumns, columns
예제 #3
0
 def _checkValidEnd(self, chr, end, start=None):
     if start is not None and end <= start:
         if not self._headerDict['circular elements']:
             self._headerDict['circular elements'] = True
             start = None
     
     return GtrackGenomeElementSource._checkValidEnd(self, chr, end, start)
예제 #4
0
 def _parseEdges(self, edgeStr):
     if edgeStr != '.':
         for edgeSpec in edgeStr.split(';'):
             if '=' in edgeSpec:
                 if not self._headerDict['edge weights']:
                     self._headerDict['edge weights'] = True
                 self._getValInCorrectType(edgeSpec.split('=')[1], 'edge weight')
     
     return GtrackGenomeElementSource._parseEdges(self, edgeStr)
예제 #5
0
 def __init__(self, geSource, fullDbDict, gtrackColsToAdd):
     self._prefixesToAdd = [GtrackGenomeElementSource.convertNameFromGtrack(col) for col in gtrackColsToAdd]
     if 'edges' in self._prefixesToAdd:
         self._prefixesToAdd.append('weights')
         
     ElementModifierGESourceWrapper.__init__(self, geSource)
     
     self._fullDbDict = fullDbDict
     self._prefixList = geSource.getPrefixList() + self._prefixesToAdd
예제 #6
0
 def _iter(self):
     self._valTypeIndexDict = {}
     self._valLenDict = {}
     self._allMissingDict = {}
     
     #self._headerDict['no overlapping elements'] = True
     self._headerDict['sorted elements'] = True            
     if self._headerDict['track type'].startswith('linked'):
         self._headerDict['undirected edges'] = True
     
     return GtrackGenomeElementSource._iter(self)
예제 #7
0
 def _getGtrackValueDim(self, val, valTypeInfo, valueOrEdgeWeight):
     valLen = len(val.split(valTypeInfo.delim) if valTypeInfo.delim != '' else val)
     
     if valueOrEdgeWeight in self._valLenDict:
         if self._valLenDict[valueOrEdgeWeight] != valLen:
             self._valLenDict[valueOrEdgeWeight] = 0
     else:
         self._valLenDict[valueOrEdgeWeight] = valLen
     
     valDim = GtrackGenomeElementSource.getGtrackValueDimension(self._valLenDict[valueOrEdgeWeight])
         
     return valDim
예제 #8
0
 def _handleEndOfFile(self):
     GtrackGenomeElementSource._handleEndOfFile(self)
     
     #To fix an issue where value dimension is "list" if the value type was wrongly
     #guessed for early elements.
     
     newIter = self.__iter__()
     newIter._valTypeIndexDict = self._valTypeIndexDict
     newIter._handleEndOfFile = newIter._basicHandleEndOfFile
     
     try:
         while True:
             newIter.next()
     except StopIteration:
         pass
     
     self._valLenDict = newIter._valLenDict
     if len(self._uniqueEdgeIds) == 0:
         self._headerDict['undirected edges'] = False
     
     for valueOrEdgeWeight in ['value', 'edge weight']:
         if valueOrEdgeWeight in newIter._allMissingDict and newIter._allMissingDict[valueOrEdgeWeight] == True:
             self._headerDict['%s type' % valueOrEdgeWeight] = 'number'
예제 #9
0
    def _composeBoundingRegionLine(self, boundingRegionTuple):
        region = copy(boundingRegionTuple.region)

        if self._headerDict["1-indexed"]:
            region.start = region.start + 1 if region.start is not None else None
            region.end = region.end + 1 if region.end is not None else None
        if self._headerDict["end inclusive"]:
            region.end = region.end - 1 if region.end is not None else None

        brLinePartList = [
            (Gtrack.convertNameToGtrack(attr), getattr(region, attr)) for attr in ["genome", "chr", "start", "end"]
        ]
        return (
            "####"
            + "; ".join(
                k + "=" + self._formatPhraseWithCorrectChrUsage(str(v), useUrlEncoding=True, notAllowedChars="=;#\t")
                for k, v in brLinePartList
                if v is not None
            )
            + os.linesep
        )
예제 #10
0
    def _getValInCorrectType(self, val, valueOrEdgeWeight='value', isEmptyElement=False):
        headerDictInFile = self.getHeaderDictInFile()
        
        valTypeList = ['binary', 'number', 'category', 'character']
        for i,valueType in enumerate(valTypeList):
            if valueOrEdgeWeight in self._valTypeIndexDict and self._valTypeIndexDict[valueOrEdgeWeight] > i:
                continue
            
            valTypeInfo = GtrackGenomeElementSource.VAL_TYPE_DICT[valueType]
            
            if self._isValOfParticularType(val, valTypeInfo):
                self._noteIfAllValuesAreMissing(valueOrEdgeWeight, val, valTypeInfo)
                self._valTypeIndexDict[valueOrEdgeWeight] = i

                valueDim = self._getGtrackValueDim(val, valTypeInfo, valueOrEdgeWeight)

                if not '%s type' % valueOrEdgeWeight in headerDictInFile:
                    self._headerDict['%s type' % valueOrEdgeWeight] = valTypeList[i]
                if not '%s dimension' % valueOrEdgeWeight in headerDictInFile:
                    self._headerDict['%s dimension' % valueOrEdgeWeight] = valueDim
                
                return GtrackGenomeElementSource._getValInCorrectType(self, val, valueOrEdgeWeight, isEmptyElement)
        raise ShouldNotOccurError()
    def testHeaderExpansion(self):
        geSourceTest = self._commonSetup()
        
        for caseName in geSourceTest.cases:
            if not caseName.startswith('gtrack'):
                continue
                
            if 'no_expand' in caseName:
                print 'Test case skipped: ' + caseName
                continue
                
            onlyGuaranteed = 'no_types_expanded' in caseName
            
            print caseName
            print '==========='
            case = geSourceTest.cases[caseName]
            
            headerLines = [line if not self._isHeaderLine(line) else
                            '##' + ': '.join([str(x).lower() for x in Gtrack.getHeaderKeyValue(line.strip())])
                             for line in case.headerLines]
            
            fullContents = os.linesep.join(headerLines + case.lines)
            print 'Original:\n\n' + fullContents
            
            case.headerLines = [line for line in headerLines if not self._isExpandableHeader(line, onlyGuaranteed)]
            print '-----'
            print 'With headers removed:\n\n' + os.linesep.join(case.headerLines + case.lines)
            
            testFn = self._writeTestFile(case)
            
            expandedContents = expandHeadersOfGtrackFileAndReturnContents(testFn, case.genome, onlyNonDefault=False)

            print '-----'
            print 'With expanded headers:\n\n' + expandedContents
            
            expandedContentsOnlyNonDefaults = expandHeadersOfGtrackFileAndReturnContents(testFn, case.genome, onlyNonDefault=True)

            print '-----'
            print 'With expanded headers (only non-default headers):\n\n' + expandedContentsOnlyNonDefaults
            
            origExpandableHeaders = dict([Gtrack.getHeaderKeyValue(line) for line in headerLines \
                                          if self._isExpandableHeader(line, onlyGuaranteed=False)])
            notExpandableHeaders = dict([Gtrack.getHeaderKeyValue(line) for line in case.headerLines \
                                          if self._isHeaderLine(line) and not self._isValueNotKeptHeader(line)])
            expandedHeaders = dict([Gtrack.getHeaderKeyValue(line) for line in expandedContents.split(os.linesep) \
                                    if self._isHeaderLine(line)])
            
            if 'no_check_expand' in caseName:
                print 'No checks for case: ' + caseName
            else:
                for header in origExpandableHeaders:
                    self.assertEquals(origExpandableHeaders[header], expandedHeaders[header])
                for header in notExpandableHeaders:
                    self.assertEquals(notExpandableHeaders[header], expandedHeaders[header])
                    
                for contents in [expandedContents, expandedContentsOnlyNonDefaults]:
                    
                    sourceClass = GenomeElementSource if case.sourceClass is None else case.sourceClass
                    forPreProcessor = True if case.sourceClass is None else False

                    stdGeSource = GEDependentAttributesHolder(sourceClass('expanded.gtrack', case.genome, \
                                                                          forPreProcessor=forPreProcessor, \
                                                                          printWarnings=False, \
                                                                          strToUseInsteadOfFn=contents))
                    
                    self.assertEquals(case.assertElementList, [ge for ge in stdGeSource])
                    self.assertEquals(case.boundingRegionsAssertList, [br for br in stdGeSource.getBoundingRegionTuples()])
 def _isValueNotKeptHeader(self, line):
     return self._isHeaderLine(line) and \
             Gtrack.getHeaderKeyValue(line)[0] in VALUE_NOT_KEPT_HEADERS
 def _isExpandableHeader(self, line, onlyGuaranteed):
     return self._isHeaderLine(line) and \
             ( (Gtrack.getHeaderKeyValue(line)[0] in EXPANDABLE_HEADERS) or \
                (not onlyGuaranteed and Gtrack.getHeaderKeyValue(line)[0] in NOT_GUARANTEED_EXPANDABLE_HEADERS) )
예제 #14
0
 def _getHbColumnsFromGtrackColumns(self, columns):
     return [Gtrack.convertNameFromGtrack(col) for col in columns]
예제 #15
0
    def _determineIfFileCompliesToSubtypes(self, hbColumns, columns):
        if "subtype url" in self._forcedHeaderDict:
            subtypeUrlList = (
                [self._forcedHeaderDict["subtype url"]] if self._forcedHeaderDict["subtype url"] != "" else []
            )
        else:
            subtypeUrlList = self.GTRACK_PRIORITIZED_SUBTYPE_LIST

        for subtypeUrl in subtypeUrlList:
            subtypeGESource = Gtrack.getSubtypeGESource(subtypeUrl)
            subtypeColumns = subtypeGESource.getColumns(orig=False)
            subtypeHeaders = subtypeGESource.getHeaderDict()

            numRepeats = 2 if subtypeHeaders["subtype adherence"] == "redefinable" else 1

            for repeat in range(numRepeats):
                self._setHeaderDict("1-indexed", subtypeHeaders["1-indexed"])
                self._setHeaderDict("end inclusive", subtypeHeaders["end inclusive"])

                if subtypeHeaders["subtype adherence"] in ["reorderable", "free"]:
                    rearrangedColumns = columns
                    rearrangedHbColumns = hbColumns
                else:
                    colSet = set(columns)
                    subtypeColSet = set(subtypeColumns)

                    if subtypeHeaders["subtype adherence"] == "redefinable":
                        colsRemoved = list(subtypeColSet - colSet)
                        colsAdded = list(colSet - subtypeColSet)
                        if len(colsRemoved) != len(colsAdded) or len(colsRemoved) > 2:
                            continue

                        colsRedefinedTo = ["value", "edges"] if repeat == 1 else ["edges", "value"]

                        rearrangedColumns = []
                        i, j = (0, 0)
                        for col in subtypeColumns:
                            if col in colsRemoved:
                                rearrangedColumns.append(colsRedefinedTo[i])
                                i += 1
                            elif col in colsRedefinedTo:
                                rearrangedColumns.append(colsAdded[j])
                                j += 1
                            else:
                                rearrangedColumns.append(col)

                        for col in columns:
                            if col in colsAdded[j:]:
                                rearrangedColumns.append(col)
                    else:
                        rearrangedColumns = [x for x in subtypeColumns if x in colSet] + [
                            x for x in columns if x not in subtypeColSet
                        ]
                    rearrangedHbColumns = self._getHbColumnsFromGtrackColumns(rearrangedColumns)

                try:
                    tempFile = StringIO()
                    self._composeContents(
                        tempFile,
                        rearrangedHbColumns,
                        rearrangedColumns,
                        deepcopy(self._geSource),
                        onlyNonDefault=True,
                        singleDataLine=True,
                    )

                    gtrackGESource = Gtrack(
                        "subtype.test." + self.getDefaultFileNameSuffix(),
                        printWarnings=False,
                        strToUseInsteadOfFn=tempFile.getvalue(),
                    )
                    tempFile.close()

                    if gtrackGESource.compliesWithSubtype(subtypeUrl):
                        gtrackGESource._headerDict["subtype url"] = subtypeUrl
                        gtrackGESource._updateHeadersAccordingToSubtype()
                        updatedHeaders = OrderedDict(
                            [
                                (key, val)
                                for key, val in gtrackGESource.getHeaderDict().iteritems()
                                if val != Gtrack.DEFAULT_HEADER_DICT.get(key)
                            ]
                        )
                        for header in updatedHeaders:
                            self._setHeaderDict(header, updatedHeaders[header])

                        return rearrangedHbColumns, rearrangedColumns, True
                except Exception, e:
                    continue
예제 #16
0
 def _checkUndirectedEdges(self):     
     if self._headerDict['track type'].startswith('linked'):
         try:       
             GtrackGenomeElementSource._checkUndirectedEdges(self)
         except InvalidFormatError:
             self._headerDict['undirected edges'] = False
예제 #17
0
 def _basicHandleEndOfFile(self):
     GtrackGenomeElementSource._handleEndOfFile(self)
예제 #18
0
 def _createColumnSpec(self, cols, addAnyExtraFixedCols=True):
     GtrackGenomeElementSource._createColumnSpec(self, cols, addAnyExtraFixedCols)
     
     self._headerDict['track type'] = GtrackGenomeElementSource.getTrackTypeFromColumnSpec(self._columnSpec)
예제 #19
0
 def _getGtrackColumnsFromHbColumns(self, hbColumns):
     return [Gtrack.convertNameToGtrack(col) for col in hbColumns if col != "weights"]
예제 #20
0
    def __init__(self, *args, **kwArgs):
        GtrackGenomeElementSource.__init__(self, *args, **kwArgs)

        self._noOverlappingElements = None