Пример #1
0
    def scan_dirs(self, dirs):
        for dir in dirs:
            CGData.log("SCANNING DIR: %s" % (dir))
            if os.path.isdir(dir):
                filePath= os.path.join(dir, "*")
            else:
                filePath = dir

            for path in glob(filePath):
                if os.path.isfile(path):
                    if path.endswith(".json"):
                        handle = open(path)
                        try:
                            data = json.loads(handle.read())
                        except ValueError, e:
                            CGData.error("BAD JSON in " + path + " " + str(e) )
                            data = None
                        handle.close()

                        if (data is not None and 'name' in data 
                        and data['name'] is not None
                        and 'type' in data):
                            self.addFile(data['type'], data['name'], path)

                    if path.endswith("*.cgz"):
                        cgzList = CGData.CGZ.list( path )
                        for type in cgzList:
                            for zPath in cgzList[type]:
                                self.addFile(type, cgzList[type][zPath], zPath, path)
                if os.path.isdir(path):
                    self.scan_dirs([path])
Пример #2
0
    def gen_sql_heatmap(self, id_table, opts):
        CGData.log("ClincalTrack SQL " + self.get_name())

        features = self.members["clinicalFeature"].features
        matrix = self.members["clinicalMatrix"]

        # e.g. { 'HER2+': 'category', ...}
        explicit_types = dict((f, features[f]['valueType']) for f in features if 'valueType' in features[f])

        matrix.feature_type_setup(explicit_types)
        for a in self.members['clinicalMatrix'].col_list:
            if a in features and "stateOrder" in features[a]:

                enums = [x for x in csv.reader(features[a]["stateOrder"], skipinitialspace=True)][0]
                i = 0
                #do not drop states in stateOrder
                for e in enums:
                    matrix.enum_map[a][e] = enums.index(e)

                for e in matrix.enum_map[a]:
                    if e in enums:
                        matrix.enum_map[a][e] = enums.index(e)
                    else:
                        matrix.enum_map[a][e] = len(enums) + i
                        i += 1
        for a in matrix.gen_sql_heatmap(id_table, features=features):
            yield a
Пример #3
0
class SampleMap(CGData.CGDataSetObject):

    DATA_FORM = CGData.TABLE

    COLS = [
        CGData.Column('node_name', str, primary_key=True),
        CGData.Column('parent', str),
        CGData.Column('child', str)
    ]

    def __init__(self):
        CGData.CGDataSetObject.__init__(self)
        self.mhash = {}

    def read(self, handle):
        for line in handle:
            tmp = line.rstrip().split('\t')
            if not tmp[0] in self.sample_hash:
                self.sample_hash[tmp[0]] = {}
            if len(tmp) > 1:
                self.sample_hash[tmp[0]][tmp[1]] = True

    def get_children(self, sample):
        out = {}
        for a in self.sample_hash.get(sample, {}):
            out[a] = True
            for c in self.get_children(a):
                out[c] = True
        return out.keys()
Пример #4
0
    def gen_sql_heatmap(self, id_table, opts):
        CGData.log("ClincalTrack SQL " + self.get_name())

        features = self.members["clinicalFeature"].features
        matrix = self.members["clinicalMatrix"]

        # e.g. { 'HER2+': 'category', ...}
        explicit_types = dict((f, features[f]['valueType']) for f in features
                              if 'valueType' in features[f])

        matrix.feature_type_setup(explicit_types)
        for a in self.members['clinicalMatrix'].col_list:
            if a in features and "stateOrder" in features[a]:

                enums = [
                    x for x in csv.reader(features[a]["stateOrder"],
                                          skipinitialspace=True)
                ][0]
                i = 0
                #do not drop states in stateOrder
                for e in enums:
                    matrix.enum_map[a][e] = enums.index(e)

                for e in matrix.enum_map[a]:
                    if e in enums:
                        matrix.enum_map[a][e] = enums.index(e)
                    else:
                        matrix.enum_map[a][e] = len(enums) + i
                        i += 1
        for a in matrix.gen_sql_heatmap(id_table, features=features):
            yield a
Пример #5
0
 def gen_sql(self, id_table):
     CGData.log("ClincalTrack SQL " + self.get_name())
             
     matrix = self.members["clinicalMatrix"]        
     matrix.feature_type_setup()        
     features = self.members["clinicalFeature"]
     #print features
     for a in features:
         if "stateOrder" in features[a]:
             #print features[a]["stateOrder"][0]
             
             #this weird bit of code is to split on ',', but respect \,
             #if you can think of a better way, please replace this
             tmp = re.split(r'([^,]),', features[a]["stateOrder"][0])
             enums = []
             word = True
             appending = False
             e = 0
             while e < len(tmp): 
                 if word:
                     if appending:
                         enums[-1] += tmp[e]
                     else:
                         enums.append(tmp[e])
                     word = False
                 else:
                     if tmp[e] != "\\":
                         enums[-1] += tmp[e]
                         appending = False
                     else:
                         enums[-1] += ","
                         appending = True
                     word = True
                 e += 1
             
             #print tmp
             #print enums
             #print matrix.enum_map[a]
             i = 0
             for e in matrix.enum_map[a]:
                 if e in enums:
                     matrix.enum_map[a][e] = enums.index(e)
                 else:
                     matrix.enum_map[a][e] = len(enums) + i
                     i += 1
             #print matrix.enum_map[a]
             #print "-=-=-=-=-"
     for a in matrix.gen_sql(id_table, skip_feature_setup=True):
         yield a
Пример #6
0
    def gen_sql_heatmap(self, id_table, skip_feature_setup=False):
        CGData.log( "Writing Clinical %s SQL" % (self.attrs['name']))
        
        if not skip_feature_setup:
            self.feature_type_setup()

        table_name = self.attrs['name']

        yield "drop table if exists clinical_%s;" % ( table_name )

        yield """
CREATE TABLE clinical_%s (
\tsampleID int,
\tsampleName ENUM ('%s')""" % ( table_name, "','".join(sortedSamples(self.row_hash.keys())) )

        for col in self.col_order:
            if ( self.enum_map.has_key( col ) ):
                yield ",\n\t`%s` ENUM( '%s' ) default NULL" % (col.strip(), "','".join( sql_fix(a) for a in sorted(self.enum_map[ col ].keys(), lambda x,y: self.enum_map[col][x]-self.enum_map[col][y]) ) )
            else:
                yield ",\n\t`%s` FLOAT default NULL" % (col.strip())
        yield """
    ) engine 'MyISAM';
    """

        for target in sortedSamples(self.row_hash.keys()):
            a = []
            for col in self.orig_order:
                val = self.row_hash[ target ][ self.col_list[ col ] ]
                if val is None or val == "null" or len(val) == 0 :
                    a.append("\\N")
                else:
                    a.append( "'" + sql_fix( val.encode('string_escape') ) + "'" )
            yield u"INSERT INTO clinical_%s VALUES ( %d, '%s', %s );\n" % ( table_name, id_table.get( table_name + ':sample_id', target ), sql_fix(target), u",".join(a) )


        yield "drop table if exists clinical_%s_colDb;" % ( table_name )
        yield CREATE_COL_DB % ( "clinical_" + table_name + "_colDb" )

        yield "INSERT INTO clinical_%s_colDb(name, shortLabel,longLabel,valField,clinicalTable,filterType,visibility,priority) VALUES( '%s', '%s', '%s', '%s', '%s', '%s', 'on',1);\n" % \
                ( table_name, 'sampleName', 'sample name', 'sample name', 'sampleName', "clinical_" + table_name, 'coded' )

        i = 0;
        for name in self.col_order:
            filter = 'coded' if self.enum_map.has_key(name) else 'minMax'
            yield "INSERT INTO clinical_%s_colDb(name, shortLabel,longLabel,valField,clinicalTable,filterType,visibility,priority) VALUES( '%s', '%s', '%s', '%s', '%s', '%s', '%s',1);\n" % \
                    ( table_name, name, name, name, name, "clinical_" + table_name, filter, 'on' if i < 10 else 'off')
            i += 1
Пример #7
0
class DataSubType(CGData.CGDataSetObject):

    DATA_FORM = CGData.TABLE
    COLS = [
        CGData.Column('name', str, primary_key=True),
    ]

    def __init__(self):
        CGData.CGDataSetObject.__init__(self)
Пример #8
0
    def addFile(self, type, name, path, zipFile=None):
        if CGData.has_type(type):
            if not type in self:
                self[type] = {}

            if name in self[type]:
                CGData.error("Duplicate %s file %s" % (type, name))
            self[type][name] = CGData.light_load(path, zipFile)
            CGData.log("FOUND: " + type + "\t" + name + "\t" + path)
        else:
            CGData.warn("Unknown file type: %s" % (path))
Пример #9
0
    def addFile(self, type, name, path, zipFile=None):
        if CGData.has_type(type):
            if not type in self:
                self[type] = {}

            if name in self[type]:
                CGData.error("Duplicate %s file %s" % (type, name))
            self[type][name] = CGData.light_load(path, zipFile)
            CGData.log("FOUND: " + type + "\t" + name + "\t" + path)
        else:
            CGData.warn("Unknown file type: %s" % (path))
Пример #10
0
class Assembly(CGData.CGObjectBase):
    """
    Blank Class to represent Genome Assemblies
    """

    DATA_FORM = CGData.TABLE

    COLS = [
        CGData.Column('name', str, primary_key=True),
    ]
Пример #11
0
    def append(self, probe):
        for attr in self.child_type.core_attr:
            if not hasattr(probe, attr):
                raise CGData.FormatException("Missing %s" % (attr))

        if self.chrom_map is None:
            self.chrom_map = {}
        if not probe.chrom in self.chrom_map:
            self.chrom_map[probe.chrom] = {}
        if not probe.name in self.chrom_map[probe.chrom]:
            self.chrom_map[probe.chrom][probe.name] = [probe]
        else:
            self.chrom_map[probe.chrom][probe.name].append(probe)
Пример #12
0
    def gen_sql(self, id_table):
        gmatrix = self.members[ 'genomicMatrix' ]
        pmap = self.members[ 'probeMap' ].get( assembly="hg18" ) # BUG: hard coded to only producing HG18 tables
        if pmap is None:
            CGData.error("Missing HG18 %s" % ( self.members[ 'probeMap'].get_name() ))
            return
        
        table_base = self.get_name()
        CGData.log("Writing Track %s" % (table_base))
        
        clinical_table_base =  self.members[ "clinicalMatrix" ].get_name()

        yield "INSERT into raDb( name, sampleTable, clinicalTable, columnTable, aliasTable, shortLabel, expCount, dataType, platform, profile, security) VALUES ( '%s', '%s', '%s', '%s', '%s', '%s', '%d', '%s', '%s', '%s', '%s');\n" % \
            ( "genomic_" + table_base, "sample_" + table_base,
                "clinical_" + clinical_table_base, "clinical_" + clinical_table_base + "_colDb",
                "genomic_" + table_base + "_alias",
                table_base,
                len(gmatrix.get_sample_list()),
                'bed 15',
                gmatrix.attrs[':dataSubType'],
                'localDb',
                'public',
                )
        
        # write out the sample table
        yield "drop table if exists sample_%s;" % ( table_base )
        yield """
CREATE TABLE sample_%s (
    id           int,
    sampleName   varchar(255)
) engine 'MyISAM';
""" % ( table_base )

        for sample in gmatrix.get_sample_list():
            yield "INSERT INTO sample_%s VALUES( %d, '%s' );\n" % ( table_base, id_table.get( 'sample_id', sample), sample )

        # write out the BED table
        yield "drop table if exists %s;" % ( "genomic_" + table_base )
        yield CREATE_BED % ( "genomic_" + table_base )
        
        sample_ids = []
        for sample in gmatrix.get_sample_list():
            sample_ids.append( str( id_table.get( 'sample_id', sample ) ) )
        
        missingProbeCount = 0
        for probe_name in gmatrix.get_probe_list():
            exp_ids = ','.join( sample_ids )
            row = gmatrix.get_row_vals( probe_name )
            exps = ','.join( str(a) for a in row )
            probe = pmap.get( probe_name )
            if probe is not None:
                istr = "insert into %s(chrom, chromStart, chromEnd, strand,  name, expCount, expIds, expScores) values ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s' );\n" % \
                    ( "genomic_%s" % (table_base), probe.chrom, probe.chrom_start, probe.chrom_end, probe.strand, sql_fix(probe_name), len(sample_ids), exp_ids, exps )
                yield istr
            else:
                missingProbeCount += 1
        CGData.log("%s Missing probes %d" % (table_base, missingProbeCount))
Пример #13
0
 def gen_sql(self):
     if "compiler.mode" in self.params and self.params[ "compiler.mode" ] == "scan":
         return
     log( "Writing SQL" )     
     if not os.path.exists(self.out_dir):
         os.makedirs(self.out_dir)
     self.id_table = CGIDTable()
     for rtype in self.compile_matrix:
         if issubclass( CGData.get_type( rtype ), CGData.CGSQLObject ):
             for rname in self.compile_matrix[ rtype ]:
                 shandle = self.compile_matrix[ rtype ][ rname ].gen_sql( self.id_table )
                 if shandle is not None:
                     ohandle = open( os.path.join( self.out_dir, "%s.%s.sql" % (rtype, rname ) ), "w" )
                     for line in shandle:
                         ohandle.write( line )
                     ohandle.close()
                 #tell the object to unload data, so we don't continually allocate over the compile
                 self.compile_matrix[ rtype ][ rname ].unload()
Пример #14
0
    def gen_sql_heatmap(self, id_table, features=None):
        CGData.log( "Writing Clinical %s SQL" % (self['name']))
        
        if features == None:
            self.feature_type_setup()
            features = {}

        features['sampleName'] = { 'shortTitle': ['Sample name'], 'longTitle': ['Sample name'], 'visibility': ['on'], 'priority': [100.0] }

        table_name = self['name'].replace(".","_")
        clinical_table = 'clinical_' + table_name

        yield "DROP TABLE IF EXISTS %s;\n" % ( clinical_table )
        yield "DELETE codes FROM codes, colDb WHERE codes.feature = colDb.id AND colDb.clinicalTable = '%s';\n" % clinical_table
        yield "DELETE FROM colDb WHERE clinicalTable = '%s';\n" % clinical_table

        # colDb
        i = 0;
        for name in self.col_order:
            shortLabel = name if name not in features or 'shortTitle' not in features[name] else features[name]['shortTitle'][0]
            longLabel = name if name not in features or 'longTitle' not in features[name] else features[name]['longTitle'][0]
            filter = 'coded' if self.enum_map.has_key(name) else 'minMax'
            visibility = ('on' if i < 10 else 'off') if name not in features or 'visibility' not in features[name] else features[name]['visibility'][0]
            priority = 1 if name not in features or 'priority' not in features[name] else float(features[name]['priority'][0])
            yield "INSERT INTO colDb(name, shortLabel,longLabel,valField,clinicalTable,filterType,visibility,priority) VALUES( '%s', '%s', '%s', '%s', '%s', '%s', '%s', %f);" % \
                    ( sql_fix(name), sql_fix(shortLabel), sql_fix(longLabel), sql_fix(name), clinical_table, filter, visibility, priority)
            yield "SET @col%d=LAST_INSERT_ID();\n" % i
            i += 1

        # codes
        i = 0;
        values = {}
        for col in self.col_order:
            if ( self.enum_map.has_key( col ) ):
                values[col] = {}
                j = 0
                for a in sorted(self.enum_map[ col ].keys(), lambda x,y: self.enum_map[col][x]-self.enum_map[col][y]):
                    yield "INSERT INTO codes(feature,ordering,value) VALUES (@col%d, %d, '%s'); SET @val%d_%d=LAST_INSERT_ID();\n" % (i, j, sql_fix(a), i, j)
                    values[col][a] = "@val%d_%d" % (i, j)
                    j += 1
            i += 1


        yield "CREATE TABLE %s (sampleID INT NOT NULL UNIQUE" % clinical_table

        for col in self.col_order:
            if col == 'sampleName':
                yield ",\n\tsampleName INT UNSIGNED NOT NULL UNIQUE"
            else:
                if self.enum_map.has_key(col):
                    yield ",\n\t`%s` INT UNSIGNED DEFAULT NULL" % (col.strip())
                else:
                    yield ",\n\t`%s` FLOAT DEFAULT NULL" % (col.strip())
        yield """
    ) engine 'MyISAM';
    """

        for target in sortedSamples(self.row_hash.keys()):
            a = []
            for col,orig in zip(self.col_order, self.orig_order):
                if col == 'sampleName':
                    val = target
                else:
                    val = self.row_hash[ target ][ self.col_list[ orig ] ]
                if val is None or val.upper() in NULL_VALUES:
                    a.append("\\N")
                else:
                    if col in self.enum_map:
                        a.append(values[col][val])
                    else:
                        a.append(val)
            yield u"INSERT INTO %s VALUES ( %d, %s );\n" % ( clinical_table, id_table.get( table_name + ':sample_id', target ), u",".join(a) )
Пример #15
0
    def link_objects(self):
        """
        Scan found object records and determine if the data they link to is
        avalible
        """
        omatrix = {}
        for otype in self.set_hash:
            if issubclass( CGData.get_type( otype ), CGData.CGGroupMember ):
                gmap = {}
                for oname in self.set_hash[ otype ]:
                    oobj = self.set_hash[ otype ][ oname ]
                    if oobj.get_group() not in gmap:
                        if issubclass(CGData.get_type(otype), CGData.CGGroupMemberSQL):
                            gmap[ oobj.get_group() ] = CGData.CGGroupBaseSQL( oobj.get_group() )
                        else:
                            gmap[ oobj.get_group() ] = CGData.CGGroupBase( oobj.get_group() )
                    gmap[ oobj.get_group() ].put( oobj )
                omatrix[ otype ] = gmap
            else:
                omatrix[ otype ] = self.set_hash[ otype ]
        
        # Now it's time to check objects for their dependencies
        ready_matrix = {}
        for stype in omatrix:
            for sname in omatrix[ stype ]:
                sobj = omatrix[ stype ][ sname ]
                lmap = sobj.get_link_map()
                is_ready = True
                for ltype in lmap:
                    if not omatrix.has_key( ltype ):
                        warn( "%s missing data type %s" % (sname, ltype) )
                        is_ready = False
                    else:
                        for lname in lmap[ ltype ]:
                            if not omatrix[ltype].has_key( lname ):
                                warn( "%s %s missing data %s %s" % ( stype, sname, ltype, lname ) )
                                is_ready = False
                if not sobj.is_link_ready():
                    warn( "%s %s not LinkReady" % ( stype, sname ) )
                elif is_ready:
                    if not stype in ready_matrix:
                        ready_matrix[ stype ] = {}
                    ready_matrix[ stype ][ sname ] = sobj
        
        for rtype in ready_matrix:
            log( "READY %s: %s" % ( rtype, ",".join(ready_matrix[rtype].keys()) ) )         

        for dType in ready_matrix:
            log("Found %s %d" % (dType, len(ready_matrix[dType])))
            
        merge_children = {}

        for merge_type in CGData.MERGE_OBJECTS:
            mtype = CGData.get_type( merge_type )
            select_types = mtype.typeSet
            select_set = {}
            try:
                for stype in select_types:
                    select_set[ stype ] = ready_matrix[ stype ] 
                    if stype not in merge_children:
                        merge_children[stype] = {}
            except KeyError:
                error("missing data type %s" % (stype) )
                continue
            mobjlist = self.set_enumerate( mtype, select_set )
            for mobj in mobjlist:
                if merge_type not in ready_matrix:
                    ready_matrix[ merge_type ] = {}
                for cType in mobj:
                    merge_children[cType][mobj[cType].get_name()] = True
                ready_matrix[ merge_type ][ mobj.get_name() ] = mobj
        
        self.compile_matrix = {}
        for sType in ready_matrix:
            self.compile_matrix[sType] = {}
            for name in ready_matrix[sType]:
                if sType not in merge_children or name not in merge_children[sType]:
                    self.compile_matrix[sType][name] = ready_matrix[sType][name]
       
        log("After Merge")
        for dType in ready_matrix:
            log("Found %s %d" % (dType, len(self.compile_matrix[dType])))
Пример #16
0
#!/usr/bin/env python

#note: this script assumes that the first alias in the probeMap aliaslist
#is a HUGO gene name...

import CGData
import sys

# matrixProbeRemap.py <matrixFile> <probeFile>

#load the matrix
matrix = CGData.load(sys.argv[1])

#load the probeMap
probeMap = CGData.load(sys.argv[2])

#remove null probes from the matrix
matrix.remove_null_probes()

#remap the matrix using the probe map
matrix.remap(probeMap, skip_missing=True)

matrix.add_history("Transformed from probespace %s to HUGO" %
                   (probeMap.get_name()))
matrix.attrs[":probeMap"] = "hugo"
#output the matrix
matrix.store(sys.argv[3])
Пример #17
0
import CGData
import CGData.NumpyMatrix
import sys

# matrixProbeRemap.py <matrixFile> <probeFile>

#load the matrix


matrix = CGData.NumpyMatrix.NumpyMatrix()
matrixHandle = open(sys.argv[1])
matrix.read(matrixHandle)
matrixHandle.close()

#load the probeMap
probeMap = CGData.load( sys.argv[2] )

#remove null probes from the matrix
matrix.remove_null_probes()

#remap the matrix using the probe map

valid_map = {}
for alt in probeMap.get_probes():
	valid_map[alt.aliases[0]] = True
	if alt.name in matrix.get_row_names():
		matrix.row_rename(alt.name, alt.aliases[0])

remove_list = []
for name in matrix.get_row_names():
	if not name in valid_map:
Пример #18
0
    def gen_sql_heatmap(self, id_table):
        #scan the children
        # XXX Handling of sql for children is broken if the child may appear
        # as part of multiple merge objects, such as TrackGenomic and TrackClinical.
        # A disgusting workaround for clinicalMatrix is to prevent the TrackGenomic from calling
        # it for gen_sql.
        clinical = self.members.pop("clinicalMatrix")
        for line in CGData.CGMergeObject.sql_pass(self, id_table, method="heatmap"):
            yield line
        self.members["clinicalMatrix"] = clinical

        gmatrix = self.members[ 'genomicMatrix' ]
        pmap = self.members[ 'probeMap' ].get( assembly="hg18" ) # BUG: hard coded to only producing HG18 tables
        if pmap is None:
            CGData.error("Missing HG18 %s" % ( self.members[ 'probeMap'].get_name() ))
            return
        
        table_base = self.get_name()
        CGData.log("Writing Track %s" % (table_base))
        
        clinical_table_base =  self.members[ "clinicalMatrix" ].get_name()

        yield "INSERT into raDb( name, sampleTable, clinicalTable, columnTable, aliasTable, shortLabel, longLabel, expCount, dataType, platform, profile, security) VALUES ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%d', '%s', '%s', '%s', '%s');\n" % \
            ( "genomic_" + table_base, "sample_" + table_base,
                "clinical_" + clinical_table_base, "clinical_" + clinical_table_base + "_colDb",
                "genomic_" + table_base + "_alias",
                sql_fix(gmatrix.attrs['shortTitle']),
                sql_fix(gmatrix.attrs['longTitle']),
                len(gmatrix.get_sample_list()),
                self.format,
                gmatrix.attrs[':dataSubType'],
                'localDb',
                'public',
                )
        
        # write out the sample table
        yield "drop table if exists sample_%s;" % ( table_base )
        yield """
CREATE TABLE sample_%s (
    id           int,
    sampleName   varchar(255)
) engine 'MyISAM';
""" % ( table_base )

        from CGData.ClinicalMatrix import sortedSamples
        for sample in sortedSamples(gmatrix.get_sample_list()):
	    yield "INSERT INTO sample_%s VALUES( %d, '%s' );\n" % ( table_base, id_table.get( clinical_table_base + ':sample_id', sample), sample )

        
        yield "drop table if exists genomic_%s_alias;" % ( table_base )
        yield """
CREATE TABLE genomic_%s_alias (
    name        varchar(255),
    alias         varchar(255)
) engine 'MyISAM';
""" % ( table_base )

        for pset in pmap:
            for probe in pset:
                for alias in probe.aliases:
                    yield "insert into genomic_%s_alias( name, alias ) values( '%s', '%s' );\n" % (table_base, sql_fix(probe.name), sql_fix(alias))

        # write out the BED table
        yield "drop table if exists %s;" % ( "genomic_" + table_base )
        yield CREATE_BED % ( "genomic_" + table_base + "_tmp")
        
        sample_ids = []
        samples = gmatrix.get_sample_list()

        # sort samples by sample_id, and retain the sort order for application to the genomic data, below
        tmp=sorted(zip(samples, range(len(samples))), cmp=lambda x,y: id_table.get(clinical_table_base + ':sample_id', x[0]) - id_table.get( clinical_table_base + ':sample_id', y[0]))
        samples, order = map(lambda t: list(t), zip(*tmp))

        for sample in samples:
            sample_ids.append( str( id_table.get( clinical_table_base + ':sample_id', sample ) ) )
        
        exp_ids = ','.join( sample_ids )
        missingProbeCount = 0
        for probe_name in gmatrix.get_probe_list():
            # get the genomic data and rearrange to match the sample_id order
            tmp = gmatrix.get_row_vals( probe_name )
            row = map(lambda i: tmp[order[i]], range(len(tmp)))

            pset = pmap.get( probe_name )
            if pset is not None:
                for probe in pset:
                    istr = "insert into %s(chrom, chromStart, chromEnd, strand,  name, expCount, expIds, expScores) values ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', %s );\n" % \
                            ( "genomic_%s_tmp" % (table_base), probe.chrom, probe.chrom_start, probe.chrom_end, probe.strand, sql_fix(probe_name), len(sample_ids), exp_ids, self.scores(row) )
                    yield istr
            else:
                missingProbeCount += 1
        yield "create table genomic_%s like genomic_%s_tmp;" % (table_base, table_base)
        yield "insert into genomic_%s select * from genomic_%s_tmp order by chrom, chromStart;" % (table_base, table_base)
        yield "drop table genomic_%s_tmp;" % table_base
        CGData.log("%s Missing probes %d" % (table_base, missingProbeCount))
Пример #19
0
#!/usr/bin/env python

#note: this script assumes that the first alias in the probeMap aliaslist 
#is a HUGO gene name...

import CGData
import sys

# matrixProbeRemap.py <matrixFile> <probeFile>

#load the matrix
matrix = CGData.load( sys.argv[1] )

#load the probeMap
probeMap = CGData.load( sys.argv[2] )

#remove null probes from the matrix
matrix.remove_null_probes()

#remap the matrix using the probe map
matrix.remap( probeMap, skip_missing=True )

matrix.add_history( "Transformed from probespace %s to HUGO" % (probeMap.get_name() ) )
matrix.attrs[":probeMap"] = "hugo"
#output the matrix
matrix.store( sys.argv[3] )
Пример #20
0
    def link_objects(self):
        """
        Scan found object records and determine if the data they link to is
        avalible
        """
        omatrix = {}
        for otype in self.set_hash:
            if issubclass(CGData.get_type(otype), CGData.CGGroupMember):
                gmap = {}
                for oname in self.set_hash[otype]:
                    oobj = self.set_hash[otype][oname]
                    if oobj.get_group() not in gmap:
                        gmap[oobj.get_group()] = CGData.CGGroupBase(
                            oobj.get_group())
                    gmap[oobj.get_group()].put(oobj)
                omatrix[otype] = gmap
            else:
                omatrix[otype] = self.set_hash[otype]

        # Now it's time to check objects for their dependencies
        ready_matrix = {}
        for stype in omatrix:
            for sname in omatrix[stype]:
                sobj = omatrix[stype][sname]
                lmap = sobj.get_link_map()
                is_ready = True
                for ltype in lmap:
                    if not omatrix.has_key(ltype):
                        warn("%s missing data type %s" % (sname, ltype))
                        is_ready = False
                    else:
                        for lname in lmap[ltype]:
                            if not omatrix[ltype].has_key(lname):
                                warn("%s %s missing data %s %s" %
                                     (stype, sname, ltype, lname))
                                is_ready = False
                if not sobj.is_link_ready():
                    warn("%s %s not LinkReady" % (stype, sname))
                elif is_ready:
                    if not stype in ready_matrix:
                        ready_matrix[stype] = {}
                    ready_matrix[stype][sname] = sobj

        for rtype in ready_matrix:
            log("READY %s: %s" % (rtype, ",".join(ready_matrix[rtype].keys())))

        for dType in ready_matrix:
            log("Found %s %d" % (dType, len(ready_matrix[dType])))

        merge_children = {}

        for merge_type in CGData.MERGE_OBJECTS:
            mtype = CGData.get_type(merge_type)
            select_types = mtype.typeSet
            select_set = {}
            try:
                for stype in select_types:
                    select_set[stype] = ready_matrix[stype]
                    if stype not in merge_children:
                        merge_children[stype] = {}
            except KeyError:
                error("missing data type %s" % (stype))
                continue
            mobjlist = self.set_enumerate(mtype, select_set)
            for mobj in mobjlist:
                if merge_type not in ready_matrix:
                    ready_matrix[merge_type] = {}
                for cType in mobj:
                    merge_children[cType][mobj[cType].get_name()] = True
                ready_matrix[merge_type][mobj.get_name()] = mobj

        self.compile_matrix = {}
        for sType in ready_matrix:
            self.compile_matrix[sType] = {}
            for name in ready_matrix[sType]:
                if sType not in merge_children or name not in merge_children[
                        sType]:
                    self.compile_matrix[sType][name] = ready_matrix[sType][
                        name]

        log("After Merge")
        for dType in ready_matrix:
            log("Found %s %d" % (dType, len(self.compile_matrix[dType])))
Пример #21
0
    def gen_sql_heatmap(self, id_table):
        #scan the children
        # XXX Handling of sql for children is broken if the child may appear
        # as part of multiple merge objects, such as TrackGenomic and TrackClinical.
        # A disgusting workaround for clinicalMatrix is to prevent the TrackGenomic from calling
        # it for gen_sql.
        clinical = self.members.pop("clinicalMatrix")
        for line in CGData.CGMergeObject.sql_pass(self, id_table, method="heatmap"):
            yield line
        self.members["clinicalMatrix"] = clinical

        gmatrix = self.members[ 'genomicMatrix' ]
        pmap = self.members[ 'probeMap' ].lookup( assembly="hg18" ) # BUG: hard coded to only producing HG18 tables
        if pmap is None:
            CGData.error("Missing HG18 %s" % ( self.members[ 'probeMap'].get_name() ))
            return
        
        table_base = self.get_name()
        CGData.log("Writing Track %s" % (table_base))
        
        clinical_table_base =  self.members[ "clinicalMatrix" ].get_name()

        other = {}
        for attr in ['wrangler', 'wrangling_procedure', 'url', 'citation', 'description']:
            if attr in gmatrix:
                other[attr] = gmatrix[attr]
        if 'dataProducer' in gmatrix:
            other['author_list'] = gmatrix['dataProducer']
        if 'articleTitle' in gmatrix:
            other['article_title'] = gmatrix['articleTitle']
        
        other['version'] = gmatrix.get('version', "")
        datetime.datetime.strptime(other['version'], "%Y-%m-%d") #if the version isn't properly formatted, though exception
        
        if 'owner' in gmatrix:
            other['owner'] = gmatrix['owner']
        other['colNormalization'] = gmatrix.get('colNormalization', False)
        if not isinstance(other['colNormalization'], bool):
            other['colNormalization']  = False
        other['redistribution'] = gmatrix.get('redistribution', False)
        if not isinstance(other['redistribution'], bool):
            other['redistribution']  = False
        other['security'] = gmatrix.get('security', "public")
        if other['security'] not in [ "public", "private" ]:
            other['security'] = "public"

        yield "DELETE from raDb where name = '%s';\n" % ("genomic_" + table_base)
        yield "INSERT into raDb( name, sampleTable, clinicalTable, columnTable, aliasTable, shortLabel, longLabel, expCount, dataType, platform, profile, security, priority, gain, groupName, wrangler, url, article_title, citation, author_list, wrangling_procedure, other) VALUES ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%d', '%s', '%s', '%s', '%s', %f, %f, '%s', %s, %s, %s, %s, %s, %s, '%s');\n" % \
            ( "genomic_" + table_base, "sample_" + table_base,
                "clinical_" + clinical_table_base, "colDb",
                "genomic_" + table_base + "_alias",
                sql_fix(gmatrix['shortTitle']),
                sql_fix(gmatrix['longTitle']),
                len(gmatrix.get_sample_list()),
                self.format,
                dataSubTypeMap[gmatrix[':dataSubType']] if gmatrix[':dataSubType'] in dataSubTypeMap else gmatrix[':dataSubType'],
                'localDb',
                'public',
                float(gmatrix.get('priority', 1.0)),
                float(gmatrix.get('gain', 1.0)),
                sql_fix(gmatrix.get('groupTitle', 'Misc.')),
                "'%s'"%sql_fix(gmatrix['wrangler']) if 'wrangler' in gmatrix else '\N',
                "'%s'"%sql_fix(gmatrix['url']) if 'url' in gmatrix else '\N',
                "'%s'"%sql_fix(gmatrix['articleTitle']) if 'articleTitle' in gmatrix else '\N',
                "'%s'"%sql_fix(gmatrix['citation']) if 'citation' in gmatrix else '\N',
                "'%s'"%sql_fix(gmatrix['dataProducer']) if 'dataProducer' in gmatrix else '\N',
                "'%s'"%sql_fix(gmatrix['wrangling_procedure']) if 'wrangling_procedure' in gmatrix else '\N',
                sql_fix(json.dumps(other)),
                )
        
        # write out the sample table
        yield "drop table if exists sample_%s;" % ( table_base )
        yield """
CREATE TABLE sample_%s (
    id           int,
    sampleName   varchar(255)
) engine 'MyISAM';
""" % ( table_base )

        from CGData.ClinicalMatrix import sortedSamples
        for sample in sortedSamples(gmatrix.get_sample_list()):
            yield "INSERT INTO sample_%s VALUES( %d, '%s' );\n" % ( table_base, id_table.get( clinical_table_base + ':sample_id', sample), sql_fix(sample) )

        
        yield "drop table if exists genomic_%s_alias;" % ( table_base )
        yield """
CREATE TABLE genomic_%s_alias (
    name        varchar(255),
    alias         varchar(255)
) engine 'MyISAM';
""" % ( table_base )

        for probe in pmap.get_probes():
            for alias in probe.aliases:
                yield "insert into genomic_%s_alias( name, alias ) values( '%s', '%s' );\n" % (table_base, sql_fix(probe.name), sql_fix(alias))

        # write out the BED table
        yield "drop table if exists %s;" % ( "genomic_" + table_base )
        yield CREATE_BED % ( "genomic_" + table_base + "_tmp")
        
        sample_ids = []
        samples = gmatrix.get_sample_list()

        # sort samples by sample_id, and retain the sort order for application to the genomic data, below
        tmp=sorted(zip(samples, range(len(samples))), cmp=lambda x,y: id_table.get(clinical_table_base + ':sample_id', x[0]) - id_table.get( clinical_table_base + ':sample_id', y[0]))
        samples, order = map(lambda t: list(t), zip(*tmp))

        for sample in samples:
            sample_ids.append( str( id_table.get( clinical_table_base + ':sample_id', sample ) ) )
        
        exp_ids = ','.join( sample_ids )
        missingProbeCount = 0
        for probe_name in gmatrix.get_probe_list():
            # get the genomic data and rearrange to match the sample_id order
            tmp = gmatrix.get_row_vals( probe_name )
            row = map(lambda i: tmp[order[i]], range(len(tmp)))

            pset = pmap.lookup( probe_name )
            if pset is not None:
                for probe in pset:
                    istr = "insert into %s(chrom, chromStart, chromEnd, strand,  name, expCount, expIds, expScores) values ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', %s );\n" % \
                            ( "genomic_%s_tmp" % (table_base), probe.chrom, probe.chrom_start-1, probe.chrom_end, probe.strand, sql_fix(probe_name), len(sample_ids), exp_ids, self.scores(row) )
                    yield istr
            else:
                missingProbeCount += 1
        yield "# sort file by chrom position\n"
        yield "create table genomic_%s like genomic_%s_tmp;\n" % (table_base, table_base)
        yield "insert into genomic_%s select * from genomic_%s_tmp order by chrom, chromStart;\n" % (table_base, table_base)
        yield "drop table genomic_%s_tmp;\n" % table_base
        CGData.log("%s Missing probes %d" % (table_base, missingProbeCount))
Пример #22
0
#!/usr/bin/env python

import sys
import CGData
import CGData.Compiler


c = CGData.Compiler.BrowserCompiler()
c.scan_dirs(sys.argv[1:])

linkSpace = {}

for type in c.set_hash:
	if issubclass( CGData.get_type( type ), CGData.CGDataMatrixObject ):
		for name in c.set_hash[type]:
			current = "%s:%s" % (type,name)
			x_link = c.set_hash[type][name].get_x_namespace()
			y_link = c.set_hash[type][name].get_y_namespace()
			if x_link is not None:
				if x_link not in linkSpace:
					linkSpace[x_link] = {}
				linkSpace[x_link][current] = True
				print "%s x_link %s" % (current, x_link)
			if y_link is not None:
				if y_link not in linkSpace:
					linkSpace[y_link] = {}				
				linkSpace[y_link][current] = True
				print "%s y_link %s" % (current, y_link)

for ns in linkSpace:
	s = linkSpace[ns].keys()
Пример #23
0
    def gen_sql(self, id_table, skip_feature_setup=False):
        CGData.log( "Gen %s SQL" % (self.attrs['name']))
        
        if not skip_feature_setup:
            self.feature_type_setup()

        table_name = self.attrs['name']

        yield "drop table if exists clinical_%s;" % ( table_name )

        yield """
CREATE TABLE clinical_%s (
\tsampleID int,
\tsampleName ENUM ('%s')""" % ( table_name, "','".join(sortedSamples(self.row_hash.keys())) )

        for col in self.col_order:
            if ( self.enum_map.has_key( col ) ):
                yield ",\n\t`%s` ENUM( '%s' ) default NULL" % (col.strip(), "','".join( sql_fix(a) for a in self.enum_map[ col ].keys() ) )
            else:
                yield ",\n\t`%s` FLOAT default NULL" % (col.strip())
        yield """
    ) engine 'MyISAM';
    """

        for target in sortedSamples(self.row_hash.keys()):
            a = []
            for col in self.orig_order:
                val = self.row_hash[ target ][ self.col_list[ col ] ]
                #print target, col, val
                if val is None or val == "null" or len(val) == 0 :
                    a.append("\\N")
                else:
                    #a.append( "'" + sql_fix(val) + "'" )
                    a.append( "'" + sql_fix( val.encode('string_escape') ) + "'" )
            yield u"INSERT INTO clinical_%s VALUES ( %d, '%s', %s );\n" % ( table_name, id_table.get( 'sample_id', target ), sql_fix(target), u",".join(a) )
            #yield u"INSERT INTO clinical_%s VALUES ( %d, %s );\n" % ( table_name, id_table.get( 'sample_id', target ), u",".join(a) )


        yield "drop table if exists clinical_%s_colDb;" % ( table_name )
        yield CREATE_COL_DB % ( "clinical_" + table_name + "_colDb" ) 
        """
`id` int(10) unsigned NOT NULL default '0',
`name` varchar(255) default NULL,
`shortLabel` varchar(255) default NULL,
`longLabel` varchar(255) default NULL,
`valField` varchar(255) default NULL,
`clinicalTable` varchar(255) default NULL,
`priority` float default NULL,
`filterType` varchar(255) default NULL,
`visibility` varchar(255) default NULL,
`groupName` varchar(255) default NULL,
PRIMARY KEY  (`id`),
KEY `name` (`name`)
"""
        yield "INSERT INTO clinical_%s_colDb(name, shortLabel,longLabel,valField,clinicalTable,filterType,visibility,priority) VALUES( '%s', '%s', '%s', '%s', '%s', '%s', 'on',1);\n" % \
                ( table_name, 'sampleName', 'sample name', 'sample name', 'sampleName', "clinical_" + table_name, 'coded' )

        i = 0;
        for name in self.col_order:
            filter = 'coded' if self.enum_map.has_key(name) else 'minMax'
            yield "INSERT INTO clinical_%s_colDb(name, shortLabel,longLabel,valField,clinicalTable,filterType,visibility,priority) VALUES( '%s', '%s', '%s', '%s', '%s', '%s', '%s',1);\n" % \
                    ( table_name, name, name, name, name, "clinical_" + table_name, filter, 'on' if i < 10 else 'off')
            i += 1
Пример #24
0
    def gen_sql_heatmap(self, id_table, opts):
        #scan the children
        # XXX Handling of sql for children is broken if the child may appear
        # as part of multiple merge objects, such as TrackGenomic and TrackClinical.
        # A disgusting workaround for clinicalMatrix is to prevent the TrackGenomic from calling
        # it for gen_sql.
        clinical = self.members.pop("clinicalMatrix")
        for line in CGData.CGMergeObject.sql_pass(self, id_table, method="heatmap"):
            yield line
        self.members["clinicalMatrix"] = clinical

        gmatrix = self.members[ 'genomicMatrix' ]
        pmap = self.members[ 'probeMap' ].lookup( assembly="hg18" ) # BUG: hard coded to only producing HG18 tables
        if pmap is None:
            CGData.error("Missing HG18 %s" % ( self.members[ 'probeMap'].get_name() ))
            return

        savedownsample = 'save-ds' in opts and opts['save-ds']
        
        table_base = self.get_name().replace(".", "_")
        CGData.log("Writing Track %s" % (table_base))
        
        clinical_table_base =  self.members[ "clinicalMatrix" ].get_name().replace(".", "_")

        other = {}
        for attr in ['wrangler', 'wrangling_procedure', 'url', 'citation', 'description']:
            if attr in gmatrix:
                other[attr] = gmatrix[attr]
        if 'dataProducer' in gmatrix:
            other['author_list'] = gmatrix['dataProducer']
        if 'articleTitle' in gmatrix:
            other['article_title'] = gmatrix['articleTitle']

        ##TO DO, the version info should be the lastest of genomic and clinical, currently only check genomic
        cVersion= self.members[ 'clinicalMatrix' ].get('version',"")
        gVersion= self.members[ 'genomicMatrix' ].get('version',"")
        dG= makeDate(gVersion)
        dC= makeDate(cVersion)
        if dC == None:
            other['version'] = gVersion
        elif dG<dC:
            other['version'] = cVersion
        else:
            other['version'] = gVersion
        datetime.datetime.strptime(other['version'], "%Y-%m-%d") #if the version isn't properly formatted, though exception

        if 'owner' in gmatrix:
            other['owner'] = gmatrix['owner']
        other['colNormalization'] = gmatrix.get('colNormalization', False)
        if not isinstance(other['colNormalization'], bool):
            other['colNormalization']  = False
        other['redistribution'] = gmatrix.get('redistribution', False)
        if not isinstance(other['redistribution'], bool):
            other['redistribution']  = False
        security = gmatrix.get('security', "public")
        if security not in [ "public", "private" ]:
            security = "public"

        if savedownsample:
            yield "SET @ds=(SELECT downSampleTable FROM raDb WHERE name = '%s');\n" % ("genomic_" + table_base)
        yield "DELETE from raDb where name = '%s';\n" % ("genomic_" + table_base)
        yield "INSERT into raDb( name, downSampleTable, sampleTable, clinicalTable, columnTable, aliasTable, shortLabel, longLabel, expCount, dataType, platform, profile, security, priority, gain, groupName, wrangler, url, article_title, citation, author_list, wrangling_procedure, other) VALUES ( '%s', %s, '%s', '%s', '%s', '%s', '%s', '%s', '%d', '%s', '%s', '%s', '%s', %f, %f, '%s', %s, %s, %s, %s, %s, %s, '%s');\n" % \
            ( "genomic_" + table_base,
                "@ds" if savedownsample else "NULL",
                "sample_" + table_base,
                "clinical_" + clinical_table_base, "colDb",
                "genomic_" + table_base + "_alias",
                sql_fix(gmatrix['shortTitle']),
                sql_fix(gmatrix['longTitle']),
                len(gmatrix.get_sample_list()),
                self.format,
                dataSubTypeMap[gmatrix[':dataSubType']] if gmatrix[':dataSubType'] in dataSubTypeMap else gmatrix[':dataSubType'],
                'localDb',
                security,
                float(gmatrix.get('priority', 1.0)),
                float(gmatrix.get('gain', 1.0)),
                sql_fix(gmatrix.get('groupTitle', 'Misc.')),
                "'%s'"%sql_fix(gmatrix['wrangler']) if 'wrangler' in gmatrix else '\N',
                "'%s'"%sql_fix(gmatrix['url']) if 'url' in gmatrix else '\N',
                "'%s'"%sql_fix(gmatrix['articleTitle']) if 'articleTitle' in gmatrix else '\N',
                "'%s'"%sql_fix(gmatrix['citation']) if 'citation' in gmatrix else '\N',
                "'%s'"%sql_fix(gmatrix['dataProducer']) if 'dataProducer' in gmatrix else '\N',
                "'%s'"%sql_fix(gmatrix['wrangling_procedure']) if 'wrangling_procedure' in gmatrix else '\N',
                sql_fix(json.dumps(other)),
                )

        if 'no-genomic-matrix' in opts and opts['no-genomic-matrix']:
            return
        
        # write out the sample table
        yield "drop table if exists sample_%s;" % ( table_base )
        yield """
        CREATE TABLE sample_%s (
        id           int,
        sampleName   varchar(255)
        ) engine 'MyISAM';
        """ % ( table_base )

        from CGData.ClinicalMatrix import sortedSamples
        for sample in sortedSamples(gmatrix.get_sample_list()):
            yield "INSERT INTO sample_%s VALUES( %d, '%s' );\n" % ( table_base, id_table.get( clinical_table_base + ':sample_id', sample), sql_fix(sample) )

        
        yield "drop table if exists genomic_%s_alias;" % ( table_base )
        yield """
        CREATE TABLE genomic_%s_alias (
        name        varchar(255),
        alias         varchar(255)
        ) engine 'MyISAM';
        """ % ( table_base )

        for probe in pmap.get_probes():
            for alias in probe.aliases:
                yield "insert into genomic_%s_alias( name, alias ) values( '%s', '%s' );\n" % (table_base, sql_fix(probe.name), sql_fix(alias))

        # write out the BED table
        yield "drop table if exists %s;" % ( "genomic_" + table_base )
        yield CREATE_BED % ( "genomic_" + table_base + "_tmp")
        
        sample_ids = []
        samples = gmatrix.get_sample_list()

        # sort samples by sample_id, and retain the sort order for application to the genomic data, below
        tmp=sorted(zip(samples, range(len(samples))), cmp=lambda x,y: id_table.get(clinical_table_base + ':sample_id', x[0]) - id_table.get( clinical_table_base + ':sample_id', y[0]))
        samples, order = map(lambda t: list(t), zip(*tmp))

        for sample in samples:
            sample_ids.append( str( id_table.get( clinical_table_base + ':sample_id', sample ) ) )
        
        exp_ids = ','.join( sample_ids )
        missingProbeCount = 0
        for probe_name in gmatrix.get_probe_list():
            # get the genomic data and rearrange to match the sample_id order
            tmp = gmatrix.get_row_vals( probe_name )
            row = map(lambda i: tmp[order[i]], range(len(tmp)))

            pset = pmap.lookup( probe_name )
            if pset is not None:
                for probe in pset:
                    istr = "insert into %s(bin, chrom, chromStart, chromEnd, strand,  name, expCount, expIds, expScores) values ( %d, '%s', '%s', '%s', '%s', '%s', '%s', '%s', %s );\n" % \
                            ( "genomic_%s_tmp" % (table_base), Binner.calcBin(probe.chrom_start, probe.chrom_end), probe.chrom, probe.chrom_start-1, probe.chrom_end, probe.strand, sql_fix(probe_name), len(sample_ids), exp_ids, self.scores(row) )
                    yield istr
            else:
                missingProbeCount += 1
        yield "# sort file by chrom position\n"
        yield "create table genomic_%s like genomic_%s_tmp;\n" % (table_base, table_base)
        yield "insert into genomic_%s(bin, chrom, chromStart, chromEnd, strand,  name, expCount, expIds, expScores) select bin, chrom, chromStart, chromEnd, strand,  name, expCount, expIds, expScores from genomic_%s_tmp order by chrom, chromStart;\n" % (table_base, table_base)
        yield "drop table genomic_%s_tmp;\n" % table_base
        CGData.log("%s Missing probes %d" % (table_base, missingProbeCount))
Пример #25
0
    def gen_sql_heatmap(self, id_table, features=None):
        CGData.log("Writing Clinical %s SQL" % (self['name']))

        if features == None:
            self.feature_type_setup()
            features = {}

        features['sampleName'] = {
            'shortTitle': ['Sample name'],
            'longTitle': ['Sample name'],
            'visibility': ['on'],
            'priority': [1]
        }

        table_name = self['name']
        clinical_table = 'clinical_' + table_name

        yield "DROP TABLE IF EXISTS %s;\n" % (clinical_table)
        yield "DELETE codes FROM codes, colDb WHERE codes.feature = colDb.id AND colDb.clinicalTable = '%s';\n" % clinical_table
        yield "DELETE FROM colDb WHERE clinicalTable = '%s';\n" % clinical_table

        # colDb
        i = 0
        for name in self.col_order:
            shortLabel = name if name not in features or 'shortTitle' not in features[
                name] else features[name]['shortTitle'][0]
            longLabel = name if name not in features or 'longTitle' not in features[
                name] else features[name]['longTitle'][0]
            filter = 'coded' if self.enum_map.has_key(name) else 'minMax'
            visibility = (
                'on' if i < 10 else
                'off') if name not in features or 'visibility' not in features[
                    name] else features[name]['visibility'][0]
            priority = 1 if name not in features or 'priority' not in features[
                name] else float(features[name]['priority'][0])
            yield "INSERT INTO colDb(name, shortLabel,longLabel,valField,clinicalTable,filterType,visibility,priority) VALUES( '%s', '%s', '%s', '%s', '%s', '%s', '%s', %f);" % \
                    ( sql_fix(name), sql_fix(shortLabel), sql_fix(longLabel), sql_fix(name), clinical_table, filter, visibility, priority)
            yield "SET @col%d=LAST_INSERT_ID();\n" % i
            i += 1

        # codes
        i = 0
        values = {}
        for col in self.col_order:
            if (self.enum_map.has_key(col)):
                values[col] = {}
                j = 0
                for a in sorted(
                        self.enum_map[col].keys(), lambda x, y: self.enum_map[
                            col][x] - self.enum_map[col][y]):
                    yield "INSERT INTO codes(feature,ordering,value) VALUES (@col%d, %d, '%s'); SET @val%d_%d=LAST_INSERT_ID();\n" % (
                        i, j, sql_fix(a), i, j)
                    values[col][a] = "@val%d_%d" % (i, j)
                    j += 1
            i += 1

        yield "CREATE TABLE %s (sampleID INT NOT NULL UNIQUE" % clinical_table

        for col in self.col_order:
            if col == 'sampleName':
                yield ",\n\tsampleName INT UNSIGNED NOT NULL UNIQUE"
            else:
                if self.enum_map.has_key(col):
                    yield ",\n\t`%s` INT UNSIGNED DEFAULT NULL" % (col.strip())
                else:
                    yield ",\n\t`%s` FLOAT DEFAULT NULL" % (col.strip())
        yield """
    ) engine 'MyISAM';
    """

        for target in sortedSamples(self.row_hash.keys()):
            a = []
            for col, orig in zip(self.col_order, self.orig_order):
                if col == 'sampleName':
                    val = target
                else:
                    val = self.row_hash[target][self.col_list[orig]]
                if val is None or val.upper() in NULL_VALUES:
                    a.append("\\N")
                else:
                    if col in self.enum_map:
                        a.append(values[col][val])
                    else:
                        a.append(val)
            yield u"INSERT INTO %s VALUES ( %d, %s );\n" % (
                clinical_table, id_table.get(table_name + ':sample_id',
                                             target), u",".join(a))
Пример #26
0
import CGData
import CGData.NumpyMatrix
import sys

# matrixProbeRemap.py <matrixFile> <probeFile>

#load the matrix

matrix = CGData.NumpyMatrix.NumpyMatrix()
matrixHandle = open(sys.argv[1])
matrix.read(matrixHandle)
matrixHandle.close()

#load the probeMap
probeMap = CGData.load(sys.argv[2])

#remove null probes from the matrix
matrix.remove_null_probes()

#remap the matrix using the probe map

valid_map = {}
for alt in probeMap.get_probes():
    valid_map[alt.aliases[0]] = True
    if alt.name in matrix.get_row_names():
        matrix.row_rename(alt.name, alt.aliases[0])

remove_list = []
for name in matrix.get_row_names():
    if not name in valid_map:
Пример #27
0
    def gen_sql_heatmap(self, id_table):
        # scan the children
        # XXX Handling of sql for children is broken if the child may appear
        # as part of multiple merge objects, such as TrackGenomic and TrackClinical.
        # A disgusting workaround for clinicalMatrix is to prevent the TrackGenomic from calling
        # it for gen_sql.
        clinical = self.members.pop("clinicalMatrix")
        for line in CGData.CGMergeObject.sql_pass(self, id_table, method="heatmap"):
            yield line
        self.members["clinicalMatrix"] = clinical

        gmatrix = self.members["genomicMatrix"]
        pmap = self.members["probeMap"].lookup(assembly="hg18")  # BUG: hard coded to only producing HG18 tables
        if pmap is None:
            CGData.error("Missing HG18 %s" % (self.members["probeMap"].get_name()))
            return

        table_base = self.get_name()
        CGData.log("Writing Track %s" % (table_base))

        clinical_table_base = self.members["clinicalMatrix"].get_name()

        other = {}
        for attr in ["wrangler", "wrangling_procedure", "url", "citation", "description"]:
            if attr in gmatrix:
                other[attr] = gmatrix[attr]
        if "dataProducer" in gmatrix:
            other["author_list"] = gmatrix["dataProducer"]
        if "articleTitle" in gmatrix:
            other["article_title"] = gmatrix["articleTitle"]

        other["version"] = gmatrix.get("version", "")
        datetime.datetime.strptime(
            other["version"], "%Y-%m-%d"
        )  # if the version isn't properly formatted, though exception

        if "owner" in gmatrix:
            other["owner"] = gmatrix["owner"]
        other["colNormalization"] = gmatrix.get("colNormalization", False)
        if not isinstance(other["colNormalization"], bool):
            other["colNormalization"] = False
        other["redistribution"] = gmatrix.get("redistribution", False)
        if not isinstance(other["redistribution"], bool):
            other["redistribution"] = False
        other["security"] = gmatrix.get("security", "public")
        if other["security"] not in ["public", "private"]:
            other["security"] = "public"

        yield "DELETE from raDb where name = '%s';\n" % ("genomic_" + table_base)
        yield "INSERT into raDb( name, sampleTable, clinicalTable, columnTable, aliasTable, shortLabel, longLabel, expCount, dataType, platform, profile, security, priority, gain, groupName, wrangler, url, article_title, citation, author_list, wrangling_procedure, other) VALUES ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%d', '%s', '%s', '%s', '%s', %f, %f, '%s', %s, %s, %s, %s, %s, %s, '%s');\n" % (
            "genomic_" + table_base,
            "sample_" + table_base,
            "clinical_" + clinical_table_base,
            "colDb",
            "genomic_" + table_base + "_alias",
            sql_fix(gmatrix["shortTitle"]),
            sql_fix(gmatrix["longTitle"]),
            len(gmatrix.get_sample_list()),
            self.format,
            dataSubTypeMap[gmatrix[":dataSubType"]]
            if gmatrix[":dataSubType"] in dataSubTypeMap
            else gmatrix[":dataSubType"],
            "localDb",
            "public",
            float(gmatrix.get("priority", 1.0)),
            float(gmatrix.get("gain", 1.0)),
            sql_fix(gmatrix.get("groupTitle", "Misc.")),
            "'%s'" % sql_fix(gmatrix["wrangler"]) if "wrangler" in gmatrix else "\N",
            "'%s'" % sql_fix(gmatrix["url"]) if "url" in gmatrix else "\N",
            "'%s'" % sql_fix(gmatrix["articleTitle"]) if "articleTitle" in gmatrix else "\N",
            "'%s'" % sql_fix(gmatrix["citation"]) if "citation" in gmatrix else "\N",
            "'%s'" % sql_fix(gmatrix["dataProducer"]) if "dataProducer" in gmatrix else "\N",
            "'%s'" % sql_fix(gmatrix["wrangling_procedure"]) if "wrangling_procedure" in gmatrix else "\N",
            sql_fix(json.dumps(other)),
        )

        # write out the sample table
        yield "drop table if exists sample_%s;" % (table_base)
        yield """
CREATE TABLE sample_%s (
    id           int,
    sampleName   varchar(255)
) engine 'MyISAM';
""" % (
            table_base
        )

        from CGData.ClinicalMatrix import sortedSamples

        for sample in sortedSamples(gmatrix.get_sample_list()):
            yield "INSERT INTO sample_%s VALUES( %d, '%s' );\n" % (
                table_base,
                id_table.get(clinical_table_base + ":sample_id", sample),
                sql_fix(sample),
            )

        yield "drop table if exists genomic_%s_alias;" % (table_base)
        yield """
CREATE TABLE genomic_%s_alias (
    name        varchar(255),
    alias         varchar(255)
) engine 'MyISAM';
""" % (
            table_base
        )

        for probe in pmap.get_probes():
            for alias in probe.aliases:
                yield "insert into genomic_%s_alias( name, alias ) values( '%s', '%s' );\n" % (
                    table_base,
                    sql_fix(probe.name),
                    sql_fix(alias),
                )

        # write out the BED table
        yield "drop table if exists %s;" % ("genomic_" + table_base)
        yield CREATE_BED % ("genomic_" + table_base + "_tmp")

        sample_ids = []
        samples = gmatrix.get_sample_list()

        # sort samples by sample_id, and retain the sort order for application to the genomic data, below
        tmp = sorted(
            zip(samples, range(len(samples))),
            cmp=lambda x, y: id_table.get(clinical_table_base + ":sample_id", x[0])
            - id_table.get(clinical_table_base + ":sample_id", y[0]),
        )
        samples, order = map(lambda t: list(t), zip(*tmp))

        for sample in samples:
            sample_ids.append(str(id_table.get(clinical_table_base + ":sample_id", sample)))

        exp_ids = ",".join(sample_ids)
        missingProbeCount = 0
        for probe_name in gmatrix.get_probe_list():
            # get the genomic data and rearrange to match the sample_id order
            tmp = gmatrix.get_row_vals(probe_name)
            row = map(lambda i: tmp[order[i]], range(len(tmp)))

            pset = pmap.lookup(probe_name)
            if pset is not None:
                for probe in pset:
                    istr = (
                        "insert into %s(chrom, chromStart, chromEnd, strand,  name, expCount, expIds, expScores) values ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', %s );\n"
                        % (
                            "genomic_%s_tmp" % (table_base),
                            probe.chrom,
                            probe.chrom_start - 1,
                            probe.chrom_end,
                            probe.strand,
                            sql_fix(probe_name),
                            len(sample_ids),
                            exp_ids,
                            self.scores(row),
                        )
                    )
                    yield istr
            else:
                missingProbeCount += 1
        yield "# sort file by chrom position\n"
        yield "create table genomic_%s like genomic_%s_tmp;\n" % (table_base, table_base)
        yield "insert into genomic_%s select * from genomic_%s_tmp order by chrom, chromStart;\n" % (
            table_base,
            table_base,
        )
        yield "drop table genomic_%s_tmp;\n" % table_base
        CGData.log("%s Missing probes %d" % (table_base, missingProbeCount))
Пример #28
0
class ProbeMap(CGData.CGDataSetObject, CGData.CGGroupMember):

    child_type = Probe

    DATA_FORM = CGData.TABLE
    COLS = [
        CGData.Column('name', str, primary_key=True),
        CGData.Column('chrom', str),
        CGData.Column('chrom_start', str),
        CGData.Column('chrom_end', int),
        CGData.Column('strand', str)
    ]

    def __init__(self):
        CGData.CGDataSetObject.__init__(self)
        self.gene_map = None
        self.chrom_map = None

    def read(self, handle):
        self.gene_map = {}
        self.chrom_map = {}
        read = csv.reader(handle, delimiter="\t")
        for line in read:
            self.gene_map[line[0]] = line[1].split(',')
            try:
                self.append(
                    Probe(line[0], line[2], int(line[3]), int(line[4]),
                          line[5], self.gene_map[line[0]]))
            except ValueError:
                """location int conversion failed, ignore silently"""
                pass

    def append(self, probe):
        for attr in self.child_type.core_attr:
            if not hasattr(probe, attr):
                raise CGData.FormatException("Missing %s" % (attr))

        if self.chrom_map is None:
            self.chrom_map = {}
        if not probe.chrom in self.chrom_map:
            self.chrom_map[probe.chrom] = {}
        if not probe.name in self.chrom_map[probe.chrom]:
            self.chrom_map[probe.chrom][probe.name] = [probe]
        else:
            self.chrom_map[probe.chrom][probe.name].append(probe)

    def write(self, handle):
        for chrom in self.chrom_map:
            for probeName in self.chrom_map[chrom]:
                probes = self.chrom_map[chrom][probeName]
                for probe in probes:
                    handle.write("%s\n" % ("\t".join([
                        probe.name, ",".join(probe.aliases), probe.chrom,
                        str(probe.chrom_start),
                        str(probe.chrom_end), probe.strand
                    ])))

    # XXX need a better name. What does this return?
    def lookup(self, item):
        if self.gene_map is None:
            self.load()
        for chrome in self.chrom_map:
            if item in self.chrom_map[chrome]:
                return self.chrom_map[chrome][item]
        return None

    def row_iter(self):
        if self.gene_map is None:
            self.load()
        for chrome in self.chrom_map:
            for probe in self.chrom_map[chrome]:
                pset = self.chrom_map[chrome][probe]
                for p in pset:
                    yield (p.name, p.chrom, p.chrom_start, p.chrom_end,
                           p.strand)

    # XXX I have no idea what this is returning. What is a pset?
    def get_probes(self):
        if self.gene_map is None:
            self.load()
        for chrome in self.chrom_map:
            for probeSet in self.chrom_map[chrome]:
                for probe in self.chrom_map[chrome][probeSet]:
                    yield probe