示例#1
0
    def scan_dirs(self, dirs):
        for dir in dirs:
            CGData.log("SCANNING DIR: %s" % (dir))
            if os.path.isdir(dir):
                filePath= os.path.join(dir, "*")
            else:
                filePath = dir

            for path in glob(filePath):
                if os.path.isfile(path):
                    if path.endswith(".json"):
                        handle = open(path)
                        try:
                            data = json.loads(handle.read())
                        except ValueError, e:
                            CGData.error("BAD JSON in " + path + " " + str(e) )
                            data = None
                        handle.close()

                        if (data is not None and 'name' in data 
                        and data['name'] is not None
                        and 'type' in data):
                            self.addFile(data['type'], data['name'], path)

                    if path.endswith("*.cgz"):
                        cgzList = CGData.CGZ.list( path )
                        for type in cgzList:
                            for zPath in cgzList[type]:
                                self.addFile(type, cgzList[type][zPath], zPath, path)
                if os.path.isdir(path):
                    self.scan_dirs([path])
示例#2
0
    def gen_sql_heatmap(self, id_table, opts):
        CGData.log("ClincalTrack SQL " + self.get_name())

        features = self.members["clinicalFeature"].features
        matrix = self.members["clinicalMatrix"]

        # e.g. { 'HER2+': 'category', ...}
        explicit_types = dict((f, features[f]['valueType']) for f in features
                              if 'valueType' in features[f])

        matrix.feature_type_setup(explicit_types)
        for a in self.members['clinicalMatrix'].col_list:
            if a in features and "stateOrder" in features[a]:

                enums = [
                    x for x in csv.reader(features[a]["stateOrder"],
                                          skipinitialspace=True)
                ][0]
                i = 0
                #do not drop states in stateOrder
                for e in enums:
                    matrix.enum_map[a][e] = enums.index(e)

                for e in matrix.enum_map[a]:
                    if e in enums:
                        matrix.enum_map[a][e] = enums.index(e)
                    else:
                        matrix.enum_map[a][e] = len(enums) + i
                        i += 1
        for a in matrix.gen_sql_heatmap(id_table, features=features):
            yield a
示例#3
0
    def gen_sql_heatmap(self, id_table, opts):
        CGData.log("ClincalTrack SQL " + self.get_name())

        features = self.members["clinicalFeature"].features
        matrix = self.members["clinicalMatrix"]

        # e.g. { 'HER2+': 'category', ...}
        explicit_types = dict((f, features[f]['valueType']) for f in features if 'valueType' in features[f])

        matrix.feature_type_setup(explicit_types)
        for a in self.members['clinicalMatrix'].col_list:
            if a in features and "stateOrder" in features[a]:

                enums = [x for x in csv.reader(features[a]["stateOrder"], skipinitialspace=True)][0]
                i = 0
                #do not drop states in stateOrder
                for e in enums:
                    matrix.enum_map[a][e] = enums.index(e)

                for e in matrix.enum_map[a]:
                    if e in enums:
                        matrix.enum_map[a][e] = enums.index(e)
                    else:
                        matrix.enum_map[a][e] = len(enums) + i
                        i += 1
        for a in matrix.gen_sql_heatmap(id_table, features=features):
            yield a
示例#4
0
    def gen_sql(self, id_table):
        gmatrix = self.members[ 'genomicMatrix' ]
        pmap = self.members[ 'probeMap' ].get( assembly="hg18" ) # BUG: hard coded to only producing HG18 tables
        if pmap is None:
            CGData.error("Missing HG18 %s" % ( self.members[ 'probeMap'].get_name() ))
            return
        
        table_base = self.get_name()
        CGData.log("Writing Track %s" % (table_base))
        
        clinical_table_base =  self.members[ "clinicalMatrix" ].get_name()

        yield "INSERT into raDb( name, sampleTable, clinicalTable, columnTable, aliasTable, shortLabel, expCount, dataType, platform, profile, security) VALUES ( '%s', '%s', '%s', '%s', '%s', '%s', '%d', '%s', '%s', '%s', '%s');\n" % \
            ( "genomic_" + table_base, "sample_" + table_base,
                "clinical_" + clinical_table_base, "clinical_" + clinical_table_base + "_colDb",
                "genomic_" + table_base + "_alias",
                table_base,
                len(gmatrix.get_sample_list()),
                'bed 15',
                gmatrix.attrs[':dataSubType'],
                'localDb',
                'public',
                )
        
        # write out the sample table
        yield "drop table if exists sample_%s;" % ( table_base )
        yield """
CREATE TABLE sample_%s (
    id           int,
    sampleName   varchar(255)
) engine 'MyISAM';
""" % ( table_base )

        for sample in gmatrix.get_sample_list():
            yield "INSERT INTO sample_%s VALUES( %d, '%s' );\n" % ( table_base, id_table.get( 'sample_id', sample), sample )

        # write out the BED table
        yield "drop table if exists %s;" % ( "genomic_" + table_base )
        yield CREATE_BED % ( "genomic_" + table_base )
        
        sample_ids = []
        for sample in gmatrix.get_sample_list():
            sample_ids.append( str( id_table.get( 'sample_id', sample ) ) )
        
        missingProbeCount = 0
        for probe_name in gmatrix.get_probe_list():
            exp_ids = ','.join( sample_ids )
            row = gmatrix.get_row_vals( probe_name )
            exps = ','.join( str(a) for a in row )
            probe = pmap.get( probe_name )
            if probe is not None:
                istr = "insert into %s(chrom, chromStart, chromEnd, strand,  name, expCount, expIds, expScores) values ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s' );\n" % \
                    ( "genomic_%s" % (table_base), probe.chrom, probe.chrom_start, probe.chrom_end, probe.strand, sql_fix(probe_name), len(sample_ids), exp_ids, exps )
                yield istr
            else:
                missingProbeCount += 1
        CGData.log("%s Missing probes %d" % (table_base, missingProbeCount))
示例#5
0
    def addFile(self, type, name, path, zipFile=None):
        if CGData.has_type(type):
            if not type in self:
                self[type] = {}

            if name in self[type]:
                CGData.error("Duplicate %s file %s" % (type, name))
            self[type][name] = CGData.light_load(path, zipFile)
            CGData.log("FOUND: " + type + "\t" + name + "\t" + path)
        else:
            CGData.warn("Unknown file type: %s" % (path))
示例#6
0
    def addFile(self, type, name, path, zipFile=None):
        if CGData.has_type(type):
            if not type in self:
                self[type] = {}

            if name in self[type]:
                CGData.error("Duplicate %s file %s" % (type, name))
            self[type][name] = CGData.light_load(path, zipFile)
            CGData.log("FOUND: " + type + "\t" + name + "\t" + path)
        else:
            CGData.warn("Unknown file type: %s" % (path))
示例#7
0
 def gen_sql(self, id_table):
     CGData.log("ClincalTrack SQL " + self.get_name())
             
     matrix = self.members["clinicalMatrix"]        
     matrix.feature_type_setup()        
     features = self.members["clinicalFeature"]
     #print features
     for a in features:
         if "stateOrder" in features[a]:
             #print features[a]["stateOrder"][0]
             
             #this weird bit of code is to split on ',', but respect \,
             #if you can think of a better way, please replace this
             tmp = re.split(r'([^,]),', features[a]["stateOrder"][0])
             enums = []
             word = True
             appending = False
             e = 0
             while e < len(tmp): 
                 if word:
                     if appending:
                         enums[-1] += tmp[e]
                     else:
                         enums.append(tmp[e])
                     word = False
                 else:
                     if tmp[e] != "\\":
                         enums[-1] += tmp[e]
                         appending = False
                     else:
                         enums[-1] += ","
                         appending = True
                     word = True
                 e += 1
             
             #print tmp
             #print enums
             #print matrix.enum_map[a]
             i = 0
             for e in matrix.enum_map[a]:
                 if e in enums:
                     matrix.enum_map[a][e] = enums.index(e)
                 else:
                     matrix.enum_map[a][e] = len(enums) + i
                     i += 1
             #print matrix.enum_map[a]
             #print "-=-=-=-=-"
     for a in matrix.gen_sql(id_table, skip_feature_setup=True):
         yield a
示例#8
0
    def gen_sql_heatmap(self, id_table, skip_feature_setup=False):
        CGData.log( "Writing Clinical %s SQL" % (self.attrs['name']))
        
        if not skip_feature_setup:
            self.feature_type_setup()

        table_name = self.attrs['name']

        yield "drop table if exists clinical_%s;" % ( table_name )

        yield """
CREATE TABLE clinical_%s (
\tsampleID int,
\tsampleName ENUM ('%s')""" % ( table_name, "','".join(sortedSamples(self.row_hash.keys())) )

        for col in self.col_order:
            if ( self.enum_map.has_key( col ) ):
                yield ",\n\t`%s` ENUM( '%s' ) default NULL" % (col.strip(), "','".join( sql_fix(a) for a in sorted(self.enum_map[ col ].keys(), lambda x,y: self.enum_map[col][x]-self.enum_map[col][y]) ) )
            else:
                yield ",\n\t`%s` FLOAT default NULL" % (col.strip())
        yield """
    ) engine 'MyISAM';
    """

        for target in sortedSamples(self.row_hash.keys()):
            a = []
            for col in self.orig_order:
                val = self.row_hash[ target ][ self.col_list[ col ] ]
                if val is None or val == "null" or len(val) == 0 :
                    a.append("\\N")
                else:
                    a.append( "'" + sql_fix( val.encode('string_escape') ) + "'" )
            yield u"INSERT INTO clinical_%s VALUES ( %d, '%s', %s );\n" % ( table_name, id_table.get( table_name + ':sample_id', target ), sql_fix(target), u",".join(a) )


        yield "drop table if exists clinical_%s_colDb;" % ( table_name )
        yield CREATE_COL_DB % ( "clinical_" + table_name + "_colDb" )

        yield "INSERT INTO clinical_%s_colDb(name, shortLabel,longLabel,valField,clinicalTable,filterType,visibility,priority) VALUES( '%s', '%s', '%s', '%s', '%s', '%s', 'on',1);\n" % \
                ( table_name, 'sampleName', 'sample name', 'sample name', 'sampleName', "clinical_" + table_name, 'coded' )

        i = 0;
        for name in self.col_order:
            filter = 'coded' if self.enum_map.has_key(name) else 'minMax'
            yield "INSERT INTO clinical_%s_colDb(name, shortLabel,longLabel,valField,clinicalTable,filterType,visibility,priority) VALUES( '%s', '%s', '%s', '%s', '%s', '%s', '%s',1);\n" % \
                    ( table_name, name, name, name, name, "clinical_" + table_name, filter, 'on' if i < 10 else 'off')
            i += 1
示例#9
0
    def gen_sql_heatmap(self, id_table):
        #scan the children
        # XXX Handling of sql for children is broken if the child may appear
        # as part of multiple merge objects, such as TrackGenomic and TrackClinical.
        # A disgusting workaround for clinicalMatrix is to prevent the TrackGenomic from calling
        # it for gen_sql.
        clinical = self.members.pop("clinicalMatrix")
        for line in CGData.CGMergeObject.sql_pass(self, id_table, method="heatmap"):
            yield line
        self.members["clinicalMatrix"] = clinical

        gmatrix = self.members[ 'genomicMatrix' ]
        pmap = self.members[ 'probeMap' ].get( assembly="hg18" ) # BUG: hard coded to only producing HG18 tables
        if pmap is None:
            CGData.error("Missing HG18 %s" % ( self.members[ 'probeMap'].get_name() ))
            return
        
        table_base = self.get_name()
        CGData.log("Writing Track %s" % (table_base))
        
        clinical_table_base =  self.members[ "clinicalMatrix" ].get_name()

        yield "INSERT into raDb( name, sampleTable, clinicalTable, columnTable, aliasTable, shortLabel, longLabel, expCount, dataType, platform, profile, security) VALUES ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%d', '%s', '%s', '%s', '%s');\n" % \
            ( "genomic_" + table_base, "sample_" + table_base,
                "clinical_" + clinical_table_base, "clinical_" + clinical_table_base + "_colDb",
                "genomic_" + table_base + "_alias",
                sql_fix(gmatrix.attrs['shortTitle']),
                sql_fix(gmatrix.attrs['longTitle']),
                len(gmatrix.get_sample_list()),
                self.format,
                gmatrix.attrs[':dataSubType'],
                'localDb',
                'public',
                )
        
        # write out the sample table
        yield "drop table if exists sample_%s;" % ( table_base )
        yield """
CREATE TABLE sample_%s (
    id           int,
    sampleName   varchar(255)
) engine 'MyISAM';
""" % ( table_base )

        from CGData.ClinicalMatrix import sortedSamples
        for sample in sortedSamples(gmatrix.get_sample_list()):
	    yield "INSERT INTO sample_%s VALUES( %d, '%s' );\n" % ( table_base, id_table.get( clinical_table_base + ':sample_id', sample), sample )

        
        yield "drop table if exists genomic_%s_alias;" % ( table_base )
        yield """
CREATE TABLE genomic_%s_alias (
    name        varchar(255),
    alias         varchar(255)
) engine 'MyISAM';
""" % ( table_base )

        for pset in pmap:
            for probe in pset:
                for alias in probe.aliases:
                    yield "insert into genomic_%s_alias( name, alias ) values( '%s', '%s' );\n" % (table_base, sql_fix(probe.name), sql_fix(alias))

        # write out the BED table
        yield "drop table if exists %s;" % ( "genomic_" + table_base )
        yield CREATE_BED % ( "genomic_" + table_base + "_tmp")
        
        sample_ids = []
        samples = gmatrix.get_sample_list()

        # sort samples by sample_id, and retain the sort order for application to the genomic data, below
        tmp=sorted(zip(samples, range(len(samples))), cmp=lambda x,y: id_table.get(clinical_table_base + ':sample_id', x[0]) - id_table.get( clinical_table_base + ':sample_id', y[0]))
        samples, order = map(lambda t: list(t), zip(*tmp))

        for sample in samples:
            sample_ids.append( str( id_table.get( clinical_table_base + ':sample_id', sample ) ) )
        
        exp_ids = ','.join( sample_ids )
        missingProbeCount = 0
        for probe_name in gmatrix.get_probe_list():
            # get the genomic data and rearrange to match the sample_id order
            tmp = gmatrix.get_row_vals( probe_name )
            row = map(lambda i: tmp[order[i]], range(len(tmp)))

            pset = pmap.get( probe_name )
            if pset is not None:
                for probe in pset:
                    istr = "insert into %s(chrom, chromStart, chromEnd, strand,  name, expCount, expIds, expScores) values ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', %s );\n" % \
                            ( "genomic_%s_tmp" % (table_base), probe.chrom, probe.chrom_start, probe.chrom_end, probe.strand, sql_fix(probe_name), len(sample_ids), exp_ids, self.scores(row) )
                    yield istr
            else:
                missingProbeCount += 1
        yield "create table genomic_%s like genomic_%s_tmp;" % (table_base, table_base)
        yield "insert into genomic_%s select * from genomic_%s_tmp order by chrom, chromStart;" % (table_base, table_base)
        yield "drop table genomic_%s_tmp;" % table_base
        CGData.log("%s Missing probes %d" % (table_base, missingProbeCount))
示例#10
0
    def gen_sql_heatmap(self, id_table, features=None):
        CGData.log("Writing Clinical %s SQL" % (self['name']))

        if features == None:
            self.feature_type_setup()
            features = {}

        features['sampleName'] = {
            'shortTitle': ['Sample name'],
            'longTitle': ['Sample name'],
            'visibility': ['on'],
            'priority': [1]
        }

        table_name = self['name']
        clinical_table = 'clinical_' + table_name

        yield "DROP TABLE IF EXISTS %s;\n" % (clinical_table)
        yield "DELETE codes FROM codes, colDb WHERE codes.feature = colDb.id AND colDb.clinicalTable = '%s';\n" % clinical_table
        yield "DELETE FROM colDb WHERE clinicalTable = '%s';\n" % clinical_table

        # colDb
        i = 0
        for name in self.col_order:
            shortLabel = name if name not in features or 'shortTitle' not in features[
                name] else features[name]['shortTitle'][0]
            longLabel = name if name not in features or 'longTitle' not in features[
                name] else features[name]['longTitle'][0]
            filter = 'coded' if self.enum_map.has_key(name) else 'minMax'
            visibility = (
                'on' if i < 10 else
                'off') if name not in features or 'visibility' not in features[
                    name] else features[name]['visibility'][0]
            priority = 1 if name not in features or 'priority' not in features[
                name] else float(features[name]['priority'][0])
            yield "INSERT INTO colDb(name, shortLabel,longLabel,valField,clinicalTable,filterType,visibility,priority) VALUES( '%s', '%s', '%s', '%s', '%s', '%s', '%s', %f);" % \
                    ( sql_fix(name), sql_fix(shortLabel), sql_fix(longLabel), sql_fix(name), clinical_table, filter, visibility, priority)
            yield "SET @col%d=LAST_INSERT_ID();\n" % i
            i += 1

        # codes
        i = 0
        values = {}
        for col in self.col_order:
            if (self.enum_map.has_key(col)):
                values[col] = {}
                j = 0
                for a in sorted(
                        self.enum_map[col].keys(), lambda x, y: self.enum_map[
                            col][x] - self.enum_map[col][y]):
                    yield "INSERT INTO codes(feature,ordering,value) VALUES (@col%d, %d, '%s'); SET @val%d_%d=LAST_INSERT_ID();\n" % (
                        i, j, sql_fix(a), i, j)
                    values[col][a] = "@val%d_%d" % (i, j)
                    j += 1
            i += 1

        yield "CREATE TABLE %s (sampleID INT NOT NULL UNIQUE" % clinical_table

        for col in self.col_order:
            if col == 'sampleName':
                yield ",\n\tsampleName INT UNSIGNED NOT NULL UNIQUE"
            else:
                if self.enum_map.has_key(col):
                    yield ",\n\t`%s` INT UNSIGNED DEFAULT NULL" % (col.strip())
                else:
                    yield ",\n\t`%s` FLOAT DEFAULT NULL" % (col.strip())
        yield """
    ) engine 'MyISAM';
    """

        for target in sortedSamples(self.row_hash.keys()):
            a = []
            for col, orig in zip(self.col_order, self.orig_order):
                if col == 'sampleName':
                    val = target
                else:
                    val = self.row_hash[target][self.col_list[orig]]
                if val is None or val.upper() in NULL_VALUES:
                    a.append("\\N")
                else:
                    if col in self.enum_map:
                        a.append(values[col][val])
                    else:
                        a.append(val)
            yield u"INSERT INTO %s VALUES ( %d, %s );\n" % (
                clinical_table, id_table.get(table_name + ':sample_id',
                                             target), u",".join(a))
示例#11
0
    def gen_sql(self, id_table, skip_feature_setup=False):
        CGData.log( "Gen %s SQL" % (self.attrs['name']))
        
        if not skip_feature_setup:
            self.feature_type_setup()

        table_name = self.attrs['name']

        yield "drop table if exists clinical_%s;" % ( table_name )

        yield """
CREATE TABLE clinical_%s (
\tsampleID int,
\tsampleName ENUM ('%s')""" % ( table_name, "','".join(sortedSamples(self.row_hash.keys())) )

        for col in self.col_order:
            if ( self.enum_map.has_key( col ) ):
                yield ",\n\t`%s` ENUM( '%s' ) default NULL" % (col.strip(), "','".join( sql_fix(a) for a in self.enum_map[ col ].keys() ) )
            else:
                yield ",\n\t`%s` FLOAT default NULL" % (col.strip())
        yield """
    ) engine 'MyISAM';
    """

        for target in sortedSamples(self.row_hash.keys()):
            a = []
            for col in self.orig_order:
                val = self.row_hash[ target ][ self.col_list[ col ] ]
                #print target, col, val
                if val is None or val == "null" or len(val) == 0 :
                    a.append("\\N")
                else:
                    #a.append( "'" + sql_fix(val) + "'" )
                    a.append( "'" + sql_fix( val.encode('string_escape') ) + "'" )
            yield u"INSERT INTO clinical_%s VALUES ( %d, '%s', %s );\n" % ( table_name, id_table.get( 'sample_id', target ), sql_fix(target), u",".join(a) )
            #yield u"INSERT INTO clinical_%s VALUES ( %d, %s );\n" % ( table_name, id_table.get( 'sample_id', target ), u",".join(a) )


        yield "drop table if exists clinical_%s_colDb;" % ( table_name )
        yield CREATE_COL_DB % ( "clinical_" + table_name + "_colDb" ) 
        """
`id` int(10) unsigned NOT NULL default '0',
`name` varchar(255) default NULL,
`shortLabel` varchar(255) default NULL,
`longLabel` varchar(255) default NULL,
`valField` varchar(255) default NULL,
`clinicalTable` varchar(255) default NULL,
`priority` float default NULL,
`filterType` varchar(255) default NULL,
`visibility` varchar(255) default NULL,
`groupName` varchar(255) default NULL,
PRIMARY KEY  (`id`),
KEY `name` (`name`)
"""
        yield "INSERT INTO clinical_%s_colDb(name, shortLabel,longLabel,valField,clinicalTable,filterType,visibility,priority) VALUES( '%s', '%s', '%s', '%s', '%s', '%s', 'on',1);\n" % \
                ( table_name, 'sampleName', 'sample name', 'sample name', 'sampleName', "clinical_" + table_name, 'coded' )

        i = 0;
        for name in self.col_order:
            filter = 'coded' if self.enum_map.has_key(name) else 'minMax'
            yield "INSERT INTO clinical_%s_colDb(name, shortLabel,longLabel,valField,clinicalTable,filterType,visibility,priority) VALUES( '%s', '%s', '%s', '%s', '%s', '%s', '%s',1);\n" % \
                    ( table_name, name, name, name, name, "clinical_" + table_name, filter, 'on' if i < 10 else 'off')
            i += 1
示例#12
0
    def gen_sql_heatmap(self, id_table, features=None):
        CGData.log( "Writing Clinical %s SQL" % (self['name']))
        
        if features == None:
            self.feature_type_setup()
            features = {}

        features['sampleName'] = { 'shortTitle': ['Sample name'], 'longTitle': ['Sample name'], 'visibility': ['on'], 'priority': [100.0] }

        table_name = self['name'].replace(".","_")
        clinical_table = 'clinical_' + table_name

        yield "DROP TABLE IF EXISTS %s;\n" % ( clinical_table )
        yield "DELETE codes FROM codes, colDb WHERE codes.feature = colDb.id AND colDb.clinicalTable = '%s';\n" % clinical_table
        yield "DELETE FROM colDb WHERE clinicalTable = '%s';\n" % clinical_table

        # colDb
        i = 0;
        for name in self.col_order:
            shortLabel = name if name not in features or 'shortTitle' not in features[name] else features[name]['shortTitle'][0]
            longLabel = name if name not in features or 'longTitle' not in features[name] else features[name]['longTitle'][0]
            filter = 'coded' if self.enum_map.has_key(name) else 'minMax'
            visibility = ('on' if i < 10 else 'off') if name not in features or 'visibility' not in features[name] else features[name]['visibility'][0]
            priority = 1 if name not in features or 'priority' not in features[name] else float(features[name]['priority'][0])
            yield "INSERT INTO colDb(name, shortLabel,longLabel,valField,clinicalTable,filterType,visibility,priority) VALUES( '%s', '%s', '%s', '%s', '%s', '%s', '%s', %f);" % \
                    ( sql_fix(name), sql_fix(shortLabel), sql_fix(longLabel), sql_fix(name), clinical_table, filter, visibility, priority)
            yield "SET @col%d=LAST_INSERT_ID();\n" % i
            i += 1

        # codes
        i = 0;
        values = {}
        for col in self.col_order:
            if ( self.enum_map.has_key( col ) ):
                values[col] = {}
                j = 0
                for a in sorted(self.enum_map[ col ].keys(), lambda x,y: self.enum_map[col][x]-self.enum_map[col][y]):
                    yield "INSERT INTO codes(feature,ordering,value) VALUES (@col%d, %d, '%s'); SET @val%d_%d=LAST_INSERT_ID();\n" % (i, j, sql_fix(a), i, j)
                    values[col][a] = "@val%d_%d" % (i, j)
                    j += 1
            i += 1


        yield "CREATE TABLE %s (sampleID INT NOT NULL UNIQUE" % clinical_table

        for col in self.col_order:
            if col == 'sampleName':
                yield ",\n\tsampleName INT UNSIGNED NOT NULL UNIQUE"
            else:
                if self.enum_map.has_key(col):
                    yield ",\n\t`%s` INT UNSIGNED DEFAULT NULL" % (col.strip())
                else:
                    yield ",\n\t`%s` FLOAT DEFAULT NULL" % (col.strip())
        yield """
    ) engine 'MyISAM';
    """

        for target in sortedSamples(self.row_hash.keys()):
            a = []
            for col,orig in zip(self.col_order, self.orig_order):
                if col == 'sampleName':
                    val = target
                else:
                    val = self.row_hash[ target ][ self.col_list[ orig ] ]
                if val is None or val.upper() in NULL_VALUES:
                    a.append("\\N")
                else:
                    if col in self.enum_map:
                        a.append(values[col][val])
                    else:
                        a.append(val)
            yield u"INSERT INTO %s VALUES ( %d, %s );\n" % ( clinical_table, id_table.get( table_name + ':sample_id', target ), u",".join(a) )
示例#13
0
    def gen_sql_heatmap(self, id_table):
        # scan the children
        # XXX Handling of sql for children is broken if the child may appear
        # as part of multiple merge objects, such as TrackGenomic and TrackClinical.
        # A disgusting workaround for clinicalMatrix is to prevent the TrackGenomic from calling
        # it for gen_sql.
        clinical = self.members.pop("clinicalMatrix")
        for line in CGData.CGMergeObject.sql_pass(self, id_table, method="heatmap"):
            yield line
        self.members["clinicalMatrix"] = clinical

        gmatrix = self.members["genomicMatrix"]
        pmap = self.members["probeMap"].lookup(assembly="hg18")  # BUG: hard coded to only producing HG18 tables
        if pmap is None:
            CGData.error("Missing HG18 %s" % (self.members["probeMap"].get_name()))
            return

        table_base = self.get_name()
        CGData.log("Writing Track %s" % (table_base))

        clinical_table_base = self.members["clinicalMatrix"].get_name()

        other = {}
        for attr in ["wrangler", "wrangling_procedure", "url", "citation", "description"]:
            if attr in gmatrix:
                other[attr] = gmatrix[attr]
        if "dataProducer" in gmatrix:
            other["author_list"] = gmatrix["dataProducer"]
        if "articleTitle" in gmatrix:
            other["article_title"] = gmatrix["articleTitle"]

        other["version"] = gmatrix.get("version", "")
        datetime.datetime.strptime(
            other["version"], "%Y-%m-%d"
        )  # if the version isn't properly formatted, though exception

        if "owner" in gmatrix:
            other["owner"] = gmatrix["owner"]
        other["colNormalization"] = gmatrix.get("colNormalization", False)
        if not isinstance(other["colNormalization"], bool):
            other["colNormalization"] = False
        other["redistribution"] = gmatrix.get("redistribution", False)
        if not isinstance(other["redistribution"], bool):
            other["redistribution"] = False
        other["security"] = gmatrix.get("security", "public")
        if other["security"] not in ["public", "private"]:
            other["security"] = "public"

        yield "DELETE from raDb where name = '%s';\n" % ("genomic_" + table_base)
        yield "INSERT into raDb( name, sampleTable, clinicalTable, columnTable, aliasTable, shortLabel, longLabel, expCount, dataType, platform, profile, security, priority, gain, groupName, wrangler, url, article_title, citation, author_list, wrangling_procedure, other) VALUES ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%d', '%s', '%s', '%s', '%s', %f, %f, '%s', %s, %s, %s, %s, %s, %s, '%s');\n" % (
            "genomic_" + table_base,
            "sample_" + table_base,
            "clinical_" + clinical_table_base,
            "colDb",
            "genomic_" + table_base + "_alias",
            sql_fix(gmatrix["shortTitle"]),
            sql_fix(gmatrix["longTitle"]),
            len(gmatrix.get_sample_list()),
            self.format,
            dataSubTypeMap[gmatrix[":dataSubType"]]
            if gmatrix[":dataSubType"] in dataSubTypeMap
            else gmatrix[":dataSubType"],
            "localDb",
            "public",
            float(gmatrix.get("priority", 1.0)),
            float(gmatrix.get("gain", 1.0)),
            sql_fix(gmatrix.get("groupTitle", "Misc.")),
            "'%s'" % sql_fix(gmatrix["wrangler"]) if "wrangler" in gmatrix else "\N",
            "'%s'" % sql_fix(gmatrix["url"]) if "url" in gmatrix else "\N",
            "'%s'" % sql_fix(gmatrix["articleTitle"]) if "articleTitle" in gmatrix else "\N",
            "'%s'" % sql_fix(gmatrix["citation"]) if "citation" in gmatrix else "\N",
            "'%s'" % sql_fix(gmatrix["dataProducer"]) if "dataProducer" in gmatrix else "\N",
            "'%s'" % sql_fix(gmatrix["wrangling_procedure"]) if "wrangling_procedure" in gmatrix else "\N",
            sql_fix(json.dumps(other)),
        )

        # write out the sample table
        yield "drop table if exists sample_%s;" % (table_base)
        yield """
CREATE TABLE sample_%s (
    id           int,
    sampleName   varchar(255)
) engine 'MyISAM';
""" % (
            table_base
        )

        from CGData.ClinicalMatrix import sortedSamples

        for sample in sortedSamples(gmatrix.get_sample_list()):
            yield "INSERT INTO sample_%s VALUES( %d, '%s' );\n" % (
                table_base,
                id_table.get(clinical_table_base + ":sample_id", sample),
                sql_fix(sample),
            )

        yield "drop table if exists genomic_%s_alias;" % (table_base)
        yield """
CREATE TABLE genomic_%s_alias (
    name        varchar(255),
    alias         varchar(255)
) engine 'MyISAM';
""" % (
            table_base
        )

        for probe in pmap.get_probes():
            for alias in probe.aliases:
                yield "insert into genomic_%s_alias( name, alias ) values( '%s', '%s' );\n" % (
                    table_base,
                    sql_fix(probe.name),
                    sql_fix(alias),
                )

        # write out the BED table
        yield "drop table if exists %s;" % ("genomic_" + table_base)
        yield CREATE_BED % ("genomic_" + table_base + "_tmp")

        sample_ids = []
        samples = gmatrix.get_sample_list()

        # sort samples by sample_id, and retain the sort order for application to the genomic data, below
        tmp = sorted(
            zip(samples, range(len(samples))),
            cmp=lambda x, y: id_table.get(clinical_table_base + ":sample_id", x[0])
            - id_table.get(clinical_table_base + ":sample_id", y[0]),
        )
        samples, order = map(lambda t: list(t), zip(*tmp))

        for sample in samples:
            sample_ids.append(str(id_table.get(clinical_table_base + ":sample_id", sample)))

        exp_ids = ",".join(sample_ids)
        missingProbeCount = 0
        for probe_name in gmatrix.get_probe_list():
            # get the genomic data and rearrange to match the sample_id order
            tmp = gmatrix.get_row_vals(probe_name)
            row = map(lambda i: tmp[order[i]], range(len(tmp)))

            pset = pmap.lookup(probe_name)
            if pset is not None:
                for probe in pset:
                    istr = (
                        "insert into %s(chrom, chromStart, chromEnd, strand,  name, expCount, expIds, expScores) values ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', %s );\n"
                        % (
                            "genomic_%s_tmp" % (table_base),
                            probe.chrom,
                            probe.chrom_start - 1,
                            probe.chrom_end,
                            probe.strand,
                            sql_fix(probe_name),
                            len(sample_ids),
                            exp_ids,
                            self.scores(row),
                        )
                    )
                    yield istr
            else:
                missingProbeCount += 1
        yield "# sort file by chrom position\n"
        yield "create table genomic_%s like genomic_%s_tmp;\n" % (table_base, table_base)
        yield "insert into genomic_%s select * from genomic_%s_tmp order by chrom, chromStart;\n" % (
            table_base,
            table_base,
        )
        yield "drop table genomic_%s_tmp;\n" % table_base
        CGData.log("%s Missing probes %d" % (table_base, missingProbeCount))
示例#14
0
    def gen_sql_heatmap(self, id_table):
        #scan the children
        # XXX Handling of sql for children is broken if the child may appear
        # as part of multiple merge objects, such as TrackGenomic and TrackClinical.
        # A disgusting workaround for clinicalMatrix is to prevent the TrackGenomic from calling
        # it for gen_sql.
        clinical = self.members.pop("clinicalMatrix")
        for line in CGData.CGMergeObject.sql_pass(self, id_table, method="heatmap"):
            yield line
        self.members["clinicalMatrix"] = clinical

        gmatrix = self.members[ 'genomicMatrix' ]
        pmap = self.members[ 'probeMap' ].lookup( assembly="hg18" ) # BUG: hard coded to only producing HG18 tables
        if pmap is None:
            CGData.error("Missing HG18 %s" % ( self.members[ 'probeMap'].get_name() ))
            return
        
        table_base = self.get_name()
        CGData.log("Writing Track %s" % (table_base))
        
        clinical_table_base =  self.members[ "clinicalMatrix" ].get_name()

        other = {}
        for attr in ['wrangler', 'wrangling_procedure', 'url', 'citation', 'description']:
            if attr in gmatrix:
                other[attr] = gmatrix[attr]
        if 'dataProducer' in gmatrix:
            other['author_list'] = gmatrix['dataProducer']
        if 'articleTitle' in gmatrix:
            other['article_title'] = gmatrix['articleTitle']
        
        other['version'] = gmatrix.get('version', "")
        datetime.datetime.strptime(other['version'], "%Y-%m-%d") #if the version isn't properly formatted, though exception
        
        if 'owner' in gmatrix:
            other['owner'] = gmatrix['owner']
        other['colNormalization'] = gmatrix.get('colNormalization', False)
        if not isinstance(other['colNormalization'], bool):
            other['colNormalization']  = False
        other['redistribution'] = gmatrix.get('redistribution', False)
        if not isinstance(other['redistribution'], bool):
            other['redistribution']  = False
        other['security'] = gmatrix.get('security', "public")
        if other['security'] not in [ "public", "private" ]:
            other['security'] = "public"

        yield "DELETE from raDb where name = '%s';\n" % ("genomic_" + table_base)
        yield "INSERT into raDb( name, sampleTable, clinicalTable, columnTable, aliasTable, shortLabel, longLabel, expCount, dataType, platform, profile, security, priority, gain, groupName, wrangler, url, article_title, citation, author_list, wrangling_procedure, other) VALUES ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%d', '%s', '%s', '%s', '%s', %f, %f, '%s', %s, %s, %s, %s, %s, %s, '%s');\n" % \
            ( "genomic_" + table_base, "sample_" + table_base,
                "clinical_" + clinical_table_base, "colDb",
                "genomic_" + table_base + "_alias",
                sql_fix(gmatrix['shortTitle']),
                sql_fix(gmatrix['longTitle']),
                len(gmatrix.get_sample_list()),
                self.format,
                dataSubTypeMap[gmatrix[':dataSubType']] if gmatrix[':dataSubType'] in dataSubTypeMap else gmatrix[':dataSubType'],
                'localDb',
                'public',
                float(gmatrix.get('priority', 1.0)),
                float(gmatrix.get('gain', 1.0)),
                sql_fix(gmatrix.get('groupTitle', 'Misc.')),
                "'%s'"%sql_fix(gmatrix['wrangler']) if 'wrangler' in gmatrix else '\N',
                "'%s'"%sql_fix(gmatrix['url']) if 'url' in gmatrix else '\N',
                "'%s'"%sql_fix(gmatrix['articleTitle']) if 'articleTitle' in gmatrix else '\N',
                "'%s'"%sql_fix(gmatrix['citation']) if 'citation' in gmatrix else '\N',
                "'%s'"%sql_fix(gmatrix['dataProducer']) if 'dataProducer' in gmatrix else '\N',
                "'%s'"%sql_fix(gmatrix['wrangling_procedure']) if 'wrangling_procedure' in gmatrix else '\N',
                sql_fix(json.dumps(other)),
                )
        
        # write out the sample table
        yield "drop table if exists sample_%s;" % ( table_base )
        yield """
CREATE TABLE sample_%s (
    id           int,
    sampleName   varchar(255)
) engine 'MyISAM';
""" % ( table_base )

        from CGData.ClinicalMatrix import sortedSamples
        for sample in sortedSamples(gmatrix.get_sample_list()):
            yield "INSERT INTO sample_%s VALUES( %d, '%s' );\n" % ( table_base, id_table.get( clinical_table_base + ':sample_id', sample), sql_fix(sample) )

        
        yield "drop table if exists genomic_%s_alias;" % ( table_base )
        yield """
CREATE TABLE genomic_%s_alias (
    name        varchar(255),
    alias         varchar(255)
) engine 'MyISAM';
""" % ( table_base )

        for probe in pmap.get_probes():
            for alias in probe.aliases:
                yield "insert into genomic_%s_alias( name, alias ) values( '%s', '%s' );\n" % (table_base, sql_fix(probe.name), sql_fix(alias))

        # write out the BED table
        yield "drop table if exists %s;" % ( "genomic_" + table_base )
        yield CREATE_BED % ( "genomic_" + table_base + "_tmp")
        
        sample_ids = []
        samples = gmatrix.get_sample_list()

        # sort samples by sample_id, and retain the sort order for application to the genomic data, below
        tmp=sorted(zip(samples, range(len(samples))), cmp=lambda x,y: id_table.get(clinical_table_base + ':sample_id', x[0]) - id_table.get( clinical_table_base + ':sample_id', y[0]))
        samples, order = map(lambda t: list(t), zip(*tmp))

        for sample in samples:
            sample_ids.append( str( id_table.get( clinical_table_base + ':sample_id', sample ) ) )
        
        exp_ids = ','.join( sample_ids )
        missingProbeCount = 0
        for probe_name in gmatrix.get_probe_list():
            # get the genomic data and rearrange to match the sample_id order
            tmp = gmatrix.get_row_vals( probe_name )
            row = map(lambda i: tmp[order[i]], range(len(tmp)))

            pset = pmap.lookup( probe_name )
            if pset is not None:
                for probe in pset:
                    istr = "insert into %s(chrom, chromStart, chromEnd, strand,  name, expCount, expIds, expScores) values ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', %s );\n" % \
                            ( "genomic_%s_tmp" % (table_base), probe.chrom, probe.chrom_start-1, probe.chrom_end, probe.strand, sql_fix(probe_name), len(sample_ids), exp_ids, self.scores(row) )
                    yield istr
            else:
                missingProbeCount += 1
        yield "# sort file by chrom position\n"
        yield "create table genomic_%s like genomic_%s_tmp;\n" % (table_base, table_base)
        yield "insert into genomic_%s select * from genomic_%s_tmp order by chrom, chromStart;\n" % (table_base, table_base)
        yield "drop table genomic_%s_tmp;\n" % table_base
        CGData.log("%s Missing probes %d" % (table_base, missingProbeCount))
示例#15
0
    def gen_sql_heatmap(self, id_table, opts):
        #scan the children
        # XXX Handling of sql for children is broken if the child may appear
        # as part of multiple merge objects, such as TrackGenomic and TrackClinical.
        # A disgusting workaround for clinicalMatrix is to prevent the TrackGenomic from calling
        # it for gen_sql.
        clinical = self.members.pop("clinicalMatrix")
        for line in CGData.CGMergeObject.sql_pass(self, id_table, method="heatmap"):
            yield line
        self.members["clinicalMatrix"] = clinical

        gmatrix = self.members[ 'genomicMatrix' ]
        pmap = self.members[ 'probeMap' ].lookup( assembly="hg18" ) # BUG: hard coded to only producing HG18 tables
        if pmap is None:
            CGData.error("Missing HG18 %s" % ( self.members[ 'probeMap'].get_name() ))
            return

        savedownsample = 'save-ds' in opts and opts['save-ds']
        
        table_base = self.get_name().replace(".", "_")
        CGData.log("Writing Track %s" % (table_base))
        
        clinical_table_base =  self.members[ "clinicalMatrix" ].get_name().replace(".", "_")

        other = {}
        for attr in ['wrangler', 'wrangling_procedure', 'url', 'citation', 'description']:
            if attr in gmatrix:
                other[attr] = gmatrix[attr]
        if 'dataProducer' in gmatrix:
            other['author_list'] = gmatrix['dataProducer']
        if 'articleTitle' in gmatrix:
            other['article_title'] = gmatrix['articleTitle']

        ##TO DO, the version info should be the lastest of genomic and clinical, currently only check genomic
        cVersion= self.members[ 'clinicalMatrix' ].get('version',"")
        gVersion= self.members[ 'genomicMatrix' ].get('version',"")
        dG= makeDate(gVersion)
        dC= makeDate(cVersion)
        if dC == None:
            other['version'] = gVersion
        elif dG<dC:
            other['version'] = cVersion
        else:
            other['version'] = gVersion
        datetime.datetime.strptime(other['version'], "%Y-%m-%d") #if the version isn't properly formatted, though exception

        if 'owner' in gmatrix:
            other['owner'] = gmatrix['owner']
        other['colNormalization'] = gmatrix.get('colNormalization', False)
        if not isinstance(other['colNormalization'], bool):
            other['colNormalization']  = False
        other['redistribution'] = gmatrix.get('redistribution', False)
        if not isinstance(other['redistribution'], bool):
            other['redistribution']  = False
        security = gmatrix.get('security', "public")
        if security not in [ "public", "private" ]:
            security = "public"

        if savedownsample:
            yield "SET @ds=(SELECT downSampleTable FROM raDb WHERE name = '%s');\n" % ("genomic_" + table_base)
        yield "DELETE from raDb where name = '%s';\n" % ("genomic_" + table_base)
        yield "INSERT into raDb( name, downSampleTable, sampleTable, clinicalTable, columnTable, aliasTable, shortLabel, longLabel, expCount, dataType, platform, profile, security, priority, gain, groupName, wrangler, url, article_title, citation, author_list, wrangling_procedure, other) VALUES ( '%s', %s, '%s', '%s', '%s', '%s', '%s', '%s', '%d', '%s', '%s', '%s', '%s', %f, %f, '%s', %s, %s, %s, %s, %s, %s, '%s');\n" % \
            ( "genomic_" + table_base,
                "@ds" if savedownsample else "NULL",
                "sample_" + table_base,
                "clinical_" + clinical_table_base, "colDb",
                "genomic_" + table_base + "_alias",
                sql_fix(gmatrix['shortTitle']),
                sql_fix(gmatrix['longTitle']),
                len(gmatrix.get_sample_list()),
                self.format,
                dataSubTypeMap[gmatrix[':dataSubType']] if gmatrix[':dataSubType'] in dataSubTypeMap else gmatrix[':dataSubType'],
                'localDb',
                security,
                float(gmatrix.get('priority', 1.0)),
                float(gmatrix.get('gain', 1.0)),
                sql_fix(gmatrix.get('groupTitle', 'Misc.')),
                "'%s'"%sql_fix(gmatrix['wrangler']) if 'wrangler' in gmatrix else '\N',
                "'%s'"%sql_fix(gmatrix['url']) if 'url' in gmatrix else '\N',
                "'%s'"%sql_fix(gmatrix['articleTitle']) if 'articleTitle' in gmatrix else '\N',
                "'%s'"%sql_fix(gmatrix['citation']) if 'citation' in gmatrix else '\N',
                "'%s'"%sql_fix(gmatrix['dataProducer']) if 'dataProducer' in gmatrix else '\N',
                "'%s'"%sql_fix(gmatrix['wrangling_procedure']) if 'wrangling_procedure' in gmatrix else '\N',
                sql_fix(json.dumps(other)),
                )

        if 'no-genomic-matrix' in opts and opts['no-genomic-matrix']:
            return
        
        # write out the sample table
        yield "drop table if exists sample_%s;" % ( table_base )
        yield """
        CREATE TABLE sample_%s (
        id           int,
        sampleName   varchar(255)
        ) engine 'MyISAM';
        """ % ( table_base )

        from CGData.ClinicalMatrix import sortedSamples
        for sample in sortedSamples(gmatrix.get_sample_list()):
            yield "INSERT INTO sample_%s VALUES( %d, '%s' );\n" % ( table_base, id_table.get( clinical_table_base + ':sample_id', sample), sql_fix(sample) )

        
        yield "drop table if exists genomic_%s_alias;" % ( table_base )
        yield """
        CREATE TABLE genomic_%s_alias (
        name        varchar(255),
        alias         varchar(255)
        ) engine 'MyISAM';
        """ % ( table_base )

        for probe in pmap.get_probes():
            for alias in probe.aliases:
                yield "insert into genomic_%s_alias( name, alias ) values( '%s', '%s' );\n" % (table_base, sql_fix(probe.name), sql_fix(alias))

        # write out the BED table
        yield "drop table if exists %s;" % ( "genomic_" + table_base )
        yield CREATE_BED % ( "genomic_" + table_base + "_tmp")
        
        sample_ids = []
        samples = gmatrix.get_sample_list()

        # sort samples by sample_id, and retain the sort order for application to the genomic data, below
        tmp=sorted(zip(samples, range(len(samples))), cmp=lambda x,y: id_table.get(clinical_table_base + ':sample_id', x[0]) - id_table.get( clinical_table_base + ':sample_id', y[0]))
        samples, order = map(lambda t: list(t), zip(*tmp))

        for sample in samples:
            sample_ids.append( str( id_table.get( clinical_table_base + ':sample_id', sample ) ) )
        
        exp_ids = ','.join( sample_ids )
        missingProbeCount = 0
        for probe_name in gmatrix.get_probe_list():
            # get the genomic data and rearrange to match the sample_id order
            tmp = gmatrix.get_row_vals( probe_name )
            row = map(lambda i: tmp[order[i]], range(len(tmp)))

            pset = pmap.lookup( probe_name )
            if pset is not None:
                for probe in pset:
                    istr = "insert into %s(bin, chrom, chromStart, chromEnd, strand,  name, expCount, expIds, expScores) values ( %d, '%s', '%s', '%s', '%s', '%s', '%s', '%s', %s );\n" % \
                            ( "genomic_%s_tmp" % (table_base), Binner.calcBin(probe.chrom_start, probe.chrom_end), probe.chrom, probe.chrom_start-1, probe.chrom_end, probe.strand, sql_fix(probe_name), len(sample_ids), exp_ids, self.scores(row) )
                    yield istr
            else:
                missingProbeCount += 1
        yield "# sort file by chrom position\n"
        yield "create table genomic_%s like genomic_%s_tmp;\n" % (table_base, table_base)
        yield "insert into genomic_%s(bin, chrom, chromStart, chromEnd, strand,  name, expCount, expIds, expScores) select bin, chrom, chromStart, chromEnd, strand,  name, expCount, expIds, expScores from genomic_%s_tmp order by chrom, chromStart;\n" % (table_base, table_base)
        yield "drop table genomic_%s_tmp;\n" % table_base
        CGData.log("%s Missing probes %d" % (table_base, missingProbeCount))