Пример #1
0
 def get_datasetdata_column_fieldinformation(): #TODO: is this cached by python?
     """
     returns a list of [('table.field', fieldinformation )]
     """
     # TODO: only include cell, protein if they are present in the dataset
     # Note, case is erased when cursor.description is filled, so use underscores, and leave the camelcasing for later
     
     table_fields = ['datarecord.id',
                     'dataset.facility_id',
                     'smallmolecule.facility_id', 
                     'smallmolecule.salt_id', 
                     'smallmoleculebatch.facility_batch_id', 
                     'smallmolecule.lincs_id',
                     'smallmolecule.name',
                     'cell.name',
                     'cell.cl_id',
                     'cell.facility_id',
                     'protein.name',
                     'protein.lincs_id',
                     'datarecord.plate',
                     'datarecord.well',
                     'datarecord.control_type',
                     'datacolumn.name',
                     'datacolumn.unit',  ]
     tablefields_fi = [];
     for tablefield in table_fields:
         tflist = tablefield.split('.') 
         tablefields_fi.append((tablefield, get_fieldinformation(tflist[1],[tflist[0]])))
         
     return tablefields_fi
Пример #2
0
 def get_datasetdata_camel_case_columns(dataset_id):
     camel_columns = [x[1].get_camel_case_dwg_name() 
                      for x in DataSetDataResource.get_datasetdata_column_fieldinformation()]
     timepoint_columns = DataSetDataResource.get_dataset_columns(dataset_id, ['day','hour','minute','second'])
     for i in xrange(len(timepoint_columns)):
         camel_columns.append('timepoint'+ ('','_'+str(i))[i>0]) # FYI we have to label manually, because timepoint is an alias, not real DataColumn
         camel_columns.append('timepoint_unit'+ ('','_'+str(i))[i>0])
         camel_columns.append('timepoint_description'+ ('','_'+str(i))[i>0])
     datapoint_value_fi = get_fieldinformation('datapoint_value', ['']) 
     camel_columns.append(datapoint_value_fi.get_camel_case_dwg_name())
     return camel_columns
Пример #3
0
    def build_schema(self):
        schema = super(DataSetResource,self).build_schema()
        original_dict = schema['fields'] # TODO: reincorporate this information (this default information is about the DB schema definition)
        fields = get_detail_schema(DataSet(), 'dataset', lambda x: x.show_in_detail )
        # Custom fields for SAF: TODO: generate the names here from the fieldinformation
        fields['datapointFile'] = get_schema_fieldinformation('datapoint_file','')
        fields['safVersion'] = get_schema_fieldinformation('saf_version','')
        fields['screeningFacility'] = get_schema_fieldinformation('screening_facility','')
        schema['fields'] = OrderedDict(sorted(fields.items(), key=lambda x: x[0])) # sort alpha, todo sort on fi.order
        
        ds_fieldinformation = DataSetDataResource.get_datasetdata_column_fieldinformation()
        ds_fieldinformation.append(('datapoint_value',get_fieldinformation('datapoint_value',[''])) )
        ds_fieldinformation.append(('timepoint',get_fieldinformation('timepoint',[''])) )
        ds_fieldinformation.append(('timepoint_unit',get_fieldinformation('timepoint_unit',[''])) )
        ds_fieldinformation.append(('timepoint_description',get_fieldinformation('timepoint_description',[''])) )
        
        meta_field_info = get_listing(FieldInformation(),['fieldinformation'])
    
        fields = {}
        for field,fi in ds_fieldinformation:
            field_schema_info = {}
            for item in meta_field_info.items():
                meta_fi_attr = item[0]
                meta_fi = item[1]['fieldinformation']
                field_schema_info[meta_fi.get_camel_case_dwg_name()] = getattr(fi,meta_fi_attr)
            fields[fi.get_camel_case_dwg_name()]= field_schema_info
        schema['datasetDataFile'] = OrderedDict(sorted(fields.items(), key=lambda x: x[0])) # sort alpha, todo sort on fi.order

        dc_fieldinformation = FieldInformation.objects.all().filter(table='datacolumn', show_in_detail=True)
        datapoint_fields = {}
        for fi in dc_fieldinformation:
            field_schema_info = {}
            for item in meta_field_info.items():
                meta_fi_attr = item[0]
                meta_fi = item[1]['fieldinformation']
                field_schema_info[meta_fi.get_camel_case_dwg_name()] = getattr(fi,meta_fi_attr)
            datapoint_fields[fi.get_camel_case_dwg_name()]= field_schema_info
        schema['datapointInformation'] = OrderedDict(sorted(datapoint_fields.items(), key=lambda x: x[0])) # sort alpha, todo sort on fi.order
        
        return schema 
Пример #4
0
    def build_schema(self):
        schema = super(DataSetDataResource,self).build_schema()
        original_dict = schema['fields'] # TODO: reincorporate this information (this default information is about the DB schema definition)

        ds_fieldinformation = DataSetDataResource.get_datasetdata_column_fieldinformation()
        ds_fieldinformation.append(('datapoint_value',get_fieldinformation('datapoint_value',[''])) )
        ds_fieldinformation.append(('timepoint',get_fieldinformation('timepoint',[''])) )
        ds_fieldinformation.append(('timepoint_unit',get_fieldinformation('timepoint_unit',[''])) )
        ds_fieldinformation.append(('timepoint_description',get_fieldinformation('timepoint_description',[''])) )
        
        meta_field_info = get_listing(FieldInformation(),['fieldinformation'])
    
        fields = {}
        for __,fi in ds_fieldinformation:
            field_schema_info = {}
            for item in meta_field_info.items():
                meta_fi_attr = item[0]
                meta_fi = item[1]['fieldinformation']
                field_schema_info[meta_fi.get_camel_case_dwg_name()] = getattr(fi,meta_fi_attr)
            fields[fi.get_camel_case_dwg_name()]= field_schema_info
        schema['fields'] = OrderedDict(sorted(fields.items(), key=lambda x: x[0])) # TODO, use the fieldinformation order
        
        return schema 
Пример #5
0
    def get_datasetdata_cursor(dataset_id):
        timepoint_columns = DataSetDataResource.get_dataset_columns(dataset_id, ['day','hour','minute','second'])
        logger.info(str(('timepoint_columns', timepoint_columns)))
        datapoint_columns = DataSetDataResource.get_dataset_columns(dataset_id)
        datapoint_columns = [col for col in datapoint_columns if col not in timepoint_columns]
        
        # pivot out the timepoint columns only
        timepoint_column_string = '';
        for i,dc in enumerate(timepoint_columns):
            alias = "dp_"+ str(i)
            
            tp_name = "timepoint"
            tp_unit_name = "timepoint_unit"
            tp_desc_name = "timepoint_description"
            if i>0: 
                tp_name += "_" + str(i)
                tp_unit_name += "_" + str(i)
                tp_desc_name += "_" + str(i)

            # note: timepoint values are probably text, but who knows, so query the type here
            column_to_select = None
            if(dc.data_type == 'Numeric' or dc.data_type == 'omero_image'):
                if dc.precision == 0 or dc.data_type == 'omero_image':
                    column_to_select = "int_value"
                else:
                    column_to_select = "round( float_value::numeric, 2 )" # TODO: specify the precision in the fieldinformation for this column
            else:
                column_to_select = "text_value"
            timepoint_column_string += (    
                ",(SELECT " + column_to_select + " FROM db_datapoint " + alias + 
                " where " + alias + ".datacolumn_id="+str(dc.id) + 
                " and " + alias + ".datarecord_id=datarecord.id) as " + tp_name )
            timepoint_column_string += ", '" + dc.unit + "' as " + tp_unit_name
            if dc.description:
                timepoint_column_string += ", '" + dc.description  + "' as " + tp_desc_name
        

             
        sql = "select "
        # patterns to match the protein and cell fields, which will be handled differently below,
        # because they can be linked through the DataRecord or the DataColumn
        protein_pattern = re.compile(r'protein.(.*)') 
        cell_pattern = re.compile(r'cell.(.*)')
        meta_columns_to_fieldinformation = DataSetDataResource.get_datasetdata_column_fieldinformation()
        for i,(tablefield,fi) in enumerate(meta_columns_to_fieldinformation):
            if i!=0: 
                sql += ', \n'
            # NOTE: Due to an updated requirement, proteins, cells may be linked to either datarecords, or datacolumns, 
            # so the following ugliness ensues
            m = protein_pattern.match(tablefield)
            m2 = cell_pattern.match(tablefield)
            if m:
                sql += ( '(select p.' + m.group(1) + 
                    ' from db_protein p where p.id in (datacolumn.protein_id,datarecord.protein_id))' + 
                    ' as "' + fi.get_camel_case_dwg_name() +'"' ) 
            elif m2:
                sql += ( '(select c.' + m2.group(1) + 
                    ' from db_cell c where c.id in (datacolumn.cell_id,datarecord.cell_id))' + 
                    ' as "' + fi.get_camel_case_dwg_name() +'"' ) 
            else:
                # TODO: this information is parsed when deserializing to create the "camel cased name"
                sql += tablefield + ' as "' + fi.get_camel_case_dwg_name() +'"' 
        datapoint_value_fi = get_fieldinformation('datapoint_value', [''])  
        sql += ', coalesce(dp.int_value::TEXT, dp.float_value::TEXT, dp.text_value) as "' + datapoint_value_fi.get_camel_case_dwg_name() +'"\n'
            
        sql += timepoint_column_string
        
        # Note: older simple left join to proteins
        #             left join db_protein protein on (datarecord.protein_id=protein.id)
        # Also, cells:
        #            left join db_cell cell on (datarecord.cell_id=cell.id)
        # has been replaced to 
        #            left join ( select datarecord.id as dr_id, * from db_protein p where p.id in (datarecord.protein_id,datacolumn.protein_id)) protein on(dr_id=datarecord.id)
        sql += """
            from db_dataset dataset 
            join db_datarecord datarecord on(datarecord.dataset_id=dataset.id) 
            join db_datacolumn datacolumn on(datacolumn.dataset_id=dataset.id) 
            left join db_smallmolecule smallmolecule on (datarecord.smallmolecule_id=smallmolecule.id) 
            left join db_smallmoleculebatch smallmoleculebatch 
                on(smallmoleculebatch.smallmolecule_id=smallmolecule.id 
                    and smallmoleculebatch.facility_batch_id=datarecord.batch_id)
            , db_datapoint dp 
            where dp.datarecord_id=datarecord.id and dp.datacolumn_id=datacolumn.id 
            and dataset.id = %s 
            """
        if len(timepoint_columns) > 0: 
            sql += " and datacolumn.id not in (" + ','.join([str(col.id) for col in timepoint_columns]) + ") "    
        sql += " order by datarecord.id, datacolumn.id "
        
        logger.info(str(('sql',sql)))
        cursor = connection.cursor()
        cursor.execute(sql, [dataset_id])
        #                query = DataRecord.objects.filter(dataset_id=dataset_id)
        #                logger.info(str(('query returns', query)))
        return cursor