Exemplo n.º 1
0
    def get_datasetdata_cursor(dataset_id):
        
        dataset = DataSet.objects.get(id=dataset_id)
        
        timepoint_columns = ( 
            DataColumn.objects.all()
                .filter(dataset_id=dataset_id)
                .filter(unit__in=['day','hour','minute','second']) )
        
        reagent_columns = ( DataColumn.objects.filter(dataset_id=dataset_id)
            .filter(data_type__in=[
                'small_molecule','cell','primary_cell','protein','antibody',
                'otherreagent']).order_by('display_order') )
        
        dc_ids_to_exclude = [dc.id for dc in timepoint_columns]
        dc_ids_to_exclude.extend([dc.id for dc in reagent_columns])
        
        col_query_string = (
            ', (SELECT '
            ' {column_to_select} '
            ' FROM db_datapoint as {alias} '
            ' WHERE {alias}.datacolumn_id={dc_id} '
            ' AND {alias}.datarecord_id=datarecord.id ) as "{column_name}" '
            )
        timepoint_unit_string = ',$${dc_unit}$$ as "{dc_name}_timepointUnit" '
        timepoint_description_string = (
            ',$${dc_description}$$ as "{dc_name}_timepointDescription" ')
        
        query_string = (
            'WITH drs as ('
            ' SELECT '
            ' datarecord.id as "datarecordID"'
            ', dataset.facility_id as "hmsDatasetID"'
            ', datarecord.plate as "recordedPlate"'
            ', datarecord.well as "recordedWell"'
            ', datarecord.control_type as "controlType"'
            )        
        alias_count = 0
        for dc in timepoint_columns:
            column_to_select = None
            alias_count += 1
            alias = 'dp_%d'%alias_count
            column_name = '%s_timepoint' % camel_case_dwg(dc.name)
            if(dc.data_type == 'Numeric' or dc.data_type == 'omero_image'):
                if dc.precision == 0 or dc.data_type == 'omero_image':
                    column_to_select = "int_value"
                else:
                    column_to_select = "round( float_value::numeric, 2 )"
            else:
                column_to_select = "text_value"
            query_string += col_query_string.format(
                column_to_select=column_to_select,
                alias=alias,dc_id=dc.id,column_name=column_name)
            query_string += timepoint_unit_string.format(
                dc_unit=dc.unit, dc_name=camel_case_dwg(dc.name))

            if dc.description:
                query_string += timepoint_description_string.format(
                    dc_description=dc.description, 
                    dc_name=camel_case_dwg(dc.name))

        reagent_id_query = (
             ', (SELECT r.facility_id '
             ' FROM db_reagent r '
             ' JOIN db_reagentbatch rb on rb.reagent_id=r.id '
             ' JOIN db_datapoint {alias} on rb.id={alias}.reagent_batch_id '
             ' WHERE {alias}.datarecord_id=datarecord.id '
             ' AND {alias}.datacolumn_id={dc_id} ) as "{column_name}" '
            )
        sm_salt_query = (
             ', (SELECT r.salt_id'
             ' FROM db_reagent r '
             ' JOIN db_reagentbatch rb on rb.reagent_id=r.id '
             ' JOIN db_datapoint {alias} on rb.id={alias}.reagent_batch_id '
             ' WHERE {alias}.datarecord_id=datarecord.id '
             ' AND {alias}.datacolumn_id={dc_id} ) as "{column_name}" '
            )
        reagent_lincs_id_query = (
             ', (SELECT r.lincs_id'
             ' FROM db_reagent r '
             ' JOIN db_reagentbatch rb on rb.reagent_id=r.id '
             ' JOIN db_datapoint {alias} on rb.id={alias}.reagent_batch_id '
             ' WHERE {alias}.datarecord_id=datarecord.id '
             ' AND {alias}.datacolumn_id={dc_id} ) as "{column_name}" '
            )
        reagent_batch_id_query = (
             ', (SELECT '
             " CASE WHEN rb.batch_id = '0' THEN '' ELSE rb.batch_id END"
             ' FROM db_reagent r '
             ' JOIN db_reagentbatch rb on rb.reagent_id=r.id '
             ' JOIN db_datapoint {alias} on rb.id={alias}.reagent_batch_id '
             ' WHERE {alias}.datarecord_id=datarecord.id '
             ' AND {alias}.datacolumn_id={dc_id} ) as "{column_name}" '
            )
        reagent_name_query = (
             ', ( SELECT r.name '
             ' FROM db_reagent r '
             ' JOIN db_reagentbatch rb on rb.reagent_id=r.id '
             ' JOIN db_datapoint {alias} on rb.id={alias}.reagent_batch_id '
             ' WHERE {alias}.datarecord_id=datarecord.id '
             ' AND {alias}.datacolumn_id={dc_id}  ) as "{column_name}" '
            )
        
        prefixes = { 'small_molecule': 'sm', 'protein': 'pp', 'antibody': 'ab',
            'otherreagent': 'or', 'cell': 'cl', 'primary_cell': 'pc'}
        for dc in reagent_columns:
            prefix = prefixes[dc.data_type]
            if dc.data_type == 'small_molecule':
                alias_count += 1
                alias = 'dp_%d' % alias_count
                query_string += sm_salt_query.format(
                    alias=alias,
                    dc_id=dc.id,
                    column_name='%s_%s%s' 
                        % (camel_case_dwg(dc.name),prefix,'Salt'))

                alias_count += 1
                alias = 'dp_%d' % alias_count
                query_string += reagent_id_query.format(
                    alias=alias,
                    dc_id=dc.id,
                    column_name='%s_%s%s' 
                        % (camel_case_dwg(dc.name),prefix,'CenterCompoundID'))

                alias_count += 1
                alias = 'dp_%d' % alias_count
                query_string += reagent_lincs_id_query.format(
                    alias=alias,
                    dc_id=dc.id,
                    column_name='%s_%s%s' 
                        % (camel_case_dwg(dc.name),prefix,'LincsID'))
            
            else:
                alias_count += 1
                alias = 'dp_%d' % alias_count
                query_string += reagent_id_query.format(
                    alias=alias,
                    dc_id=dc.id,
                    column_name='%s_%s%s' 
                        % (camel_case_dwg(dc.name),prefix,'CenterCanonicalID'))
                alias_count += 1
                alias = 'dp_%d' % alias_count
                # Note: remove condition when new protein specification is released
                if dc.data_type != 'protein':
                    query_string += reagent_lincs_id_query.format(
                        alias=alias,
                        dc_id=dc.id,
                        column_name='%s_%s%s' 
                            % (camel_case_dwg(dc.name),prefix,'LincsID'))
                else:
                    query_string += (", '' as %s_%s%s" 
                        % (camel_case_dwg(dc.name),prefix,'LincsID'))
            
            alias_count += 1
            alias = 'dp_%d' % alias_count
            query_string += reagent_batch_id_query.format(
                alias=alias,
                dc_id=dc.id,
                column_name='%s_%s%s' 
                    % (camel_case_dwg(dc.name),prefix,'CenterBatchID'))
                
            alias_count += 1
            alias = 'dp_%d' % alias_count
            query_string += reagent_name_query.format(
                alias=alias,
                dc_id=dc.id,
                column_name='%s_%s%s' 
                    % (camel_case_dwg(dc.name),prefix,'Name'))
                
        query_string += """
            from db_dataset dataset 
            join db_datarecord datarecord on(datarecord.dataset_id=dataset.id) 
            and dataset.id = %s )  
            """

        query_string += 'SELECT drs.* '
        query_string += ', dc.name as "datapointName" '
        query_string += ', dc.unit as "datapointUnit" '
        query_string += (
            ', coalesce(dp.int_value::TEXT'
            ', dp.float_value::TEXT'
            ', dp.text_value) as "%s" '
                % 'datapointValue' )
        query_string += """ 
            FROM drs 
            JOIN db_datapoint dp on dp.datarecord_id=drs."datarecordID" 
            join db_datacolumn dc on dp.datacolumn_id=dc.id  
            where dp.dataset_id = %s """
        if dc_ids_to_exclude: 
            query_string += ( 
                " and dp.datacolumn_id not in (%s)" 
                    % ','.join([str(x) for x in dc_ids_to_exclude]))    
        query_string +=' order by "datarecordID",dc.id '
        
        logger.info('query_string: %s' % query_string)
        cursor = connection.cursor()
        cursor.execute(query_string, [dataset_id,dataset_id])
        return cursor
Exemplo n.º 2
0
    def get_datasetdata_cursor(dataset_id):
        
        dataset = DataSet.objects.get(id=dataset_id)
        
        timepoint_columns = ( 
            DataColumn.objects.all()
                .filter(dataset_id=dataset_id)
                .filter(unit__in=['day','hour','minute','second']) )
        
        reagent_columns = ( DataColumn.objects.filter(dataset_id=dataset_id)
            .filter(data_type__in=[
                'small_molecule','cell','protein','antibody','otherreagent']) )
        
        dc_ids_to_exclude = [dc.id for dc in timepoint_columns]
        dc_ids_to_exclude.extend([dc.id for dc in reagent_columns])
        
        col_query_string = (
            ', (SELECT '
            ' {column_to_select} '
            ' FROM db_datapoint as {alias} '
            ' WHERE {alias}.datacolumn_id={dc_id} '
            ' AND {alias}.datarecord_id=datarecord.id ) as "{column_name}" '
            )
        timepoint_unit_string = ',$${dc_unit}$$ as "{dc_name}_timepointUnit" '
        timepoint_description_string = (
            ',$${dc_description}$$ as "{dc_name}_timepointDescription" ')
        
        query_string = (
            'WITH drs as ('
            ' SELECT '
            ' datarecord.id as "datarecordID"'
            ', dataset.facility_id as "hmsDatasetID"'
            ', datarecord.plate as "recordedPlate"'
            ', datarecord.well as "recordedWell"'
            ', datarecord.control_type as "controlType"'
            )        
        alias_count = 0
        for dc in timepoint_columns:
            column_to_select = None
            alias_count += 1
            alias = 'dp_%d'%alias_count
            column_name = '%s_timepoint' % camel_case_dwg(dc.name)
            if(dc.data_type == 'Numeric' or dc.data_type == 'omero_image'):
                if dc.precision == 0 or dc.data_type == 'omero_image':
                    column_to_select = "int_value"
                else:
                    column_to_select = "round( float_value::numeric, 2 )"
            else:
                column_to_select = "text_value"
            query_string += col_query_string.format(
                column_to_select=column_to_select,
                alias=alias,dc_id=dc.id,column_name=column_name)
            query_string += timepoint_unit_string.format(
                dc_unit=dc.unit, dc_name=camel_case_dwg(dc.name))

            if dc.description:
                query_string += timepoint_description_string.format(
                    dc_description=dc.description, 
                    dc_name=camel_case_dwg(dc.name))

        reagent_id_query = (
             ', (SELECT r.facility_id '
             ' FROM db_reagent r '
             ' JOIN db_reagentbatch rb on rb.reagent_id=r.id '
             ' JOIN db_datapoint {alias} on rb.id={alias}.reagent_batch_id '
             ' WHERE {alias}.datarecord_id=datarecord.id '
             ' AND {alias}.datacolumn_id={dc_id} ) as "{column_name}" '
            )
        sm_salt_query = (
             ', (SELECT r.salt_id'
             ' FROM db_reagent r '
             ' JOIN db_reagentbatch rb on rb.reagent_id=r.id '
             ' JOIN db_datapoint {alias} on rb.id={alias}.reagent_batch_id '
             ' WHERE {alias}.datarecord_id=datarecord.id '
             ' AND {alias}.datacolumn_id={dc_id} ) as "{column_name}" '
            )
        sm_lincs_id_query = (
             ', (SELECT r.lincs_id'
             ' FROM db_reagent r '
             ' JOIN db_reagentbatch rb on rb.reagent_id=r.id '
             ' JOIN db_datapoint {alias} on rb.id={alias}.reagent_batch_id '
             ' WHERE {alias}.datarecord_id=datarecord.id '
             ' AND {alias}.datacolumn_id={dc_id} ) as "{column_name}" '
            )
        reagent_batch_id_query = (
             ', (SELECT '
             " CASE WHEN rb.batch_id = '0' THEN '' ELSE rb.batch_id END"
             ' FROM db_reagent r '
             ' JOIN db_reagentbatch rb on rb.reagent_id=r.id '
             ' JOIN db_datapoint {alias} on rb.id={alias}.reagent_batch_id '
             ' WHERE {alias}.datarecord_id=datarecord.id '
             ' AND {alias}.datacolumn_id={dc_id} ) as "{column_name}" '
            )
        reagent_name_query = (
             ', ( SELECT r.name '
             ' FROM db_reagent r '
             ' JOIN db_reagentbatch rb on rb.reagent_id=r.id '
             ' JOIN db_datapoint {alias} on rb.id={alias}.reagent_batch_id '
             ' WHERE {alias}.datarecord_id=datarecord.id '
             ' AND {alias}.datacolumn_id={dc_id}  ) as "{column_name}" '
            )
        
        prefixes = { 'small_molecule': 'sm', 'protein': 'pp', 'antibody': 'ab',
            'otherreagent': 'or', 'cell': 'cl'}
        for dc in reagent_columns:
            prefix = prefixes[dc.data_type]
            if dc.data_type == 'small_molecule':
                alias_count += 1
                alias = 'dp_%d' % alias_count
                query_string += sm_salt_query.format(
                    alias=alias,
                    dc_id=dc.id,
                    column_name='%s_%s%s' 
                        % (camel_case_dwg(dc.name),prefix,'Salt'))

                alias_count += 1
                alias = 'dp_%d' % alias_count
                query_string += reagent_id_query.format(
                    alias=alias,
                    dc_id=dc.id,
                    column_name='%s_%s%s' 
                        % (camel_case_dwg(dc.name),prefix,'CenterCompoundID'))

                alias_count += 1
                alias = 'dp_%d' % alias_count
                query_string += sm_lincs_id_query.format(
                    alias=alias,
                    dc_id=dc.id,
                    column_name='%s_%s%s' 
                        % (camel_case_dwg(dc.name),prefix,'LincsID'))
            
            else:
                alias_count += 1
                alias = 'dp_%d' % alias_count
                query_string += reagent_id_query.format(
                    alias=alias,
                    dc_id=dc.id,
                    column_name='%s_%s%s' 
                        % (camel_case_dwg(dc.name),prefix,'CenterSpecificID'))
            
            alias_count += 1
            alias = 'dp_%d' % alias_count
            query_string += reagent_batch_id_query.format(
                alias=alias,
                dc_id=dc.id,
                column_name='%s_%s%s' 
                    % (camel_case_dwg(dc.name),prefix,'CenterSampleID'))
                
            alias_count += 1
            alias = 'dp_%d' % alias_count
            query_string += reagent_name_query.format(
                alias=alias,
                dc_id=dc.id,
                column_name='%s_%s%s' 
                    % (camel_case_dwg(dc.name),prefix,'Name'))
                
        query_string += """
            from db_dataset dataset 
            join db_datarecord datarecord on(datarecord.dataset_id=dataset.id) 
            and dataset.id = %s )  
            """

        query_string += 'SELECT drs.* '
        query_string += ', dc.name as "datapointName" '
        query_string += ', dc.unit as "datapointUnit" '
        query_string += (
            ', coalesce(dp.int_value::TEXT, dp.float_value::TEXT, dp.text_value) as "%s" '
                % 'datapointValue' )
        query_string += """ 
            FROM drs 
            JOIN db_datapoint dp on dp.datarecord_id=drs."datarecordID" 
            join db_datacolumn dc on dp.datacolumn_id=dc.id  
            where dp.dataset_id = %s """
        if dc_ids_to_exclude: 
            query_string += ( 
                " and dp.datacolumn_id not in (%s)" 
                    % ','.join([str(x) for x in dc_ids_to_exclude]))    
        query_string +=' order by "datarecordID",dc.id '
        
        logger.info('query_string: %s' % query_string)
        cursor = connection.cursor()
        cursor.execute(query_string, [dataset_id,dataset_id])
        return cursor
Exemplo n.º 3
0
                
                final_val = util.convertdata(
                    val,type_lookup.get(recognized_label, None)) 
                
                if final_val != None:
                    dc_dict[recognized_label] = final_val
                    if recognized_label == 'display_order':
                        # add 10 to the order, so default reagent cols can go first
                        dc_dict['display_order'] = (
                            dc_dict['display_order'] + 10)
                    if recognized_label == 'name':
                        # split on non-alphanumeric chars
                        temp = re.split(r'[^a-zA-Z0-9]+',dc_dict['name'])
                        # convert, if needed
                        if len(temp) > 1:
                            dc_dict['name'] = camel_case_dwg(dc_dict['name'])
                else:
                    if recognized_label in required_labels:
                        raise Exception(
                            'Error, data column field is required: %s, col: %r'
                                % ( recognized_label, colname(j+1) ) )
        else:
            logger.debug(
                'unrecognized label in "Data Columns" sheet %r' % label_read)
    
    for dc_dict in dc_definitions:
        for label in required_labels:
            if label not in dc_dict:
                raise Exception(
                    'required "Data Column" label not defined %r' % label)
Exemplo n.º 4
0
def read_datacolumns(book):
    '''
    @return an array of data column definition dicts 
    '''

    data_column_sheet = book.sheet_by_name('Data Columns')

    labels = {
        'Worksheet Column': 'worksheet_column',
        '"Data" Worksheet Column': 'worksheet_column',
        'Display Order': 'display_order',
        'Display Name': 'display_name',
        'Name': 'name',
        'Data Type': 'data_type',
        'Decimal Places': 'precision',
        'Description': 'description',
        'Replicate Number': 'replicate',
        'Unit': 'unit',
        'Assay readout type': 'readout_type',
        'Comments': 'comments',
    }

    dc_definitions = []
    datacolumn_fields = util.get_fields(DataColumn)
    type_lookup = dict((f.name, iu.totype(f)) for f in datacolumn_fields)
    logger.debug('datacolumn type lookups: %s' % type_lookup)
    required_labels = ['name', 'data_type']

    logger.info('read the data column definitions...')
    for i in xrange(data_column_sheet.nrows):
        row_values = data_column_sheet.row_values(i)

        if i == 0:
            for val in row_values[1:]:
                dc_definitions.append({})

        label_read = row_values[0]

        recognized_label = next(
            (field_name for label, field_name in labels.items()
             if label_read and label.lower() == label_read.lower()), None)

        if recognized_label:

            logger.debug('label: %r, recognized_label: %r' %
                         (label_read, recognized_label))

            for j, val in enumerate(row_values[1:]):
                dc_dict = dc_definitions[j]

                logger.debug('data column %s:%d:%d:%r' %
                             (recognized_label, i, j, val))

                final_val = util.convertdata(
                    val, type_lookup.get(recognized_label, None))

                if final_val != None:
                    dc_dict[recognized_label] = final_val
                    if recognized_label == 'display_order':
                        # add 10 to the order, so default reagent cols can go first
                        dc_dict['display_order'] = (dc_dict['display_order'] +
                                                    10)
                    if recognized_label == 'name':
                        # split on non-alphanumeric chars
                        temp = re.split(r'[^a-zA-Z0-9]+', dc_dict['name'])
                        # convert, if needed
                        if len(temp) > 1:
                            dc_dict['name'] = camel_case_dwg(dc_dict['name'])
                else:
                    if recognized_label in required_labels:
                        raise Exception(
                            'Error, data column field is required: %s, col: %r'
                            % (recognized_label, colname(j + 1)))
        else:
            logger.debug('unrecognized label in "Data Columns" sheet %r' %
                         label_read)

    for dc_dict in dc_definitions:
        for label in required_labels:
            if label not in dc_dict:
                raise Exception('required "Data Column" label not defined %r' %
                                label)

    logger.info('find the data columns on the "Data" sheet...')

    data_sheet = book.sheet_by_name('Data')
    data_sheet_labels = data_sheet.row_values(0)
    dc_definitions_found = []
    data_labels_found = []
    for i, data_label in enumerate(data_sheet_labels):

        if not data_label or not data_label.strip():
            logger.info('break on data sheet col %d, blank' % i)
            break

        data_label = data_label.upper()
        col_letter = colname(i)

        for dc_dict in dc_definitions:
            _dict = None
            if 'worksheet_column' in dc_dict:

                v = dc_dict['worksheet_column']
                if v.upper() == col_letter:
                    data_labels_found.append(i)
                    dc_definitions_found.append(dc_dict)
                    _dict = dc_dict

            elif 'name' in dc_dict or 'display_name' in dc_dict:

                if (dc_dict.get('name', '').upper() == data_label or
                        dc_dict.get('display_name', '').upper() == data_label):

                    dc_dict['worksheet_column'] = col_letter
                    data_labels_found.append(i)
                    dc_definitions_found.append(dc_dict)
                    _dict = dc_dict

            if _dict and 'display_order' not in _dict:

                _dict['display_order'] = i + 10
                logger.warn('auto assigning "display_order" for col %r as %d' %
                            (_dict['name'], i + 10))

        if i not in data_labels_found:

            logger.debug(('Data sheet label not found %r,'
                          ' looking in default reagent definitions %s') %
                         (data_label, default_reagent_columns.keys()))

            for key, dc_dict in default_reagent_columns.items():
                if (key.upper() == data_label
                        or dc_dict.get('name', '').upper() == data_label or
                        dc_dict.get('display_name', '').upper() == data_label):

                    dc_dict['worksheet_column'] = col_letter
                    data_labels_found.append(i)
                    dc_definitions_found.append(dc_dict)

    data_labels_not_found = [
        data_label for i, data_label in enumerate(data_sheet_labels)
        if data_label and data_label.strip() and i not in data_labels_found
        and data_label not in meta_columns
    ]
    if data_labels_not_found:
        logger.warn('data sheet labels not recognized %s' %
                    data_labels_not_found)

    # for legacy datasets: make sure the small molecule column 1 is always created
    small_mol_col = None
    for dc_dict in dc_definitions_found:
        if dc_dict['data_type'] == 'small_molecule':
            small_mol_col = dc_dict
            break
    if not small_mol_col:
        dc_definitions_found.append(
            default_reagent_columns['Small Molecule Batch'])

    logger.info('data column definitions found: %s' %
                [x['display_name'] for x in dc_definitions_found])

    return dc_definitions_found
Exemplo n.º 5
0
def read_datacolumns(book):
    '''
    @return an array of data column definition dicts 
    '''
    
    data_column_sheet = book.sheet_by_name('Data Columns')
    
    labels = {
        'Worksheet Column':'worksheet_column',
        '"Data" Worksheet Column':'worksheet_column',
        'Display Order':'display_order',
        'Display Name':'display_name',
        'Name':'name',
        'Data Type':'data_type',
        'Decimal Places':'precision',
        'Description':'description',
        'Replicate Number':'replicate',
        'Unit':'unit', 
        'Assay readout type':'readout_type',
        'Comments':'comments',
    }

    dc_definitions = []
    datacolumn_fields = util.get_fields(DataColumn)
    type_lookup = dict((f.name, iu.totype(f)) for f in datacolumn_fields)
    logger.debug('datacolumn type lookups: %s' % type_lookup)
    required_labels = ['name', 'data_type']

    logger.info('read the data column definitions...')
    for i in xrange(data_column_sheet.nrows):
        row_values = data_column_sheet.row_values(i)
        
        if i == 0:
            for val in row_values[1:]:
                dc_definitions.append({})
        
        label_read = row_values[0]
        
        recognized_label = next(
            (field_name for label, field_name in labels.items() 
                if label_read and label.lower() == label_read.lower() ), None)
        
        if recognized_label:
            
            logger.debug(
                'label: %r, recognized_label: %r' % (label_read, recognized_label))
            
            for j,val in enumerate(row_values[1:]):
                dc_dict = dc_definitions[j]

                logger.debug('data column %s:%d:%d:%r' 
                    % ( recognized_label, i, j, val))
                
                final_val = util.convertdata(
                    val,type_lookup.get(recognized_label, None)) 
                
                if final_val != None:
                    dc_dict[recognized_label] = final_val
                    if recognized_label == 'display_order':
                        # add 10 to the order, so default reagent cols can go first
                        dc_dict['display_order'] = (
                            dc_dict['display_order'] + 10)
                    if recognized_label == 'name':
                        # split on non-alphanumeric chars
                        temp = re.split(r'[^a-zA-Z0-9]+',dc_dict['name'])
                        # convert, if needed
                        if len(temp) > 1:
                            dc_dict['name'] = camel_case_dwg(dc_dict['name'])
                else:
                    if recognized_label in required_labels:
                        raise Exception(
                            'Error, data column field is required: %s, col: %r'
                                % ( recognized_label, colname(j+1) ) )
        else:
            logger.debug(
                'unrecognized label in "Data Columns" sheet %r' % label_read)
    
    for dc_dict in dc_definitions:
        for label in required_labels:
            if label not in dc_dict:
                raise Exception(
                    'required "Data Column" label not defined %r' % label)

    logger.info('find the data columns on the "Data" sheet...')           

    data_sheet = book.sheet_by_name('Data')
    data_sheet_labels = data_sheet.row_values(0)
    dc_definitions_found = []
    data_labels_found = []
    for i,data_label in enumerate(data_sheet_labels):
        
        if not data_label or not data_label.strip():
            logger.info('break on data sheet col %d, blank' % i)
            break
        
        data_label = data_label.upper()
        col_letter = colname(i)
        
        for dc_dict in dc_definitions:
            _dict = None
            if 'worksheet_column' in dc_dict:
                
                v = dc_dict['worksheet_column']
                if v.upper() == col_letter:
                    data_labels_found.append(i)
                    dc_definitions_found.append(dc_dict)
                    _dict = dc_dict
                    
            elif 'name' in dc_dict or 'display_name' in dc_dict:
            
                if ( dc_dict.get('name', '').upper() == data_label
                    or dc_dict.get('display_name', '').upper() == data_label):
                    
                    dc_dict['worksheet_column'] = col_letter
                    data_labels_found.append(i)
                    dc_definitions_found.append(dc_dict)
                    _dict = dc_dict
                    
            if _dict and 'display_order' not in _dict:
            
                _dict['display_order'] = i+10
                logger.warn(
                    'auto assigning "display_order" for col %r as %d' 
                        % (_dict['name'], i+10))

        if i not in data_labels_found:
        
            logger.debug( ( 
                'Data sheet label not found %r,'
                ' looking in default reagent definitions %s' )
                     % ( data_label, default_reagent_columns.keys() ) )
            
            for key,dc_dict in default_reagent_columns.items():
                if (key.upper() == data_label 
                    or dc_dict.get('name', '').upper() == data_label
                    or dc_dict.get('display_name', '').upper() == data_label):
                    
                    dc_dict['worksheet_column'] = col_letter
                    data_labels_found.append(i)
                    dc_definitions_found.append(dc_dict)
    
    data_labels_not_found = [ 
        data_label for i,data_label in enumerate(data_sheet_labels) 
            if data_label and data_label.strip() 
            and i not in data_labels_found and data_label not in meta_columns ]
    if data_labels_not_found:
        logger.warn(
            'data sheet labels not recognized %s' % data_labels_not_found )

    # for legacy datasets: make sure the small molecule column 1 is always created
    small_mol_col = None
    for dc_dict in dc_definitions_found:
        if dc_dict['data_type'] == 'small_molecule':
            small_mol_col = dc_dict
            break
    if not small_mol_col:
        dc_definitions_found.append(default_reagent_columns['Small Molecule Batch'])
        
    logger.info('data column definitions found: %s' 
        % [x['display_name'] for x in dc_definitions_found])

    return dc_definitions_found