Exemplo n.º 1
0
def readDataColumns(path):
    # Read in the DataColumn Sheet
    sheetname = 'Data Columns'
    dataColumnSheet = iu.readtable([path, sheetname])

    _fields = util.get_fields(DataColumn)
    _typelookup = dict((f.name, iu.totype(f)) for f in _fields)
    
    # TODO: Use the import_utils methods here
    # TODO: compare and combine this with the fieldinformation entity
    labels = {'Worksheet Column':'worksheet_column',
              'Display Order':'display_order',
              'Name':'name',
              'Data Type':'data_type',
              'Decimal Places':'precision',
              'Description':'description',
              'Replicate Number':'replicate',
              'Time point':'time_point', 
              'Assay readout type':'readout_type',
              'Comments':'comments'}

    # create an array of dict's, each dict defines a DataColumn    
    dataColumnDefinitions = []
    # first put the label row in (it contains the worksheet column, and its unique)
    for v in dataColumnSheet.labels[1:]:
        dataColumnDefinitions.append({labels['Worksheet Column']:v})
    # now, for each row, create the appropriate dictionary entry in the dataColumnDefinitions
    for row in dataColumnSheet:
        rowAsUnicode = util.make_row(row)
        keyRead = rowAsUnicode[0]
        for i,cellText in enumerate(rowAsUnicode[1:]):
            for key,fieldName in labels.items():
                if re.match(key,keyRead,re.M|re.I): # if the row is one of the DataColumn fields, then add it to the dict
                    dataColumnDefinitions[i][fieldName] = util.convertdata(cellText,_typelookup.get(fieldName, None)) # Note: convert the data to the model field type
                else:
                    logger.debug(str(( '"Data Column definition not used: ', cellText)) ) 
                    pass
    logger.debug(str(("definitions: ", dataColumnDefinitions)) )
    
    return dataColumnDefinitions
Exemplo n.º 2
0
def readDataColumns(path):
        # Read in the DataColumn Sheet
    sheetname = 'Data Columns'
    dataColumnSheet = iu.readtable([path, sheetname])

    _fields = util.get_fields(DataColumn)
    _typelookup = dict((f.name, iu.totype(f)) for f in _fields)
    
    labels = {'Worksheet Column':'worksheet_column',
              'Name':'name',
              'Data Type':'data_type',
              'Decimal Places':'precision',
              'Description':'description',
              'Replicate Number':'replicate',
              'Time point':'time_point', 
              'Assay readout type':'readout_type',
              'Comments':'comments'}

    # create an array of dict's, each dict defines a DataColumn    
    dataColumnDefinitions = []
    # first put the label row in (it contains the worksheet column, and its unique)
    for v in dataColumnSheet.labels[1:]:
        dataColumnDefinitions.append({labels['Worksheet Column']:v})
    # now, for each row, create the appropriate dictionary entry in the dataColumnDefinitions
    for row in dataColumnSheet:
        rowAsUnicode = util.make_row(row)
        keyRead = rowAsUnicode[0]
        for i,cellText in enumerate(rowAsUnicode[1:]):
            for key,fieldName in labels.items():
                if re.match(key,keyRead,re.M|re.I): # if the row is one of the DataColumn fields, then add it to the dict
                    dataColumnDefinitions[i][fieldName] = util.convertdata(cellText,_typelookup.get(fieldName, None)) # Note: convert the data to the model field type
                else:
                    pass
                    # print '"Data Column definition not used: ', cellText 
    print "definitions: ", dataColumnDefinitions
    
    return dataColumnDefinitions
Exemplo n.º 3
0
def readDataColumns(path):
    # Read in the DataColumn Sheet
    sheetname = 'Data Columns'
    dataColumnSheet = iu.readtable([path, sheetname])

    # Lookup all of the field types of the Datacolumn table.  
    # These will be used to validate input type by converting on read
    _fields = util.get_fields(DataColumn)
    _typelookup = dict((f.name, iu.totype(f)) for f in _fields)
    
    # TODO: Use the import_utils methods here
    # TODO: compare and combine this with the fieldinformation entity
    labels = {'Worksheet Column':'worksheet_column',
              'Display Order':'display_order',
              'Name':'name',
              'Display Name':'display_name',
              'Data Type':'data_type',
              'Decimal Places':'precision',
              'Description':'description',
              'Replicate Number':'replicate',
              'Unit':'unit', 
              'Assay readout type':'readout_type',
              'Comments':'comments',
              'Protein HMS LINCS ID': 'protein', 
              'Cell HMS LINCS ID': 'cell'}

    # create an array of dict's, each dict defines a DataColumn    
    dataColumnDefinitions = []
    #Note we also allow a list of pro
    # first the label row (it contains the worksheet column, it is unique)
    for v in dataColumnSheet.labels[1:]:
        dataColumnDefinitions.append({labels['Worksheet Column']:v})
        
    logger.debug(str(('========== datacolumns:',dataColumnDefinitions)))
    # for each row, create the dictionary entry in the dataColumnDefinitions
    for row in dataColumnSheet:
        rowAsUnicode = util.make_row(row)
        keyRead = rowAsUnicode[0]
        for i,cellText in enumerate(rowAsUnicode[1:]):
            try:
                for key,fieldName in labels.items():
                    # if one of the DataColumn fields, add it to the dict
                    if re.match(key,keyRead,re.M|re.I): 
                        if re.match('Protein HMS LINCS ID', keyRead, re.M|re.I):
                            facility_id = util.convertdata(cellText, int);
                            if facility_id:
                                dataColumnDefinitions[i][fieldName] = \
                                    Protein.objects.get(lincs_id=facility_id) 
                        elif re.match('Cell HMS LINCS ID', keyRead, re.M|re.I):
                            facility_id = util.convertdata(cellText, int);
                            if facility_id:
                                dataColumnDefinitions[i][fieldName] = \
                                    Cell.objects.get(facility_id=facility_id) 
                        else:
                            # Use the type from the fieldinformation table 
                            # to read in the data for each DC field
                            dataColumnDefinitions[i][fieldName] = \
                                util.convertdata(cellText,
                                                 _typelookup.get(fieldName, None)) 
                    else:
                        logger.debug(str((
                            '"Data Column definition not used: ', cellText)) ) 
                        pass
            except Exception, e:
                logger.error(str(('Exception reading data for cell', i, cellText, e)))
                raise e
        logger.debug(str(("definitions: ", dataColumnDefinitions)) )
Exemplo n.º 4
0
def read_datacolumns(book):
    '''
    @return an array of data column definition dicts 
    '''

    data_column_sheet = book.sheet_by_name('Data Columns')

    labels = {
        'Worksheet Column': 'worksheet_column',
        '"Data" Worksheet Column': 'worksheet_column',
        'Display Order': 'display_order',
        'Display Name': 'display_name',
        'Name': 'name',
        'Data Type': 'data_type',
        'Decimal Places': 'precision',
        'Description': 'description',
        'Replicate Number': 'replicate',
        'Unit': 'unit',
        'Assay readout type': 'readout_type',
        'Comments': 'comments',
    }

    dc_definitions = []
    datacolumn_fields = util.get_fields(DataColumn)
    type_lookup = dict((f.name, iu.totype(f)) for f in datacolumn_fields)
    logger.debug('datacolumn type lookups: %s' % type_lookup)
    required_labels = ['name', 'data_type']

    logger.info('read the data column definitions...')
    for i in xrange(data_column_sheet.nrows):
        row_values = data_column_sheet.row_values(i)

        if i == 0:
            for val in row_values[1:]:
                dc_definitions.append({})

        label_read = row_values[0]

        recognized_label = next(
            (field_name for label, field_name in labels.items()
             if label_read and label.lower() == label_read.lower()), None)

        if recognized_label:

            logger.debug('label: %r, recognized_label: %r' %
                         (label_read, recognized_label))

            for j, val in enumerate(row_values[1:]):
                dc_dict = dc_definitions[j]

                logger.debug('data column %s:%d:%d:%r' %
                             (recognized_label, i, j, val))

                final_val = util.convertdata(
                    val, type_lookup.get(recognized_label, None))

                if final_val != None:
                    dc_dict[recognized_label] = final_val
                    if recognized_label == 'display_order':
                        # add 10 to the order, so default reagent cols can go first
                        dc_dict['display_order'] = (dc_dict['display_order'] +
                                                    10)
                    if recognized_label == 'name':
                        # split on non-alphanumeric chars
                        temp = re.split(r'[^a-zA-Z0-9]+', dc_dict['name'])
                        # convert, if needed
                        if len(temp) > 1:
                            dc_dict['name'] = camel_case_dwg(dc_dict['name'])
                else:
                    if recognized_label in required_labels:
                        raise Exception(
                            'Error, data column field is required: %s, col: %r'
                            % (recognized_label, colname(j + 1)))
        else:
            logger.debug('unrecognized label in "Data Columns" sheet %r' %
                         label_read)

    for dc_dict in dc_definitions:
        for label in required_labels:
            if label not in dc_dict:
                raise Exception('required "Data Column" label not defined %r' %
                                label)

    logger.info('find the data columns on the "Data" sheet...')

    data_sheet = book.sheet_by_name('Data')
    data_sheet_labels = data_sheet.row_values(0)
    dc_definitions_found = []
    data_labels_found = []
    for i, data_label in enumerate(data_sheet_labels):

        if not data_label or not data_label.strip():
            logger.info('break on data sheet col %d, blank' % i)
            break

        data_label = data_label.upper()
        col_letter = colname(i)

        for dc_dict in dc_definitions:
            _dict = None
            if 'worksheet_column' in dc_dict:

                v = dc_dict['worksheet_column']
                if v.upper() == col_letter:
                    data_labels_found.append(i)
                    dc_definitions_found.append(dc_dict)
                    _dict = dc_dict

            elif 'name' in dc_dict or 'display_name' in dc_dict:

                if (dc_dict.get('name', '').upper() == data_label or
                        dc_dict.get('display_name', '').upper() == data_label):

                    dc_dict['worksheet_column'] = col_letter
                    data_labels_found.append(i)
                    dc_definitions_found.append(dc_dict)
                    _dict = dc_dict

            if _dict and 'display_order' not in _dict:

                _dict['display_order'] = i + 10
                logger.warn('auto assigning "display_order" for col %r as %d' %
                            (_dict['name'], i + 10))

        if i not in data_labels_found:

            logger.debug(('Data sheet label not found %r,'
                          ' looking in default reagent definitions %s') %
                         (data_label, default_reagent_columns.keys()))

            for key, dc_dict in default_reagent_columns.items():
                if (key.upper() == data_label
                        or dc_dict.get('name', '').upper() == data_label or
                        dc_dict.get('display_name', '').upper() == data_label):

                    dc_dict['worksheet_column'] = col_letter
                    data_labels_found.append(i)
                    dc_definitions_found.append(dc_dict)

    data_labels_not_found = [
        data_label for i, data_label in enumerate(data_sheet_labels)
        if data_label and data_label.strip() and i not in data_labels_found
        and data_label not in meta_columns
    ]
    if data_labels_not_found:
        logger.warn('data sheet labels not recognized %s' %
                    data_labels_not_found)

    # for legacy datasets: make sure the small molecule column 1 is always created
    small_mol_col = None
    for dc_dict in dc_definitions_found:
        if dc_dict['data_type'] == 'small_molecule':
            small_mol_col = dc_dict
            break
    if not small_mol_col:
        dc_definitions_found.append(
            default_reagent_columns['Small Molecule Batch'])

    logger.info('data column definitions found: %s' %
                [x['display_name'] for x in dc_definitions_found])

    return dc_definitions_found
Exemplo n.º 5
0
        '"Data" Worksheet Column':'worksheet_column',
        'Display Order':'display_order',
        'Display Name':'display_name',
        'Name':'name',
        'Data Type':'data_type',
        'Decimal Places':'precision',
        'Description':'description',
        'Replicate Number':'replicate',
        'Unit':'unit', 
        'Assay readout type':'readout_type',
        'Comments':'comments',
    }

    dc_definitions = []
    datacolumn_fields = util.get_fields(DataColumn)
    type_lookup = dict((f.name, iu.totype(f)) for f in datacolumn_fields)
    logger.debug('datacolumn type lookups: %s' % type_lookup)
    required_labels = ['name', 'data_type']

    logger.info('read the data column definitions...')
    for i in xrange(data_column_sheet.nrows):
        row_values = data_column_sheet.row_values(i)
        
        if i == 0:
            for val in row_values[1:]:
                dc_definitions.append({})
        
        label_read = row_values[0]
        
        recognized_label = next(
            (field_name for label, field_name in labels.items() 
Exemplo n.º 6
0
def read_datacolumns(book):
    '''
    @return an array of data column definition dicts 
    '''
    
    data_column_sheet = book.sheet_by_name('Data Columns')
    
    labels = {
        'Worksheet Column':'worksheet_column',
        '"Data" Worksheet Column':'worksheet_column',
        'Display Order':'display_order',
        'Display Name':'display_name',
        'Name':'name',
        'Data Type':'data_type',
        'Decimal Places':'precision',
        'Description':'description',
        'Replicate Number':'replicate',
        'Unit':'unit', 
        'Assay readout type':'readout_type',
        'Comments':'comments',
    }

    dc_definitions = []
    datacolumn_fields = util.get_fields(DataColumn)
    type_lookup = dict((f.name, iu.totype(f)) for f in datacolumn_fields)
    logger.debug('datacolumn type lookups: %s' % type_lookup)
    required_labels = ['name', 'data_type']

    logger.info('read the data column definitions...')
    for i in xrange(data_column_sheet.nrows):
        row_values = data_column_sheet.row_values(i)
        
        if i == 0:
            for val in row_values[1:]:
                dc_definitions.append({})
        
        label_read = row_values[0]
        
        recognized_label = next(
            (field_name for label, field_name in labels.items() 
                if label_read and label.lower() == label_read.lower() ), None)
        
        if recognized_label:
            
            logger.debug(
                'label: %r, recognized_label: %r' % (label_read, recognized_label))
            
            for j,val in enumerate(row_values[1:]):
                dc_dict = dc_definitions[j]

                logger.debug('data column %s:%d:%d:%r' 
                    % ( recognized_label, i, j, val))
                
                final_val = util.convertdata(
                    val,type_lookup.get(recognized_label, None)) 
                
                if final_val != None:
                    dc_dict[recognized_label] = final_val
                    if recognized_label == 'display_order':
                        # add 10 to the order, so default reagent cols can go first
                        dc_dict['display_order'] = (
                            dc_dict['display_order'] + 10)
                    if recognized_label == 'name':
                        # split on non-alphanumeric chars
                        temp = re.split(r'[^a-zA-Z0-9]+',dc_dict['name'])
                        # convert, if needed
                        if len(temp) > 1:
                            dc_dict['name'] = camel_case_dwg(dc_dict['name'])
                else:
                    if recognized_label in required_labels:
                        raise Exception(
                            'Error, data column field is required: %s, col: %r'
                                % ( recognized_label, colname(j+1) ) )
        else:
            logger.debug(
                'unrecognized label in "Data Columns" sheet %r' % label_read)
    
    for dc_dict in dc_definitions:
        for label in required_labels:
            if label not in dc_dict:
                raise Exception(
                    'required "Data Column" label not defined %r' % label)

    logger.info('find the data columns on the "Data" sheet...')           

    data_sheet = book.sheet_by_name('Data')
    data_sheet_labels = data_sheet.row_values(0)
    dc_definitions_found = []
    data_labels_found = []
    for i,data_label in enumerate(data_sheet_labels):
        
        if not data_label or not data_label.strip():
            logger.info('break on data sheet col %d, blank' % i)
            break
        
        data_label = data_label.upper()
        col_letter = colname(i)
        
        for dc_dict in dc_definitions:
            _dict = None
            if 'worksheet_column' in dc_dict:
                
                v = dc_dict['worksheet_column']
                if v.upper() == col_letter:
                    data_labels_found.append(i)
                    dc_definitions_found.append(dc_dict)
                    _dict = dc_dict
                    
            elif 'name' in dc_dict or 'display_name' in dc_dict:
            
                if ( dc_dict.get('name', '').upper() == data_label
                    or dc_dict.get('display_name', '').upper() == data_label):
                    
                    dc_dict['worksheet_column'] = col_letter
                    data_labels_found.append(i)
                    dc_definitions_found.append(dc_dict)
                    _dict = dc_dict
                    
            if _dict and 'display_order' not in _dict:
            
                _dict['display_order'] = i+10
                logger.warn(
                    'auto assigning "display_order" for col %r as %d' 
                        % (_dict['name'], i+10))

        if i not in data_labels_found:
        
            logger.debug( ( 
                'Data sheet label not found %r,'
                ' looking in default reagent definitions %s' )
                     % ( data_label, default_reagent_columns.keys() ) )
            
            for key,dc_dict in default_reagent_columns.items():
                if (key.upper() == data_label 
                    or dc_dict.get('name', '').upper() == data_label
                    or dc_dict.get('display_name', '').upper() == data_label):
                    
                    dc_dict['worksheet_column'] = col_letter
                    data_labels_found.append(i)
                    dc_definitions_found.append(dc_dict)
    
    data_labels_not_found = [ 
        data_label for i,data_label in enumerate(data_sheet_labels) 
            if data_label and data_label.strip() 
            and i not in data_labels_found and data_label not in meta_columns ]
    if data_labels_not_found:
        logger.warn(
            'data sheet labels not recognized %s' % data_labels_not_found )

    # for legacy datasets: make sure the small molecule column 1 is always created
    small_mol_col = None
    for dc_dict in dc_definitions_found:
        if dc_dict['data_type'] == 'small_molecule':
            small_mol_col = dc_dict
            break
    if not small_mol_col:
        dc_definitions_found.append(default_reagent_columns['Small Molecule Batch'])
        
    logger.info('data column definitions found: %s' 
        % [x['display_name'] for x in dc_definitions_found])

    return dc_definitions_found