Exemplo n.º 1
0
    def _findFreeRow(self, record):
        """
        Find free row to write records to.
        """
        expenseStartColumn, expenseStartRow, expenseEndColumn, expenseEndRow = 0, 0, 0, 0
        sheetName = constants.MONTH_NUM_TO_STRING[record.getDate().tm_mon]

        # check if already found the the empty row
        if sheetName in self.emptyRowsPerSheet:
           self.emptyRowsPerSheet[sheetName] = (self.emptyRowsPerSheet[sheetName][0] + 1,
                                                self.emptyRowsPerSheet[sheetName][1])
        else:
            self.ws = self.wb.sheet_by_name(sheetName)
            recordsSignature = [constants.DATE_SIG, constants.CATEGORY_SIG,
                                constants.DESCRIPTION_SIG, constants.COST_SIG]
            num_rows = self.ws.nrows - 1
            num_cells = self.ws.ncols - 1
            curr_row = -1

            while curr_row < num_rows:
                curr_row += 1
                curr_cell = -1
                while curr_cell < num_cells:
                    if curr_cell > constants.MAX_COL_NUM:
                        break
                    curr_cell += 1

                    rowTuple = [self.ws.cell_value(curr_row, curr_cell + i) for i in range(4)]
                    if rowTuple == recordsSignature:
                        expenseStartColumn = curr_cell
                        expenseStartRow = curr_row + 1
                        expenseEndColumn = curr_cell + 4

            wsRows = [self.ws.row_values(rowx=expenseStartRow + i, start_colx=expenseStartColumn, end_colx=expenseEndColumn)
                      for i in range(num_rows - expenseStartRow)]

            foundEmptyLine = False
            emptyRow = -1
            for rowNumber, line in enumerate(wsRows):
                if all(item == "" for item in line) and not foundEmptyLine:
                    foundEmptyLine = True
                    emptyRow = rowNumber + 1
                elif not all(item == "" for item in line) and foundEmptyLine:
                    foundEmptyLine = False
            firstEmptyRow = emptyRow + int(expenseStartRow)

            self.emptyRowsPerSheet[sheetName] = (firstEmptyRow, colname(colx=expenseStartColumn))
        return self.emptyRowsPerSheet[sheetName]
Exemplo n.º 2
0
def read_datacolumns(book):
    '''
    @return an array of data column definition dicts 
    '''

    data_column_sheet = book.sheet_by_name('Data Columns')

    labels = {
        'Worksheet Column': 'worksheet_column',
        '"Data" Worksheet Column': 'worksheet_column',
        'Display Order': 'display_order',
        'Display Name': 'display_name',
        'Name': 'name',
        'Data Type': 'data_type',
        'Decimal Places': 'precision',
        'Description': 'description',
        'Replicate Number': 'replicate',
        'Unit': 'unit',
        'Assay readout type': 'readout_type',
        'Comments': 'comments',
    }

    dc_definitions = []
    datacolumn_fields = util.get_fields(DataColumn)
    type_lookup = dict((f.name, iu.totype(f)) for f in datacolumn_fields)
    logger.debug('datacolumn type lookups: %s' % type_lookup)
    required_labels = ['name', 'data_type']

    logger.info('read the data column definitions...')
    for i in xrange(data_column_sheet.nrows):
        row_values = data_column_sheet.row_values(i)

        if i == 0:
            for val in row_values[1:]:
                dc_definitions.append({})

        label_read = row_values[0]

        recognized_label = next(
            (field_name for label, field_name in labels.items()
             if label_read and label.lower() == label_read.lower()), None)

        if recognized_label:

            logger.debug('label: %r, recognized_label: %r' %
                         (label_read, recognized_label))

            for j, val in enumerate(row_values[1:]):
                dc_dict = dc_definitions[j]

                logger.debug('data column %s:%d:%d:%r' %
                             (recognized_label, i, j, val))

                final_val = util.convertdata(
                    val, type_lookup.get(recognized_label, None))

                if final_val != None:
                    dc_dict[recognized_label] = final_val
                    if recognized_label == 'display_order':
                        # add 10 to the order, so default reagent cols can go first
                        dc_dict['display_order'] = (dc_dict['display_order'] +
                                                    10)
                    if recognized_label == 'name':
                        # split on non-alphanumeric chars
                        temp = re.split(r'[^a-zA-Z0-9]+', dc_dict['name'])
                        # convert, if needed
                        if len(temp) > 1:
                            dc_dict['name'] = camel_case_dwg(dc_dict['name'])
                else:
                    if recognized_label in required_labels:
                        raise Exception(
                            'Error, data column field is required: %s, col: %r'
                            % (recognized_label, colname(j + 1)))
        else:
            logger.debug('unrecognized label in "Data Columns" sheet %r' %
                         label_read)

    for dc_dict in dc_definitions:
        for label in required_labels:
            if label not in dc_dict:
                raise Exception('required "Data Column" label not defined %r' %
                                label)

    logger.info('find the data columns on the "Data" sheet...')

    data_sheet = book.sheet_by_name('Data')
    data_sheet_labels = data_sheet.row_values(0)
    dc_definitions_found = []
    data_labels_found = []
    for i, data_label in enumerate(data_sheet_labels):

        if not data_label or not data_label.strip():
            logger.info('break on data sheet col %d, blank' % i)
            break

        data_label = data_label.upper()
        col_letter = colname(i)

        for dc_dict in dc_definitions:
            _dict = None
            if 'worksheet_column' in dc_dict:

                v = dc_dict['worksheet_column']
                if v.upper() == col_letter:
                    data_labels_found.append(i)
                    dc_definitions_found.append(dc_dict)
                    _dict = dc_dict

            elif 'name' in dc_dict or 'display_name' in dc_dict:

                if (dc_dict.get('name', '').upper() == data_label or
                        dc_dict.get('display_name', '').upper() == data_label):

                    dc_dict['worksheet_column'] = col_letter
                    data_labels_found.append(i)
                    dc_definitions_found.append(dc_dict)
                    _dict = dc_dict

            if _dict and 'display_order' not in _dict:

                _dict['display_order'] = i + 10
                logger.warn('auto assigning "display_order" for col %r as %d' %
                            (_dict['name'], i + 10))

        if i not in data_labels_found:

            logger.debug(('Data sheet label not found %r,'
                          ' looking in default reagent definitions %s') %
                         (data_label, default_reagent_columns.keys()))

            for key, dc_dict in default_reagent_columns.items():
                if (key.upper() == data_label
                        or dc_dict.get('name', '').upper() == data_label or
                        dc_dict.get('display_name', '').upper() == data_label):

                    dc_dict['worksheet_column'] = col_letter
                    data_labels_found.append(i)
                    dc_definitions_found.append(dc_dict)

    data_labels_not_found = [
        data_label for i, data_label in enumerate(data_sheet_labels)
        if data_label and data_label.strip() and i not in data_labels_found
        and data_label not in meta_columns
    ]
    if data_labels_not_found:
        logger.warn('data sheet labels not recognized %s' %
                    data_labels_not_found)

    # for legacy datasets: make sure the small molecule column 1 is always created
    small_mol_col = None
    for dc_dict in dc_definitions_found:
        if dc_dict['data_type'] == 'small_molecule':
            small_mol_col = dc_dict
            break
    if not small_mol_col:
        dc_definitions_found.append(
            default_reagent_columns['Small Molecule Batch'])

    logger.info('data column definitions found: %s' %
                [x['display_name'] for x in dc_definitions_found])

    return dc_definitions_found
Exemplo n.º 3
0
            dc_definition['display_order'] = i
        datacolumn = DataColumn(**dc_definition)
        datacolumn.save()
        if not small_molecule_col and datacolumn.data_type == 'small_molecule':
            small_molecule_col = datacolumn
        logger.debug('datacolumn created: %r' % datacolumn)
        if datacolumn.worksheet_column:
            col_to_dc_map[int_for_col(
                datacolumn.worksheet_column)] = datacolumn
    logger.debug('final data columns: %s' % col_to_dc_map)

    logger.debug('read the Data sheet')
    data_sheet = book.sheet_by_name('Data')

    for i, label in enumerate(data_sheet.row_values(0)):
        logger.debug('find datasheet label %r:%r' % (colname(i), label))
        if label in meta_columns:
            meta_columns[label] = i
            continue

    logger.debug('meta_columns: %s, datacolumnList: %s' %
                 (meta_columns, col_to_dc_map))

    logger.debug('read the data sheet, save_interval: %d' % save_interval)
    loopStart = time.time()
    pointsSaved = 0
    rows_read = 0
    col_to_dc_items = col_to_dc_map.items()

    for i in xrange(data_sheet.nrows - 1):
        current_row = i + 2
Exemplo n.º 4
0
def read_data(book, col_to_dc_map, first_small_molecule_column, dataset):

    datarecord_batch = []
    save_interval = 1000

    logger.debug('read the Data sheet')
    data_sheet = book.sheet_by_name('Data')
    
    for i,label in enumerate(data_sheet.row_values(0)):
        logger.debug('find datasheet label %r:%r' % (colname(i), label))
        if label in meta_columns: 
            meta_columns[label] = i
            continue
    
    logger.debug('meta_columns: %s, datacolumnList: %s' 
        % (meta_columns, col_to_dc_map) )
    logger.debug('read the data sheet, save_interval: %d' % save_interval)
    loopStart = time.time()
    pointsSaved = 0
    rows_read = 0
    for i in xrange(data_sheet.nrows-1):
        current_row = i + 2
        row = data_sheet.row_values(i+1)    

        r = util.make_row(row)
        datarecord = DataRecord(dataset=dataset)
        
        if meta_columns['Control Type'] > -1: 
            datarecord.control_type = util.convertdata(
                r[meta_columns['Control Type']])

        datapoint_batch = []
        small_molecule_datapoint = None 
        for i,dc in col_to_dc_map.items():
            value = r[i]
            logger.debug(
                'reading column %r, %s, val: %r' % (colname(i), dc, value))
            value = value.strip()
            value = util.convertdata(value)
            if not value: 
                continue
            datapoint = _create_datapoint(dc, dataset, datarecord, value)
            datapoint_batch.append(datapoint)
            pointsSaved += 1
            if not small_molecule_datapoint and dc.data_type == 'small_molecule':
                small_molecule_datapoint = datapoint
                
        if meta_columns['Plate'] > -1:
            _read_plate_well(
                meta_columns['Plate'], r, current_row, datarecord,
                first_small_molecule_column,small_molecule_datapoint,
                datapoint_batch)
        
        
        datarecord_batch.append((datarecord, datapoint_batch))
        rows_read += 1
        
        if (rows_read % save_interval == 0):
            bulk_create_datarecords(datarecord_batch)
            logger.debug(
                'datarecord batch created, rows_read: %d , time (ms): %d'
                    % (rows_read, time.time()-loopStart ) )
            count = bulk_create_datapoints(datarecord_batch)
            logger.debug('datapoints created in batch: %d ' % count)
            datarecord_batch=[]

    bulk_create_datarecords(datarecord_batch)
    et = time.time()-loopStart
    logger.debug(
        'final datarecord batch created, rows_read: %d, time (ms): %d' 
            % (rows_read, et))

    count = bulk_create_datapoints(datarecord_batch)
    logger.debug('created dps %d' % count )

    print 'Finished reading, rows_read: ', rows_read, ', points Saved: ', pointsSaved
    print 'elapsed: ', et , 'avg: ', et/rows_read
    
    cleanup_unused_datacolumns(dataset)
Exemplo n.º 5
0
                    dc_dict[recognized_label] = final_val
                    if recognized_label == 'display_order':
                        # add 10 to the order, so default reagent cols can go first
                        dc_dict['display_order'] = (
                            dc_dict['display_order'] + 10)
                    if recognized_label == 'name':
                        # split on non-alphanumeric chars
                        temp = re.split(r'[^a-zA-Z0-9]+',dc_dict['name'])
                        # convert, if needed
                        if len(temp) > 1:
                            dc_dict['name'] = camel_case_dwg(dc_dict['name'])
                else:
                    if recognized_label in required_labels:
                        raise Exception(
                            'Error, data column field is required: %s, col: %r'
                                % ( recognized_label, colname(j+1) ) )
        else:
            logger.debug(
                'unrecognized label in "Data Columns" sheet %r' % label_read)
    
    for dc_dict in dc_definitions:
        for label in required_labels:
            if label not in dc_dict:
                raise Exception(
                    'required "Data Column" label not defined %r' % label)

    logger.info('find the data columns on the "Data" sheet...')           

    data_sheet = book.sheet_by_name('Data')
    data_sheet_labels = data_sheet.row_values(0)
    dc_definitions_found = []
Exemplo n.º 6
0
def read_datacolumns(book):
    '''
    @return an array of data column definition dicts 
    '''
    
    data_column_sheet = book.sheet_by_name('Data Columns')
    
    labels = {
        'Worksheet Column':'worksheet_column',
        '"Data" Worksheet Column':'worksheet_column',
        'Display Order':'display_order',
        'Display Name':'display_name',
        'Name':'name',
        'Data Type':'data_type',
        'Decimal Places':'precision',
        'Description':'description',
        'Replicate Number':'replicate',
        'Unit':'unit', 
        'Assay readout type':'readout_type',
        'Comments':'comments',
    }

    dc_definitions = []
    datacolumn_fields = util.get_fields(DataColumn)
    type_lookup = dict((f.name, iu.totype(f)) for f in datacolumn_fields)
    logger.debug('datacolumn type lookups: %s' % type_lookup)
    required_labels = ['name', 'data_type']

    logger.info('read the data column definitions...')
    for i in xrange(data_column_sheet.nrows):
        row_values = data_column_sheet.row_values(i)
        
        if i == 0:
            for val in row_values[1:]:
                dc_definitions.append({})
        
        label_read = row_values[0]
        
        recognized_label = next(
            (field_name for label, field_name in labels.items() 
                if label_read and label.lower() == label_read.lower() ), None)
        
        if recognized_label:
            
            logger.debug(
                'label: %r, recognized_label: %r' % (label_read, recognized_label))
            
            for j,val in enumerate(row_values[1:]):
                dc_dict = dc_definitions[j]

                logger.debug('data column %s:%d:%d:%r' 
                    % ( recognized_label, i, j, val))
                
                final_val = util.convertdata(
                    val,type_lookup.get(recognized_label, None)) 
                
                if final_val != None:
                    dc_dict[recognized_label] = final_val
                    if recognized_label == 'display_order':
                        # add 10 to the order, so default reagent cols can go first
                        dc_dict['display_order'] = (
                            dc_dict['display_order'] + 10)
                    if recognized_label == 'name':
                        # split on non-alphanumeric chars
                        temp = re.split(r'[^a-zA-Z0-9]+',dc_dict['name'])
                        # convert, if needed
                        if len(temp) > 1:
                            dc_dict['name'] = camel_case_dwg(dc_dict['name'])
                else:
                    if recognized_label in required_labels:
                        raise Exception(
                            'Error, data column field is required: %s, col: %r'
                                % ( recognized_label, colname(j+1) ) )
        else:
            logger.debug(
                'unrecognized label in "Data Columns" sheet %r' % label_read)
    
    for dc_dict in dc_definitions:
        for label in required_labels:
            if label not in dc_dict:
                raise Exception(
                    'required "Data Column" label not defined %r' % label)

    logger.info('find the data columns on the "Data" sheet...')           

    data_sheet = book.sheet_by_name('Data')
    data_sheet_labels = data_sheet.row_values(0)
    dc_definitions_found = []
    data_labels_found = []
    for i,data_label in enumerate(data_sheet_labels):
        
        if not data_label or not data_label.strip():
            logger.info('break on data sheet col %d, blank' % i)
            break
        
        data_label = data_label.upper()
        col_letter = colname(i)
        
        for dc_dict in dc_definitions:
            _dict = None
            if 'worksheet_column' in dc_dict:
                
                v = dc_dict['worksheet_column']
                if v.upper() == col_letter:
                    data_labels_found.append(i)
                    dc_definitions_found.append(dc_dict)
                    _dict = dc_dict
                    
            elif 'name' in dc_dict or 'display_name' in dc_dict:
            
                if ( dc_dict.get('name', '').upper() == data_label
                    or dc_dict.get('display_name', '').upper() == data_label):
                    
                    dc_dict['worksheet_column'] = col_letter
                    data_labels_found.append(i)
                    dc_definitions_found.append(dc_dict)
                    _dict = dc_dict
                    
            if _dict and 'display_order' not in _dict:
            
                _dict['display_order'] = i+10
                logger.warn(
                    'auto assigning "display_order" for col %r as %d' 
                        % (_dict['name'], i+10))

        if i not in data_labels_found:
        
            logger.debug( ( 
                'Data sheet label not found %r,'
                ' looking in default reagent definitions %s' )
                     % ( data_label, default_reagent_columns.keys() ) )
            
            for key,dc_dict in default_reagent_columns.items():
                if (key.upper() == data_label 
                    or dc_dict.get('name', '').upper() == data_label
                    or dc_dict.get('display_name', '').upper() == data_label):
                    
                    dc_dict['worksheet_column'] = col_letter
                    data_labels_found.append(i)
                    dc_definitions_found.append(dc_dict)
    
    data_labels_not_found = [ 
        data_label for i,data_label in enumerate(data_sheet_labels) 
            if data_label and data_label.strip() 
            and i not in data_labels_found and data_label not in meta_columns ]
    if data_labels_not_found:
        logger.warn(
            'data sheet labels not recognized %s' % data_labels_not_found )

    # for legacy datasets: make sure the small molecule column 1 is always created
    small_mol_col = None
    for dc_dict in dc_definitions_found:
        if dc_dict['data_type'] == 'small_molecule':
            small_mol_col = dc_dict
            break
    if not small_mol_col:
        dc_definitions_found.append(default_reagent_columns['Small Molecule Batch'])
        
    logger.info('data column definitions found: %s' 
        % [x['display_name'] for x in dc_definitions_found])

    return dc_definitions_found
Exemplo n.º 7
0
                or dc_definition['display_order']==None): 
            dc_definition['display_order']=i
        datacolumn = DataColumn(**dc_definition)
        datacolumn.save()
        if not small_molecule_col and datacolumn.data_type == 'small_molecule':
            small_molecule_col = datacolumn
        logger.debug('datacolumn created: %r' % datacolumn)
        if datacolumn.worksheet_column:
            col_to_dc_map[int_for_col(datacolumn.worksheet_column)] = datacolumn    
    logger.debug('final data columns: %s' % col_to_dc_map)

    logger.debug('read the Data sheet')
    data_sheet = book.sheet_by_name('Data')
    
    for i,label in enumerate(data_sheet.row_values(0)):
        logger.debug('find datasheet label %r:%r' % (colname(i), label))
        if label in meta_columns: 
            meta_columns[label] = i
            continue
    
    logger.debug('meta_columns: %s, datacolumnList: %s' 
        % (meta_columns, col_to_dc_map) )
    
    logger.debug('read the data sheet, save_interval: %d' % save_interval)
    loopStart = time.time()
    pointsSaved = 0
    rows_read = 0
    col_to_dc_items = col_to_dc_map.items()

    for i in xrange(data_sheet.nrows-1):
        current_row = i + 2