Exemplo n.º 1
0
 def on_execute(self, evt=None):
     '''Run the query and show the results in a TableViewer'''
     db = dbconnect.DBConnect.getInstance()
     q = self.query_textctrl.Value
     try:
         res = db.execute(q)
         if res is None:
             logging.info('Query successful. No Data to return.')
             return
         res = np.array(db.execute(q))
         colnames = db.GetResultColumnNames()
         grid = tableviewer.TableViewer(self, title='query results')
         grid.table_from_array(res, colnames)
         grid.Show()
         logging.info('Query successful')
     except Exception, e:
         logging.error('Query failed:')
         logging.error(e)
Exemplo n.º 2
0
def score(properties,
          ts,
          nRules,
          filter_name=None,
          group='Image',
          show_results=False,
          results_table=None,
          overwrite=False):
    '''
    Trains a Classifier on a training set and scores the experiment
    returns the table of scores as a numpy array.
        
    properties    -- Properties instance
    ts            -- TrainingSet instance
    nRules        -- number of rules to use
    filter_name   -- name of a filter to use from the properties file
    group         -- name of a group to use from the properties file
    show_results  -- whether or not to show the results in TableViewer
    results_table -- table name to save results to or None.
    '''

    p = properties
    db = DBConnect.getInstance()
    dm = DataModel.getInstance()

    if group == None:
        group = 'Image'

    if results_table:
        if db.table_exists(results_table) and not overwrite:
            print 'Table "%s" already exists. Delete this table before running scoreall.' % (
                results_table)
            return None

    print ''
    print 'properties:    ', properties
    print 'training set:  ', ts
    print '# rules:       ', nRules
    print 'filter:        ', filter_name
    print 'grouping by:   ', group
    print 'show results:  ', show_results
    print 'results table: ', results_table
    print 'overwrite:     ', overwrite
    print ''

    nClasses = len(ts.labels)
    nKeyCols = len(image_key_columns())

    assert 200 > nRules > 0, '# of rules must be between 1 and 200.  Value was %s' % (
        nRules, )
    assert filter_name in p._filters.keys() + [
        None
    ], 'Filter %s not found in properties file.  Valid filters are: %s' % (
        filter_name,
        ','.join(p._filters.keys()),
    )
    assert group in p._groups.keys() + [
        'Image'
    ], 'Group %s not found in properties file.  Valid groups are: %s' % (
        group,
        ','.join(p._groups.keys()),
    )

    output = StringIO()
    logging.info('Training classifier with %s rules...' % nRules)
    t0 = time()
    weaklearners = fastgentleboostingmulticlass.train(ts.colnames, nRules,
                                                      ts.label_matrix,
                                                      ts.values, output)
    logging.info('Training done in %f seconds' % (time() - t0))

    logging.info('Computing per-image class counts...')
    t0 = time()

    def update(frac):
        logging.info('%d%% ' % (frac * 100., ))

    keysAndCounts = multiclasssql.PerImageCounts(weaklearners,
                                                 filter_name=(filter_name
                                                              or None),
                                                 cb=update)
    keysAndCounts.sort()
    logging.info('Counts found in %f seconds' % (time() - t0))

    if not keysAndCounts:
        logging.error(
            'No images are in filter "%s". Please check the filter definition in your properties file.'
            % (filter_name))
        raise Exception(
            'No images are in filter "%s". Please check the filter definition in your properties file.'
            % (filter_name))

    # AGGREGATE PER_IMAGE COUNTS TO GROUPS IF NOT GROUPING BY IMAGE
    if group != 'Image':
        logging.info('Grouping %s counts by %s...' % (p.object_name[0], group))
        t0 = time()
        imData = {}
        for row in keysAndCounts:
            key = tuple(row[:nKeyCols])
            imData[key] = np.array([float(v) for v in row[nKeyCols:]])

        groupedKeysAndCounts = np.array([
            list(k) + vals.tolist()
            for k, vals in dm.SumToGroup(imData, group).items()
        ],
                                        dtype=object)
        nKeyCols = len(dm.GetGroupColumnNames(group))
        logging.info('Grouping done in %f seconds' % (time() - t0))
    else:
        groupedKeysAndCounts = np.array(keysAndCounts, dtype=object)

    # FIT THE BETA BINOMIAL
    logging.info('Fitting beta binomial distribution to data...')
    counts = groupedKeysAndCounts[:, -nClasses:]
    alpha, converged = polyafit.fit_betabinom_minka_alternating(counts)
    logging.info('   alpha = %s   converged = %s' % (alpha, converged))
    logging.info('   alpha/Sum(alpha) = %s' % ([a / sum(alpha)
                                                for a in alpha]))

    # CONSTRUCT ARRAY OF TABLE DATA
    logging.info('Computing enrichment scores for each group...')
    t0 = time()
    tableData = []
    for i, row in enumerate(groupedKeysAndCounts):
        # Start this row with the group key:
        tableRow = list(row[:nKeyCols])

        if group != 'Image':
            tableRow += [
                len(dm.GetImagesInGroup(group, tuple(row[:nKeyCols])))
            ]
        # Append the counts:
        countsRow = [int(v) for v in row[nKeyCols:nKeyCols + nClasses]]
        tableRow += [sum(countsRow)]
        tableRow += countsRow
        if p.area_scoring_column is not None:
            # Append the areas
            countsRow = [int(v) for v in row[-nClasses:]]
            tableRow += [sum(countsRow)]
            tableRow += countsRow

        # Append the scores:
        #   compute enrichment probabilities of each class for this image OR group
        scores = np.array(dirichletintegrate.score(alpha, np.array(countsRow)))
        #   clamp to [0,1] to
        scores[scores > 1.] = 1.
        scores[scores < 0.] = 0.
        tableRow += scores.tolist()
        # Append the logit scores:
        #   Special case: only calculate logit of "positives" for 2-classes
        if nClasses == 2:
            tableRow += [np.log10(scores[0]) - (np.log10(1 - scores[0]))
                         ]  # compute logit of each probability
        else:
            tableRow += [
                np.log10(score) - (np.log10(1 - score)) for score in scores
            ]  # compute logit of each probability
        tableData.append(tableRow)
    tableData = np.array(tableData, dtype=object)
    logging.info('Enrichments computed in %f seconds' % (time() - t0))

    # CREATE COLUMN LABELS LIST
    # if grouping isn't per-image, then get the group key column names.
    if group != 'Image':
        colnames = dm.GetGroupColumnNames(group)
    else:
        colnames = list(image_key_columns())

    # record the column indices for the keys
    key_col_indices = [i for i in range(len(colnames))]

    if group != 'Image':
        colnames += ['Number_of_Images']
    colnames += ['Total_%s_Count' % (p.object_name[0].capitalize())]
    for i in xrange(nClasses):
        colnames += [
            '%s_%s_Count' %
            (ts.labels[i].capitalize(), p.object_name[0].capitalize())
        ]
    if p.area_scoring_column is not None:
        colnames += ['Total_%s_Area' % (p.object_name[0].capitalize())]
        for i in xrange(nClasses):
            colnames += [
                '%s_%s_Area' %
                (ts.labels[i].capitalize(), p.object_name[0].capitalize())
            ]
    for i in xrange(nClasses):
        colnames += ['pEnriched_%s' % (ts.labels[i])]
    if nClasses == 2:
        colnames += ['Enriched_Score_%s' % (ts.labels[0])]
    else:
        for i in xrange(nClasses):
            colnames += ['Enriched_Score_%s' % (ts.labels[i])]

    title = results_table or "Enrichments_per_%s" % (group, )
    if filter_name:
        title += "_filtered_by_%s" % (filter_name, )
    title += ' (%s)' % (os.path.split(p._filename)[1])

    if results_table:
        print 'Creating table %s' % (results_table)
        success = db.CreateTableFromData(tableData,
                                         colnames,
                                         results_table,
                                         temporary=False)
        if not success:
            print 'Failed to create results table :('

    if show_results:
        import tableviewer
        tableview = tableviewer.TableViewer(None, title=title)
        if results_table and overwrite:
            tableview.load_db_table(results_table)
        else:
            tableview.table_from_array(tableData, colnames, group,
                                       key_col_indices)
        tableview.set_fitted_col_widths()
        tableview.Show()
    return tableData
    def do_normalization(self):
        if not self.validate():
            # Should be unreachable
            wx.MessageBox(
                'Your normalization settings are invalid. Can\'t perform normalization.'
            )

        long_cols = [
            col for col in self.col_choices.GetCheckedStrings()
            if len(col) + 4 > 64
        ]
        if long_cols:
            dlg = wx.MessageDialog(
                self, 'The following columns contain more '
                'than 64 characters when a normalization suffix (4 '
                'characters) is appended. This may cause a problem when '
                'writing to the database.\n %s' % ('\n'.join(long_cols)),
                'Warning', wx.OK | wx.CANCEL | wx.ICON_EXCLAMATION)
            if dlg.ShowModal() == wx.ID_CANCEL:
                return
            dlg.Destroy()

        imkey_cols = dbconnect.image_key_columns()
        obkey_cols = dbconnect.object_key_columns()
        wellkey_cols = dbconnect.well_key_columns()
        im_clause = dbconnect.UniqueImageClause
        well_clause = dbconnect.UniqueWellClause
        input_table = self.table_choice.GetStringSelection()
        meas_cols = self.col_choices.GetCheckedStrings()
        wants_norm_meas = self.norm_meas_checkbox.IsChecked()
        wants_norm_factor = self.norm_factor_checkbox.IsChecked()
        output_table = self.output_table.Value
        FIRST_MEAS_INDEX = len(imkey_cols + (wellkey_cols or tuple()))
        if p.db_type == 'mysql':
            BATCH_SIZE = 100
        else:
            BATCH_SIZE = 1
        if input_table == p.object_table:
            FIRST_MEAS_INDEX += 1  # Original
        if wellkey_cols:
            if input_table == p.image_table:
                WELL_KEY_INDEX = len(imkey_cols)
            else:
                WELL_KEY_INDEX = len(imkey_cols) + 1

        if db.table_exists(output_table):
            dlg = wx.MessageDialog(
                self, 'Are you sure you want to overwrite the table "%s"?' %
                (output_table), "Overwrite table?",
                wx.YES_NO | wx.NO_DEFAULT | wx.ICON_EXCLAMATION)
            if dlg.ShowModal() == wx.ID_NO:
                dlg.Destroy()
                return
            dlg.Destroy()

        #
        # First Get the data from the db.
        #
        if input_table == p.image_table:
            if wellkey_cols:
                # If there are well columns, fetch them.
                query = "SELECT %s, %s, %s FROM %s" % (im_clause(
                ), well_clause(), ', '.join(meas_cols), input_table)
            else:
                query = "SELECT %s, %s FROM %s" % (
                    im_clause(), ', '.join(meas_cols), input_table)
        elif input_table == p.object_table:
            if p.image_table and wellkey_cols:

                # If we have x and y from cells, we can use that for classifier
                if p.cell_x_loc and p.cell_y_loc:
                    FIRST_MEAS_INDEX += 2  # Cell X and Y Location are fixed to for classifier
                    # If there are well columns, fetch them from the per-image table.
                    query = "SELECT %s, %s, %s, %s, %s FROM %s, %s WHERE %s" % (
                        dbconnect.UniqueObjectClause(
                            p.object_table), well_clause(p.image_table),
                        p.cell_x_loc, p.cell_y_loc, ', '.join([
                            '%s.%s' % (p.object_table, col)
                            for col in meas_cols
                        ]), p.image_table, p.object_table, ' AND '.join([
                            '%s.%s=%s.%s' %
                            (p.image_table, c, p.object_table, c)
                            for c in imkey_cols
                        ]))

                else:
                    # If there are well columns, fetch them from the per-image table.
                    query = "SELECT %s, %s, %s FROM %s, %s WHERE %s" % (
                        dbconnect.UniqueObjectClause(p.object_table),
                        well_clause(p.image_table), ', '.join([
                            '%s.%s' % (p.object_table, col)
                            for col in meas_cols
                        ]), p.image_table, p.object_table, ' AND '.join([
                            '%s.%s=%s.%s' %
                            (p.image_table, c, p.object_table, c)
                            for c in imkey_cols
                        ]))

            else:

                if p.cell_x_loc and p.cell_y_loc:
                    FIRST_MEAS_INDEX += 2  # Cell X and Y Location are fixed to for classifier

                    query = "SELECT %s, %s, %s, %s FROM %s" % (
                        im_clause(), p.cell_x_loc, p.cell_y_loc,
                        ', '.join(meas_cols), input_table)

                else:
                    query = "SELECT %s, %s FROM %s" % (
                        im_clause(), ', '.join(meas_cols), input_table)

        if p.negative_control:  # if the user defined negative control, we can use that to fetch the wellkeys
            neg_query = query + ' AND ' + p.negative_control  # fetch all the negative control elements

        if wellkey_cols:
            query += " ORDER BY %s" % (well_clause(p.image_table))

        dlg = wx.ProgressDialog('Computing normalized values',
                                'Querying database for raw data.',
                                parent=self,
                                style=wx.PD_CAN_ABORT | wx.PD_APP_MODAL)
        dlg.Pulse()
        #
        # MAKE THE QUERY
        #

        input_data = np.array(db.execute(query), dtype=object)
        if p.negative_control:
            import pandas as pd
            negative_control = pd.DataFrame(db.execute(neg_query), dtype=float)
            logging.info("# of objects in negative control: " +
                         str(negative_control.shape[0]))
            logging.info("# of objects queried: " + str(input_data.shape[0]))
            neg_mean_plate = negative_control.groupby([WELL_KEY_INDEX]).mean()
            neg_std_plate = negative_control.groupby([WELL_KEY_INDEX]).std()

        output_columns = np.ones(input_data[:,
                                            FIRST_MEAS_INDEX:].shape) * np.nan
        output_factors = np.ones(input_data[:,
                                            FIRST_MEAS_INDEX:].shape) * np.nan
        for colnum, col in enumerate(input_data[:, FIRST_MEAS_INDEX:].T):
            keep_going, skip = dlg.Pulse("Normalizing column %d of %d" %
                                         (colnum + 1, len(meas_cols)))
            if not keep_going:
                dlg.Destroy()
                return
            norm_data = col.copy()
            for step_num, step_panel in enumerate(self.norm_steps):
                d = step_panel.get_configuration_dict()
                if d[norm.P_GROUPING] in (norm.G_QUADRANT,
                                          norm.G_WELL_NEIGHBORS):
                    # Reshape data if normalization step is plate sensitive.
                    assert p.plate_id and p.well_id
                    well_keys = input_data[:,
                                           range(WELL_KEY_INDEX,
                                                 FIRST_MEAS_INDEX - 2)]
                    wellkeys_and_vals = np.hstack(
                        (well_keys, np.array([norm_data]).T))
                    new_norm_data = []
                    for plate, plate_grp in groupby(wellkeys_and_vals,
                                                    lambda row: row[0]):
                        keys_and_vals = np.array(list(plate_grp))
                        plate_data, wks, ind = FormatPlateMapData(
                            keys_and_vals)
                        pnorm_data = norm.do_normalization_step(
                            plate_data, **d)
                        new_norm_data += pnorm_data.flatten()[
                            ind.flatten().tolist()].tolist()
                    norm_data = new_norm_data
                elif d[norm.P_GROUPING] == norm.G_PLATE:
                    assert p.plate_id and p.well_id

                    if d[norm.P_AGG_TYPE] == norm.M_NEGCTRL:
                        mean_plate_col = neg_mean_plate[colnum +
                                                        FIRST_MEAS_INDEX]
                        std_plate_col = neg_std_plate[colnum +
                                                      FIRST_MEAS_INDEX]
                        print(mean_plate_col)
                        print(std_plate_col)

                    well_keys = input_data[:,
                                           range(WELL_KEY_INDEX,
                                                 FIRST_MEAS_INDEX - 2)]
                    wellkeys_and_vals = np.hstack(
                        (well_keys, np.array([norm_data]).T))
                    new_norm_data = []
                    # print wellkeys_and_vals
                    for plate, plate_grp in groupby(wellkeys_and_vals,
                                                    lambda row: row[0]):
                        plate_data = np.array(list(plate_grp))[:, -1].flatten()
                        pnorm_data = norm.do_normalization_step(
                            plate_data, **d)

                        if d[norm.P_AGG_TYPE] == norm.M_NEGCTRL:
                            try:
                                plate_mean = mean_plate_col[plate]
                                plate_std = std_plate_col[plate]
                            except:
                                plate_mean = mean_plate_col[int(plate)]
                                plate_std = std_plate_col[int(plate)]

                            try:
                                pnorm_data = (pnorm_data -
                                              plate_mean) / plate_std
                                print(pnorm_data)
                            except:
                                logging.error(
                                    "Plate std is zero, division by zero!")

                        new_norm_data += pnorm_data.tolist()
                    norm_data = new_norm_data
                else:
                    norm_data = norm.do_normalization_step(norm_data, **d)

            output_columns[:, colnum] = np.array(norm_data)
            output_factors[:,
                           colnum] = col.astype(float) / np.array(norm_data,
                                                                  dtype=float)

        dlg.Destroy()
        return  # Abort here for coding

        norm_table_cols = []
        # Write new table
        db.execute('DROP TABLE IF EXISTS %s' % (output_table))
        if input_table == p.image_table:
            norm_table_cols += dbconnect.image_key_columns()
            col_defs = ', '.join([
                '%s %s' % (col, db.GetColumnTypeString(p.image_table, col))
                for col in dbconnect.image_key_columns()
            ])
        elif input_table == p.object_table:
            norm_table_cols += obkey_cols
            col_defs = ', '.join([
                '%s %s' % (col, db.GetColumnTypeString(p.object_table, col))
                for col in obkey_cols
            ])
        if wellkey_cols:
            norm_table_cols += wellkey_cols
            col_defs += ', ' + ', '.join([
                '%s %s' % (col, db.GetColumnTypeString(p.image_table, col))
                for col in wellkey_cols
            ])

        if input_table == p.object_table:
            if p.cell_x_loc and p.cell_y_loc:
                norm_table_cols += [p.cell_x_loc, p.cell_y_loc]
                col_defs += ', %s %s' % (
                    p.cell_x_loc,
                    db.GetColumnTypeString(p.object_table, p.cell_x_loc)
                ) + ', ' + '%s %s' % (p.cell_y_loc,
                                      db.GetColumnTypeString(
                                          p.object_table, p.cell_y_loc))

        if wants_norm_meas:
            col_defs += ', ' + ', '.join([
                '%s_NmM %s' % (col, db.GetColumnTypeString(input_table, col))
                for col in meas_cols
            ])
        if wants_norm_factor:
            col_defs += ', ' + ', '.join([
                '%s_NmF %s' % (col, db.GetColumnTypeString(input_table, col))
                for col in meas_cols
            ])

        for col in meas_cols:
            if wants_norm_meas:
                norm_table_cols += ['%s_NmM' % (col)]
            if wants_norm_factor:
                norm_table_cols += ['%s_NmF' % (col)]
        db.execute('CREATE TABLE %s (%s)' % (output_table, col_defs))

        dlg = wx.ProgressDialog('Writing to "%s"' % (output_table),
                                "Writing normalized values to database",
                                maximum=output_columns.shape[0],
                                parent=self,
                                style=wx.PD_CAN_ABORT | wx.PD_APP_MODAL
                                | wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME
                                | wx.PD_REMAINING_TIME)

        cmd = 'INSERT INTO %s VALUES ' % (output_table)
        cmdi = cmd
        for i, (val, factor) in enumerate(zip(output_columns, output_factors)):
            cmdi += '(' + ','.join(['"%s"'] * len(norm_table_cols)) + ')'
            if wants_norm_meas and wants_norm_factor:
                cmdi = cmdi % tuple(
                    list(input_data[i, :FIRST_MEAS_INDEX]) + [
                        'NULL' if (np.isnan(x) or np.isinf(x)) else x
                        for x in val
                    ] + [
                        'NULL' if (np.isnan(x) or np.isinf(x)) else x
                        for x in factor
                    ])
            elif wants_norm_meas:
                cmdi = cmdi % tuple(
                    list(input_data[i, :FIRST_MEAS_INDEX]) + [
                        'NULL' if (np.isnan(x) or np.isinf(x)) else x
                        for x in val
                    ])
            elif wants_norm_factor:
                cmdi = cmdi % tuple(
                    list(input_data[i, :FIRST_MEAS_INDEX]) + [
                        'NULL' if (np.isnan(x) or np.isinf(x)) else x
                        for x in factor
                    ])
            if (i + 1) % BATCH_SIZE == 0 or i == len(output_columns) - 1:
                db.execute(str(cmdi))
                cmdi = cmd
                # update status dialog
                (keep_going, skip) = dlg.Update(i)
                if not keep_going:
                    break
            else:
                cmdi += ',\n'
        dlg.Destroy()
        db.Commit()

        #
        # Update table linkage
        #
        if db.get_linking_tables(input_table, output_table) is not None:
            db.do_unlink_table(output_table)

        if input_table == p.image_table:
            db.do_link_tables(output_table, input_table, imkey_cols,
                              imkey_cols)
        elif input_table == p.object_table:
            db.do_link_tables(output_table, input_table, obkey_cols,
                              obkey_cols)

        #
        # Show the resultant table
        #
        import tableviewer
        tv = tableviewer.TableViewer(ui.get_main_frame_or_none())
        tv.Show()
        tv.load_db_table(output_table)
Exemplo n.º 4
0
        #
        if db.get_linking_tables(input_table, output_table) is not None:
            db.do_unlink_table(output_table)

        if input_table == p.image_table:
            db.do_link_tables(output_table, input_table, imkey_cols,
                              imkey_cols)
        elif input_table == p.object_table:
            db.do_link_tables(output_table, input_table, obkey_cols,
                              obkey_cols)

        #
        # Show the resultant table
        #
        import tableviewer
        tv = tableviewer.TableViewer(ui.get_main_frame_or_none())
        tv.Show()
        tv.load_db_table(output_table)

    def save_settings(self):
        '''returns a dictionary mapping setting names to values encoded as strings'''
        return {
            'table': self.table_choice.GetStringSelection(),
            'columns': ','.join(self.col_choices.GetCheckedStrings()),
            'steps':
            repr([s.get_configuration_dict() for s in self.norm_steps]),
            'wants_meas': str(int(self.norm_meas_checkbox.IsChecked())),
            'wants_factor': str(int(self.norm_factor_checkbox.IsChecked())),
            'output_table': self.output_table.Value,
            'version': '1',
        }