示例#1
0
 def predict(self, predict_table_name, actual_label_col=''):
     ''' 
       Return predicted values using the trained model. Also return precision, recall & f-measure
       Input:
       ======
       predict_table_name : (String) the name of the table to be used for prediction
       actual_label_col : (String) the name of the actual label column (will be ignored if empty)
       
       Output:
       =======
       A cursor to the row set of the results, including the predicted value as column 'prediction'
       
     '''
     #Transform the columns if any of them are categorical
     predict_table_name, _indep, _dep, _discard = pivotCategoricalColumns(self.dbconn,predict_table_name, 
                                                                self.model['indep_org'], 
                                                                actual_label_col,
                                                                self.model['col_distinct_vals_dict'])
     stmt = '''
               select *, 
                      madlib.array_dot(array{coef}::real[],{indep}) as prediction 
               from {table_name}
            '''
     stmt = stmt .format(coef=self.model['coef'],
                         indep=self.model['indep'],
                         table_name=predict_table_name
                        )
     cursor = self.dbconn.getCursor()
     cursor.execute(stmt)
     return cursor
示例#2
0
 def predict(self, predict_table_name, actual_label_col=''):
     ''' 
       Return predicted values using the trained model. Also return precision, recall & f-measure
       Input:
       ======
       predict_table_name : (String) the name of the table to be used for prediction
       actual_label_col : (String) the name of the actual label column (will be ignored if empty)
       
       Output:
       =======
       A dataframe of the prediction results
       
     '''
     #Transform the columns if any of them are categorical
     predict_table_name, _indep, _dep, _discard = pivotCategoricalColumns(self.dbconn,predict_table_name, 
                                                                self.model['indep_org'], 
                                                                actual_label_col,
                                                                self.model['col_distinct_vals_dict']
                                                 )
     stmt = '''
               select *, 
                      {madlib_schema}.array_dot(array{coef}::real[],{indep}) as prediction 
               from {table_name}
            '''.format(
                       coef=self.model['coef'],
                       indep=self.model['indep'],
                       table_name=predict_table_name,
                       madlib_schema=self.dbconn.getMADlibSchema()
                      )
     prediction_results = psql.read_frame(stmt,self.dbconn.getConnection())
     return prediction_results
示例#3
0
 def train(self, table_name, indep, dep, numIter=100, optimizer='irls',precision=0.001):
     ''' 
       Given train a logistic regression model on the specified table 
       for the given set of independent and dependent variables 
       Inputs :
       ========
       table_name : (String) input table name
       indep : (String) column containing independent variables as an array, to be used to build the model on
                                              OR
               (list) a list of strings, where each element of the list is a column name of table_name or is a constant number                      
       dep : (string) the class label.
       
       Output :
       ========
       The Model coefficients, r2, p_values and t_stats
       The function also returns the model object.            
     '''
     self.model = {}
     #If indep is a list, then the input is specified as a list of columns in a table.
     #1) First, we will transform any categorical columns in this list.
     #2) We will marshal the values from the columns into an array, that can be passed on to MADlib's logistic regression algorithm.
     if(isinstance(indep,[].__class__)):
         self.model['indep_org'] = indep
         table_name, indep, dep, _ = pivotCategoricalColumns(self.dbconn,table_name, indep, dep)
         #Convert transformed independent columns into an array
         table_name, indep = convertsColsToArray(self.dbconn, table_name, indep, dep)
     else:
         self.model['indep_org'] = indep
        
     self.model['indep'] = indep
     self.model['dep'] = dep
     
     stmt = '''
               select * 
               from madlib.logregr('{table_name}','{dep}','{indep}',{numIter}, '{optimizer}', {precision}) 
            '''
     stmt = stmt.format(dep=dep,
                        indep=self.model['indep'],
                        table_name=table_name,
                        numIter=numIter,
                        optimizer=optimizer,
                        precision=precision
                       ) 
     
     print '\nstatement :',stmt
     print '\n' 
     cursor = self.dbconn.getCursor()
     cursor.execute(stmt)
     
     row_set = self.dbconn.fetchRowsFromCursor(cursor)
     mdl_params = self.dbconn.fetchModelParams(row_set)
     self.dbconn.printModel(row_set)
     
     for param in mdl_params:
         self.model[param] = mdl_params[param]
         
     return self.model
示例#4
0
        def predict(self, predict_table_name,actual_label_col='',threshold=0.5):
            ''' 
              Return predicted values using the trained model. Also return precision, recall & f-measure
              Input:
              ======
              predict_table_name : (String) the name of the table to be used for prediction
              actual_label_col : (String) the name of the actual label column (will be ignored if empty).
              threshold : (float), the probability beyond which the predicted values will be considered +ve (default: 0.5)
                                 
              Output:
              =======
              A cursor to the row set of the results, including the predicted value as column 'prediction'
            '''
            #If the independent columns specified in the training method were a list (instead of a column name of type array)
            #We should transform the independent columns in the predict table as well
            if(isinstance(self.model['indep_org'],[].__class__)):
                predict_table_name, indep, dep, _ = pivotCategoricalColumns(self.dbconn,predict_table_name, self.model['indep_org'], actual_label_col)
                #Convert transformed independent columns into an array
                predict_table_name, indep = convertsColsToArray(self.dbconn, predict_table_name, indep, dep)
                
            stmt = ''' '''
            if(threshold):
                stmt = '''
                          select *,
                                case when (1.0/(1.0 + exp(-1.0*madlib.array_dot({indep}, array{coef}::real[])))) > {threshold} 
                                          THEN 1 ELSE 0 
                                end as prediction 
                          from {table_name}
                       '''.format(coef=self.model['coef'],
                                  indep=self.model['indep'],
                                  table_name=predict_table_name,
                                  threshold=threshold
                                 )
            else:
                #If threshold is not specified, we will return actual predictions
                stmt = '''
                          select *,
                                (1.0/(1.0 + exp(-1.0*madlib.array_dot({indep}, array{coef}::real[]))))  as prediction 
                         from {table_name}
                       '''.format(coef=self.model['coef'],
                                  indep=self.model['indep'],
                                  table_name=predict_table_name
                                 )

            print '\nstatement:',stmt
            print '\n'
            
            cursor = self.dbconn.getCursor()
            cursor.execute(stmt)
            return cursor
示例#5
0
 def train(self, table_name, indep, dep, numIter=100, optimizer='irls',precision=0.001):
     ''' 
       Given train a logistic regression model on the specified table 
       for the given set of independent and dependent variables 
       Inputs :
       ========
       table_name : (String) input table name
       indep : (String) column containing independent variables as an array, to be used to build the model on
                                              OR
               (list) a list of strings, where each element of the list is a column name of table_name or is a constant number                      
       dep : (string) the class label.
       
       Output :
       ========
       The Model coefficients, r2, p_values and t_stats
       The function also returns the model object.            
     '''
     self.model = {}
     #If indep is a list, then the input is specified as a list of columns in a table.
     #1) First, we will transform any categorical columns in this list.
     #2) We will marshal the values from the columns into an array, that can be passed on to MADlib's logistic regression algorithm.
     if(isinstance(indep,[].__class__)):
         self.model['indep_org'] = indep
         table_name, indep, dep, _ = pivotCategoricalColumns(self.dbconn,table_name, indep, dep)
         #Convert transformed independent columns into an array
         table_name, indep = convertsColsToArray(self.dbconn, table_name, indep, dep)
     else:
         self.model['indep_org'] = indep
        
     self.model['indep'] = indep
     self.model['dep'] = dep
     
     stmt = '''
               select * 
               from {madlib_schema}.logregr('{table_name}','{dep}','{indep}',{numIter}, '{optimizer}', {precision}) 
            '''.format(dep=dep,
                        indep=self.model['indep'],
                        table_name=table_name,
                        numIter=numIter,
                        optimizer=optimizer,
                        precision=precision,
                        madlib_schema=self.dbconn.madlib_schema
                       ) 
     
     logging.info('statement :{0}'.format(stmt))
     mdl_params = psql.read_frame(stmt, self.dbconn.getConnection())
     for param in mdl_params.columns:
         self.model[param] = mdl_params.get(param)[0]
     
     return self.model, mdl_params
示例#6
0
        def predict(self, predict_table_name,actual_label_col='',threshold=0.5):
            ''' 
              Return predicted values using the trained model. Also return precision, recall & f-measure
              Input:
              ======
              predict_table_name : (String) the name of the table to be used for prediction
              actual_label_col : (String) the name of the actual label column (will be ignored if empty).
              threshold : (float), the probability beyond which the predicted values will be considered +ve (default: 0.5)
                                 
              Output:
              =======
              A cursor to the row set of the results, including the predicted value as column 'prediction'
            '''
            #If the independent columns specified in the training method were a list (instead of a column name of type array)
            #We should transform the independent columns in the predict table as well
            if(isinstance(self.model['indep_org'],[].__class__)):
                predict_table_name, indep, dep, _ = pivotCategoricalColumns(self.dbconn,predict_table_name, self.model['indep_org'], actual_label_col)
                #Convert transformed independent columns into an array
                predict_table_name, indep = convertsColsToArray(self.dbconn, predict_table_name, indep, dep)
                
            stmt = ''' '''
            if(threshold):
                stmt = '''
                          select *,
                                case when (1.0/(1.0 + exp(-1.0*{madlib_schema}.array_dot({indep}, array{coef}::real[])))) > {threshold} 
                                          THEN 1 ELSE 0 
                                end as prediction 
                          from {table_name}
                       '''.format(coef=self.model['coef'],
                                  indep=self.model['indep'],
                                  table_name=predict_table_name,
                                  threshold=threshold,
                                  madlib_schema=self.dbconn.getMADlibSchema()
                                 )
            else:
                #If threshold is not specified, we will return actual predictions
                stmt = '''
                          select *,
                                (1.0/(1.0 + exp(-1.0*{madlib_schema}.array_dot({indep}, array{coef}::real[]))))  as prediction 
                         from {table_name}
                       '''.format(coef=self.model['coef'],
                                  indep=self.model['indep'],
                                  table_name=predict_table_name,
                                  madlib_schema=self.dbconn.getMADlibSchema()
                                 )

            logging.info('statement:{0}'.format(stmt))
            prediction_results = psql.read_frame(stmt,self.dbconn.getConnection())
            return prediction_results
示例#7
0
 def train(self, table_name, indep, dep):
     ''' 
       Given train a linear regression model on the specified table 
       for the given set of independent and dependent variables 
       Inputs :
       ========
       table_name : (String) input table name
       indep : (list of strings) the independent variables to be used to build the model on
       dep : (string) the class label
       
       Output :
       ========
       The Model coefficients, r2, p_values and t_stats
       The function also returns the model object.
       
     '''
          
     indep_org = indep
     
     #Transform the columns if any of them are categorical
     table_name, indep, dep, col_distinct_vals_dict = pivotCategoricalColumns(self.dbconn,table_name, indep, dep)
     #
               
     self.model = {}
     self.model['indep'] = 'array[{0}]'.format(','.join(indep))
     self.model['indep_org'] = indep_org
     self.model['col_distinct_vals_dict'] = col_distinct_vals_dict
     self.model['dep'] = dep
     cursor = self.dbconn.getCursor()
     
     stmt = '''
               select (madlib.linregr({dep},{indep})).* 
               from {table_name}
            '''
     stmt = stmt.format(dep=dep,indep=self.model['indep'], table_name=table_name) 
 
     print '\nstatement :',stmt
     print '\n'
     
     cursor.execute(stmt)
     row_set = self.dbconn.fetchRowsFromCursor(cursor)
     mdl_params = self.dbconn.fetchModelParams(row_set)
     self.dbconn.printModel(row_set)
     
     for param in mdl_params:
         self.model[param] = mdl_params[param]
     
     return self.model
示例#8
0
 def train(self, table_name, indep, dep):
     ''' 
       Given train a linear regression model on the specified table 
       for the given set of independent and dependent variables 
       Inputs :
       ========
       table_name : (String) input table name
       indep : (list of strings) the independent variables to be used to build the model on
       dep : (string) the class label
       
       Output :
       ========
       The Model coefficients, r2, p_values and t_stats
       The function also returns the model object.
       
     '''
          
     indep_org = indep
     
     #Transform the columns if any of them are categorical
     table_name, indep, dep, col_distinct_vals_dict = pivotCategoricalColumns(self.dbconn,table_name, indep, dep)
     #        
     self.model = {}
     self.model['indep'] = 'array[{0}]'.format(','.join(indep))
     self.model['indep_org'] = indep_org
     self.model['col_distinct_vals_dict'] = col_distinct_vals_dict
     self.model['dep'] = dep
     
     stmt = '''
               select ({madlib_schema}.linregr({dep},{indep})).* 
               from {table_name}
     '''.format(
         dep = dep,
         indep = self.model['indep'], 
         table_name = table_name,
         madlib_schema = self.dbconn.getMADlibSchema()
     ) 
     logging.info('statement :{0}'.format(stmt))
     
     mdl_params = psql.read_frame(stmt, self.dbconn.getConnection())
     for param in mdl_params.columns:
         self.model[param] = mdl_params.get(param)[0]
     
     return self.model, mdl_params