Exemplo n.º 1
0
def verify_data( data, columns, hierarchy_indexes, start_ind ):
    ''' Detect eventual errors in the given file (parts inconsistent with
        columns and hierarchy). Return found errors in the list.
        start_ind - index of the first row of data
    '''
    row_types = get_row_types( columns, hierarchy_indexes )
    expected_len = len( row_types )

    errors = []
    hierarchies = {}

    log.description('Verifying data')
    for (i, row) in enumerate(data, start_ind):
        # TODO log it per 1%
        if i % 1000 == 0:
            log.description(i)

        if len( row ) != expected_len:
            errors.append( bad_len( i, len(row), expected_len ) )

        if not are_fields_correct( row_types, row ):
            errors.append( bad_fields( i, row_types, row ) )

        if is_row_hierarchy_empty( row, hierarchy_indexes ):
            errors.append( empty_hierarchy( i ) )

    if errors:
      log.error('%s error(s) found' % len(errors))
    else:
      log.description('Finished with no errors')

    return errors
Exemplo n.º 2
0
    def check_db_counters( self, init_endpoint_id, init_dbtree_id, init_data_id ):
        '''Check consistency of db counters and data in the database(if no data
           has higher counter than db counter. If such a situation happens,
           ask if it should be removed.'''
        # Check db tree nodes
        if self.db.get_higher_dbtree( init_dbtree_id ) != []:
            log.warn('Found wrong dbtree nodes, higher than %d' % init_dbtree_id)
            log.question('Do you want to remove them? (Y/N)')
            dec = raw_input('Your decision: ')
            if dec.lower() == 'y':
                self.db.remove_higher_dbtree( init_dbtree_id )
                log.description('Removed wrong dbtree nodes')
        else:
            log.description('Dbtree correct')

        # Check hierarchy
        endpoint = 'data_' + str( init_endpoint_id )
        if self.db.get_higher_hierarchy( endpoint ) != []:
            print 'Found wrong hierarchy columns, higher than %d' % init_endpoint_id
            print 'Do you want to remove them? (Y/N)'
            dec = raw_input('Your decision: ')
            if dec.lower() == 'y':
                self.db.remove_higher_hierarchy( endpoint )
                print 'Removed wrong hierarchy columns'
        else:
            print 'Hierarchy correct'

        if self.db.get_higher_columns( endpoint ):
            print 'Found wrong columns, higher than %d' % init_endpoint_id
            print 'Do you want to remove them? (Y/N)'
            dec = raw_input('Your decision: ')
            if dec.lower() == 'y':
                self.db.remove_higher_columns( endpoint )
                print 'Removed wrong columns'
        else:
            print 'Columns correct'

        # Check relations in ptree
        if self.db.get_higher_ptree( init_data_id ) != []:
            print 'Found wrong ptree nodes, higher than %d' % init_data_id
            print 'Do you want to remove them? (Y/N)'
            dec = raw_input('Your_decision: ')
            if dec.lower() == 'y':
                self.db.remove_higher_ptree( init_data_id )
                print 'Removed wrong ptree nodes'
        else:
            print 'Ptree correct'

        # Remove tables with incorrect endpoints data
        tables_names = self.db.get_higher_datatables( init_endpoint_id )
        if tables_names != []:
            print 'Found too many tables, higher than %d' % init_endpoint_id
            print 'Do you want to remove them? (Y/N)'
            dec = raw_input('Your decision: ')
            if dec.lower() == 'y':
                self.db.drop_higher_datatables( init_endpoint_id )
                print 'Removed wrong data tables:'
                for tname in tables_names:
                    print 'Removed table', tname
        else:
            print 'Data tables correct'
        
        # Check user uploaded collections
        users = self.db.get_non_admin_users()
        for user in users:
            if self.db.has_old_collections( user, init_dbtree_id ):
                print 'Found old collections from user %s' % user
                print 'Do you want to remove them? (Y/N)'
                dec = raw_input('Your decision: ')
                if dec.lower() == 'y':
                    self.db.remove_old_collections( user, init_dbtree_id )
            else:
                print 'User %s correct' % user
Exemplo n.º 3
0
    def upload(self, has_header=True, visible=True, restore=False):
        '''Main method of Uploader. Checks db counters, if any inconsistency
           is found, then ask if it should be removed. After that, checks data
           that is about to be uploaded. After this attempts to upload data.
           If any error occurs during that process, then removes uploaded data to
           that moment. Returns tuple containg boolean value that tells if it
           succeeded and name of the new endpoint.
           There are 4 optional parameters: has_header - if data file comes with header,
           visible - if endpoint should be visible after upload,
           restore - if state of db should be restored to the state pointed in debug_restore()
                   method. Use with CAUTION!
        '''
        # restore db state to a state before a recent data insertion
        if restore:
            self.debug_restore()

        # Check db counters
        init_endpoint_id = self.db.get_max_endpoint()
        init_dbtree_id   = self.db.get_max_dbtree_id()
        init_data_id     = self.db.get_max_data_id()

        # TODO move it to db module. data from db module should come correct!
        log.section('DB counters correctness')
        self.check_db_counters(init_endpoint_id, init_dbtree_id, init_data_id)
        log.end_section()

        # TODO move it to Meta class constructor!
        # Check if parents, columns and hierarchy from meta is correct
        log.section('Metadata correctness')
        try:
            log.description('Verifying metadata')
            self.check_correctness()
        except UploadDataException as e:
            log.error(e.get_error())
            return ( False, e.get_error() )
        log.end_section()

        # Check data, if any error is in data, stop processing and return list with errors
        log.section('Data correctness')
        errors = self.find_errors(has_header)
        if errors:
            return (False, errors)
        log.end_section()
        
        endpoint = None
        log.section('Data insertion')
        if self.debug:
            endpoint = self.insert_data_into_db(has_header, visible)
        else:
            try:
                endpoint = self.insert_data_into_db( has_header, visible )
            except UploadDataException as e:
                log.error('Failed.')
                log.error(e) 
                log.end_section()

                # cleanup after unseccessful upload
                self.remove_uploaded( init_endpoint_id, init_dbtree_id, init_data_id )
                exit(0)

        log.description('Done!')
        log.end_section()

        return (True, endpoint)
Exemplo n.º 4
0
    def upload_data( self, endpoint, has_header=True, sum_up=True):
        '''Remove table for endpoint = given endpoint(if exists) and create a new
           one for new data. Create IdMap to track parent-child relations between
           nodes. If has_header = True, then omit the first line. Transform rows
           from original data to rows without hierarchy, and create hierarchy
           rows. Return max id of nodes from the collection.
        '''
        def db_type( col_type, col_format ):
            if col_type == 'number':
                return 'float' if '.' in col_format else 'int'
            else:
                return col_type

        def type_fun( col_type, col_format ):
            return int if db_type( col_type, col_format ) == 'int' else float

        # UnicodeReader
        bulk = self.get_data(has_header)

        # Create and remove table
        self.db.remove_table( endpoint )
        columns = [(t['key'], db_type(t['type'], t['format'])) for t in self.meta.get_columns()]
        self.db.create_table( endpoint, columns )

        summable_cols = []
        for (i, col) in enumerate( self.meta.get_columns() ):
            if col['type'] == 'number':
                summable_cols.append( (i+5, type_fun(col['type'], col['format'])) )

        start_id = self.db.get_max_data_id()
        id_map = IdMap( start_id )

        batch_size = self.count_batch_size()
        print 'BATCH_SIZE = ', batch_size # Process all rows
        # rows to be uploaded in one batch
        batch_rows = []
        # rows that are actually processed, they need to be remembered,
        # because numeric fields should be summed from many leaves
        proc_rows = []
        # hierarchy
        ptree_hier = []
        # ptree rows to be uploaded in one batch
        batch_ptree_rows = []
        # list representing values in hierarchy fields
        old_hierarchy_in_row = []
        total_row = self.create_total_row( None )

        for i, row in enumerate( bulk ):
            if i % 1000 == 0:
                log.description(i)
            # retrieve hierarchy from the row
            hierarchy_in_row = self.get_hierarchy_cols( row )
            # remove empty fields from hierarchy columns
            while len( hierarchy_in_row ) > 0 and hierarchy_in_row[-1][0] == '':
                hierarchy_in_row.pop()
            
            common_level = self.hierarchy_common_level( hierarchy_in_row,
                                                        old_hierarchy_in_row )
            # Replace empty numeric cells with Nones (NULL in Postgres)
            hier_num = len(hierarchy_in_row)
            hier_cells = row[:hier_num]
            data_cells = row[hier_num:]
            row = hier_cells + [None if c[1] != 'string' and e == '' else e for e, c in zip(data_cells, columns)]

            # Transform rows to non hierarchical form
            new_rows = self.add_rows(id_map, common_level, hierarchy_in_row, row)
            ptree_hier, new_ptree_rows = self.create_ptree_rows( common_level,
                                            len( hierarchy_in_row ),
                                            new_rows, ptree_hier )

            leaf_row = new_rows[-1]
            # remove rows that are not needed to sum values anymore
            # (all their children were added) and if there is top level
            # row in them, then add values from it to total row
            if i > 0:
                if common_level == 0:
                    self.sum_values( total_row, proc_rows[0], summable_cols )
                batch_rows += proc_rows[common_level:]
                proc_rows   = proc_rows[:common_level]

            proc_rows += new_rows
            # sum from last but one row using values from leaf row
            for i in range( len(proc_rows) - 2, -1, -1):
                self.sum_values( proc_rows[i], leaf_row, summable_cols )

            old_hierarchy_in_row = hierarchy_in_row
            batch_ptree_rows += new_ptree_rows

            if len( batch_rows ) > batch_size:
                self.db.insert_data( batch_rows, endpoint )
                batch_rows = []
                self.db.insert_ptree_data( batch_ptree_rows )
                batch_ptree_rows = []


        batch_rows += proc_rows
        # add values from the last top row to total row
        self.sum_values( total_row, proc_rows[-1], summable_cols )

        # TODO: changed
        # TODO: get rid of magic numbers
        total_row_id = id_map.add_id( 0, 1 )[0]
        total_row[0] = total_row_id
        #total_row = self.create_total_row( total_row_id )
        batch_rows.append( total_row )
        batch_ptree_rows.append( (total_row_id, []) )

        self.db.insert_data( batch_rows, endpoint )
        self.db.insert_ptree_data( batch_ptree_rows )

        return id_map.get_last_id()
Exemplo n.º 5
0
    def insert_data_into_db( self, has_header, visible ):
        '''Inserts node (or nodes if new parents) into dbtree, uploads new hierarchy
           and columns, then uploads data, sums columns of higher level nodes and
           updates db data counter. Updates ptree. Returns new endpoint's name.'''
        log.description('Uploading...')

        endpoint, new_dbtree_ids = self.update_dbtree(visible)
        log.description('Dbtree uploaded')

        self.update_hierarchy(endpoint)
        log.description('Hierarchy uploaded')

        self.update_columns( endpoint )
        log.description('Columns uploaded')

        # Return id of the last uploaded row
        last_id = self.upload_data(endpoint, has_header=has_header)
        log.description('Data uploaded')
        log.description('Columns summed up, ptree uploaded')

        # Add information about uploading collection to the user (if not admin)
        if not self.db.is_admin( self.meta.get_user() ):
            self.db.add_user_collections( self.meta.get_user(), new_dbtree_ids )

        self.db.set_max_data_id( last_id )

        log.description('Ptree uploaded')

        return endpoint