예제 #1
0
    def __init__(self, tables_not_to_include = [], 
        debug = True, 
        debugMore=True, 
        recalc_struct_to_upload = True):
        ''' recalc_struct_to_upload : true if you would like to reconstruct the struct to upload. 
                                        if false. it will upload the json doc in logs\bulk_import_data
            '''

        INDEX = 'metadata'
        DOCTYPE = 'tables'

        dbm = dbManager()   #initialize database Manager

            # NOTE: the keys will be tablenames. 
        if recalc_struct_to_upload:
            mm = MapManager(debug=False)   #initialize the map manager
            if(debug): mm.log_mappings('logs/mappings_used') #check that mappigs are correct
    
            # open the bulk_uplopad file to print import statements to.
            bulk_upload_fp = u.log_file('logs/bulk_import_data.json', keep_alive=True)
            
                # open the database and enumerate through tablenames
            for i,tablename in enumerate(dbm.open_db()):
                if(tablename not in tables_not_to_include):
                        #break the tablename into it's individual parts. 
                    table_info_raw = dbm.parse_row(tablename, debug = debug) 
                
                        # apply the mapping based on those parts. save them to the struct. 
                    table_info_raw = mm.apply_mapping(table_info_raw, debug = debug) 
                    
                    # if there is an instrument... get the scales. & extra info
                    if('instrument abbreviation' in table_info_raw):
                        if(debugMore): print('\n\nAbout to look at scales for %s' % u.prettify_str(table_info_raw))  
        
                        # get the scales & instrument name from the mappings
                        scales, instrument_name = mm.get_info(table_info_raw['instrument abbreviation'], debug = debugMore)
                        
                        if(scales): # get_info may return null, if it doesn't have scales/instrument name
                            table_info_raw['scales'] = []
                            for scale in scales:
                                scale_info = {
                                    'scale' : scale,
                                    'description' : scales[scale]
                                }
                                table_info_raw['scales'].append(scale_info)
                        if(instrument_name):
                            table_info_raw['instrument name'] = instrument_name
                    table_indexing = {'index':{'_index':INDEX, '_type':DOCTYPE, '_id':i}}
                    json.dump(table_indexing, bulk_upload_fp)
                    json.dump(table_info_raw, bulk_upload_fp)
        else: # grab the logs\struct_uploaded,json and upload that
            bulk_upload_fp = json.load(open('logs/bulk_import_data.json'))

        dbm.close_db()
        dbm.open_es()
        print('exporting struct to es.... may take some time....')

        dbm.insert_struct_to_es(bulk_upload_fp) 
        print(u.prettify_str(json.loads(bulk_upload_fp)))
예제 #2
0
 def log_mappings(self,path): # save the mappings to a file 
     import time
     f = open(path + time.strftime("-%a_%b_%Y-%H") + '.md','w')
     f.write('Raw Parameters File :\n')
     f.write(u.prettify_str(self.json_raw))
     f.write('\n\n---------------------------------\n')
     f.write('Mappings Struct:\n')
     f.write(u.prettify_str(self.maps))
     f.write('\n\n---------------------------------\n')
     f.write('Instrument Parameters:\n')
     f.write(u.prettify_str(self.instr_params))
     f.close()
예제 #3
0
    def parse_row(self, row, debug = True): # get a dict with the attribute for this tablename
        """ Takes in a row from database cursor. returns dict with all attributes from tablename
            args:
                row : ['wtp_data', '', tablename, 'table', '']
                    This is the definition from the pypyodbc package. each row of 
                    cursosr.tables() is like that.  
            returns:
                {
                    'type': 'data'/'calc'/'misc'
                    'tablename': tablename,
                    'instrument abbreviation': ABES
                    'respondent': t/m/f
                    'phase' : 4
                    etc....
                }

            This function uses the mappings defined in the class variable table_name map. 
            which basically 
            """
        tablename_maps =\
            {
            'parsing maps':{
                 'data':{
                     'tablename':'table_split[0:]',
                     'phase':'table_split[1: 2]',
                     'instrument abbreviation':'table_split[2 : table_split.__len__()-1]',
                     'respondent':'table_split[-1]',
                 },
                 'calc':{
                     'tablename':'table_split[0:]',
                     'phase':'table_split[1: 2]',
                     'instrument abbreviation':'table_split[2: table_split.__len__()-1]',
                     'respondent':'table_split[-1]',
                     },
                 'misc':{
                     'tablename':'table_split[0:]',
                     }
            },
            'filters':{
                'data':{
                    'length':4,
                    'disallowed words':[
                        'dates'
                    ]
                },
                'calc':{
                   'length':4,
                    'disallowed words':[
                        'dates'
                    ]
                }
            },
            '__doc__': \
                """ This is the readme for this dict. This dict defines the parameters for 
                the WTP naming conventions. 
                        This has 3 sections. 
                            - This readme
                            - Parsing maps
                            - Filters

                        ___This readme___ is self explanatory.

                        ___Parsing maps___ has a bunch of commands to parse a tablename into various attributes.
                            the structure is like so:
                                type{
                                    'attribute this type should have 1'
                                    'attribute this type should have 2'
                                    etc. 
                                }
                            It is assumed that the tablename has already been split into an array at '_'s and called table_split

                        note that the selections made by each command will return an array of strings. that should be joined by _s. 
                        You should check that  it's an array. and not just a single string. 
                        If it's a string. don't join it. 

                        ___filters___ is a list of things each type needs in order to be considered. part of that type. 
                        i.e. data needs to have at least 4 parts. type, phase, respondent, instrument. 
                        If it doesn't then it's some kind of special meta-data. like data_dates..... 
                        And we don't know what to do with them yet. 

                        -----
                            you should be aware. that type should always be the first secgment of any tablename in the wtp database. 
                            we use that to select which types to try. but first we filter it. if it doesn't match the filter's criteria it's categorized as misc. 
                        ----
                            If you start updating this filter you'll need to update get_table_type as well. 

                        """
            }

        attributes_dict = {}
        tablename = row[2] # the pypyodbc tables() row has tablename at index 2
        
        if(debug):print('Parsing table %s' % tablename)
        table_split = tablename.split('_') # the sections should be split by _
        table_type = self.get_table_type(table_split, tablename_maps['filters'])
        if(debug):print('type determined to be %s' % table_type)

        for attrib in tablename_maps['parsing maps'][table_type]:
            parts = eval(tablename_maps['parsing maps'][table_type][attrib])
            if(type(parts) is list):
                parts = '_'.join(parts)
            attributes_dict[attrib] = parts
        attributes_dict['type'] = table_type
        
        if(debug):
            print('\tbecame->\n\t' + u.prettify_str(attributes_dict).replace('\n','\n\t'))
        
        return attributes_dict
예제 #4
0
    def get_info(self, instr_abbrev, instr_full=None, instr_list = None, 
        line ='', debug = True): # gets scales and name
        """ Search for instrument with abbreviation that matches.
            if there are any scales, return them. they will be in the form of a dict
            If instrument name can be determined return that as well

            if No scales found return None
                ARGS:
                instr_abbrev -> the abbreviation of the instrument we're lookng for
                instr_list -> the list of instrument maps _defaults_ to self.instr_params

                RETURNS:
                    the dict of scales if found. None if not found.
                    the name of the instrument as can best be determined  

                recall that constructs should support nested instruments.  by way of extension.
                recall constucts is in the forms
                {
                    'instr name':{
                        'abbreviation':''
                        'extensions':{
                            'instrument name':{ 
                                'abbreviation':''
                                'scales':{}
                                'memo':''
                                'details':''
                            }
                        }
                        'scales'
                    }
                }

            """
        
        if(not instr_list): # if no list provided use the whole parameters list.
            instr_list = self.instr_params 

        scales = None # scales will hold the scales from maps.
        for instr in instr_list: # iterate through all instruments provided.
            instr_dict = instr_list[instr] # instr_dict is the parameters dict for inst
            #_check_ if this is the correct instrument. 
            # NOTE : as it uses starts with - potential bug when abbrevations have 'au' and 'aub'
            if(instr_abbrev[0:instr_dict['abbreviation'].__len__()] == (instr_dict['abbreviation'])): # 
                if(debug): print('%sFound beginning of %s in:' %(line, instr_abbrev))
                if(debug): print('%s\"%s\" %s' %(line, instr, u.prettify_str(instr_dict).replace('\n','\n\t'+line)))
                
                if(not instr_full): instr_full = ''
                instr_full = instr_full + instr +' ' # track full instrument name. Lower levels should append their additions
                if(debug): print('%sinstrument name = %s...' %(line,instr_full))
                
                # if there are extensions. recurse
                if('extensions' in instr_dict): # then recurse. 
                    if(debug): print('%s---Extensions found: Recursing' %(line+'\t'))
                    instr_abbrev_suffix = instr_abbrev[instr_dict['abbreviation'].__len__():] # save after map abrv
                    # recurse down
                    lower_scales, instr_full = self.get_info( instr_abbrev_suffix, instr_full=instr_full, instr_list = instr_dict['extensions'], line = (line + '\t'), debug=debug) 
                    if(lower_scales):       # if it found something
                        if(not scales): scales = {}          # if scales doesn't exist initialize it
                        scales.update(lower_scales)          # add it to scales.  
                # if there are no extensions and instr_abrv doesn't match. Then return None.
                elif(instr_dict['abbreviation'] != instr_abbrev):  
                    if(debug): print('%sNothing Found. The abbreviations don\'t match. .%s:%s.' \
                                            % (line+'\t', instr_abbrev, instr_dict['abbreviation']))
                    return None, instr_full

                if('scales' in instr_dict): # then add them to the dict for return
                    if(debug): print('%sScales found!' % line)
                    if(not scales): scales ={}                      # if scales doesn't exist initialize it
                    scales.update(instr_dict['scales'])             # add scales from instr_list to scales
                if(debug): print('%sReturning scales as %s' %(line, str(scales)[:30]))
                return scales, instr_full
        return None, instr_full       # if you iterate through all instruments. and none returns. then failed.