Пример #1
0
  def refine_schemas(self):
    """Try to discern relationships between tables by looking for primary key
    and column name and column help text similarities. Encode those relationships
    as the target foreign key table in the "is_key" element within the schemas
    to replace the True/False 1/0 value there currently"""
    if self.verbosity:
      print('Refining {0} schemas... '.format(len(self.schemas.items())))
    # find all foreign key references even if the column is not labeled 'is_key' because the FDA doesn't label the footnotes column for the NDB No as a primary key, though maybe it should
#    Ncols=[]
#    for schema in self.schemas:
#      Ncols.append(schema['numcols'])
    Nkeys={}
    Ncols={}
    # count the keys before they begin being modified
    for k,schema in self.schemas.items():      
      Nkeys[k] = schema['is_key'].count(True)  
      Ncols[k] = len(schema['is_key'])  # already available within the schema   
    for k,schema in self.schemas.items():
      if self.verbosity:
        print('Refining schema '+str(k))
      for i,schema_is_key in enumerate(schema['is_key']):
        max_target_likelihood = 0.0
        source_likelihood = 0.3*(Nkeys[k]-1)
        source_likelihood += 0.5*bool(schema_is_key)
        source_likelihood -= 0.5*bool(schema['is_blank'][i])
        target_likelihoods = []
        for fsk,foreign_schema in self.schemas.items():
#      actHL this is a kluge to catch the footnote table in the FDA database
#      footnote_table_columns = 4
#      if (Nkeys==1 and schema['is_key'][0]==1 and schema['numcols']>=max(Ncols)):
#        continue
#          if foreign_schema['numcols']....:
#          if fsk == k or schema['column_name'][i]!=foreign_schema['column_name'][0]:
#            target_likelihood = 0.0
#          else:
          target_likelihood = source_likelihood 
          if fsk==k and i==0 and Nkeys[k]==1:
            target_likelihood += .5
          target_likelihood += 0.5*( schema['column_name'][i] == foreign_schema['column_name'][0] )
          target_likelihood += nlp.similarity(nlp.essence(schema['help_text'][i][0:32]),nlp.essence(foreign_schema['help_text'][0][0:32]))
          target_likelihood += 0.2*( schema['field_type'][i] == foreign_schema['field_type'][0] )
          target_likelihood += 0.5*foreign_schema['numcols']/(2*Nkeys[fsk]+1) # +1 to help avoid divide by zero
          # assumes that the key for each table is the first column, which is not the general case, but works for the FDA schema 
          if self.verbosity:
            print(str(k)+'.'+str(schema['column_name'][i]) +'->'+str(fsk)+'.'+str(foreign_schema['column_name'][0]) + ' likelihood='+str(target_likelihood))
          if target_likelihood > max_target_likelihood:
            max_target_likelihood = target_likelihood
            max_target_key = fsk
          target_likelihoods.append(target_likelihood);
        if self.verbosity:
          print('max likelihood='+str(max_target_likelihood)+'='+str(max(target_likelihoods)))
        if max_target_key == k:
          schema['is_key'][i] = 1 # actHL: should already be 1
        elif max_target_likelihood>1 :
          schema['is_key'][i]=str(max_target_key) #+'.'+schema['column_name']
Пример #2
0
 def read_tables(self):
     """Read *.txt.table files and form a schema files in memory for each table file.\
 
 This is extremely brittle translation of .table files (created by cutting and 
 pasting text from FDA database documentation in sr22.pdf) into *.txt.schema files.
 Cut-and-paste must be done with Adobe Acrobat reader to gedit or similar text editor. 
 The gnome document viewer application garbles tables--shuffling the text order."""
     print('Reading {0} table files... '.format(len(self.file_names)))
     valid_fields = False  # flag to indicate when the first line containing a valid set of fields has been read
     #lre = re.compile(r'['+nlp.SPACE+']+') # use this to perform split if custom whitespace character definition is required
     for i, fn in enumerate(self.file_names):
         fmn = self.file_model_names[
             i]  # actHL: should be stored in the schemas dictionary of dictionaries
         table_path = os.path.join(self.full_path, fn + '.table')
         schema_path = os.path.join(self.full_path, fn + '.schema')
         if (os.path.isfile(schema_path) and not self.force):
             print(
                 'Warning: {0} already exists and "--force" (overwrite) option not enabled. Skipping schema.'
                 .format(schema_path))
             continue
         if not os.path.isfile(table_path):
             continue
         with open(table_path, 'Ur') as infile:
             print('Opened {0}'.format(table_path))
             for l in infile:
                 l = l.strip(
                 )  # leading and trailing whitespace (including \r \n, regardless of eol for locale)
                 if nlp.similarity(
                         nlp.essence(l),
                         nlp.essence('Field name Type Description')) > 0.7:
                     print 'Found a header row: "{0}"\n  so skipping to next line...'.format(
                         l)
                     continue  # header row detected, skip it
                 # find or develop a regular expression language for tokenizing text lines into variables like this does manually:\
                 fields = list(
                     self.default_fields
                 )  # without the "list()" method, assignment only creates a pointer with fields pointing to the same memory as self.default_fields
                 #s=lre.split(l,self.num_table_fields_before_asterisk_split-1)  # use this to spit on custom whitespace characters
                 s = l.split(
                     None, self.num_table_fields_before_asterisk_split - 1
                 )  # careful, if you specify a whitespace character then a different algorithm runs
                 # The following is unnecessary, a check of the field contents and formating occurs below
                 # All this would do is prevent some blank table rows being generated with helptext containing the short, invalid text lines from the file
                 if not self.schemas.has_key(fmn):
                     self.schemas.update({fmn: {}})
                     for k in self.schema_keys:
                         self.schemas[fmn].update({k: []})
                 if len(s) < self.num_table_fields_before_asterisk_split:
                     if valid_fields == False:
                         print 'Warning (RecipeParser.read_tables()): Row {0}, number of fields insufficient (<{1}), skipping to next line.'.format(
                             len(s),
                             self.num_table_fields_before_asterisk_split)
                         print 'The faulty table line was: "{0}"'.format(l)
                     else:
                         self.schemas[fmn]['help_text'][-1] = self.schemas[
                             fmn]['help_text'][-1] + ' ' + l.strip(
                                 nlp.SPACE + eol)
                     continue
                 # actHL: need some way of specifying generically this syntax where more than one schema field is contained in a single space-delimitted token
                 if s[2][0].isdigit() and s[2].endswith('*'):
                     # insert synthetic field between second number and the asterisk (primary key indicator)
                     # create a new Y/N token to represent the presence or absence of this asterisk
                     s[2:3] = [s[2].rstrip('*'), 'Y']
                 else:
                     s[2:3] = [s[2], 'N']
                 for j in range(len(self.field_order)):
                     # if self.field_order[j]<len(s) doesn't work because sometimes self.field_order is a slice
                     cmd = 's[' + str(self.field_order[j]) + ']'
                     s2 = eval(cmd)
                     if type(
                             s2
                     ) == list:  # actHL: there's certainly some slice or sequence method more elegant than this
                         fields[j] = ' '.join(s2)
                     else:
                         fields[j] = s2
                 if fields[0][0].isalpha() and fields[0][0].isupper() and (
                         fields[1] == 'A' or fields[1]
                         == 'N') and fields[2][0].isdigit() and (
                             fields[3] == 'Y' or fields[3] == 'N'
                         ) and (fields[4] == 'Y' or fields[4] == 'N') and (
                             fields[5] == 'Y'
                             or fields[5] == 'N'):  # and len(fields[7])>10:
                     # actHL: automate this with a loop over some translation definitions for strings into fields, like Y/N into 1/0 etc, colname coes with schema_key[0] etc
                     valid_fields = True
                     for j in range(len(self.schema_keys)):
                         for k, v in self.schema_translations[j].items():
                             fields[j] = fields[j].replace(
                                 k, str(v)
                             )  # actHL: should take into account the type of the variable being translated into, somehow
                         self.schemas[fmn][self.schema_keys[j]].append(
                             type(self.default_fields[j])(fields[j]))
                 else:  # line is a continuation of the help text rather than a new column description
                     self.schemas[fmn]['help_text'][-1] = self.schemas[fmn][
                         'help_text'][-1] + ' ' + l.strip(nlp.SPACE + eol)
             self.schemas[fmn].update({
                 'numcols':
                 len(self.schemas[fmn][self.schema_keys[0]])
             })  # not necesary, but convenient
             self.schemas[fmn].update(
                 {'file_name': fn}
             )  # necessary because the "variablize" conversion (title case, no punc) is not reversible
Пример #3
0
 def read_tables(self):
   """Read *.txt.table files and form a schema files in memory for each table file.\
   
   This is extremely brittle translation of .table files (created by cutting and 
   pasting text from FDA database documentation in sr22.pdf) into *.txt.schema files.
   Cut-and-paste must be done with Adobe Acrobat reader to gedit or similar text editor. 
   The gnome document viewer application garbles tables--shuffling the text order."""
   print('Reading {0} table files... '.format(len(self.file_names)))
   valid_fields=False # flag to indicate when the first line containing a valid set of fields has been read
   #lre = re.compile(r'['+nlp.SPACE+']+') # use this to perform split if custom whitespace character definition is required
   for i,fn in enumerate(self.file_names):
     fmn=self.file_model_names[i] # actHL: should be stored in the schemas dictionary of dictionaries
     table_path=os.path.join(self.full_path,fn+'.table')
     schema_path=os.path.join(self.full_path,fn+'.schema')
     if (os.path.isfile(schema_path) and not self.force):
       print('Warning: {0} already exists and "--force" (overwrite) option not enabled. Skipping schema.'.format(schema_path))
       continue
     if not os.path.isfile(table_path): 
       continue
     with open(table_path,'Ur') as infile:
       print('Opened {0}'.format(table_path))
       for l in infile: 
         l=l.strip() # leading and trailing whitespace (including \r \n, regardless of eol for locale)
         if nlp.similarity(nlp.essence(l),nlp.essence('Field name Type Description')) > 0.7:
           print 'Found a header row: "{0}"\n  so skipping to next line...'.format(l)
           continue # header row detected, skip it
         # find or develop a regular expression language for tokenizing text lines into variables like this does manually:\
         fields=list(self.default_fields) # without the "list()" method, assignment only creates a pointer with fields pointing to the same memory as self.default_fields
         #s=lre.split(l,self.num_table_fields_before_asterisk_split-1)  # use this to spit on custom whitespace characters
         s=l.split(None,self.num_table_fields_before_asterisk_split-1) # careful, if you specify a whitespace character then a different algorithm runs
         # The following is unnecessary, a check of the field contents and formating occurs below
         # All this would do is prevent some blank table rows being generated with helptext containing the short, invalid text lines from the file
         if not self.schemas.has_key(fmn):
           self.schemas.update({fmn:{}})
           for k in self.schema_keys:
             self.schemas[fmn].update({k:[]})
         if len(s)<self.num_table_fields_before_asterisk_split:
           if valid_fields==False:
             print 'Warning (RecipeParser.read_tables()): Row {0}, number of fields insufficient (<{1}), skipping to next line.'.format(
               len(s),self.num_table_fields_before_asterisk_split)
             print 'The faulty table line was: "{0}"'.format(l)
           else:
             self.schemas[fmn]['help_text'][-1]=self.schemas[fmn]['help_text'][-1]+' '+l.strip(nlp.SPACE+eol)
           continue                      
         # actHL: need some way of specifying generically this syntax where more than one schema field is contained in a single space-delimitted token      
         if s[2][0].isdigit() and s[2].endswith('*'):
           # insert synthetic field between second number and the asterisk (primary key indicator)
           # create a new Y/N token to represent the presence or absence of this asterisk
           s[2:3]=[s[2].rstrip('*'),'Y']
         else:
           s[2:3]=[s[2],'N']
         for j in range(len(self.field_order)):
           # if self.field_order[j]<len(s) doesn't work because sometimes self.field_order is a slice
           cmd='s['+str(self.field_order[j])+']'
           s2=eval(cmd)
           if type(s2)==list: # actHL: there's certainly some slice or sequence method more elegant than this
             fields[j]=' '.join(s2)
           else:
             fields[j]=s2
         if fields[0][0].isalpha() and fields[0][0].isupper() and (fields[1]=='A' or fields[1]=='N') and fields[2][0].isdigit() and (fields[3]=='Y' or fields[3]=='N') and (fields[4]=='Y' or fields[4]=='N') and (fields[5]=='Y' or fields[5]=='N'): # and len(fields[7])>10:
           # actHL: automate this with a loop over some translation definitions for strings into fields, like Y/N into 1/0 etc, colname coes with schema_key[0] etc
           valid_fields=True
           for j in range(len(self.schema_keys)):
             for k,v in self.schema_translations[j].items():
               fields[j]=fields[j].replace(k,str(v)) # actHL: should take into account the type of the variable being translated into, somehow
             self.schemas[fmn][self.schema_keys[j]].append(type(self.default_fields[j])(fields[j]))
         else: # line is a continuation of the help text rather than a new column description
           self.schemas[fmn]['help_text'][-1]=self.schemas[fmn]['help_text'][-1]+' '+l.strip(nlp.SPACE+eol)      
       self.schemas[fmn].update({'numcols': len(self.schemas[fmn][self.schema_keys[0]])}) # not necesary, but convenient
       self.schemas[fmn].update({'file_name': fn}) # necessary because the "variablize" conversion (title case, no punc) is not reversible