Exemplo n.º 1
0
def db_validate(arglist):
  saved_pwd = os.getcwd()
  opt = optparse.OptionParser()
  opt.add_option('--rf2',action='store')
  opt.add_option('--release_type', action='store', choices=['delta','snapshot','full'])
  opt.add_option('--neopw64', action='store')
  opt.add_option('--neopw', action='store')
  opt.add_option('--exceptions', action='store')
  opt.add_option('--logfile', action='store', default='-')
  opts, args = opt.parse_args(arglist)
  if not (len(args)==0 and opts.rf2 and opts.release_type and (opts.neopw or opts.neopw64)):
    print('Usage: db_validate --rf2 <dir> --release_type full --neopw <pw>')
    sys.exit(1)
  if opts.neopw and opts.neopw64:
    print('Usage: only one of --neopw and --neopw64 may be specified')
    sys.exit(1)
  if opts.neopw64: # snomed_g v1.2, convert neopw64 to neopw
      opts.neopw = str(base64.b64decode(opts.neopw64),'utf-8') if sys.version_info[0]==3 else base64.decodestring(opts.neopw64) # py2
  # open logfile
  logfile = open(opts.logfile, 'w') if opts.logfile != '-' else sys.stdout
  #---------------------------------------------------------------------------
  # Determine SNOMED_G bin directory, where snomed_g_rf2_tools.py exists, etal
  #---------------------------------------------------------------------------
  pathsep = '/'
  # determine snomed_g_bin -- bin directory where snomed_g_rf2_tools.py exists in, etc -- try SNOMED_G_HOME, SNOMED_G_BIN env vbls
  # ... ask directly if these variables don't exist
  snomed_g_bin = os.environ.get('SNOMED_G_BIN',None) # unlikely to exist, but great if it does
  if not snomed_g_bin:
    snomed_g_home = os.environ.get('SNOMED_G_HOME',None)
    if snomed_g_home:
      snomed_g_bin = snomed_g_home.rstrip(pathsep) + pathsep + 'bin'
    else:
      snomed_g_bin = raw_input('Enter SNOMED_G bin directory path where snomed_g_rf2_tools.py exists: ').rstrip(pathsep)
  validated = False
  while not validated:
    if len(snomed_g_bin)==0:
      snomed_g_bin = raw_input('Enter SNOMED_G bin directory path where snomed_g_rf2_tools.py exists: ').rstrip(pathsep)
    else: # try to validate, look for snomed_g_rf2_tools.py
      target_file = snomed_g_bin+pathsep+'snomed_g_rf2_tools.py'
      validated = os.path.isfile(target_file)
      if not validated: print('Cant find [%s]' % target_file); snomed_g_bin = ''
  snomed_g_bin = os.path.abspath(snomed_g_bin)
  print('SNOMED_G bin directory [%s]' % snomed_g_bin)
  # connect to NEO4J, make sure information given is good
  neo4j = snomed_g_lib_neo4j.Neo4j_Access(base64.decodestring(opts.neopw64))
  # Connect to RF2 files, make sure rf2 directory given is good
  rf2_folders = snomed_g_lib_rf2.Rf2_Folders(opts.rf2, opts.release_type)
  # Build
  # open SQLITE database
  DB = StatusDb(os.path.abspath(opts.output_dir.rstrip(pathsep)+pathsep+'validate_status.db'))

  # create YYYYMMDD string
  d = datetime.datetime.now() # determine current date
  yyyymmdd = '%04d%02d%02d' % (d.year,d.month,d.day)
  job_start_datetime = datetime.datetime.now()

  # Commands needed to Create/Update a SNOMED_G Graph Database
  commands_d = {
      'JOB_START':
          {'stepname': 'JOB_START',
           'log':      'JOB-START(release_type:[%s], rf2:[%s], date:[%s])' \
                           % (opts.release_type, opts.rf2, yyyymmdd)},
      'VALIDATE_CONCEPTS':
          {'stepname': 'VALIDATE_CONCEPTS',
           'cmd':      'python %s/snomed_g_validate_graphdb_tools.py validate_graphdb --element concept      --release_type %s --rf2 %s --neopw %s' \
                           % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw),
           'mode':     ['validate']},
      'VALIDATE_DESCRIPTIONS':
          {'stepname': 'VALIDATE_DESCRIPTIONS',
           'cmd':      'python %s/snomed_g_validate_graphdb_tools.py validate_graphdb --element description  --release_type %s --rf2 %s --neopw %s' \
                       % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw),
                       'mode': ['validate']},
      'VALIDATE_ISA_RELS':
          {'stepname': 'VALIDATE_ISA_RELS',
           'cmd':      'python %s/snomed_g_validate_graphdb_tools.py validate_graphdb --element isa_rel      --release_type %s --rf2 %s --neopw %s' \
                           % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw),
                       'mode': ['validate']},
      'VALIDATE_DEFINING_RELS':
          {'stepname': 'VALIDATE_DEFINING_RELS',
           'cmd':      'python %s/snomed_g_validate_graphdb_tools.py validate_graphdb --element defining_rel --release_type %s --rf2 %s --neopw %s' \
                           % (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw),
                       'mode': ['validate']},
      'JOB_END':
          {'stepname': 'JOB_END',
           'log':      'JOB-END'}
  }

  command_list_validate_build = \
      [ commands_d[x] for x in
        ['JOB_START',
         'VALIDATE_CONCEPTS',
         'VALIDATE_DESCRIPTIONS',
         'VALIDATE_ISA_RELS',
         'VALIDATE_DEFINING_RELS',
         'JOB_END'] ]
  command_list = command_list_validate_build
  stepnames = [x['stepname'] for x in command_list] # list of dictionaries
  seqnum = DB.get_next_sequence_number()
  # Execute commands (BUILD)
  results_d = {}
  for command_d in command_list:
    # extract from tuple
    stepname, cmd, logmsg, expected_status, mode_requirement = \
      command_d['stepname'], command_d.get('cmd',None), command_d.get('log',None), command_d.get('expected_status',0), command_d.get('mode', None)
    if mode_requirement and opts.mode not in mode_requirement: continue # eg: NEO4J execution only in build mode
    results_d[stepname] = {}
    cmd_start = datetime.datetime.now() if stepname!='JOB_END' else job_start_datetime  # start timer
    status = -1
    should_break = False
    results_d[stepname]['result'] = 'SUCCESS' # assumption of success until failure determined
    results_d[stepname]['expected_status'] = expected_status
    results_d[stepname]['command'] = cmd
    results_d[stepname]['error_count'] = 0 # default
    print(stepname)
    print(stepname, file=logfile) # indicate to user what step we are on
    if logmsg: # no command to execute in a separate process
      results_d[stepname]['status'] = 0
      results_d[stepname]['STDOUT'] = logmsg # LOG everything after 'LOG:'
      output, err = '', ''
    else: # execute command (cmd) in subprocess
      print(cmd, file=logfile)
      try:
        # SUBPROCESS creation
        cmd_as_list = cmd.split(' ')
        if opts.output_dir != '.': os.chdir(opts.output_dir) # move to output_dir, to start subprocess
        subprocess.check_call(cmd_as_list, stdout=logfile, stderr=logfile)
        if opts.output_dir !='.': os.chdir(saved_pwd) # get back (popd)
        status = 0 # if no exception -- status guaranteed to be zero
      except subprocess.CalledProcessError as e:
        status = e.returncode # by validate_graphdb convention, this code is the number of discrprancies found
        results_d[stepname]['status'] = status
        if status != expected_status:
          results_d[stepname]['result'] = 'FAILED (STATUS %d)' % status
          should_break = False # keep validating
        pass # might be fine, should_break controls termination
      except: # NOTE: result defaulted to -1 above
        results_d[stepname]['result'] = 'EXCEPTION occured -- on step [%s], cmd [%s]' % (stepname,cmd)
        should_break = True
        pass
      else: # no exception
        results_d[stepname]['status'] = status
        if status != expected_status:
          results_d[stepname]['result'] = 'FAILED (STATUS %d)' % status
          results_d[stepname]['error_count'] = status # graphdb_validate convention is to return discrprency count
          should_break = True # no steps are optional
    # Book-keeping
    cmd_end = datetime.datetime.now() # stop timer
    cmd_seconds = (cmd_end-cmd_start).seconds
    results_d[stepname]['elapsed_seconds'] = cmd_seconds
    if len(output) > 0: results_d[stepname]['STDOUT'] = output.replace('\n','<EOL>')
    if len(err) > 0: results_d[stepname]['STDERR'] = err.replace('\n','<EOL>')
    results_d[stepname]['cmd_start'] = cmd_start
    results_d[stepname]['cmd_end'] = cmd_end

    if should_break: break
  # Write results to the database
  save_and_report_results(DB, seqnum, stepnames, results_d)

  # Done
  sys.exit(0)
  return
Exemplo n.º 2
0
def validate_graphdb(arglist):

  def rf2_filename(element, view=None): # rf2_folders is set in validate_graphdb initialization
    return rf2_folders.rf2_file_path(element, view) # eg: 'concept'

  def old_compute_hist_changes(new_field_values, prev_field_values, field_names): # find map with only modified fields
    return { field_names[idx] : new_field_values[idx] for idx in range(len(field_names)) if db_data_prep(new_field_values[idx]) != db_data_prep(prev_field_values[idx]) }

  '''
  HISTORY COMPUTATION -- Example information for a concept:
  
  Information state example (need to understand for history computation)
    csv_fields  = ['id','effectiveTime','active','moduleId','definitionStatusId','FSN','history']
    field_names = ['id','effectiveTime','active','moduleId','definitionStatusId']
    renamed_fields = {}
    id -- '293672009'
    concepts_d[id]['20160301'] -- concepts_d[id] is a map keyed by effectiveTime,
                                  its value ==> list of attribute values for that time,
                                  in same order as in RF2 file
    graph_matches_d[id] (graph) --
              {u'nodetype': u'concept', u'effectiveTime': u'20060131', u'FSN': u'Antiemetic allergy (disorder)',
               u'definitionStatusId': u'900000000000073002', u'sctid': u'293672009', u'active': u'1',
               u'moduleId': u'900000000000207008', u'id': u'293672009',
               u'history': u'[{"active": "1", "effectiveTime": "20020131", ...}, ...]'}
  '''

  def compute_history_string(id, rf2_d, graph_matches_d, field_names, rf2_fields_d, renamed_fields):
    if opts.release_type == 'full': # compute history, have all information
      historical_effectiveTimes = sorted(rf2_d[id].keys())[:-1] # excluce 'current' (latest)
      hist = [ { nm: rf2_d[id][effTime][rf2_fields_d[renamed_fields.get(nm,nm)]] for nm in field_names } for effTime in historical_effectiveTimes ] \
             if len(rf2_d[id].keys()) > 1 else []
    else: # not FULL, can be missing historical info
      if id not in graph_matches_d:
        hist = []
      else:
        old_history =  graph_matches_d[id]['history'] # JSON string or empty string
        old_field_values = [ graph_matches_d[id][nm] for nm in field_names ]
        if len(old_history) == 0: # no prev history, old values ==> previous history)
          hist = [ { a:b for a,b in zip(field_names, old_field_values) } ]
        else: # existing history, not FULL release, append previous values from graph (previous history)
          hist = json.loads(old_history) + [ ( { a:b for a,b in zip(field_names, old_field_values) } ) ]
    return json.dumps(hist) if len(hist) > 0 else ''

  def build_csv_output_line(id, non_rf2_fields, current_effTime, rf2_d, csv_fields_d, field_names, rf2_fields_d, renamed_fields, quoted_fields):
    csv_data = [None]*len(csv_fields_d.keys())
    for nm in field_names: csv_data[csv_fields_d[nm]] = db_data_prep(rf2_d[id][current_effTime][rf2_fields_d[renamed_fields.get(nm,nm)]])
    for k,v in non_rf2_fields: csv_data[csv_fields_d[k]] = db_data_prep(v) # eg: [('history','<hist-json-str>'),...]
    if None in csv_data: raise ValueError('csv_data %s' % str(csv_data))
    for nm in quoted_fields: csv_data[csv_fields_d[nm]] = csv_clean_str(csv_data[csv_fields_d[nm]]) # quote only necessary fields
    return db_data_prep( ','.join(csv_data) ) # output_line

  #------------------------------------------------------------------------------|
  #        CONCEPT CSV files creation -- concept_new.csv, concept_chg.csv        |
  #------------------------------------------------------------------------------|

  def validate_concepts():

    def concept_cb(fields, fields_d, hist):
      id = fields[ fields_d['id'] ]
      effTime = fields[ fields_d['effectiveTime'] ]
      if id not in concepts_d: concepts_d[id] = {} # not seen before -- empty dictionary (keyed by effectiveTime)
      else:
        if opts.release_type != 'full': raise ValueError('*** Concept id [%s] with multiple entries in [%s] release-type, should NOT occur ***' % (id,opts.release_type))
        if effTime in concepts_d[id]: raise ValueError('*** Concept id [%s] with duplicate effectiveTime [%s], should NOT occur ***' % (id, effTime))
      concepts_d[id][effTime] = fields[:] # attributes in RF2-defined order

    def Fsn_cb(fields, fields_d, hist):
      all_Fsn_in_Rf2_d[ db_data_prep(fields[ fields_d['conceptId'] ]) ] = db_data_prep(fields[ fields_d['term'] ]) # FSN

    def Fsn_filter(fields, fields_d, hist):
      return fields[ fields_d['typeId'] ] == snomedct_constants.SNOMEDCT_TYPEID_FSN

    # validate_concepts:
    # ==> generate concept_new.csv, concept_chg.csv -- from info in RF2 and NEO4J
    stats = { 'error_count': 0 }
    timing_d = { }
    timing_idx = 0
    timing_overall_nm = '{:04d}_validate_concepts'.format(timing_idx); timing_start(timing_d, timing_overall_nm)
    timing_idx += 1; timing_nm = '{:04d}_read_RF2_description'.format(timing_idx); timing_start(timing_d, timing_nm)
    all_Fsn_in_Rf2_d = {}
    snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('description') ).process_file(Fsn_cb, Fsn_filter, False)
    timing_end(timing_d, timing_nm)
    f_new, f_chg = io.open('concept_new.csv','w',encoding='utf8'),io.open('concept_chg.csv','w',encoding='utf8')
    outfile_list = [f_new,f_chg]
    rf2_fields = attributes_by_file.rf2_fields['concept']
    rf2_fields_d = { nm: idx for idx,nm in enumerate(rf2_fields) }
    csv_fields = attributes_by_file.csv_fields['concept'] # ['id','effectiveTime','active',...,'history']
    csv_fields_d = { nm: idx for idx,nm in enumerate(csv_fields) }
    field_names = [ x for x in csv_fields if x not in ['FSN','history'] ] # exclude non-RF2 history and FSN (external)
    renamed_fields = attributes_by_file.renamed_fields['concept'] # dictionary
    quoted_in_csv_fields = attributes_by_file.quoted_in_csv_fields['concept']
    csv_header = db_data_prep(','.join(csv_fields)) # "id,effectiveTime,..."
    for f in outfile_list: print(csv_header, file=f) # header
    # create concepts_d with information from DELTA/SNAPSHOT/FULL concept file
    timing_idx += 1; timing_nm = '{:04d}_read_RF2_concept'.format(timing_idx); timing_start(timing_d, timing_nm)
    concepts_d = {}
    snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('concept') ).process_file(concept_cb, None, False)
    timing_end(timing_d, timing_nm)
    rf2_idlist = concepts_d.keys()
    Fsn_d = { k: all_Fsn_in_Rf2_d[k] for k in list(set(all_Fsn_in_Rf2_d.keys()).intersection(set(rf2_idlist))) } # sets compare ascii+unicode
    print('count of RF2 ids: %d' % len(rf2_idlist))
    # Look for existing FSN values in graph
    print('count of FSNs in RF2: %d' % len(Fsn_d.keys()))
    if opts.action=='create':
      graph_matches_d = {}
    else:
      # NEO4J -- look for these concepts (N at a time)
      timing_idx += 1; timing_nm = '{:04d}_neo4j_lookup_concepts'.format(timing_idx); timing_start(timing_d, timing_nm)
      if opts.release_type=='delta':
        graph_matches_d = neo4j.lookup_concepts_for_ids(rf2_idlist) # This includes FSN values
      else:
        graph_matches_d = neo4j.lookup_all_concepts()
      timing_end(timing_d, timing_nm)
      print('Found %d of the IDs+FSNs in the graph DB:' % len(graph_matches_d.keys()))
      # Set any missing FSN values from the Graph
      target_id_set = set(graph_matches_d.keys()) - set(Fsn_d.keys())
      print('Filling in %d FSN values from the graph' % len(target_id_set))
      for id in list(target_id_set): Fsn_d[id] = graph_matches_d[id]['FSN']
      print('count of FSNs after merge with RF2 FSNs: %d' % len(Fsn_d.keys()))
    # Make sure all ids have an FSN
    if sorted(Fsn_d.keys()) != sorted(rf2_idlist): raise ValueError('*** (sanity check failure) Cant find FSN for all IDs in release ***')
    # GENERATE CSV FILES
    timing_idx += 1; timing_nm = '{:04d}_generate_csvs'.format(timing_idx); timing_start(timing_d, timing_nm)
    for id in rf2_idlist:
      current_effTime = sorted(concepts_d[id].keys())[-1] # highest effectiveTime is current
      if id not in graph_matches_d:
        stats['new'] += 1
      elif concepts_d[id][current_effTime][rf2_fields_d['effectiveTime']] == graph_matches_d[id]['effectiveTime']:
        stats['no_change'] += 1; continue # NO CHANGE ==> SKIP
      else:
        stats['change'] += 1
      hist_str = compute_history_string(id, concepts_d, graph_matches_d, field_names, rf2_fields_d, renamed_fields)
      output_line = build_csv_output_line(id,[('FSN',Fsn_d[id]),('history',hist_str)],current_effTime, concepts_d, csv_fields_d, field_names, rf2_fields_d, renamed_fields, quoted_in_csv_fields)
      print(output_line,file=(f_new if not id in graph_matches_d else f_chg))
    # Done generating CSVs
    timing_end(timing_d, timing_nm)
    timing_end(timing_d, timing_overall_nm)
    # CLEANUP, DISPLAY RESULTS
    for f in outfile_list: f.close() # cleanup
    print('Total RF2 elements: {:d}, ERRORS: {:d}'.format(len(rf2_idlist), stats['error_count']))
    show_timings(timing_d)
    sys.exit(stats['error_count']) # CONVENTION - return number of errors as program code (zero ==> SUCCESS)
  # END validate_concepts

  #------------------------------------------------------------------------------|
  #        DESCRIPTION CSV files  -- descrip_new.csv, descrip_chg.csv            |
  #------------------------------------------------------------------------------|
  def validate_descriptions():

    def description_cb(fields, fields_d, hist):
      id = fields[ fields_d['id'] ]
      effTime = fields[ fields_d['effectiveTime'] ]
      if id not in description_d: description_d[id] = {} # not seen before -- empty dictionary (keyed by effectiveTime)
      else:
        if opts.release_type != 'full': raise ValueError('*** Concept id [%s] with multiple entries in [%s] release-type, should NOT occur ***' % (id,opts.release_type))
        if effTime in description_d[id]: raise ValueError('*** Concept id [%s] with duplicate effectiveTime [%s], should NOT occur ***' % (id, effTime))
      description_d[id][effTime] = fields[:] # attributes in RF2-defined order
    def language_cb(fields, fields_d, hist):
      id = fields[ fields_d['referencedComponentId'] ] # DONT USE "id", use the id associated with the Description
      if id in language_d and language_d[id]['refsetId']==snomedct_constants.SNOMEDCT_REFSETID_USA: return # PREFER US definition
      language_d[id] = { nm : fields[ fields_d[nm] ] for nm in fields_d.keys() }
    def snapshot_language_cb(fields, fields_d, hist):
      id = fields[ fields_d['referencedComponentId'] ]
      if id in snapshot_language_d and snapshot_language_d[id]['refsetId']==snomedct_constants.SNOMEDCT_REFSETID_USA: return # prefer US def
      snapshot_language_d[id] = { nm : fields[ fields_d[nm] ] for nm in fields_d.keys() }
    def compute_descriptionType(typeId,acceptabilityId):
      return 'FSN' if typeId=='900000000000003001' \
             else 'Preferred' if typeId=='900000000000013009' and acceptabilityId=='900000000000548007' \
             else 'Synonym'

    # validate_descriptions:
    # ==> generate descrip_new.csv, descrip_chg.csv -- from info in RF2 and NEO4J
    stats = { 'error_count': 0 }
    timing_d = {}
    timing_idx = 0
    timing_overall_nm = '%04d_validate_descriptions' % timing_idx; timing_start(timing_d, timing_overall_nm)
    # READ RF2 DESCRIPTION FILE
    timing_idx += 1; timing_nm = '%04d_read_RF2_description' % timing_idx; timing_start(timing_d, timing_nm)
    description_d, language_d, snapshot_language_d = {}, {}, {}
    snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('description') ).process_file(description_cb, None, False)
    timing_end(timing_d, timing_nm)
    rf2_idlist = description_d.keys()
    print('count of RF2 ids: %d' % len(rf2_idlist))
    # READ RF2 LANGUAGE FILE
    timing_idx += 1; timing_nm = '%04d_read_RF2_language' % timing_idx; timing_start(timing_d, timing_nm)
    snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('language') ).process_file(language_cb, None, False)
    timing_end(timing_d, timing_nm)
    if opts.release_type=='delta': # need snapshot file for fallback of potential missing historical information
      print('read snapshot language values');
      timing_idx += 1; timing_nm = '%04d_read_rf2_language_snapshot' % timing_idx; timing_start(timing_d, timing_nm)
      snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('language','Snapshot') ).process_file(snapshot_language_cb, None, False); print('read')
      timing_end(timing_d, timing_nm)
    # CSV INIT, ATTRIBUTE NAMES MANAGEMENT
    f_new, f_chg = io.open('descrip_new.csv','w',encoding='utf8'),io.open('descrip_chg.csv','w',encoding='utf8')
    outfile_list = [f_new,f_chg]
    rf2_fields = attributes_by_file.rf2_fields['description']
    rf2_fields_d = { nm: idx for idx,nm in enumerate(rf2_fields) }
    csv_fields = attributes_by_file.csv_fields['description'] # ['id','effectiveTime','active',...,'history']
    csv_fields_d = { nm: idx for idx,nm in enumerate(csv_fields) }
    field_names = [ x for x in csv_fields if x not in ['id128bit','acceptabilityId','refsetId','descriptionType','history'] ]
    renamed_fields = attributes_by_file.renamed_fields['description'] # dictionary
    quoted_in_csv_fields = attributes_by_file.quoted_in_csv_fields['description']
    csv_header = db_data_prep(','.join(csv_fields)) # "id,effectiveTime,..."
    for f in outfile_list: print(csv_header, file=f) # header
    if opts.action=='create':
      graph_matches_d = {}
    else: # 'update' (compare vs Graph)
      # READ NEO4J DESCRIPTIONS
      timing_idx += 1; timing_nm = '%04d_neo4j_lookup_DESCRIPTIONS' % timing_idx; timing_start(timing_d, timing_nm)
      if opts.release_type=='delta':
        graph_matches_d = neo4j.lookup_descriptions_for_ids(rf2_idlist) # This includes FSN values
      else:
        graph_matches_d = neo4j.lookup_all_descriptions()
      timing_end(timing_d, timing_nm)
      print('count of Descriptions in NEO4J: %d' % len(graph_matches_d.keys()))
      print('count of Language Descriptions in RF2: %d' % len(list(set(language_d.keys()).intersection(set(rf2_idlist)))))
    # GENERATE CSV FILES
    timing_idx += 1; timing_nm = '%04d_generate_csvs' % timing_idx; timing_start(timing_d, timing_nm)
    for id in rf2_idlist:
      current_effTime = sorted(description_d[id].keys())[-1] # highest effectiveTime is current
      if id not in graph_matches_d:
        stats['new'] += 1
      elif description_d[id][current_effTime][rf2_fields_d['effectiveTime']] == graph_matches_d[id]['effectiveTime']:
        stats['no_change'] += 1; continue # NO CHANGE ==> NO ADDITIONAL PROCESSING FOR THIS ENTRY
      else:
        stats['change'] += 1
      hist_str = compute_history_string(id, description_d, graph_matches_d, field_names, rf2_fields_d, renamed_fields)
      # Need to add the following to the description_d definition ==>
      #  'id128bit','acceptabilityId','descriptionType' (compute from acceptabilityId),'refsetId'
      computed = {}
      current_typeId = description_d[id][current_effTime][rf2_fields_d['typeId']]
      if id in language_d:
        computed['id128bit']        = language_d[id]['id']
        computed['acceptabilityId'] = language_d[id]['acceptabilityId']
        computed['refsetId']        = language_d[id]['refsetId']
        computed['descriptionType'] = compute_descriptionType(current_typeId,language_d[id]['acceptabilityId'])
      elif id in snapshot_language_d: # empty unless view=='delta', things not necessarily in Graph (any missing releases in graph)
        computed['id128bit']        = snapshot_language_d[id]['id']
        computed['acceptabilityId'] = snapshot_language_d[id]['acceptabilityId']
        computed['refsetId']        = snapshot_language_d[id]['refsetId']
        computed['descriptionType'] = compute_descriptionType(current_typeId,snapshot_language_d[id]['acceptabilityId'])
      elif id in graph_matches_d:
        computed['id128bit']        = graph_matches_d[id]['id128bit']
        computed['acceptabilityId'] = graph_matches_d[id]['acceptabilityId']
        computed['refsetId']        = graph_matches_d[id]['refsetId']
        computed['descriptionType'] = graph_matches_d[id]['descriptionType']
      else:
        stats['no_language'] += 1
        computed['id128bit']        = '<NA>'
        computed['acceptabilityId'] = '<NA>'
        computed['refsetId']        = '<NA>'
        computed['descriptionType'] = '<NA>'
        if stats['no_language']<=1000: print('*** Missing LANGUAGE records for Description %s ***' % id)
        elif stats['no_language']==1001: print('*** Missing more than 1000 LANGUAGE records ***')
      non_rf2_fields = [(x,computed[x]) for x in ['id128bit','acceptabilityId','refsetId','descriptionType']]+[('history',hist_str)]
      output_line = build_csv_output_line(id, non_rf2_fields, current_effTime, description_d, csv_fields_d, field_names, rf2_fields_d, renamed_fields, quoted_in_csv_fields)
      print(output_line,file=(f_new if not id in graph_matches_d else f_chg))
    # Done generating CSVs
    timing_end(timing_d, timing_nm)
    timing_end(timing_d, timing_overall_nm)
    # CLEANUP, DISPLAY RESULTS
    for f in outfile_list: f.close() # cleanup
    if stats['no_language'] > 0: print('Missing %d LANGUAGE records' % stats['no_language'])
    print('Total RF2 elements: {:d}, ERRORS: {:d}'.format(len(rf2_idlist), stats['error_count']))
    show_timings(timing_d)
    # DONE
    for f in outfile_list: f.close() # cleanup
    sys.exit(stats['error_count']) # CONVENTION - return number of errors as program code (zero ==> SUCCESS)
  # END validate_descriptions

  #------------------------------------------------------------------------------|
  #            ISA_REL CSV files  -- isa_rel_new.csv, isa_rel_chg.csv            |
  #------------------------------------------------------------------------------|
  def validate_isa_rels():

    def isa_rel_cb(fields, fields_d, hist):
      id = fields[ fields_d['id'] ]
      effTime = fields[ fields_d['effectiveTime'] ]
      if id not in isa_rel_d: isa_rel_d[id] = {} # not seen before -- empty dictionary (keyed by effectiveTime)
      else:
        if opts.release_type != 'full': raise ValueError('*** ISA id [%s] with multiple entries in [%s] release-type, should NOT occur ***' % (id,opts.release_type))
        if effTime in isa_rel_d[id]: raise ValueError('*** ISA id [%s] with duplicate effectiveTime [%s], should NOT occur ***' % (id, effTime))
      isa_rel_d[id][effTime] = fields[:] # attributes in RF2-defined order
    def isa_rel_filter(fields, fields_d, hist):
      return fields[ fields_d['typeId'] ] == snomedct_constants.SNOMEDCT_TYPEID_ISA

    # validate_isa_rels:
    # ==> generate isa_rel_new.csv, isa_rel_chg.csv -- from info in RF2 and NEO4J
    stats = { 'error_count': 0 }
    timing_d = {}
    timing_idx = 0
    timing_overall_nm = '%04d_make_isa_rels_csvs' % timing_idx; timing_start(timing_d, timing_overall_nm)
    # READ RF2 RELATIONSHIP FILE - EXTRACT ISA
    timing_idx += 1; timing_nm = '%04d_read_RF2_relationship' % timing_idx; timing_start(timing_d, timing_nm)
    isa_rel_d = {}
    snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('relationship') ).process_file(isa_rel_cb, isa_rel_filter, False)
    timing_end(timing_d, timing_nm)
    rf2_idlist = isa_rel_d.keys()
    print('count of ids in RF2: %d' % len(rf2_idlist))
    # CSV FILE INIT, ATTRIBUTE NAME MANAGEMENT
    f_new, f_chg = io.open('isa_rel_new.csv','w',encoding='utf8'),io.open('isa_rel_chg.csv','w',encoding='utf8')
    outfile_list = [f_new,f_chg]
    rf2_fields = attributes_by_file.rf2_fields['isa_rel']
    rf2_fields_d = { nm: idx for idx,nm in enumerate(rf2_fields) }
    csv_fields = attributes_by_file.csv_fields['isa_rel'] # ['id','effectiveTime','active',...,'history']
    csv_fields_d = { nm: idx for idx,nm in enumerate(csv_fields) }
    field_names = [ x for x in csv_fields if x not in ['history'] ]
    renamed_fields = attributes_by_file.renamed_fields['isa_rel'] # dictionary
    quoted_in_csv_fields = attributes_by_file.quoted_in_csv_fields['isa_rel']
    csv_header = db_data_prep(','.join(csv_fields)) # "id,effectiveTime,..."
    for f in outfile_list: print(csv_header, file=f) # header
    if opts.action=='create':
      graph_matches_d = {}
    else:
      # EXTRACT ISA RELATIONSHIPS FROM NEO4J
      timing_idx += 1; timing_nm = '%04d_get_neo4j_ISA' % timing_idx; timing_start(timing_d, timing_nm)
      all_in_graph = neo4j.lookup_all_isa_rels() # looking for ISA by its 'id' is SLOOOOOOW, get them ALL instead
      timing_end(timing_d, timing_nm)
      print('count of ALL ISA in NEO4J: %d' % len(all_in_graph.keys()))
      graph_matches_d = { x: all_in_graph[x] for x in list(set(all_in_graph.keys()).intersection(set(rf2_idlist))) } # successful compare ascii+unicode, way faster than "if" test
      print('count of ISA in NEO4J: %d' % len(graph_matches_d.keys()))
    # GENERATE CSV FILES FOR NEW AND CHG
    timing_idx += 1; timing_nm = '%04d_csv_generation' % timing_idx; timing_start(timing_d, timing_nm)
    for id in rf2_idlist: # must compute updated history for each
      current_effTime = sorted(isa_rel_d[id].keys())[-1] # highest effectiveTime is current
      if id not in graph_matches_d:
        stats['new'] += 1
      elif isa_rel_d[id][current_effTime][rf2_fields_d['effectiveTime']] == graph_matches_d[id]['effectiveTime']:
        stats['no_change'] += 1; continue # NO CHANGE ==> NO ADDITIONAL PROCESSING FOR THIS ENTRY
      else:
        stats['change'] += 1
      hist_str = compute_history_string(id, isa_rel_d, graph_matches_d, field_names, rf2_fields_d, renamed_fields)
      output_line = build_csv_output_line(id,[('history',hist_str)],current_effTime, isa_rel_d, csv_fields_d, field_names, rf2_fields_d, renamed_fields, quoted_in_csv_fields)
      print(output_line,file=(f_new if not id in graph_matches_d else f_chg))
    # Done generating CSVs
    timing_end(timing_d, timing_nm)
    timing_end(timing_d, timing_overall_nm)
    # CLEANUP, DISPLAY RESULTS
    for f in outfile_list: f.close() # cleanup
    print('Total RF2 elements: {:d}, ERRORS: {:d}'.format(len(rf2_idlist), stats['error_count']))
    show_timings(timing_d)
    sys.exit(stats['error_count']) # CONVENTION - return number of errors as program code (zero ==> SUCCESS)
  # END validate_isa_rels

  #------------------------------------------------------------------------------|
  #    DEFINING_REL CSV files  -- defining_rel_new.csv, defining_rel_chg.csv     |
  #------------------------------------------------------------------------------|
  def validate_defining_rels():

    def defining_rel_cb(fields, fields_d, hist):
      id = fields[ fields_d['id'] ]
      effTime = fields[ fields_d['effectiveTime'] ]
      if id not in defining_rel_d: defining_rel_d[id] = {} # not seen before -- empty dictionary (keyed by effectiveTime)
      else:
        if opts.release_type != 'full': raise ValueError('*** DEFINING-REL id [%s] with multiple entries in [%s] release-type, should NOT occur ***' % (id,opts.release_type))
        if effTime in defining_rel_d[id]: raise ValueError('*** DEFINING-REL id [%s] with duplicate effectiveTime [%s], should NOT occur ***' % (id, effTime))
      defining_rel_d[id][effTime] = fields[:] # attributes in RF2-defined order
    def defining_rel_filter(fields, fields_d, hist):
      return fields[ fields_d['typeId'] ] != snomedct_constants.SNOMEDCT_TYPEID_ISA

    # validate_defining_rels:
    # ==> generate defining_rel_new.csv, defining_rel_chg.csv -- from info in RF2 and NEO4J
    stats = { 'error_count': 0 }
    timing_d = {}
    timing_idx = 0
    timing_overall_nm = '%04d_make_defining_rels_csvs' % timing_idx; timing_start(timing_d, timing_overall_nm)
    # READ all_roles.csv (tiny file)
    timing_idx += 1; timing_nm = '%04d_read_all_roles' % timing_idx; timing_start(timing_d, timing_nm)
    roleHash = {}
    with open('all_roles.csv') as f:
      for idx,line in enumerate(x.rstrip('\n').rstrip('\r') for x in f):
        if idx==0: continue # typeId,rolename
        typeId, rolename = line.split(',')
        roleHash[typeId] = rolename
    timing_end(timing_d, timing_nm)
    # READ RF2 RELATIONSHIP FILE - EXTRACT DEFINING-RELS
    timing_idx += 1; timing_nm = '%04d_read_RF2_relationship' % timing_idx; timing_start(timing_d, timing_nm)
    defining_rel_d = {}
    snomed_g_lib_rf2.Process_Rf2_Release_File( rf2_filename('relationship') ).process_file(defining_rel_cb, defining_rel_filter, False)
    timing_end(timing_d, timing_nm)
    rf2_idlist = defining_rel_d.keys()
    print('count of ids in RF2: %d' % len(rf2_idlist))
    # CSV FILE INIT, ATTRIBUTE NAME MANAGEMENT
    f_new, f_chg = io.open('defining_rel_new.csv','w',encoding='utf8'),io.open('defining_rel_chg.csv','w',encoding='utf8')
    f_edge_rem = io.open('defining_rel_edge_rem.csv','w',encoding='utf8')
    print(db_data_prep('id,rolegroup,sourceId,destinationId'),file=f_edge_rem)
    outfile_list = [f_new,f_chg]
    f_DRs = {} # per-defining-relationship type
    rf2_fields = attributes_by_file.rf2_fields['defining_rel']
    rf2_fields_d = { nm: idx for idx,nm in enumerate(rf2_fields) }
    csv_fields = attributes_by_file.csv_fields['defining_rel'] # ['id','effectiveTime','active',...,'history']
    csv_fields_d = { nm: idx for idx,nm in enumerate(csv_fields) }
    field_names = [ x for x in csv_fields if x not in ['history'] ]
    renamed_fields = attributes_by_file.renamed_fields['defining_rel'] # dictionary
    quoted_in_csv_fields = attributes_by_file.quoted_in_csv_fields['defining_rel']
    csv_header = db_data_prep(','.join(csv_fields)) # "id,effectiveTime,..."
    for f in outfile_list: print(csv_header, file=f) # header
    if opts.action == 'create':
      graph_matches_d = {}
    else:
      # EXTRACT DEFINING RELATIONSHIPS FROM NEO4J
      timing_idx += 1; timing_nm = '%04d_get_neo4j_DEFINING_RELS' % timing_idx; timing_start(timing_d, timing_nm)
      all_in_graph = neo4j.lookup_all_defining_rels() # looking for rel by its 'id' is SLOOOOOOW, get them ALL instead
      timing_end(timing_d, timing_nm)
      print('count of ALL DEFINING-REL in NEO4J: %d' % len(all_in_graph.keys()))
      graph_matches_d = { x: all_in_graph[x] for x in list(set(all_in_graph.keys()).intersection(set(rf2_idlist))) } # successful compare ascii+unicode, way faster than "if" test
      print('count of DEFINING-REL in NEO4J: %d' % len(graph_matches_d.keys()))
    # GENERATE CSV FILES FOR NEW AND CHG
    timing_idx += 1; timing_nm = '%04d_csv_generation' % timing_idx; timing_start(timing_d, timing_nm)
    f_used_roles = open('used_roles.csv','w'); print('typeId,rolename',file=f_used_roles)
    for id in rf2_idlist: # must compute updated history for each
      current_effTime = sorted(defining_rel_d[id].keys())[-1] # highest effectiveTime is current
      current_typeId = defining_rel_d[id][current_effTime][rf2_fields_d['typeId']]
      rolegroup_changed = False # if this occurred, treat as create instead of change (as it requires edge remove+edge create)
      if id not in graph_matches_d:
        stats['new'] += 1
        if current_typeId not in f_DRs:
          f_DRs[current_typeId] = open('DR_%s_new.csv' % roleHash[current_typeId],'w'); print(csv_header, file=f_DRs[current_typeId])
          print('%s,%s' % (current_typeId, roleHash[current_typeId]), file=f_used_roles)
      elif defining_rel_d[id][current_effTime][rf2_fields_d['effectiveTime']] == graph_matches_d[id]['effectiveTime']:
        stats['no_change'] += 1; continue # NO CHANGE ==> NO ADDITIONAL PROCESSING FOR THIS ENTRY
      else:
        stats['change'] += 1
        # see if rolegroup changed
        if graph_matches_d[id]['rolegroup'] != defining_rel_d[id][current_effTime][ rf2_fields_d['relationshipGroup'] ]: # rolegroup change?
          print('%s,%s,%s,%s' % (id,graph_matches_d[id]['rolegroup'],graph_matches_d[id]['sctid'],graph_matches_d[id]['destinationId']),file=f_edge_rem)
          rolegroup_changed = True # treat this as an edge create case
      hist_str = compute_history_string(id, defining_rel_d, graph_matches_d, field_names, rf2_fields_d, renamed_fields)
      output_line = build_csv_output_line(id,[('history',hist_str)],current_effTime, defining_rel_d, csv_fields_d, field_names, rf2_fields_d, renamed_fields, quoted_in_csv_fields)
      for f in ([f_chg] if rolegroup_changed==False and id in graph_matches_d else [f_new, f_DRs[current_typeId]]): print(output_line,file=f)
    # Done generating CSVs
    timing_end(timing_d, timing_nm)
    timing_end(timing_d, timing_overall_nm)
    # CLEANUP, DISPLAY RESULTS
    for f in outfile_list+[f_edge_rem]+[f_DRs[typeId] for typeId in f_DRs.keys()]+[f_used_roles]: f.close() # cleanup
    print('Total RF2 elements: {:d}, ERRORS: {:d}'.format(len(rf2_idlist), stats['error_count']))
    show_timings(timing_d)
    sys.exit(stats['error_count']) # CONVENTION - return number of errors as program code (zero ==> SUCCESS)
  # END validate_defining_rels

  # validate_graphdb:
  # Output: result displayed to STDOUT, exceptions)
  opt = optparse.OptionParser()
  opt.add_option('--verbose',action='store_true',dest='verbose')
  opt.add_option('--rf2',action='store',dest='rf2')
  opt.add_option('--element',action='store', choices=['concept','description','isa_rel','defining_rel'])
  opt.add_option('--release_type', action='store', dest='release_type', choices=['delta','snapshot','full'])
  opt.add_option('--exceptions_file', action='store', dest='exceptions_file')
  opt.add_option('--neopw64', action='store')
  opt.add_option('--neopw', action='store')
  opts, args = opt.parse_args(arglist)
  if not (len(args)==0 and opts.rf2 and opts.element and opts.release_type and (opts.neopw or opts.neopw64)):
    print('Usage: validate_graphdb --element concept/description/isa_rel/defining_rel --rf2 <dir> --release_type delta/snapshot [--verbose] --neopw <base64pw>')
    sys.exit(1)
  if opts.neopw and (opts.neopw or opts.neopw64):
    print('Usage: only one of --neopw and --neopw64 may be specified')
    sys.exit(1)
  if opts.neopw64: # snomed_g v1.2, convert neopw64 to neopw
    opts.neopw = str(base64.b64decode(opts.neopw64),'utf-8') if sys.version_info[0]==3 else base64.decodestring(opts.neopw64) # py2
  # Connect to NEO4J
  #neopw = base64.decodestring( json.loads(open('necares_config.json').read())['salt'] )
  neo4j = snomed_g_lib_neo4j.Neo4j_Access(opts.neopw)
  # Connect to RF2 files
  rf2_folders = snomed_g_lib_rf2.Rf2_Folders(opts.rf2, opts.release_type)
  # Information for comparing RF2 to Graph
  attributes_by_file = snomed_g_lib_rf2.Rf2_Attributes_per_File()
  # POSSIBILITY - open exception file (append if it exists, write header if it did not exist)
  fn = opts.exceptions_file
  exceptions_file = open(fn, 'a')
  if exceptions_file.tell()==0: print('element,id,description',file=exceptions_file) # header
  # determine the fields names, NOTE: history is assumed as added last field
  if   opts.element=='concept':      validate_concepts()
  elif opts.element=='description':  validate_descriptions()
  elif opts.element=='isa_rel':      validate_isa_rels()
  elif opts.element=='defining_rel': validate_defining_rels()
  else:
    print('unknown element [%s]' % opts.element); sys.exit(1)
  return
Exemplo n.º 3
0
def TC_from_graph(arglist):
  #-------------------------------------------------------------------------------
  # build_ISA_graph(children,filename)
  # Concept: Reads ISA edges from relationships file, stores in the children hash
  #-------------------------------------------------------------------------------
  def build_ISA_graph(children,isa_rels):
    for idvalue in isa_rels.keys():
      isa_map = isa_rels[idvalue]
      active, sourceId, destinationId = isa_map['active'], isa_map['sourceId'], isa_map['destinationId']
      if active=='1': # active ISA relationship
        if destinationId not in children:            # parent discovered
          children[destinationId] = set([sourceId])  # 1st child, create list
        else:
          children[destinationId].add(sourceId)      # nth child, add to set
    return # done
  
  #-------------------------------------------------------------------------------
  # compute_TC_table(startnode,children,descendants,visited)
  #-------------------------------------------------------------------------------
  # Based on a method described in "Transitive Closure Algorithms
  # Based on Graph Traversal" by Yannis Ioannidis, Raghu Ramakrishnan, and Linda Winger,
  # ACM Transactions on Database Systems, Vol. 18, No. 3, September 1993,
  # Pages: 512 - 576.
  # Simplified version of their "DAG_DFTC" algorithm.
  #-------------------------------------------------------------------------------
  # 
  def compute_TC_table(startnode,children,descendants,visited): # recursively depth-first traverse the graph.
    visited.add(startnode)
    descendants[startnode] = set([]) # no descendants yet
    if startnode not in children: return # no children case, leaf nodes
    for childnode in children[startnode]: # for all the children of the startnode
      if childnode not in visited:  # if not yet visited (Note: DFS traversal)
        compute_TC_table(childnode,children,descendants,visited) # recursively visit the childnode, set descendants
      for descendant in list(descendants[childnode]): # each descendant of childnode
        descendants[startnode].add(descendant) # mark descendants of startnode
      descendants[startnode].add(childnode) # mark immediate child of startnode
    return
  
  def print_TC_table(descendants, outfile_name):
    fout = open(outfile_name, 'w')
    for startnode in descendants.keys():
      for endnode in list(descendants[startnode]):
        print('%s,%s' % (startnode,endnode), file = fout)
    fout.close()
    return

  def show_timings(t):
    print('NEO4J Graph DB open: %g' % (t['graph_open_end']-t['graph_open_start']))
    print('ISA extraction from NEO4J: %g' % (t['isa_get_end']-t['isa_get_start']))
    print('TC computation: %g' % (t['TC_end']-t['TC_start']))
    print('Output (csv): %g' % (t['output_write_end']-t['output_write_start']))
    print('Total time: %g' % (t['end']-t['start']))

  # TC_from_graph:
  # command line parsing
  opt = optparse.OptionParser()
  opt.add_option('--neopw64', action='store', dest='neopw64')
  opts, args = opt.parse_args(arglist)
  if not (len(args)==1 and opts.neopw64):
    print('Usage: cmd TC_from_graph <TCfile-out> --neopw64 <pw>'); sys.exit(1)
  output_TC_filename = args[0]
  # Extract ISA relationships from graph (active and inactive)
  timings = {}
  timings['start'] = timer()
  timings['graph_open_start'] = timer()
  neo4j = snomed_g_lib_neo4j.Neo4j_Access(base64.decodestring(opts.neopw64))
  timings['graph_open_end'] = timer()
  timings['isa_get_start'] = timer()
  isa_rels = neo4j.lookup_all_isa_rels()
  timings['isa_get_end'] = timer()
  print('Result class: %s' % str(type(isa_rels)))
  print('Returned %d objects' % len(isa_rels))

  # Compute TC table from ISA relationships, output to specified file.
  timings['TC_start'] = timer()
  children, visited, descendants, concept_node = ({}, set(), {}, "138875005") # init
  build_ISA_graph(children, isa_rels) # build 'children' hash
  compute_TC_table(concept_node, children, descendants, visited)
  timings['TC_end'] = timer()
  timings['output_write_start'] = timer()
  print_TC_table(descendants, output_TC_filename)
  timings['output_write_end'] = timer()
  timings['end'] = timer()
  show_timings(timings)

  # All done
  return
Exemplo n.º 4
0
def db_build(arglist):
    saved_pwd = os.getcwd()
    opt = optparse.OptionParser()
    opt.add_option('--rf2', action='store', dest='rf2')
    opt.add_option('--release_type',
                   action='store',
                   dest='release_type',
                   choices=['delta', 'snapshot', 'full'])
    opt.add_option('--action',
                   action='store',
                   dest='action',
                   default='create',
                   choices=['create', 'update'])
    opt.add_option('--neopw64', action='store', dest='neopw64')
    opt.add_option(
        '--mode',
        action='store',
        dest='mode',
        default='build',
        choices=['build', 'prep', 'make_csvs', 'run_cypher',
                 'validate'])  # build is end-to-end, others are subsets
    opt.add_option('--logfile', action='store', dest='logfile')
    opt.add_option('--output_dir',
                   action='store',
                   dest='output_dir',
                   default='.')
    opt.add_option('--relationship_file',
                   action='store',
                   dest='relationship_file',
                   default='Relationship')
    opt.add_option('--language_code',
                   action='store',
                   dest='language_code',
                   default='en')
    opt.add_option('--language_name',
                   action='store',
                   dest='language_name',
                   default='Language')
    opt.add_option('--prep_only', action='store_true', dest='prep_only')
    opts, args = opt.parse_args(arglist)
    if not (len(args) == 0 and opts.rf2 and opts.release_type
            and opts.neopw64):
        print(
            'Usage: db_build --rf2 <dir> --release_type delta/snapshot --neopw64 <base64pw>'
        )
        sys.exit(1)
    # file path separator
    pathsep = '/'
    # make sure output directory exists and is empty
    opts.output_dir = get_path(opts.output_dir, pathsep)
    if not (os.path.isdir(opts.output_dir)
            and len(os.listdir(opts.output_dir)) == 0):
        print('*** Output directory is not an empty directory [%s] ***' %
              opts.output_dir)
        sys.exit(1)
    # open logfile
    logfile = open(opts.output_dir+'build.log', 'w') if not opts.logfile else \
              (sys.output if opts.logfile == '-' else open(opts.logfile, 'w'))
    #---------------------------------------------------------------------------
    # Determine SNOMED_G bin directory, where snomed_g_rf2_tools.py exists, etal
    #---------------------------------------------------------------------------
    # determine snomed_g_bin -- bin directory where snomed_g_rf2_tools.py exists in, etc -- try SNOMED_G_HOME, SNOMED_G_BIN env vbls
    # ... ask directly if these variables don't exist
    snomed_g_bin = os.environ.get(
        'SNOMED_G_BIN', None)  # unlikely to exist, but great if it does
    if not snomed_g_bin:
        snomed_g_home = os.environ.get('SNOMED_G_HOME', None)
        if snomed_g_home:
            snomed_g_bin = get_path(snomed_g_home, pathsep) + 'bin'
        else:
            snomed_g_bin = get_path(os.path.dirname(os.path.abspath(__file__)),
                                    pathsep)  # default to python script dir
    validated = False
    while not validated:
        if len(snomed_g_bin) == 0:
            snomed_g_bin = raw_input(
                'Enter SNOMED_G bin directory path where snomed_g_rf2_tools.py exists: '
            ).rstrip(pathsep)
        else:  # try to validate, look for snomed_g_rf2_tools.py
            target_file = snomed_g_bin + pathsep + 'snomed_g_rf2_tools.py'
            validated = os.path.isfile(target_file)
            if not validated:
                print('Cant find [%s]' % target_file)
                snomed_g_bin = ''
    snomed_g_bin = get_path(snomed_g_bin, pathsep)
    print('SNOMED_G bin directory [%s]' % snomed_g_bin)
    # db_build ==> connect to NEO4J, make sure information given is good
    if opts.mode == 'build':
        neo4j = snomed_g_lib_neo4j.Neo4j_Access(
            base64.decodestring(opts.neopw64))
    # Connect to RF2 files, make sure rf2 directory given is good
    rf2_folders = snomed_g_lib_rf2.Rf2_Folders(opts.rf2, opts.release_type,
                                               opts.relationship_file,
                                               opts.language_code)
    # Build
    # open SQLITE database
    DB = StatusDb(
        os.path.abspath(
            opts.output_dir.rstrip(pathsep) + pathsep + 'build_status.db'))

    # create YYYYMMDD string
    d = datetime.datetime.now()  # determine current date
    yyyymmdd = '%04d%02d%02d' % (d.year, d.month, d.day)
    job_start_datetime = datetime.datetime.now()

    # Commands needed to Create/Update a SNOMED_G Graph Database
    command_list_db_build = [{
        'stepname':
        'JOB_START',
        'log':
        'JOB-START(action:[%s], mode:[%s], release_type:[%s], rf2:[%s], date:[%s])'
        % (opts.action, opts.mode, opts.release_type, opts.rf2, yyyymmdd)
    }, {
        'stepname':
        'FIND_ROLENAMES',
        'cmd':
        'python %s/snomed_g_rf2_tools.py find_rolenames --release_type %s --rf2 %s --language_code %s --language_name %s'
        % (snomed_g_bin, opts.release_type, opts.rf2, opts.language_code,
           opts.language_name),
        'mode': ['build', 'prep', 'make_csvs', 'validate']
    }, {
        'stepname':
        'FIND_ROLEGROUPS',
        'cmd':
        'python %s/snomed_g_rf2_tools.py find_rolegroups --release_type %s --rf2 %s --language_code %s --language_name %s'
        % (snomed_g_bin, opts.release_type, opts.rf2, opts.language_code,
           opts.language_name),
        'mode': ['build', 'prep', 'make_csvs']
    }, {
        'stepname':
        'MAKE_CONCEPT_CSVS',
        'cmd':
        'python %s/snomed_g_rf2_tools.py make_csv --element concept --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s'
        %
        (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action,
         opts.relationship_file, opts.language_code, opts.language_name),
        'mode': ['build', 'prep', 'make_csvs', 'validate']
    }, {
        'stepname':
        'MAKE_DESCRIPTION_CSVS',
        'cmd':
        'python %s/snomed_g_rf2_tools.py make_csv --element description --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s'
        %
        (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action,
         opts.relationship_file, opts.language_code, opts.language_name),
        'mode': ['build', 'prep', 'make_csvs', 'validate']
    }, {
        'stepname':
        'MAKE_ISA_REL_CSVS',
        'cmd':
        'python %s/snomed_g_rf2_tools.py make_csv --element isa_rel --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s'
        %
        (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action,
         opts.relationship_file, opts.language_code, opts.language_name),
        'mode': ['build', 'prep', 'make_csvs', 'validate']
    }, {
        'stepname':
        'MAKE_DEFINING_REL_CSVS',
        'cmd':
        'python %s/snomed_g_rf2_tools.py make_csv --element defining_rel --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s'
        %
        (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action,
         opts.relationship_file, opts.language_code, opts.language_name),
        'mode': ['build', 'prep', 'make_csvs', 'validate']
    }, {
        'stepname':
        'TEMPLATE_PROCESSING',
        'cmd':
        'python %s/snomed_g_template_tools.py instantiate %s/snomed_g_graphdb_cypher_%s.template build.cypher --rf2 %s --release_type %s'
        % (snomed_g_bin, snomed_g_bin,
           ('create' if opts.action == 'create' else 'update'), opts.rf2,
           opts.release_type),
        'mode': ['build', 'prep']
    }, {
        'stepname':
        'CYPHER_EXECUTION',
        'cmd':
        'python %s/snomed_g_neo4j_tools.py run_cypher %s/build.cypher --verbose --neopw64 %s'
        % (snomed_g_bin, opts.output_dir, opts.neopw64),
        'mode': ['build', 'run_cypher']
    }, {
        'stepname':
        'CHECK_RESULT',
        'cmd':
        'python %s/snomed_g_neo4j_tools.py run_cypher %s/snomed_g_graphdb_update_failure_check.cypher --verbose --neopw64 %s'
        % (snomed_g_bin, snomed_g_bin, opts.neopw64),
        'mode': ['build', 'run_cypher']
    }, {
        'stepname': 'JOB_END',
        'log': 'JOB-END'
    }]
    command_list_db_build_prep = [{
        'stepname':
        'JOB_START',
        'log':
        'JOB-START(action:[%s], mode:[%s], release_type:[%s], rf2:[%s], date:[%s])'
        % (opts.action, opts.mode, opts.release_type, opts.rf2, yyyymmdd)
    }, {
        'stepname':
        'FIND_ROLENAMES',
        'cmd':
        'python %s/snomed_g_rf2_tools.py find_rolenames --release_type %s --rf2 %s --language_code %s --language_name %s'
        % (snomed_g_bin, opts.release_type, opts.rf2, opts.language_code,
           opts.language_name),
        'mode': ['build', 'prep', 'make_csvs', 'validate']
    }, {
        'stepname':
        'FIND_ROLEGROUPS',
        'cmd':
        'python %s/snomed_g_rf2_tools.py find_rolegroups --release_type %s --rf2 %s --language_code %s --language_name %s'
        % (snomed_g_bin, opts.release_type, opts.rf2, opts.language_code,
           opts.language_name),
        'mode': ['build', 'prep', 'make_csvs']
    }, {
        'stepname':
        'MAKE_CONCEPT_CSVS',
        'cmd':
        'python %s/snomed_g_rf2_tools.py make_csv --element concept --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s'
        %
        (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action,
         opts.relationship_file, opts.language_code, opts.language_name),
        'mode': ['build', 'prep', 'make_csvs', 'validate']
    }, {
        'stepname':
        'MAKE_DESCRIPTION_CSVS',
        'cmd':
        'python %s/snomed_g_rf2_tools.py make_csv --element description --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s'
        %
        (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action,
         opts.relationship_file, opts.language_code, opts.language_name),
        'mode': ['build', 'prep', 'make_csvs', 'validate']
    }, {
        'stepname':
        'MAKE_ISA_REL_CSVS',
        'cmd':
        'python %s/snomed_g_rf2_tools.py make_csv --element isa_rel --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s'
        %
        (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action,
         opts.relationship_file, opts.language_code, opts.language_name),
        'mode': ['build', 'prep', 'make_csvs', 'validate']
    }, {
        'stepname':
        'MAKE_DEFINING_REL_CSVS',
        'cmd':
        'python %s/snomed_g_rf2_tools.py make_csv --element defining_rel --release_type %s --rf2 %s --neopw64 %s --action %s --relationship_file %s --language_code %s --language_name %s'
        %
        (snomed_g_bin, opts.release_type, opts.rf2, opts.neopw64, opts.action,
         opts.relationship_file, opts.language_code, opts.language_name),
        'mode': ['build', 'prep', 'make_csvs', 'validate']
    }, {
        'stepname':
        'TEMPLATE_PROCESSING',
        'cmd':
        'python %s/snomed_g_template_tools.py instantiate %s/snomed_g_graphdb_cypher_%s.template build.cypher --rf2 %s --release_type %s'
        % (snomed_g_bin, snomed_g_bin,
           ('create' if opts.action == 'create' else 'update'), opts.rf2,
           opts.release_type),
        'mode': ['build', 'prep']
    }, {
        'stepname': 'JOB_END',
        'log': 'JOB-END'
    }]
    # OLD --     #{'stepname':'CYPHER_EXECUTION',       'cmd':'%s/neo4j-shell -localhost -file build.cypher' % neo4j_bin, 'mode':['build','run_cypher']},
    command_list = command_list_db_build if not opts.prep_only else command_list_db_build_prep
    stepnames = [x['stepname'] for x in command_list]  # list of dictionaries
    seqnum = DB.get_next_sequence_number()
    # Execute commands (BUILD)
    results_d = {}
    for command_d in command_list:
        # extract from tuple
        stepname, cmd, logmsg, expected_status, mode_requirement = \
          command_d['stepname'], command_d.get('cmd',None), command_d.get('log',None), command_d.get('expected_status',0), command_d.get('mode', None)
        if mode_requirement and opts.mode not in mode_requirement:
            continue  # eg: NEO4J execution only in build mode
        results_d[stepname] = {}
        cmd_start = datetime.datetime.now(
        ) if stepname != 'JOB_END' else job_start_datetime  # start timer
        status = -1
        should_break = False
        results_d[stepname][
            'result'] = 'SUCCESS'  # assumption of success until failure determined
        results_d[stepname]['expected_status'] = expected_status
        results_d[stepname]['command'] = cmd
        print(stepname)
        print(stepname, file=logfile)  # indicate to user what step we are on
        if logmsg:  # no command to execute in a separate process
            results_d[stepname]['status'] = 0
            results_d[stepname][
                'STDOUT'] = logmsg  # LOG everything after 'LOG:'
            output, err = '', ''
        else:  # execute command (cmd) in subprocess
            print(cmd, file=logfile)
            try:
                #p = subprocess.Popen(cmd, shell=True,stdin=PIPE, stdout=PIPE, stderr=PIPE)
                #output, err = p.communicate(b"")
                #status = p.returncode
                cmd_as_list = cmd.split(' ')
                if opts.output_dir != '.':
                    os.chdir(opts.output_dir
                             )  # move to output_dir, to start subprocess
                subprocess.check_call(cmd_as_list,
                                      stdout=logfile,
                                      stderr=logfile)
                if opts.output_dir != '.':
                    os.chdir(saved_pwd)  # get back (popd)
                status = 0  # if no exception -- status is zero
            except subprocess.CalledProcessError, e:
                status = e.returncode
                results_d[stepname]['status'] = status
                if status != expected_status:
                    results_d[stepname][
                        'result'] = 'FAILED (STATUS %d)' % status
                    should_break = True
                pass  # might be fine, should_break controls termination
            except:  # NOTE: result defaulted to -1 above
Exemplo n.º 5
0
def TC_fordate_from_graph(arglist):

  def active_at_date(datestring, isa_edge):
    active = '0' # if no information applies (possible), default to inactive
    # check the current definition, may be in effect at given date
    if isa_edge['effectiveTime'] <= datestring: # the current def in effect
      active = isa_edge['active']
    elif len(isa_edge['history']) > 2: # check history, current definition doesnt apply
      # eg: datestring = 20050101 and current effectiveTime is 20160101 ==> not in effect
      #     hist item 20030101 and 20040101 exists ==> 200401010 in effect in 20050101.
      # note: no need to check current element again, already determined not in effect
      # JSON example [{"typeId": "116680003", "sourceId": "900000000000441003", ...},{...}]
      ordered_history_list = json.loads(isa_edge['history'])
      for hist_elem in ordered_history_list: # list of maps
        if hist_elem['effectiveTime'] > datestring: break # in future vs given date
        if 'active' in hist_elem: active = hist_elem['active']
    return active=='1'
  
  #-------------------------------------------------------------------------------
  # build_ISA_graph(children,filename)
  # Concept: Reads ISA edges from relationships file, stores in the children hash
  #-------------------------------------------------------------------------------
  def build_ISA_graph(children,isa_rels,yyyymmdd):
    for idvalue in isa_rels.keys():
      isa_map = isa_rels[idvalue]
      sourceId, destinationId = isa_map['sourceId'], isa_map['destinationId']
      if active_at_date(yyyymmdd, isa_map):
        if destinationId not in children:            # parent discovered
          children[destinationId] = set([sourceId])  # 1st child, create list
        else:
          children[destinationId].add(sourceId)      # nth child, add to set
    return # done
  
  #-------------------------------------------------------------------------------
  # compute_TC_table(startnode,children,descendants,visited)
  #-------------------------------------------------------------------------------
  # Based on a method described in "Transitive Closure Algorithms
  # Based on Graph Traversal" by Yannis Ioannidis, Raghu Ramakrishnan, and Linda Winger,
  # ACM Transactions on Database Systems, Vol. 18, No. 3, September 1993,
  # Pages: 512 - 576.
  # Simplified version of their "DAG_DFTC" algorithm.
  #-------------------------------------------------------------------------------
  # 
  def compute_TC_table(startnode,children,descendants,visited): # recursively depth-first traverse the graph.
    visited.add(startnode)
    descendants[startnode] = set([]) # no descendants yet
    if startnode not in children: return # no children case, leaf nodes
    for childnode in children[startnode]: # for all the children of the startnode
      if childnode not in visited:  # if not yet visited (Note: DFS traversal)
        compute_TC_table(childnode,children,descendants,visited) # recursively visit the childnode, set descendants
      for descendant in list(descendants[childnode]): # each descendant of childnode
        descendants[startnode].add(descendant) # mark descendants of startnode
      descendants[startnode].add(childnode) # mark immediate child of startnode
    return
  
  def print_TC_table(descendants, outfile_name):
    fout = open(outfile_name, 'w')
    for startnode in descendants.keys():
      for endnode in list(descendants[startnode]):
        print('%s,%s' % (startnode,endnode), file = fout)
    fout.close()
    return

  def show_timings(t):
    print('NEO4J Graph DB open: %g' % (t['graph_open_end']-t['graph_open_start']))
    print('ISA extraction from NEO4J: %g' % (t['isa_get_end']-t['isa_get_start']))
    print('TC computation: %g' % (t['TC_end']-t['TC_start']))
    print('Output (csv): %g' % (t['output_write_end']-t['output_write_start']))
    print('Total time: %g' % (t['end']-t['start']))

  # TC_fordate_from_graph:
  # command line parsing
  opt = optparse.OptionParser()
  opt.add_option('--neopw64', action='store', dest='neopw64')
  opts, args = opt.parse_args(arglist)
  if not (len(args)==2 and opts.neopw64):
    print('Usage: cmd TC_fordate_from_graph YYYYMMDD <TCfile-out> --neopw64 <pw>'); sys.exit(1)
  yyyymmdd, output_TC_filename = args[0], args[1]
  # Extract ISA relationships from graph (active and inactive)
  timings = {}
  timings['start'] = timer()
  timings['graph_open_start'] = timer()
  neo4j = snomed_g_lib_neo4j.Neo4j_Access(base64.decodestring(opts.neopw64))
  timings['graph_open_end'] = timer()
  timings['isa_get_start'] = timer()
  isa_rels = neo4j.lookup_all_isa_rels()
  timings['isa_get_end'] = timer()
  print('Result class: %s' % str(type(isa_rels)))
  print('Returned %d objects' % len(isa_rels))

  # Compute TC table from ISA relationships, output to specified file.
  timings['TC_start'] = timer()
  children, visited, descendants, concept_node = ({}, set(), {}, "138875005") # init
  build_ISA_graph(children, isa_rels, yyyymmdd) # build 'children' hash
  compute_TC_table(concept_node, children, descendants, visited)
  timings['TC_end'] = timer()
  timings['output_write_start'] = timer()
  print_TC_table(descendants, output_TC_filename)
  timings['output_write_end'] = timer()
  timings['end'] = timer()
  show_timings(timings)
  return