def parse_variable_metadata(varfile): dic = load_acronym('acronym_list.txt') with open(varfile, 'rb') as f: varcontent = f.read().splitlines() varnamelist = [] varfieldlist = [] orgvarlist = [] for i in range(len(varcontent)): #print varcontent[i] orgvarlist.append(varcontent[i]) cstr = varcontent[i].split('|||') #process varname info namestr = cstr[0].split('/') cname = [] for j in range(len(namestr)): #cstr = convert_CamelCase2WhiteSpace(namestr[j]) namestr[j] = nlp_phrasecleaning(namestr[j], dic) cname.append(nlp_StopStemLemm(namestr[j])) varnamelist.append(cname) #process description info cstr[1] = nlp_phrasecleaning(cstr[1], dic) varfieldlist.append(nlp_StopStemLemm(cstr[1])) return varnamelist, varfieldlist, orgvarlist
def parse_GCMD_Keywords(keywordfile): dic = load_acronym('acronym_list.txt') #assuming keywords for a data set from a file such as xxx_keyword_original.txt with open(keywordfile, 'rb') as f: keylist = f.read().splitlines() keylist = list(set(keylist)) org_keywordlist = [] #get a set of keywords for this dataset fullkeywordlist = [] partialkeywordlist = [] for i in range(len(keylist)): ckeylist = keylist[i] ckeylist = ckeylist.rstrip() ckeylist = ckeylist.lstrip() org_keywordlist.append(ckeylist) strlist = ckeylist.split('->') for j in range(len(strlist)): slist = strlist[j].split('/') for k in range(len(slist)): fullkeywordlist.append(slist[k]) plist = slist[k].split() for m in range(len(plist)): partialkeywordlist.append(plist[m]) #nlp processing (removing stop word, stem and lemmatize) fullkeywordlist = list(set(fullkeywordlist)) partialkeywordlist = list(set(partialkeywordlist)) fullwordlist = [] for i in range(len(fullkeywordlist)): cstr = nlp_phrasecleaning(fullkeywordlist[i], dic) fullwordlist.append(nlp_StopStemLemm(cstr)) partialwordlist = [] for i in range(len(partialkeywordlist)): cstr = nlp_phrasecleaning(partialkeywordlist[i], dic) partialwordlist.append(nlp_StopStemLemm(cstr)) #cleaning original keywordlist for duplicate org_keywordlist = list(set(org_keywordlist)) print "FF: ", fullwordlist print "JJ: ", partialwordlist print "KK: ", org_keywordlist return list(set(fullwordlist)), list(set(partialwordlist)), org_keywordlist
def parse_GCMD_Keywords_by_list(keylist): dic = load_acronym('acronym_list.txt') org_keywordlist = [] #get a set of keywords for this dataset fullkeywordlist = [] partialkeywordlist = [] for i in range(len(keylist)): ckeylist = keylist[i] org_keywordlist.append(ckeylist) strlist = ckeylist.split('->') for j in range(len(strlist)): slist = strlist[j].split('/') for k in range(len(slist)): fullkeywordlist.append(slist[k]) plist = slist[k].split() for m in range(len(plist)): partialkeywordlist.append(plist[m]) #nlp processing (removing stop word, stem and lemmatize) fullkeywordlist = list(set(fullkeywordlist)) partialkeywordlist = list(set(partialkeywordlist)) fullwordlist = [] for i in range(len(fullkeywordlist)): cstr = nlp_phrasecleaning(fullkeywordlist[i], dic) fullwordlist.append(nlp_StopStemLemm(cstr)) partialwordlist = [] for i in range(len(partialkeywordlist)): cstr = nlp_phrasecleaning(partialkeywordlist[i], dic) partialwordlist.append(nlp_StopStemLemm(cstr)) return list(set(fullwordlist)), list(set(partialwordlist)), org_keywordlist
def map_var_2_keyword(keywordlist, varfield, keylist1, keylist2, var_meta, field_meta, \ idf_varlist_k1, idf_fieldlist_k1, idf_varlist_k2, idf_fieldlist_k2): dic = load_acronym('acronym_list.txt') num_var = len(var_meta) num_keywords = len(keywordlist) ovreall_score = [] #process each keywords for i in range(num_keywords): #each original science keywords #print keywordlist[i] cscorelist = [] #score of each variable matched to this keyword ckey = keywordlist[i] #current keyword keylist = ckey.split( '->') #split current keyword, on various levels of hierarchy #print keylist kwd = [ ] #keep hierarchy of keywords (as indexed in list), processed by nlp #as does earlier for variable text for consistency for j in range(len(keylist)): cstr = nlp_phrasecleaning(keylist[j], dic) cstr = nlp_StopStemLemm(cstr) kwd.append(cstr) #print kwd nkwd = float(len(kwd)) #now score each variable based on text matching using tf-idf scheme var_k1_score = [] #score based on variable name and science keyword k1 for k in range(num_var): #scoring for each variable cvar = var_meta[k] #variable text for #print cvar nlev = float( len(cvar) ) #number of levels of this variable, the lowest set weight of 1.0 cscore = 0.0 #score for m in range( len(cvar) ): #for each level of variable (possible from nc4, hdf5) cstr = cvar[m] #current level of variable name component for n in range(len( kwd)): #try to match to each keyword with hierarchy tf = cstr.count( kwd[n] ) #count number of current keyword in the varname idf = idf_varlist_k1.get( kwd[n], 0.0) #get idf for kwd[n] word, default 0.0 cscore = cscore + tf * idf * (m + 1) * (n + 1) / ( nkwd * nlev ) #tf-idf is weighted by the hierarchy level of keyword and variable var_k1_score.append(cscore) #print var_k1_score #now score each variable field based on text matching using tf-idf scheme field_k1_score = [ ] #score based on variable name and science keyword k1 for k in range( num_var ): #scoring for each variable field, same number as variable name fvar = field_meta[k] #field text for #print fvar cscore = 0.0 #score for n in range( len(kwd)): #try to match to each keyword with hierarchy tf = fvar.count( kwd[n]) #count number of current keyword in the varname idf = idf_fieldlist_k1.get( kwd[n], 0.0) #get idf for kwd[n] word, default 0.0 cscore = cscore + tf * idf * ( n + 1 ) / nkwd #tf-idf is weighted by the hierarchy level of keyword and variable field_k1_score.append(cscore) #print field_k1_score #now score each variable based on text matching using tf-idf scheme using keyword 2 list #need to regenerate current keyword list by decomposing keyword phrase into words elem_kwd = [] #element words (not phrase) for kk in range(len(kwd)): cstr = kwd[kk] carray = cstr.split() for mm in range(len(carray)): elem_kwd.append(carray[mm]) elem_kwd = list(set(elem_kwd)) var_k2_score = [] #score based on variable name and science keyword k1 for k in range(num_var): #scoring for each variable cvar = var_meta[k] #variable text for nlev = float( len(cvar) ) #number of levels of this variable, the lowest set weight of 1.0 cscore = 0.0 #score for m in range( len(cvar) ): #for each level of variable (possible from nc4, hdf5) cstr = cvar[m] #current level of variable name component for n in range(len(elem_kwd) ): #try to match to each keyword with hierarchy tf = cstr.count( elem_kwd[n] ) #count number of current keyword in the varname idf = idf_varlist_k2.get( elem_kwd[n], 0.0) #get idf for kwd[n] word, default 0.0 cscore = cscore + tf * idf * ( m + 1 ) / nlev #tf-idf is weighted by the hierarchy level of variable var_k2_score.append(cscore) #print var_k2_score field_k2_score = [ ] #score based on variable name and science keyword k1 for k in range( num_var ): #scoring for each variable field, same number as variable name fvar = field_meta[k] #field text for #print fvar cscore = 0.0 #score for n in range(len( elem_kwd)): #try to match to each keyword with hierarchy tf = fvar.count( elem_kwd[n] ) #count number of current keyword in the varname idf = idf_fieldlist_k2.get( elem_kwd[n], 0.0) #get idf for kwd[n] word, default 0.0 cscore = cscore + tf * idf #tf-idf field_k2_score.append(cscore) #now need to combine scores from the 4 categories by assigning weights(significance) #set wt1 = 0.5, (subjectively), meaning score from field metadata is half-credited of from variable name #set wt2 = 0.1, (subjectively), meaning score from phrase (exact match) gives 10 times credit as much as from words wt1 = 0.5 wt2 = 0.1 for kk in range(num_var): f_score = var_k1_score[kk] + wt1 * field_k1_score[kk] + \ wt2*var_k2_score[kk] + wt1 * wt2* field_k2_score[kk] cscorelist.append(f_score) ovreall_score.append(cscorelist) #print cscorelist rank = sorted(range(len(cscorelist)), key=lambda k: cscorelist[k]) rank.reverse() # for kk in range(num_var): # idx = rank[kk] # if cscorelist[idx] > 0.0: # print ' ' + varfield[idx], cscorelist[idx] #print out variables that have score > 0.0 return ovreall_score