def compare_all(fin1, fdin2): # read input data if fin1 is None or fin1 =="": return False texts = ufile.read_csv(fin1) # a specific file or a directory # read input data if fdin2 is None or fdin2 =="": return False EDBlist = ufile.load_files (fdin2) # a specific file or a directory result = [] cur = 0 for text in texts: cur += 1 result_items_new =[] result_items = ast.literal_eval(text[2]) #print result_items for result_item in result_items: #print result_item[0] in EDBlist if result_item[0] in EDBlist: result_items_new.append(result_item) result.append((text[0], text[1], str(result_items_new))) # output result fout = os.path.splitext(fin1)[0] + "_EDB.csv" ufile.write_csv(fout, result) print 'saved result into: %s' % fout print ext_print ('all tasks completed\n') return True
def extract_variables (fdin, ffea, ffea2, var, cores): # read input dataset if fdin is None or fdin =="": return False trials = ufile.read_csv (fdin) if trials is None or len(trials) <= 0: print ext_print ('input data error, please check either no such file or no data --- interrupting') return False print ext_print ('found a total of %d data items' % len(trials)) # read feature list - domain knowledge if ffea is None or ffea =="": return False fea_dict_dk = ufile.read_csv_as_dict_with_multiple_items (ffea) if fea_dict_dk is None or len(fea_dict_dk) <= 0: print ext_print ('no feature data available --- interrupting') return False # get feature info features, feature_dict_dk = {}, {} if var == "All": features = fea_dict_dk del features["Variable name"] elif var in fea_dict_dk: features = {var: fea_dict_dk[var]} for key, value in fea_dict_dk.iteritems(): names = value[0].lower().split('|') for name in names: if name.strip() != '': feature_dict_dk[name.strip()] = key # read feature list - umls if ffea2 is None or ffea2 =="": return False fea_dict_umls = ufile.read_csv_as_dict (ffea2) if fea_dict_umls is None or len(fea_dict_umls) <= 0: print ext_print ('no feature data available --- interrupting') return False output = Manager().list() jobs = [] for i in xrange(1,cores+1): t = Process(target=worker, args=(trials, len(trials)*(i-1)/cores,len(trials)*i/cores-1, var, features, feature_dict_dk, fea_dict_umls, output)) jobs.append(t) t.start() for j in jobs: j.join() fout = os.path.splitext(fdin)[0] + "_exp_%s.csv" % var ufile.write_csv (fout, output) print ext_print ('saved processed results into: %s' % fout) return True
def compare_all(fin1): # read input data if fin1 is None or fin1 == "": return False texts = ufile.read_csv(fin1) # a specific file or a directory result = [] start_time = time.time() cur = 0 for text in texts: simi_valuesList = [] cur += 1 if len(text[1].split('.')) > 1: target_word, pos = text[1].split('.')[0], text[1].split('.')[1] else: target_word, pos = text[1], None print "%d of %d" % (cur, len(texts)), target_word candidatewords = text[2] candidatewords = ast.literal_eval(candidatewords) simi_values = [] for candidate in candidatewords: #print "candidate:" #print candidate word2 = candidate[0] # print word2 try: simi_values = gensim_model.similarity(target_word, word2) except KeyError: simi_values = 0 # word_sim[word2] = round(float(simi_values), 5) simi_valuesList.append((word2, round(float(simi_values), 5))) simi_valuesList.sort(key=operator.itemgetter(1), reverse=True) # sort by rank value print "simi_valuesList:" print simi_valuesList[:30] result.append((text[0], text[1], simi_valuesList[:30])) print result print("--- %s seconds ---" % (time.time() - start_time)) fout = os.path.splitext(fin1)[0] + "_rank.csv" ufile.write_csv(fout, result) print 'saved result into: %s' % fout print ext_print('all tasks completed\n') return True
def CT_extractxml (fin, fout=None): processed_list = [] # set processed trials into here to avoid redundency for root, dir, files in os.walk(fin): for f in files: if not f.endswith(".xml") or f in processed_list: continue print ext_print (f) processed_list.append(f) if len(processed_list)%1000 == 0: print ('Processing %d' % len(processed_list)) output = [] # read input data fdin = os.path.join(root, f) text = ufile.read_file (fdin, 3, False) if text is not None: ct_xml = xml_parser.fromstring(text) blocks = ct_xml.findall('MedlineCitation') ct_xml = "" for block in blocks: (PMID, JournalTitle, PubDate, ArticleTitle, Abstract, Keywords) = extract_component(block) output.append((PMID, JournalTitle, PubDate, ArticleTitle, Abstract, Keywords)) blocks = [] # set output data file fout = os.path.splitext(fdin)[0] + "_extracted.csv" ufile.write_csv (fout, output) print ext_print ('saved result in: %s' % fout) print ext_print ('all tasks completed\n') return True
def file_merge(fdin, fout, columns, format): # read input data if fdin is None or fdin == "": return False texts = ufile.read(fdin) # a specific file or a directory result = [] print texts if columns == "all": result = texts else: cols = columns.split('|') for text in texts: if len(cols) == 1: result.append(text[int(cols) - 1]) else: for col in cols: result.append(text[int(col) - 1]) print ext_print('get %d in total' % len(result)) # get output data directory if fout is None: fout = os.path.splitext(fdin)[0] + "_merged" + format # output detailed result into file if format == "" or ".txt": ufile.write_file(fout, result, False) elif format == ".csv": ufile.write_csv(fout, result) print ext_print('saved result into: %s' % fout) print ext_print('all tasks completed\n') return True
def Insert_DB(fin): for root, dir, files in os.walk(fin): for f in files: if not f.endswith(".csv"): continue print ext_print(f) output = [] # read input data fdin = os.path.join(root, f) rows = ufile.read_csv(fdin) for row in rows: # param = (PMID, JournalTitle, PubDate, ArticleTitle, Abstract, Keywords) PubDate = row[2] if PubDate != '2009': # To split tasks into differnt machines with MySQL, process tables separatly continue table = "article_" + PubDate # # if (row[2] == '' or row[2] is None): PubDate = 0 # PubDate = int(PubDate) # table = 'article_0-1950' # if 2000 >= PubDate >= 1951: # table = 'article_1951-2000' # if 2005 >= PubDate >= 2001: # table = 'article_2001-2005' # elif PubDate > 2005: # table = 'article_'+ str(PubDate) param = (row[0], row[1], PubDate, row[3], row[4], row[5]) sql = "INSERT INTO `" + table + "` (`PMID`, `JournalTitle`, `PubDate`, `ArticleTitle`, `Abstract`, `Keywords`) VALUES(%s, %s, %s, %s, %s, %s);" msg = db.execute(sql, param) if msg != 1: print msg print ext_print('all tasks completed\n') return True
def compare_all(fin1, fin2, n, fin3, fout1=None): # read Google 1T corpus print ext_print("start to load Google 1T grams") # load gram data Goole_grams, max_fre = {}, 0 if fin2 is None or fin2 == "": return False fid = open(fin2, 'r') for line in fid: line = line.strip() if len(line) > 0: max_fre += 1 if max_fre % 1000000 == 0: print max_fre tem = line.split('\t') if len(tem) <= 1: print ext_print("Data error! please check!" + str(tem)) fid.close() print ext_print("all files loaded" + str(max_fre)) return True
def temporal_processing(fin, fout=None, type="testing", fin_t=None, rep_enable=False, rep_word="", event=False, X3=False): # read the input data if (fin) is None: print ext_print('no input file found --- interrupting') return texts = ufile.read_file(fin, 1, False) if texts is None or len(texts) <= 0: print ext_print('no text available for processing --- interrupting') return print ext_print('start to process temporal information in text file %s' % fin) if type == "training": tpatts = temporal_training(texts) # output pattern result if (fout is None) or (fout == ""): fout = os.path.splitext(fin)[0] + "_pat" + os.path.splitext(fin)[1] ufile.write_file(fout, sorted(tpatts, key=tpatts.get, reverse=True), False) print ext_print('saved trained patterns into: %s' % fout) elif type == "testing": # read the pattern data if (fin_t) is None: print ext_print('no pattern file found --- interrupting') return tpatts = ufile.read_file(fin_t, 1, False) if tpatts is None or len(tpatts) <= 0: print ext_print( 'no patterns available for processing --- interrupting') return result = temporal_testing(texts, tpatts, rep_enable, rep_word, event) if X3: result = using_TimeX3(result) # output result if (fout is None) or (fout == ""): if X3: fout = os.path.splitext(fin)[0] + "_TEXer.xml" else: fout = os.path.splitext(fin)[0] + "_TEXer" + os.path.splitext( fin)[1] ufile.write_file(fout, result, False) print ext_print('saved processed results into: %s' % fout) print ext_print('all tasks completed\n') return True
def compare_all(fin1, fin2, n, fin3, fout1=None): # Referee related words of target words to reduce the size of loaded file into memory if fin1 is None or fin1 == "": return False orig_texts = ufile.read_csv(fin1) # a specific file or a directory print orig_texts Related_words = {} for text in orig_texts: target_word = text[2].split('.')[0] words = text[1].lower().split() # sentence words if (target_word in words) and len(words) > 1: temp_ngrams = find_ngrams( words, n) # get all of sentence ngram candidates for ngram in temp_ngrams: if target_word in ngram: # get target_word`s candidates for te in ngram: if te != target_word: Related_words[te] = 1 # key(te)value=1 print ext_print("Identified all related words") # Referee candidate words to reduce the size of loaded file into memory if fin3 is None or fin3 == "": return False candidate_words = {} for fin3_each in fin3.split(";"): test_data = ufile.read_csv(fin3_each) # a specific file or a directory for i in range(len(test_data)): can_words = ast.literal_eval( test_data[i][2]) # parse string to array for can_word in can_words: if can_word[0] not in candidate_words: candidate_words[can_word[0]] = 1 print ext_print("Identified all candidate words") # read Google 1T corpus print ext_print("start to load Google 1T grams") Goole_grams, count, max_fre, c1, c2 = {}, 0, 0, 0, 0 if fin2 is None or fin2 == "": return False fid = open(fin2, 'r') for line in fid: line = line.lower() count += 1 if count % 10000000 == 0: print count if len(line) > 0: tem = line.split('\t') '''if len(tem) > 1: if tem[0] not in Goole_grams: Goole_grams[tem[0]] = tem[1] if long(tem[1]) > max_fre: # reduce ordering calculations max_fre = long(tem[1])''' if len(tem) == 1: c1 += 1 if len(tem) > 1: c2 += 1 temws = tem[0].split() find_candidate, find_related = False, False # reduce memory usage for temw in temws: if temw in candidate_words: find_candidate = True elif temw in Related_words: find_related = True if find_candidate and find_related: Goole_grams[tem[0]] = tem[1] if long(tem[1]) > max_fre: # reduce ordering calculations max_fre = long(tem[1]) fid.close() print count print("c1=%d,c2=%d" % (c1, c2)) print ext_print("all files loaded") # max_fre = max(map(float, Goole_grams.values())) # reduce memory usage if max_fre == 0: print ext_print("Data error! please check!") return else: print ext_print("Total number is %d" % len(Goole_grams)) lemmatizer = WordNetLemmatizer() #betas = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1] # betas = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35] betas = [0.5] m, t, p = 0, 0, 0 for beta in betas: # read candidate words for fin3_each in fin3.split(";"): candidate_words = ufile.read_csv( fin3_each) # a specific file or a directory ranked_result = [] for i in xrange(len(orig_texts)): text = orig_texts[i] can_words = ast.literal_eval( candidate_words[i][2]) # parse string to array words = text[1].lower().split() target_word = text[2].split('.')[0] # print target_word if (target_word in words) and len(words) > 1: candiate_ngrams, temp_ngrams = [], find_ngrams( words, n) # get ngram candidates for ngram in temp_ngrams: if target_word in ngram: candiate_ngrams.append( (ngram, ngram.index(target_word))) ranks = {} for can_word in can_words: can_word, can_word_value, fre_can_word, max_context = can_word[ 0], float( can_word[1] ), 0.0, 0.0 # can_word is candidate_word,can_word[0] is delete value just key lemma_can_word = lemmatizer.lemmatize(can_word) for ( ngram, k ) in candiate_ngrams: # k is the site of target_word lst = list(ngram) le_lst = list(ngram) lst[k] = can_word can_context = ' '.join( lst ) # candidate_word replace ngram target_word le_lst[k] = lemma_can_word le_context = ''.join(le_lst) t += 1 if can_context in Goole_grams: m += 1 fre_can_word = float(Goole_grams[can_context]) max_context = max(max_context, fre_can_word) elif le_context in Goole_grams: p += 1 fre_can_word = float(Goole_grams[can_context]) max_context = max(max_context, fre_can_word) # change strategies for calculating 1gram, 2gram, 3gram, or their combination ranks[can_word] = ( 1 - beta) * can_word_value + beta * math.sqrt( max_context / float(max_fre)) sorted_ranks = sorted(ranks.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value ranked_result.append((text[0], text[2], sorted_ranks)) # print ranked_result else: ranked_result.append((text[0], text[2], can_words)) # get output data directory fout1 = fin3_each.replace( ".csv", "_Rank" + str(n) + "gram+" + str(beta) + ".csv") ufile.write_csv(fout1, ranked_result) print ext_print('saved result into: %s' % fout1) return True
def extract_variables (fdin, ffea, ffea2, var): # read input data if fdin is None or fdin =="": return False trials = ufile.read_csv (fdin) if trials is None or len(trials) <= 0: print ext_print ('input data error, please check either no such file or no data --- interrupting') return False print ext_print ('found a total of %d data items' % len(trials)) # read feature list - domain knowledge if ffea is None or ffea =="": return False fea_dict_dk = ufile.read_csv_as_dict_with_multiple_items (ffea) if fea_dict_dk is None or len(fea_dict_dk) <= 0: print ext_print ('no feature data available --- interrupting') return False # get feature info features, feature_dict_dk = {}, {} if var == "All": features = fea_dict_dk del features["Variable name"] elif var in fea_dict_dk: features = {var:fea_dict_dk[var]} for key, value in fea_dict_dk.iteritems(): names = value[0].lower().split('|') for name in names: if name.strip() != '': feature_dict_dk[name.strip()] =key # read feature list - UMLS (can be replaced by full UMLS) if ffea2 is None or ffea2 =="": return False fea_dict_umls = ufile.read_csv_as_dict (ffea2) if fea_dict_umls is None or len(fea_dict_umls) <= 0: print ext_print ('no feature data available --- interrupting') return False #load numeric feature list Valx_core.init_features() output = [] for i in xrange(len(trials)): if i%1000 == 0: print ('processing %d' % i) # pre-processing eligibility criteria text text = Valx_core.preprocessing(trials[i][1]) # trials[i][1] is the eligibility criteria text (sections_num, candidates_num) = Valx_core.extract_candidates_numeric(text) # extract candidates containing numeric features for j in xrange(len(candidates_num)): # for each candidate exp_text = Valx_core.formalize_expressions(candidates_num[j]) # identify and formalize values (exp_text, key_ngrams) = Valx_core.identify_variable(exp_text, feature_dict_dk, fea_dict_umls) # identify variable mentions and map them to names (variables, vars_values) = Valx_core.associate_variable_values(exp_text) all_exps = [] for k in xrange(len(variables)): curr_var = variables[k] curr_exps = vars_values[k] if curr_var in features: fea_list = features[curr_var] curr_exps = Valx_core.context_validation(curr_exps, fea_list[1], fea_list[2]) curr_exps = Valx_core.normalization(fea_list[3], curr_exps) # unit conversion and value normalization curr_exps = Valx_core.hr_validation (curr_exps, float(fea_list[4]), float(fea_list[5])) # heuristic rule-based validation if len(curr_exps) > 0: if var == "All" or var.lower() == curr_var.lower() or var.lower() in curr_var.lower(): all_exps += curr_exps if len(all_exps) > 0: output.append((trials[i][0], sections_num[j], candidates_num[j], exp_text, str(all_exps).replace("u'", "'"))) # output result # output result fout = os.path.splitext(fdin)[0] + "_exp_%s.csv" % var ufile.write_csv (fout, output) print ext_print ('saved processed results into: %s' % fout) return True
def Extract_nonGT(fdin, fout, fin_, fout_, c): #----------------------------------initialize and load supporting data # read input data all_texts = [] if fdin is None or fdin == "": return False elif fdin.endswith(".txt"): all_texts = ufile.load_files(fdin) if all_texts is None or len(all_texts) <= 0: print ext_print( 'input data error, please check either no such file or no data --- interrupting' ) return print ext_print('found a total of %d data items' % len(all_texts)) output = [] for text in all_texts: text = text.lower() result = GAXer_Ggender(text) output.append(result) # output result if (fout is None) or (fout == ""): fout = os.path.splitext(fdin)[0] + "_gender.txt" ufile.write_file(fout, output, False) elif fdin.endswith(".csv"): all_texts = ufile.load_files(fdin) # a specific file or a directory all_texts_ = ufile.load_files(fin_) # a specific file or a directory if all_texts is None or len(all_texts) <= 0: print ext_print( 'input data error, please check either no such file or no data --- interrupting' ) return print ext_print('found a total of %d data items' % len(all_texts)) output = [] output_ = [] i = 0 cnt = 0 cho = 0 j = 100 jump = int(j * random.random()) + 2 goadList = {} for t in all_texts_: goadList[t[0]] = 1 for texts in all_texts: if i % 1000 == 0: print ext_print('processing %d' % i) i += 1 # if str(texts[0])<>'NCT00002967': # continue cop = texts inclusive = texts[5].lower() inclusive = inclusive[0:inclusive.find('exclusi')] combine_texts = texts[2].lower() + ". " + texts[3].lower( ) + ". " + texts[4].lower() + ". " + inclusive pre_label = 'Biological ' + texts[1][0].upper() + texts[1][1:] result = GAXer_Ggender(combine_texts, pre_label) ''' if 'Transgender' not in str(result): FindSame = texts[0] in goadList.keys() if not FindSame: if cho==jump: output_.append((cop[0],cop[1],cop[2],cop[3],cop[4],cop[5])) cnt+=1 jump=int(j*random.random())+2 cho=0 cho+=1 ''' if 'Transgender' not in str(result): FindSame = texts[0] in goadList.keys() if not FindSame: output_.append( (cop[0], cop[1], cop[2], cop[3], cop[4], cop[5])) cnt += 1 if cnt == c: break if len(result) == 0 or (len(texts[1]) > 0 and len(result) == 1 and pre_label in result): continue else: t = texts[0] t = t.replace('"', '') t = str(t) output.append((t, texts[1], str(result))) # output result if (fout is None) or (fout == ""): fout = os.path.splitext(fdin)[0] + "_gender.csv" ufile.write_csv(fout, output) ufile.write_csv(fout_, output_) print ext_print('saved processed results into: %s' % fout) print ext_print('all tasks completed\n') return True
def GAXer_wrapper(fdin, fout=None): #----------------------------------initialize and load supporting data # read input data all_texts = [] if fdin is None or fdin == "": return False elif fdin.endswith(".txt"): all_texts = ufile.load_files(fdin) if all_texts is None or len(all_texts) <= 0: print ext_print( 'input data error, please check either no such file or no data --- interrupting' ) return print ext_print('found a total of %d data items' % len(all_texts)) output = [] for text in all_texts: text = text.lower() result = GAXer_Ggender(text) output.append(result) # output result if (fout is None) or (fout == ""): fout = os.path.splitext(fdin)[0] + "_gender.txt" ufile.write_file(fout, output, False) elif fdin.endswith(".csv"): all_texts = ufile.load_files(fdin) # a specific file or a directory if all_texts is None or len(all_texts) <= 0: print ext_print( 'input data error, please check either no such file or no data --- interrupting' ) return print ext_print('found a total of %d data items' % len(all_texts)) output = [] i = 0 for texts in all_texts: if i % 1000 == 0: print ext_print('processing %d' % i) i += 1 # if str(texts[0])<>'NCT00002967': # continue inclusive = texts[5].lower() inclusive = inclusive[0:inclusive.find('exclusi')] # combine_texts = texts[2].lower() + ". " + texts[3].lower() + ". " + texts[4].lower() + ". " + inclusive combine_texts = texts[3].lower() + ". " + texts[4].lower( ) + ". " + inclusive pre_label = 'Biological ' + texts[1][0].upper() + texts[1][1:] result = GAXer_Ggender(combine_texts, pre_label) # print result # if len(result)==0 or (len(texts[1])>0 and len(result)==1 and pre_label in result): if len(result) == 0: continue else: t = texts[0] t = t.replace('"', '') t = str(t) output.append((t, texts[1], str(result))) # output result if (fout is None) or (fout == ""): fout = os.path.splitext(fdin)[0] + "_gender.csv" ufile.write_csv(fout, output) print ext_print('saved processed results into: %s' % fout) print ext_print('all tasks completed\n') return True
def compare_all(fin1, fdin2, fdin3): # read input data if fin1 is None or fin1 =="": return False texts = ufile.read_csv(fin1) # a specific file or a directory # read input data if fdin2 is None or fdin2 =="": return False EDBlist = ufile.load_files (fdin2) # a specific file or a directory # read input data if fdin3 is None or fdin3 =="": return False FreCorpus = ufile.read_file_dict_tokenized(fdin3, '\t') result = [] words_sims = {} cur = 0 for text in texts: cur += 1 if len(text[2].split('.')) > 1: target_word, pos = text[2].split('.')[0], text[2].split('.')[1] else: target_word, pos = text[2], None print "%d of %d" % (cur, len(texts)), target_word simi_values = [] if target_word not in words_sims: processed = [] processed.append(target_word) # step 1 ============== can_words =[] syn = wordnet.synsets(target_word) if len(syn) > 0: for l in syn[0].lemmas(): if l.name() not in can_words: can_words.append(l.name()) word_fre = {} for word_each in can_words: if word_each in EDBlist and word_each not in processed: word_each_fre = 0 if (word_each in FreCorpus): word_each_fre = int(FreCorpus[word_each]) word_fre[word_each] = word_each_fre processed.append(word_each) word_fre = sorted(word_fre.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value simi_values.extend(word_fre) # step 2 ============== can_words =[] syn = wordnet.synsets(target_word) if len(syn) > 0: syn_word = syn[0].hypernyms() for l in syn_word: if (l.pos() in ['v', 'n', 'a']): for k in l.lemmas(): if k.name() not in can_words: can_words.append(k.name()) word_fre = {} for word_each in can_words: if word_each in EDBlist and word_each not in processed: word_each_fre = 0 if (word_each in FreCorpus): word_each_fre = int(FreCorpus[word_each]) word_fre[word_each] = word_each_fre processed.append(word_each) word_fre = sorted(word_fre.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value simi_values.extend(word_fre) # step 3 ============== can_words =[] for syn in wordnet.synsets(target_word): for l in syn.lemmas(): if l.name() not in can_words: can_words.append(l.name()) word_fre = {} for word_each in can_words: if word_each in EDBlist and word_each not in processed: word_each_fre = 0 if (word_each in FreCorpus): word_each_fre = int(FreCorpus[word_each]) word_fre[word_each] = word_each_fre processed.append(word_each) word_fre = sorted(word_fre.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value simi_values.extend(word_fre) # step 4 ============== can_words =[] for syn in wordnet.synsets(target_word): syn_word = syn.hypernyms() for l in syn_word: if (l.pos() in ['v', 'n', 'a']): for k in l.lemmas(): if k.name() not in can_words: can_words.append(k.name()) word_fre = {} for word_each in can_words: if word_each in EDBlist and word_each not in processed: word_each_fre = 0 if (word_each in FreCorpus): word_each_fre = int(FreCorpus[word_each]) word_fre[word_each] = word_each_fre processed.append(word_each) word_fre = sorted(word_fre.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value simi_values.extend(word_fre) #================================= words_sims[target_word] = simi_values print simi_values[:2] else: simi_values = words_sims[target_word] result.append((text[0], text[2], simi_values)) # output result fout = os.path.splitext(fin1)[0] + "_4steps.csv" ufile.write_csv(fout, result) print 'saved result into: %s' % fout print ext_print ('all tasks completed\n') return True
def compare_all(fin1, fdin2, method, threasholds): # read input data if fin1 is None or fin1 == "": return False texts = ufile.read_csv(fin1) # a specific file or a directory # read input data if fdin2 is None or fdin2 == "": return False EDBlist = ufile.load_files(fdin2) # a specific file or a directory threasholds = threasholds.split(';') # 过滤掉原型词与同词根的词 porter_stemmer = PorterStemmer() wnl = WordNetLemmatizer() gold, fre = [], [] for threashold in threasholds: result = [] words_sims = {} start_time = time.time() cur = 0 for text in texts: cur += 1 for i in range(len(text[3].split(";"))): fre.append(text[3].split(";")[i].split(":")[1]) gold.append(text[3].split(";")[i].split(":")[0]) if len(text[2].split('.')) > 1: target_word, pos = text[2].split('.')[0], text[2].split('.')[1] stemming_tw = porter_stemmer.stem(target_word) lemma_tw = wordnet.morphy(target_word, pos=pos) #lemma_tw = wnl.lemmatize(target_word, pos) print lemma_tw else: target_word, pos = text[2], None stemming_tw = porter_stemmer.stem(target_word) lemma_tw = wordnet.morphy(target_word) #lemma_tw = wnl.lemmatize(target_word, pos) print("%d of %d" % (cur, len(texts)), target_word) simi_values = [] if target_word not in words_sims: word_sim = {} for word2 in EDBlist: stemming_cw = porter_stemmer.stem(word2) lemma_word = wordnet.morphy(word2) if word2 not in word_sim: #if target_word !=word2: if target_word != word2 and stemming_cw != stemming_tw and lemma_word != lemma_tw: # simi_value=compare_allsynsets(method, target_word, word2, TWpos, SYNpos, pos) # simi_value = compare_allsynsets(method, target_word, word2, TWpos, pos) # simi_value = compare_allsynsets(method, target_word, word2, SYNpos) simi_value = compare_allsynsets( method, target_word, word2) if simi_value > float(threashold): word_sim[word2] = round(float(simi_value), 3) simi_values = sorted(word_sim.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value words_sims[target_word] = simi_values else: simi_values = words_sims[target_word] result.append((text[0], text[2], simi_values)) print("--- %s seconds ---" % (time.time() - start_time)) # output result fout = os.path.splitext(fin1)[0] + "_%s_%s.csv" % (method, threashold) # if SYNpos: # fout = fout.replace(".csv", "_SYNpos.csv") # if TWpos: # fout = fout.replace(".csv", "_TWpos.csv") ufile.write_csv(fout, result) print('saved result into: %s' % fout) print(ext_print('all tasks completed\n')) return True