def compare_all(data_manual,data_auto): def2result={} for i in range(len(data_manual)): concept_manual,pronunciation_manual,pos2definition_manual=extract_item_properties(data_manual[i]) concept_auto,pronunciation_auto,pos2definition_auto=extract_item_properties(data_auto[i]) for j in range(len(pos2definition_manual)): attributes_manual=pos2definition_manual[j]["attributes"] attributes_auto=pos2definition_auto[j]["attributes"] key_similarity,attribute2value_similarity=attribute_compare(attributes_manual,attributes_auto) #locate the position item and definition position=(i,j) def2result[position]=(key_similarity,attribute2value_similarity) return def2result
def compare_all_different(data_manual,data_auto): path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir)) path_diff=path_project+os.sep+"output"+os.sep+"attribute_diff.txt" fp_diff=codecs.open(path_diff,'w','utf-8') for i in range(len(data_manual)): concept_manual,pronunciation_manual,pos2definition_manual=extract_item_properties(data_manual[i]) concept_auto,pronunciation_auto,pos2definition_auto=extract_item_properties(data_auto[i]) for j in range(len(pos2definition_manual)): attributes_manual=pos2definition_manual[j]["attributes"] attributes_auto=pos2definition_auto[j]["attributes"] manual_diff,auto_diff=attribute_different(attributes_manual,attributes_auto) if len(manual_diff)!=0 or len(auto_diff)!=0: write_error((i,j),data_manual,data_auto,fp_diff)
def compare_all(data_manual,data_auto): def2result={} for i in range(len(data_manual)): concept_manual,pronunciation_manual,pos2definition_manual=extract_item_properties(data_manual[i]) concept_auto,pronunciation_auto,pos2definition_auto=extract_item_properties(data_auto[i]) # print 'concept_manual',concept_manual,'concept_auto',concept_auto for j in range(len(pos2definition_manual)): attributes_manual=pos2definition_manual[j]["attributes"] attributes_auto=pos2definition_auto[j]["attributes"] # print 'attributes_manual',attributes_manual,'\nattributes_auto',attributes_auto similarity,attribute2value_similarity=attribute_compare(attributes_manual,attributes_auto) # print similarity,attribute2value_similarity def2result[pos2definition_manual[j]["definition"]]=(similarity,attribute2value_similarity) return def2result
def extract_items_single_thread(data, pattern2attrubute, tagger): data_new = [] all_time = 0 for item in data: concept, pronunciation, pos2definition = extract_item_properties(item) for pos2def in pos2definition: definition = pos2def["definition"] # tagged_text=stanford_tagger.tag(definition.split()) definition_pure = re.sub(r'\([\s\S]*?\)', "", definition) # cnt+=1 start = datetime.datetime.now() attributes2value = process_definition(definition_pure, pattern2attrubute, tagger) end = datetime.datetime.now() all_time += (end - start).seconds * 1000 + (end - start).microseconds logger.info('process_definition time: %ds: %dms ' % ((end - start).seconds, (end - start).microseconds)) pos2def["attributes"] = attributes2value logger.info("\n\n") data_new.append(item) global tag_time_all logger.info("tag all time is: %d" % tag_time_all) global find_candidate_time logger.info("find candidate time is: %d" % find_candidate_time) logger.info("all time is: %d" % all_time) return data_new
def upload(data): from py2neo import Node, Relationship from py2neo import Graph graph = Graph("http://localhost:7474", username="******", password="******") graph.delete_all() nodes = [] for item in data: concept, pronunciation, pos2definition = extract_item_properties(item) node_tmp = Node("Prosthodontics", name=concept) node_tmp.properties["pronunciation"] = pronunciation cnt = 1 for pos2def in pos2definition: node_tmp.properties["pos " + str(cnt)] = pos2def["pos"] # node_tmp.properties["definition "+str(cnt)]=pos2def["definition"] for attribute, value in pos2def["attributes"].iteritems(): node_tmp["def " + str(cnt) + " : " + attribute] = value graph.create(node_tmp) nodes.append(node_tmp) print "nodes create over , relation start to create" for node1 in nodes: properties = node1.properties.keys() for property in properties: if property[8:] == "cross_reference": for node2 in nodes: if node2.properties["name"] == node1[property]: graph.create( Relationship(node1, "cross_reference", node2)) print "graph create over"
def extract_items_all(data,pattern2attrubute): data_new=[] all_time=0 # attributes=set([]) cnt=0 for item in data: # print 'processing %d item'%cnt concept,pronunciation,pos2definition=extract_item_properties(item) for pos2def in pos2definition: definition=pos2def["definition"] # tagged_text=stanford_tagger.tag(definition.split()) definition_pure=re.sub(r'\([\s\S]*?\)', "", definition) # cnt+=1 start=datetime.datetime.now() # for key in pos2def["attributes"]: # attributes.add(key) attributes2value=process_definition(definition_pure,pattern2attrubute) end=datetime.datetime.now() all_time+=(end-start).seconds*1000+(end-start).microseconds logger.info('process_definition time: %ds: %dms ' % ((end-start).seconds,(end-start).microseconds)) pos2def["attributes"]=attributes2value cnt+=1 logger.info("\n\n") data_new.append(item) # for attribue in sorted(list(attributes)): # print attribue global tag_time_all logger.info("tag all time is: %d"%tag_time_all) global find_candidate_time logger.info("find candidate time is: %d"%find_candidate_time) logger.info("all time is: %d"%all_time) return data_new
def modify_data(data): logger.info("starting to transfer the data") data_new = [] cnt_item = 0 print "data size: %d" % len(data) while cnt_item < len(data): item = data[cnt_item] # print 'processing the %d item' % cnt_item logger.info('processing the %d item' % cnt_item) concept, pronunciation, pos2definition = extract_item_properties(item) concept_result = concept_analysis(concept) logger.info(concept + " : concept result is: " + str(concept_result) + "\n") concept_real = concept_result[0] item['concept'] = concept_real if len(concept_result[1]) > 0: item['abbr'] = concept_result[1] for i in range(len(pos2definition) - 1, -1, -1): pos2def = pos2definition[i] definition = pos2def["definition"] def_tokens = nltk.word_tokenize( re.sub(r'\([\s\S]*?\)', "", definition).strip()) logger.info(def_tokens[0]) if def_tokens[0] in ['See', 'see']: logger.info('concept: %s \n definition: %s \n is removed ' % (concept, definition)) logger.info('\n') pos2definition.remove(pos2def) continue pos = pos2def["pos"] if 'n' not in pos: logger.info('concept: %s \n definition: %s \n is removed ' % (concept, definition)) logger.info('\n') pos2definition.remove(pos2def) continue definition_new = definition_restore( nltk.word_tokenize(concept_real), def_tokens) pos2def["definition"] = definition_new logger.info("\n" + definition + "\n definition result is: \n" + definition_new) if len(pos2definition) == 0: data.remove(item) logger.info('concept: %s is removed ' % concept) logger.info('\n') continue cnt_item += 1 data_new.append(item) # logger.info('\n') print "items left %d " % cnt_item return data_new
def analysis_data(data): cnt = 0 for item in data: concept, pronunciation, pos2definition = extract_item_properties(item) if len(pos2definition) > 1: for pos2def in pos2definition: definition = pos2def["definition"] if definition[0:4] in ['See ', 'see ']: print concept cnt += 1 print cnt
def extract_single_item(data,i,new_data,pattern2attrubute,stanford_tagger): print i,'start' concept,pronunciation,pos2definition=extract_item_properties(data[i]) for pos2def in pos2definition: definition=pos2def["definition"] definition_pure=re.sub(r'\([\s\S]*?\)', "", definition) attributes2value=process_definition(definition_pure,pattern2attrubute,stanford_tagger) pos2def["attributes"]=attributes2value logger.info("\n\n") print i,' over' new_data.append(data[i])
def acquire_patterns(data): attribute2patterns_can_all = {} for item in data: concept, pronunciation, pos2definition = extract_item_properties(item) # logger.info('concept: %s'%concept) for pos2def in pos2definition: definition = pos2def['definition'] # logger.info('definition: \n%s'%definition) text = nltk.word_tokenize(definition) definition_pos = nltk.pos_tag(text) # logger.info('definition_pos: \n%s'%definition_pos) definition_tokens = get_tokens(definition_pos) attributes = pos2def['attributes'] for attribute_name, attribute_value in attributes.iteritems(): # logger.info('attribute_name: %s'%attribute_name) attribute_value_text = nltk.word_tokenize(attribute_value) attribute_value_tokens = nltk.pos_tag(attribute_value_text) # logger.info('attributes value token: %s'%attribute_value_tokens) attribute_tokens = get_tokens(attribute_value_tokens) prefixs_tokens2intersect = get_fix(prefix_window_size, attribute_tokens, definition_tokens) # logger.info('prefixs_token: \n%s'%str([str((token.word,token.pos))+": "+str(intersect) for (tokens,intersect) in prefixs_tokens2intersect for token in tokens])) prefixs = get_combination_fix(prefixs_tokens2intersect) # logger.info('prefix: \n%s'%str(prefixs)) # if attribute_name in ['isA','isThe','purpose','used_for']: # continue # if attribute_name in ['isA','isThe']: # continue if attribute_name in attribute2patterns_can_all.keys(): attribute2patterns_can_all[attribute_name].extend( list(set(prefixs))) else: attribute2patterns_can_all[attribute_name] = list( set(prefixs)) # logger.info('\n') # logger.info('\n') attribute2patterns_all = {} for attribue, patterns in attribute2patterns_can_all.iteritems(): print 'attribue: ' + attribue logger.info('final attribue: %s\n' % attribue) logger.info('patterns original length: ' + str(len(patterns))) logger.info('patterns original: %s\n' % str(patterns)) filter_patterns = get_filter_pattern_seq(patterns) attribute2patterns_all[attribue] = filter_patterns # logger.info('final attribue: %s\n'%attribue) logger.info('final pattern: %s\n' % str(filter_patterns)) pattern2single_attribute = filter_efficient(attribute2patterns_all) return pattern2single_attribute
def test6(): path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir)) path_data= path_project+os.sep+"input"+os.sep+"items_tagged_modified.json" data=load_json(path_data) attributes=set([]) for item in data: concept,pronunciation,pos2definition=extract_item_properties(item) for pos2def in pos2definition: for attribute in pos2def['attributes'].keys(): attributes.add(attribute) for x in sorted(list(attributes)): print x
def test7(): path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir)) path_data= path_project+os.sep+"input"+os.sep+"items_tagged_modified.json" data=load_json(path_data) attributes=set([]) for item in data: concept,pronunciation,pos2definition=extract_item_properties(item) for pos2def in pos2definition: definition=pos2def['definition'] grammar = "NP: {<DT>?<JJ>*<NN>}" tokens=nltk.word_tokenize(definition) tagged=nltk.pos_tag(tokens) print nltk.RegexpParser(grammar).parse(tagged)
def extract_items_single_thread(data, pattern2attrubute, tagger): data_new = [] for item in data: concept, pronunciation, pos2definition = extract_item_properties(item) for pos2def in pos2definition: definition = pos2def["definition"] # tagged_text=stanford_tagger.tag(definition.split()) definition_pure = re.sub(r'\([\s\S]*?\)', "", definition) # cnt+=1 attributes2value = process_definition(definition_pure, pattern2attrubute, tagger) pos2def["attributes"] = attributes2value logger.info("\n\n") data_new.append(item) return data_new
def extract_all_items(data,patterns): data_new=[] stanford_tagger=get_tagger() cnt=0 for item in data: cnt+=1 concept,pronunciation,pos2definition=extract_item_properties(item) for pos2def in pos2definition: definition=pos2def["definition"] # tagged_text=stanford_tagger.tag(definition.split()) definition_pure=re.sub(r'\([\s\S]*?\)', "", definition) cnt+=1 attributes2value=process_definition(definition_pure,patterns,stanford_tagger) pos2def["attributes"]=attributes2value data_new.append(item) return data_new
def acquire_patterns(data): attribute2patterns_can_all={} for item in data: concept,pronunciation,pos2definition=extract_item_properties(item) logger.info('concept: %s'%concept) for pos2def in pos2definition: definition=pos2def['definition'] logger.info('definition: \n%s'%definition) text=nltk.word_tokenize(definition) definition_pos=nltk.pos_tag(text) logger.info('definition_pos: \n%s'%definition_pos) definition_tokens=get_tokens(definition_pos) attributes=pos2def['attributes'] for attribute_name,attribute_value in attributes.iteritems(): logger.info('attribute_name: %s'%attribute_name) attribute_value_text=nltk.word_tokenize(attribute_value) attribute_value_tokens=nltk.pos_tag(attribute_value_text) logger.info('attributes value token: %s'%attribute_value_tokens) attribute_tokens=get_tokens(attribute_value_tokens) prefixs_tokens=get_fix('pre',prefix_window_size,attribute_tokens,definition_tokens) logger.info('prefixs_token: \n%s'%str([(token.word,token.pos) for tokens in prefixs_tokens for token in tokens])) # postfixs_tokens=get_fix('post',prefix_window_size,attribute_tokens,definition_tokens) # logger.info('postfixs_token: \n%s'%str([(token.word,token.pos) for tokens in postfixs_tokens for token in tokens])) prefixs=get_combination_fix(prefixs_tokens) logger.info('prefix: \n%s'%str(prefixs)) # postfixs=get_combination_fix(postfixs_tokens) # logger.info('postfix: \n%s'%str(postfixs)) # logger.info('all candidate patterns: \n%s'%str(patterns)) # if attribute_name in attribute2patterns_can_all.keys(): # attribute2patterns_can_all[attribute_name].extend(patterns) # else: # attribute2patterns_can_all[attribute_name]=patterns logger.info('\n') logger.info('\n') attribute2patterns_all={} for attribue,patterns in attribute2patterns_can_all.iteritems(): filter_patterns=get_filter_pattern(patterns) attribute2patterns_all[attribue]=filter_patterns logger.info('final attribue: %s\n'%attribue) logger.info('final pattern: %s\n'%str(filter_patterns)) return attribute2patterns_can_all
def main(): path_project = os.path.abspath( os.path.join(os.getcwd(), os.pardir, os.pardir)) path_book = path_project + os.sep + "input" + os.sep + "prosthodontic_items_full.json" path_stop_words = path_project + os.sep + "input" + os.sep + "stop_words" # path_out = path_project+os.sep+"output"+os.sep+"sorted_result.txt" path_sent_words_freq = path_project + os.sep + "output" + os.sep + "sent_words_freq.txt" data = json.load(open(path_book, "r"), encoding="utf-8") definaitons = [] for item in data: concept, pronunciation, pos2definition = extract_item_properties(item) for pos2def in pos2definition: definaitons.append(pos2def["definition"]) uni_thre = 5 bi_thre = 5 tri_thre = 3 uni_filter, bi_filter, tri_filter = filter_ngram(definaitons, uni_thre, bi_thre, tri_thre) fp = codecs.open(path_sent_words_freq, 'w', 'utf-8') stop_words = open(path_stop_words, "r").readlines() new_stop_words = [] for stop_word in stop_words: new_stop_words.append(stop_word.strip()) ngrams_filter = get_ngram_filter(new_stop_words, uni_filter, bi_filter, tri_filter, uni_thre, bi_thre, tri_thre) del ngrams_filter[0] ngrams_filter_words = [] ngrams_filter_freqs = [] for ngram_filter in ngrams_filter: ngrams_filter_words.append(ngram_filter[0]) ngrams_filter_freqs.append(ngram_filter[1]) for defination in definaitons: line_candidate = get_sent_high_freq_word(defination, ngrams_filter_words, ngrams_filter_freqs) fp.write(defination + "\n") fp.write(str(line_candidate) + "\n\n")
def test2(): path_project = os.path.abspath( os.path.join(os.getcwd(), os.pardir, os.pardir)) path_data = path_project + os.sep + "input" + os.sep + "tagged_items.json" data = load_json(path_data) tagger = get_tagger() cnt_same_pos_all = 0 cnt_same_word_all = 0 for item in data: concept, pronunciation, pos2definition = extract_item_properties(item) for pos2def in pos2definition: definition = pos2def["definition"] text = nltk.word_tokenize(definition) def_pos1 = tagger.tag(text) logger.info(def_pos1) def_pos2 = nltk.pos_tag(text) logger.info(def_pos2) similar, cnt_same_pos, cnt_same_word = compare_similar_pos( def_pos1, def_pos2) cnt_same_pos_all += cnt_same_pos cnt_same_word_all += cnt_same_word print float(cnt_same_pos_all) / cnt_same_word_all