def word_decompose(): # def build_model(): # '''use neural network''' # dir_path = '../segmentation/' # frozen_graph_filename = os.path.join(dir_path, 'models/seg_model_140.pbtxt') # vocab_path = os.path.join(dir_path, 'dictionary/basic_vocab.txt') # user_dict_path = os.path.join('./usr', 'word_two_three.txt') # seg = ChineseSegment(frozen_graph_filename, vocab_path, user_dict_path) # return seg source_file = os.path.join('./usr', 'word_.txt') word_fre_pair = hf.read_word_fre(source_file) word_list = [w for w, fre in word_fre_pair] two_three_len_word_list = [w for w in word_list if len(w) in [2, 3]] hf.write_data(os.path.join('./usr', 'word_two_three.txt'), ('{}\t{}'.format(w, int(1000)) for w in two_three_len_word_list)) # seg = build_model() # result = seg.segment(word_list) # for ele in result: # print(ele) print(len(two_three_len_word_list), len(word_list)) '''add word into jieba''' for word in two_three_len_word_list: jieba.add_word(word) result_set = set() for word in word_list: w = jieba.cut(word) for ele in w: result_set.add(ele) hf.write_data(os.path.join('./usr', 'short_word.txt'), result_set)
def find_frequency_pattern(source_file, MI_entropy_path, neighbor_entropy_path, detail_information_path): string_list = read_source_file(source_file) # approcimate_total_word_num = approcimate_total_world_num(string_list) approcimate_total_word_num = len(string_list) print('total world len', approcimate_total_word_num) distict_substring_list_gen = get_all_distinct_substring( string_list, max_n_gram) pattern_dict = count_pattern_frequency(distict_substring_list_gen) MI_dict, detail_dict = calculate_mutual_entropy( pattern_dict, approcimate_total_word_num) sort_entropy = sorted(MI_dict.items(), key=lambda x: x[1], reverse=True) write_data(MI_entropy_path, ('\t'.join([key, str(ent)]) for key, ent in list(sort_entropy))) entropy_dict = calculate_free_degree(string_list, MI_dict) sort_entropy = sorted(entropy_dict.items(), key=lambda x: x[1], reverse=True) write_data(neighbor_entropy_path, ('\t'.join([key, str(left), str(right)]) for key, (left, right) in list(sort_entropy))) del string_list, pattern_dict, sort_entropy combine_detail_information(detail_dict, entropy_dict) write_detail(detail_information_path, detail_dict)
def delete_word_from_file(): delete_word_list = hf.read_data(os.path.join('./usr', 'delete_product.txt')) word_fre_pair = hf.read_word_fre(os.path.join('./usr', 'word.txt')) print('before delete word len', len(word_fre_pair)) word_fre_pair = [(w, fre) for w, fre in word_fre_pair if w not in delete_word_list] print('after delete word len', len(word_fre_pair)) hf.write_data(os.path.join('./usr', 'word_.txt'), ('{}\t{}'.format(w, fre) for w, fre in word_fre_pair))
def decompose(input_p, output_p): word_list = read_file(input_p) len_of_two_and_three = [w for w in word_list if len(w) in [2, 3]] for word in len_of_two_and_three: jieba.add_word(word) result_set = set() for word in word_list: w = jieba.cut(word) for ele in w: result_set.add(ele) hf.write_data(output_p, result_set)
def start_tag(source_file_p, product_file_p, base_word_file_p, output_dir): product_list = hf.read_data(product_file_p) word_list = hf.read_data(base_word_file_p) source_list = hf.read_data(source_file_p) print('product_list len:{}'.format(len(product_list))) print('word_list len:{}'.format(len(word_list))) print('source_list(wait to be tagged) len:{}'.format(len(source_list))) # sentences = segment_source(string_list, word_list) # hf.write_data(os.path.join(data_dir, 'source_segement.txt'), sentences) postive_list, negative_list, segement_list = tag_source( source_list, product_list, word_list) hf.write_data(os.path.join(output_dir, 'source_tag.txt'), postive_list) hf.write_data(os.path.join(output_dir, 'source_tag_negative.txt'), negative_list) hf.write_data(os.path.join(output_dir, 'source_segement.txt'), segement_list) write_tag2file(Tag_BIESO, os.path.join(output_dir, 'tag_vocab.txt'))
def main(): parser = argparse.ArgumentParser() parser.add_argument( 'test_file', help= 'the segmented sentence file you want to recognize product name in it', action='store') parser.add_argument('--recognize_result', help='output path of recongnized result', action='store', dest='output_recognize_result', default=os.path.join(sub_dir_path, 'recognize_result.txt')) parser.add_argument('--product_name_result', help='output path of recongnized product_name', action='store', dest='output_product_name_result', default=os.path.join(sub_dir_path, 'discover_word.txt')) parser.add_argument('--frozen_graph', help='recognization model path', action='store', dest='model_path', default=os.path.join(root_dir, 'usr', 'training_model', 'product_model.pbtxt')) parser.add_argument('--char_vec_index', help='char vector index file path', dest='char_index_path', default=os.path.join(root_dir, 'usr', 'training_model', 'char_vec_index.txt')) parser.add_argument('--word_vec_index', help='word vector index file path', dest='word_index_path', default=os.path.join(root_dir, 'usr', 'training_model', 'word_vec_index.txt')) parser.add_argument('--tag_vob', help='tag scheme index path', dest='tag_vob_path', default=os.path.join(root_dir, 'usr', 'training_model', 'tag_vocab.txt')) args = parser.parse_args() hf.check_dir_exist(sub_dir_path) input_list = load_unrecongnition_file(args.test_file) recognition_result_list = run_product_recognition(input_list, args.model_path, args.char_index_path, args.word_index_path, args.tag_vob_path) print('input len :{} output len :{}'.format(len(input_list), len(recognition_result_list))) product_name_set = fetch_product_name(recognition_result_list) hf.write_data(args.output_product_name_result, product_name_set) write_recongnition_result(args.output_recognize_result, recognition_result_list)