def word_decompose():
    # def build_model():
    # 	'''use neural network'''
    # 	dir_path = '../segmentation/'
    # 	frozen_graph_filename = os.path.join(dir_path, 'models/seg_model_140.pbtxt')
    # 	vocab_path = os.path.join(dir_path, 'dictionary/basic_vocab.txt')
    # 	user_dict_path = os.path.join('./usr', 'word_two_three.txt')
    # 	seg = ChineseSegment(frozen_graph_filename, vocab_path, user_dict_path)

    # 	return seg
    source_file = os.path.join('./usr', 'word_.txt')
    word_fre_pair = hf.read_word_fre(source_file)
    word_list = [w for w, fre in word_fre_pair]
    two_three_len_word_list = [w for w in word_list if len(w) in [2, 3]]
    hf.write_data(os.path.join('./usr', 'word_two_three.txt'),
                  ('{}\t{}'.format(w, int(1000))
                   for w in two_three_len_word_list))

    # seg = build_model()
    # result = seg.segment(word_list)
    # for ele in result:
    # 	print(ele)

    print(len(two_three_len_word_list), len(word_list))
    '''add word into jieba'''
    for word in two_three_len_word_list:
        jieba.add_word(word)

    result_set = set()
    for word in word_list:
        w = jieba.cut(word)
        for ele in w:
            result_set.add(ele)
    hf.write_data(os.path.join('./usr', 'short_word.txt'), result_set)
示例#2
0
def find_frequency_pattern(source_file, MI_entropy_path, neighbor_entropy_path,
                           detail_information_path):
    string_list = read_source_file(source_file)

    # approcimate_total_word_num = approcimate_total_world_num(string_list)
    approcimate_total_word_num = len(string_list)
    print('total world len', approcimate_total_word_num)
    distict_substring_list_gen = get_all_distinct_substring(
        string_list, max_n_gram)
    pattern_dict = count_pattern_frequency(distict_substring_list_gen)
    MI_dict, detail_dict = calculate_mutual_entropy(
        pattern_dict, approcimate_total_word_num)
    sort_entropy = sorted(MI_dict.items(), key=lambda x: x[1], reverse=True)
    write_data(MI_entropy_path,
               ('\t'.join([key, str(ent)]) for key, ent in list(sort_entropy)))

    entropy_dict = calculate_free_degree(string_list, MI_dict)
    sort_entropy = sorted(entropy_dict.items(),
                          key=lambda x: x[1],
                          reverse=True)
    write_data(neighbor_entropy_path,
               ('\t'.join([key, str(left), str(right)])
                for key, (left, right) in list(sort_entropy)))

    del string_list, pattern_dict, sort_entropy
    combine_detail_information(detail_dict, entropy_dict)
    write_detail(detail_information_path, detail_dict)
def delete_word_from_file():
    delete_word_list = hf.read_data(os.path.join('./usr',
                                                 'delete_product.txt'))
    word_fre_pair = hf.read_word_fre(os.path.join('./usr', 'word.txt'))
    print('before delete word len', len(word_fre_pair))
    word_fre_pair = [(w, fre) for w, fre in word_fre_pair
                     if w not in delete_word_list]
    print('after delete word len', len(word_fre_pair))
    hf.write_data(os.path.join('./usr', 'word_.txt'),
                  ('{}\t{}'.format(w, fre) for w, fre in word_fre_pair))
def decompose(input_p, output_p):
    word_list = read_file(input_p)
    len_of_two_and_three = [w for w in word_list if len(w) in [2, 3]]

    for word in len_of_two_and_three:
        jieba.add_word(word)

    result_set = set()
    for word in word_list:
        w = jieba.cut(word)
        for ele in w:
            result_set.add(ele)

    hf.write_data(output_p, result_set)
def start_tag(source_file_p, product_file_p, base_word_file_p, output_dir):

    product_list = hf.read_data(product_file_p)
    word_list = hf.read_data(base_word_file_p)
    source_list = hf.read_data(source_file_p)
    print('product_list len:{}'.format(len(product_list)))
    print('word_list len:{}'.format(len(word_list)))
    print('source_list(wait to be tagged) len:{}'.format(len(source_list)))
    # sentences = segment_source(string_list, word_list)
    # hf.write_data(os.path.join(data_dir, 'source_segement.txt'), sentences)

    postive_list, negative_list, segement_list = tag_source(
        source_list, product_list, word_list)

    hf.write_data(os.path.join(output_dir, 'source_tag.txt'), postive_list)
    hf.write_data(os.path.join(output_dir, 'source_tag_negative.txt'),
                  negative_list)
    hf.write_data(os.path.join(output_dir, 'source_segement.txt'),
                  segement_list)
    write_tag2file(Tag_BIESO, os.path.join(output_dir, 'tag_vocab.txt'))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        'test_file',
        help=
        'the segmented sentence file you want to recognize product name in it',
        action='store')

    parser.add_argument('--recognize_result',
                        help='output path of recongnized result',
                        action='store',
                        dest='output_recognize_result',
                        default=os.path.join(sub_dir_path,
                                             'recognize_result.txt'))

    parser.add_argument('--product_name_result',
                        help='output path of recongnized product_name',
                        action='store',
                        dest='output_product_name_result',
                        default=os.path.join(sub_dir_path,
                                             'discover_word.txt'))

    parser.add_argument('--frozen_graph',
                        help='recognization model path',
                        action='store',
                        dest='model_path',
                        default=os.path.join(root_dir, 'usr', 'training_model',
                                             'product_model.pbtxt'))

    parser.add_argument('--char_vec_index',
                        help='char vector index file path',
                        dest='char_index_path',
                        default=os.path.join(root_dir, 'usr', 'training_model',
                                             'char_vec_index.txt'))

    parser.add_argument('--word_vec_index',
                        help='word vector index file path',
                        dest='word_index_path',
                        default=os.path.join(root_dir, 'usr', 'training_model',
                                             'word_vec_index.txt'))

    parser.add_argument('--tag_vob',
                        help='tag scheme index path',
                        dest='tag_vob_path',
                        default=os.path.join(root_dir, 'usr', 'training_model',
                                             'tag_vocab.txt'))

    args = parser.parse_args()
    hf.check_dir_exist(sub_dir_path)

    input_list = load_unrecongnition_file(args.test_file)
    recognition_result_list = run_product_recognition(input_list,
                                                      args.model_path,
                                                      args.char_index_path,
                                                      args.word_index_path,
                                                      args.tag_vob_path)

    print('input len :{} output len :{}'.format(len(input_list),
                                                len(recognition_result_list)))
    product_name_set = fetch_product_name(recognition_result_list)

    hf.write_data(args.output_product_name_result, product_name_set)
    write_recongnition_result(args.output_recognize_result,
                              recognition_result_list)