예제 #1
0
def convert(input_path, output_path):
    with open(input_path) as file:
        lines = file.readlines()
    processed_news = []
    for line in lines:
        news_piece = {}
        sina_id, emotions, text = line.split('\t')
        emotions = emotions.split(' ')
        news_piece['total_votes'] = emotions[0].split(':')[1]
        emotion_vec = []
        for e_text in emotions[1:]:
            e_type, e_votes = e_text.split(':')
            emotion_vec.append(int(e_votes))
        max_vote = max(emotion_vec)
        if emotion_vec.count(max_vote) > 1:
            # multiple emotions with highest votes. can't label, skip this entry
            continue
        news_piece['label'] = emotion_vec.index(max_vote)
        news_piece['emotions'] = emotion_vec
        text = remove_redundant(text)
        news_piece['text'] = text
        processed_news.append(news_piece)
    function.write_json_file(output_path, processed_news)
    print(
        f"Finish preprocesssing {input_path}. {str(len(processed_news))} entries. "
        f"Saved at {output_path}.")
    return processed_news
예제 #2
0
def init_char(char_path):
    char_table = {}
    with open(char_path, encoding="gbk") as file:
        line = file.readline()
    for char in line:
        char_table[char] = True
    function.write_json_file(CHAR_TABLE_PATH, char_table)
    print("init char table done.")
예제 #3
0
def build_embedding(word2id, ori_emb_path):
    embedding = {}
    with open(ori_emb_path) as file:
        lines = file.readlines()
        embedding['dimension'] = int(lines[0].split(' ')[1])
        emb_list = [[0.0] * embedding['dimension']] * len(word2id)
        for line in lines[1:]:
            line_list = line.strip().split(' ')
            if line_list[0] in word2id:
                emb_list[word2id[line_list[0]]] = list(
                    map(float, line_list[1:]))
    embedding['list'] = emb_list
    function.write_json_file(EMBEDDING_PATH, embedding)
    print(f"Embedding built. Saved at {EMBEDDING_PATH}.")
예제 #4
0
def build_vocabulary(texts):
    word_count = {}
    for sentence in texts:
        word_list = sentence.split(' ')
        for word in word_list:
            if word in word_count:
                word_count[word] += 1
            else:
                word_count[word] = 1
    function.write_json_file(WORD_COUNT_PATH, word_count)
    vocab_list = list(word_count.items())
    vocab_list.sort(key=lambda x: x[1], reverse=True)
    vocab_list.extend([(UNKNOWN, 0), (PADDING, 0)])
    word2id = {}
    for idx, word in enumerate(vocab_list):
        word2id[word[0]] = idx
    function.write_json_file(WORD2ID_PATH, word2id)
    print(f"{len(word_count)} words in vocabulary. "
          f"Saved at {WORD_COUNT_PATH} and {WORD2ID_PATH}.")
    return word2id
예제 #5
0
def init_pinyin(pinyin_path):
    pinyin2char = {}
    char2pinyin = {}
    # read file and generate a pinyin2char dict without homograph
    with open(pinyin_path, encoding="gbk") as file:
        lines = file.readlines()
    for line in lines:
        line_arr = line.strip().split(" ")
        pinyin2char[line_arr[0]] = line_arr[1:]
    # homograph init
    for pinyin, chars in pinyin2char.items():
        piyin_new_chars = []
        for char in chars:
            if char in char2pinyin:
                char2pinyin[char][pinyin] = len(char2pinyin[char])
            else:
                char2pinyin[char] = {pinyin: 0}
            piyin_new_chars.append(char + str(char2pinyin[char][pinyin]))
        pinyin2char[pinyin] = piyin_new_chars
    function.write_json_file(HOMO_DIC_PATH, char2pinyin)
    function.write_json_file(PINYIN2CHAR_PATH, pinyin2char)
    print("init pinyin done.")
예제 #6
0
def train(folder_path, n, model_name):
    model = Model(n)

    def train_file(file_path):
        data = function.read_json_file(file_path)
        for sentence in list(data):
            # add (n_gram - 1) 'bb' to the beginning of the sentence and 'ee' to the end
            sentence = ('bb' * (n - 1)) + sentence + 'ee'
            model.train(sentence)

    all_files_paths = os.listdir(folder_path)
    for rel_path in all_files_paths:
        path = folder_path + "/" + rel_path
        print(f"Begin training with {path}")
        try:
            train_file(path)
        except UnicodeDecodeError:
            print("Illegal file, continue.")
        print(f"training with {path} finished.")
    save_path = MODEL_PATH + f"/{model_name}.json"
    function.write_json_file(save_path, model.to_dict())
    print(f"Training finished. Model saved as {save_path}")
예제 #7
0
def gen_test_set(file_path, test_count=500, only_long_sentence=True):
    homo_dic = function.read_json_file(HOMO_DIC_PATH)
    all_sentences = function.read_json_file(file_path)
    all_length = len(all_sentences)
    test_index = random.sample(range(0, all_length), test_count)
    answers = []
    inputs = []
    char_count = 0
    for index in test_index:
        sentence = all_sentences[index]
        length = len(sentence)
        if only_long_sentence and length < 10:
            continue
        chars = [sentence[i] for i in range(0, length, 2)]
        pinyin_ids = [int(sentence[i]) for i in range(1, length, 2)]
        pinyins = []
        for char, pinyin_id in zip(chars, pinyin_ids):
            for dic_pinyin, dic_id in homo_dic[char].items():
                if dic_id == pinyin_id:
                    pinyins.append(dic_pinyin)
                    continue
        answers.append(''.join(chars) + '\n')
        inputs.append(' '.join(pinyins) + '\n')
        char_count += len(chars)
    new_all_sentences = []
    # delete test from training file
    for index, sentence in enumerate(all_sentences):
        if index not in test_index:
            new_all_sentences.append(sentence)

    function.write_json_file(file_path, new_all_sentences)
    with open(TEST_INPUT, "a") as file:
        file.writelines(inputs)
    with open(TEST_ANSWER, "a", encoding='gbk') as file:
        file.writelines(answers)
    print(
        f"Generate a test set with {len(inputs)} sentences and {char_count} characters. "
        f"Test input added at {TEST_INPUT}. Answer added at {TEST_ANSWER}")
예제 #8
0
 def process_file(file_path, cnt, batch_name):
     all_sentences = []
     if batch_name == 'sina':
         with open(file_path, encoding="gbk") as file:
             lines = file.readlines()
         for line in lines:
             news_piece = json.loads(line)
             title = news_piece["title"]
             content = news_piece["html"]
             all_sentences += cut_sentences(title)
             all_sentences += cut_sentences(content)
     if batch_name == 'weixin':
         with open(file_path) as file:
             lines = file.readlines()
         length = len(lines)
         for line_index in range(0, length, 3):  # get 1/3 of weixin corpus
             content = json.loads(lines[line_index])['content']
             all_sentences += cut_sentences(content, ignore_number=True)
     sentences_with_pinyin = []
     for sentence in all_sentences:
         sentences_with_pinyin.append(label_homo(sentence))
     save_path = TRAINING_DATA_PATH + f"/{name}-{cnt}.json"
     function.write_json_file(save_path, sentences_with_pinyin)
     print(f"{file_path} processed. Saved as {save_path}")