# val = pickle.load(f) # #with open('../data/test.pkl', 'rb') as f: # test = pickle.load(f) t = time.time() train = add_chars(train) val = add_chars(val) test = add_chars(test) words, pos_tag_set, deprel_set, ner_set, labelSet = get_words_labels(train, val, test) label_index = get_label_index_matrix() pos_tag_index = get_pos_index_matrix(pos_tag_set) dep_index = get_deprel_index_matrix(deprel_set) ner_index = get_ner_index_matrix(ner_set) word_index, wordEmbeddings = get_word_embedding(config['embedding_file_path'],words) char_index = get_char_index_matrix() train_set = create_matrices(train, word_index, label_index, char_index, pos_tag_index, dep_index, ner_index) validation_set = create_matrices(val, word_index, label_index, char_index, pos_tag_index, dep_index, ner_index) test_set = create_matrices(test, word_index, label_index, char_index, pos_tag_index, dep_index, ner_index) batch_size = config['batch_size'] model = get_model(wordEmbeddings, char_index, pos_tag_index, dep_index, ner_index, config) train_steps, train_batches = create_batches(train_set, batch_size, pos_tag_index, dep_index, ner_index) idx2Label = {v: k for k, v in label_index.items()} metric = Metrics(validation_set, idx2Label, pos_tag_index, dep_index, ner_index)
def get_data(self): """ 获得文本,统计词汇,对文本用词index重新编码,获得词(index)的embedding :return: """ mashup_descriptions, api_descriptions, mashup_categories, api_categories = meta_data.pd.get_all_texts( ) self.num_mashup = len(mashup_descriptions) self.num_api = len(api_descriptions) # 整合文本 for index in range(self.num_mashup): for i in range(self.tag_coefficient): mashup_descriptions[index] += mashup_categories[index] for index in range(self.num_api): for i in range(self.tag_coefficient): api_descriptions[index] += api_categories[index] # 统计字符串 并统计IDF word2DF = {} # 词的出现mashup/api 的set word->set word_count = 0 for text_index in range(self.num_mashup): # 记录的是mashup inedx if isinstance(mashup_descriptions[text_index], str): mashup_text = mashup_descriptions[text_index].split() elif isinstance(mashup_descriptions[text_index], list): mashup_text = mashup_descriptions[text_index] word_count += len(mashup_text) for word in mashup_text: if word not in self.stopwords and word not in self.word2inedx.keys( ): # ???去标点符号?? word2DF[word] = set() # word2DF和word2inedx的key是同步更新的 self.word2inedx[word] = len( self.word2inedx) # 词到index的索引,新词加在末尾 word2DF[word].add(text_index) for text_index in range(self.num_api): # 记录的是mashup index if isinstance(api_descriptions[text_index], str): api_text = api_descriptions[text_index].split() elif isinstance(api_descriptions[text_index], list): api_text = api_descriptions[text_index] word_count += len(api_text) true_index = text_index + self.num_mashup for word in api_text: if word not in self.stopwords and word not in self.word2inedx.keys( ): word2DF[word] = set() self.word2inedx[word] = len(self.word2inedx) # 词到index的索引 word2DF[word].add(true_index) # 将mashup_descriptions 转化为 word index的形式 if isinstance(api_descriptions[0], str): self.mashup_descriptions = [[ self.word2inedx.get(word) for word in text.split() ] for text in mashup_descriptions] self.api_descriptions = [[ self.word2inedx.get(word) for word in text.split() ] for text in api_descriptions] elif isinstance(api_descriptions[text_index], list): self.mashup_descriptions = [[ self.word2inedx.get(word) for word in text ] for text in mashup_descriptions] self.api_descriptions = [[ self.word2inedx.get(word) for word in text ] for text in api_descriptions] # print(mashup_descriptions) # self. # print(api_descriptions) # print(self.word2inedx) # 计算IDF num_all_texts = self.num_mashup + self.num_api self.average_len = word_count / num_all_texts self.wordindex2IDF = { self.word2inedx.get(word): log(num_all_texts / len(existed_docs)) for word, existed_docs in word2DF.items() } # 获得每个词的embedding: id->array embedding = get_embedding(self.embedding_name, self.embedding_dim) self.wordindex2embedding = { self.word2inedx.get(word): get_word_embedding(embedding, self.embedding_name, word, self.embedding_dim, initize='random') for word in word2DF.keys() }
return predLabels, correctLabels train = readfile("../data/train.txt") validation = readfile("../data/valid.txt") test = readfile("../data/test.txt") train = add_chars(train) validation = add_chars(validation) test = add_chars(test) words, labelSet, pos_tag_set = get_words_and_labels(train, validation, test) label_index = get_label_index_matrix(labelSet) pos_tag_index, posTagEmbedding = get_pos_tag_embedding(pos_tag_set) case_index, caseEmbeddings = get_case_embedding() word_index, wordEmbeddings = get_word_embedding(words) char_index = get_char_index_matrix() train_set = padding( create_matrices(train, word_index, label_index, case_index, char_index, pos_tag_index)) validation_set = padding( create_matrices(validation, word_index, label_index, case_index, char_index, pos_tag_index)) test_set = padding( create_matrices(test, word_index, label_index, case_index, char_index, pos_tag_index)) idx2Label = {v: k for k, v in label_index.items()} train_batch, train_batch_len = create_batches(train_set)