Пример #1
0
#    val = pickle.load(f)
#    
#with open('../data/test.pkl', 'rb') as f:
#    test = pickle.load(f)

t = time.time()
train = add_chars(train)
val = add_chars(val)
test = add_chars(test)

words, pos_tag_set, deprel_set, ner_set, labelSet = get_words_labels(train, val, test)
label_index = get_label_index_matrix()
pos_tag_index = get_pos_index_matrix(pos_tag_set)
dep_index = get_deprel_index_matrix(deprel_set)
ner_index = get_ner_index_matrix(ner_set)
word_index, wordEmbeddings = get_word_embedding(config['embedding_file_path'],words)
char_index = get_char_index_matrix()

train_set = create_matrices(train, word_index,  label_index, char_index, pos_tag_index, dep_index, ner_index)
validation_set = create_matrices(val, word_index, label_index, char_index, pos_tag_index, dep_index, ner_index)
test_set = create_matrices(test, word_index, label_index, char_index, pos_tag_index, dep_index, ner_index)

batch_size = config['batch_size']
model = get_model(wordEmbeddings, char_index, pos_tag_index, dep_index, ner_index, config)

train_steps, train_batches = create_batches(train_set, batch_size, pos_tag_index, dep_index, ner_index)

idx2Label = {v: k for k, v in label_index.items()}

metric = Metrics(validation_set, idx2Label, pos_tag_index, dep_index, ner_index)
Пример #2
0
    def get_data(self):
        """
        获得文本,统计词汇,对文本用词index重新编码,获得词(index)的embedding
        :return:
        """
        mashup_descriptions, api_descriptions, mashup_categories, api_categories = meta_data.pd.get_all_texts(
        )
        self.num_mashup = len(mashup_descriptions)
        self.num_api = len(api_descriptions)
        # 整合文本
        for index in range(self.num_mashup):
            for i in range(self.tag_coefficient):
                mashup_descriptions[index] += mashup_categories[index]
        for index in range(self.num_api):
            for i in range(self.tag_coefficient):
                api_descriptions[index] += api_categories[index]

        # 统计字符串 并统计IDF
        word2DF = {}  # 词的出现mashup/api 的set  word->set

        word_count = 0
        for text_index in range(self.num_mashup):  # 记录的是mashup inedx
            if isinstance(mashup_descriptions[text_index], str):
                mashup_text = mashup_descriptions[text_index].split()
            elif isinstance(mashup_descriptions[text_index], list):
                mashup_text = mashup_descriptions[text_index]
            word_count += len(mashup_text)
            for word in mashup_text:
                if word not in self.stopwords and word not in self.word2inedx.keys(
                ):  # ???去标点符号??
                    word2DF[word] = set()  # word2DF和word2inedx的key是同步更新的
                    self.word2inedx[word] = len(
                        self.word2inedx)  # 词到index的索引,新词加在末尾
                word2DF[word].add(text_index)

        for text_index in range(self.num_api):  # 记录的是mashup index
            if isinstance(api_descriptions[text_index], str):
                api_text = api_descriptions[text_index].split()
            elif isinstance(api_descriptions[text_index], list):
                api_text = api_descriptions[text_index]
            word_count += len(api_text)
            true_index = text_index + self.num_mashup
            for word in api_text:
                if word not in self.stopwords and word not in self.word2inedx.keys(
                ):
                    word2DF[word] = set()
                    self.word2inedx[word] = len(self.word2inedx)  # 词到index的索引
                word2DF[word].add(true_index)

        # 将mashup_descriptions 转化为 word index的形式
        if isinstance(api_descriptions[0], str):
            self.mashup_descriptions = [[
                self.word2inedx.get(word) for word in text.split()
            ] for text in mashup_descriptions]
            self.api_descriptions = [[
                self.word2inedx.get(word) for word in text.split()
            ] for text in api_descriptions]
        elif isinstance(api_descriptions[text_index], list):
            self.mashup_descriptions = [[
                self.word2inedx.get(word) for word in text
            ] for text in mashup_descriptions]
            self.api_descriptions = [[
                self.word2inedx.get(word) for word in text
            ] for text in api_descriptions]

        # print(mashup_descriptions) # self.
        # print(api_descriptions)
        # print(self.word2inedx)

        # 计算IDF
        num_all_texts = self.num_mashup + self.num_api
        self.average_len = word_count / num_all_texts
        self.wordindex2IDF = {
            self.word2inedx.get(word): log(num_all_texts / len(existed_docs))
            for word, existed_docs in word2DF.items()
        }

        # 获得每个词的embedding: id->array
        embedding = get_embedding(self.embedding_name, self.embedding_dim)
        self.wordindex2embedding = {
            self.word2inedx.get(word): get_word_embedding(embedding,
                                                          self.embedding_name,
                                                          word,
                                                          self.embedding_dim,
                                                          initize='random')
            for word in word2DF.keys()
        }
Пример #3
0
    return predLabels, correctLabels


train = readfile("../data/train.txt")
validation = readfile("../data/valid.txt")
test = readfile("../data/test.txt")

train = add_chars(train)
validation = add_chars(validation)
test = add_chars(test)

words, labelSet, pos_tag_set = get_words_and_labels(train, validation, test)
label_index = get_label_index_matrix(labelSet)
pos_tag_index, posTagEmbedding = get_pos_tag_embedding(pos_tag_set)
case_index, caseEmbeddings = get_case_embedding()
word_index, wordEmbeddings = get_word_embedding(words)
char_index = get_char_index_matrix()

train_set = padding(
    create_matrices(train, word_index, label_index, case_index, char_index,
                    pos_tag_index))
validation_set = padding(
    create_matrices(validation, word_index, label_index, case_index,
                    char_index, pos_tag_index))
test_set = padding(
    create_matrices(test, word_index, label_index, case_index, char_index,
                    pos_tag_index))

idx2Label = {v: k for k, v in label_index.items()}

train_batch, train_batch_len = create_batches(train_set)