def get_instances(self, mashup_id_instances, api_id_instances): pd = process_data(self.base_dir, False) mashup_id2info = pd.get_mashup_api_id2info('mashup') api_id2info = pd.get_mashup_api_id2info('api') # mashup/api的类型信息 mashup_categories = [get_mashup_api_allCategories('mashup', mashup_id2info, mashup_id, self.Category_type) for mashup_id in mashup_id_instances] api_categories = [get_mashup_api_allCategories('api', api_id2info, api_id, self.Category_type) for api_id in api_id_instances] # 针对使用预训练的text_tag_model(embedding复用) if self.encoded_texts is None: self.process_text() # print('train examples:' + str(len(mashup_id_instances))) examples = ( np.array(mashup_id_instances), np.array(api_id_instances), np.array(self.encoded_texts.get_texts_in_index(mashup_id_instances, 'keras_setting', 0)), np.array(self.encoded_texts.get_texts_in_index(api_id_instances, 'keras_setting', self.num_users)), np.array(self.encoded_texts.get_texts_in_index(mashup_categories, 'self_padding')), np.array(self.encoded_texts.get_texts_in_index(api_categories, 'self_padding')) ) """ if api_id_instances[0]!=api_id_instances[-1]: # 不保存feature用例的 np.savetxt('../data/getInstences_encoding_texts1', examples[2],fmt='%d') np.savetxt('../data/getInstences_encoding_texts2', examples[3],fmt='%d') np.savetxt('../data/getInstences_encoding_texts3', examples[4],fmt='%d') np.savetxt('../data/getInstences_encoding_texts4', examples[5],fmt='%d') print('save getInstences_encoding_texts,done!') """ return examples
def get_instances(self, mashup_id_instances, api_id_instances,mashup_only=False): """ 根据get_model_instances得到的mashup_id_instances, api_id_instances生成该模型需要的样本 train和test样例都可用 但是针对一维列表形式,所以test先需拆分!!! :param args: :return: """ pd = process_data(self.base_dir, False) mashup_id2info = pd.get_mashup_api_id2info('mashup') api_id2info = pd.get_mashup_api_id2info('api') # mashup/api的类型信息 mashup_categories = [get_mashup_api_allCategories('mashup', mashup_id2info, mashup_id, self.Category_type) for mashup_id in mashup_id_instances] api_categories = [get_mashup_api_allCategories('api', api_id2info, api_id, self.Category_type) for api_id in api_id_instances] if mashup_only: examples = ( np.array(self.encoded_texts.get_texts_in_index(mashup_id_instances, 'keras_setting', 0)), np.array(self.encoded_texts.get_texts_in_index(mashup_categories, 'self_padding')), ) else: examples = ( np.array(self.encoded_texts.get_texts_in_index(mashup_id_instances, 'keras_setting', 0)), np.array(self.encoded_texts.get_texts_in_index(api_id_instances, 'keras_setting', self.num_users)), np.array(self.encoded_texts.get_texts_in_index(mashup_categories, 'self_padding')), np.array(self.encoded_texts.get_texts_in_index(api_categories, 'self_padding')) ) return examples
def __init__(self,base_dir,mf_embedding_dim,mf_fc_unit_nums): """ :param base_dir: :param mf_embedding_dim: 因为DHSR NCF都包含MF部分,所以这里作为属性(尽管只处理文本部分的模型不需要这部分,此时默认为空即可 :param mf_fc_unit_nums: """ self.base_dir = base_dir self.pd = process_data(self.base_dir, False) self.num_users = len(self.pd.get_mashup_api_index2name('mashup')) self.num_items = len(self.pd.get_mashup_api_index2name('api')) self.mf_embedding_dim=mf_embedding_dim self.mf_fc_unit_nums=mf_fc_unit_nums
def __init__(self, data_dir, split_mannner, train_ratio=0.7, num_negatives=6): # ,valid_ratio """ 交叉验证???mashup划分时容易,但是按关系对比例划分时很难 :param data_dir: 存取文件路径 :param split_mannner: 受划分方式的影响:按mashup划分还是划分一定比例的关系对 :param train_ratio: 训练集比例:按mashup划分时 是mashup比例;按关系对划分时是关系对比例 """ self.data_dir = data_dir self.pd = process_data(self.data_dir) # 未划分的数据集对象 self.split_mannner = split_mannner self.result_path = self.data_dir + r'/split_data/' + self.split_mannner self.train_radio = train_ratio # self.valid_ratio = valid_ratio self.num_negatives = num_negatives self.train_mashup_api_list = [] self.test_mashup_api_list = []
def get_instances(self, mashup_id_instances, api_id_instances): pd = process_data(self.base_dir, False) mashup_id2info = pd.get_mashup_api_id2info('mashup') api_id2info = pd.get_mashup_api_id2info('api') # mashup/api的类型信息 mashup_categories = [get_mashup_api_allCategories('mashup', mashup_id2info, mashup_id, self.Category_type) for mashup_id in mashup_id_instances] api_categories = [get_mashup_api_allCategories('api', api_id2info, api_id, self.Category_type) for api_id in api_id_instances] examples = ( np.array(mashup_id_instances), np.array(api_id_instances), np.array(self.encoded_texts.get_texts_in_index(mashup_id_instances, 'keras_setting', 0)), np.array(self.encoded_texts.get_texts_in_index(api_id_instances, 'keras_setting', self.num_users)), np.array(self.encoded_texts.get_texts_in_index(mashup_categories, 'self_padding')), np.array(self.encoded_texts.get_texts_in_index(api_categories, 'self_padding')) ) return examples
def get_data(self): """ 获得文本,统计词汇,对文本用词index重新编码,获得词(index)的embedding :return: """ pd = process_data(self.data_dir, False) mashup_descriptions, api_descriptions, mashup_categories, api_categories = pd.get_all_texts( ) self.num_mashup = len(mashup_descriptions) self.num_api = len(api_descriptions) # 整合文本 for index in range(self.num_mashup): for i in range(self.tag_coefficient): mashup_descriptions[index] += mashup_categories[index] for index in range(self.num_api): for i in range(self.tag_coefficient): api_descriptions[index] += api_categories[index] # 统计字符串 并统计IDF word2DF = {} # 词的出现mashup/api 的set word->set word_count = 0 for text_index in range(self.num_mashup): # 记录的是mashup inedx mashup_text = mashup_descriptions[text_index].split() word_count += len(mashup_text) for word in mashup_text: if word not in self.stopwords and word not in self.word2inedx.keys( ): # ???去标点符号?? word2DF[word] = set() # word2DF和word2inedx的key是同步更新的 self.word2inedx[word] = len( self.word2inedx) # 词到index的索引,新词加在末尾 word2DF[word].add(text_index) for text_index in range(self.num_api): # 记录的是mashup index api_text = api_descriptions[text_index].split() word_count += len(api_text) true_index = text_index + self.num_mashup for word in api_text: if word not in self.stopwords and word not in self.word2inedx.keys( ): word2DF[word] = set() self.word2inedx[word] = len(self.word2inedx) # 词到index的索引 word2DF[word].add(true_index) # 将mashup_descriptions 转化为 word index的形式 self.mashup_descriptions = [[ self.word2inedx.get(word) for word in text.split() ] for text in mashup_descriptions] self.api_descriptions = [[ self.word2inedx.get(word) for word in text.split() ] for text in api_descriptions] # print(mashup_descriptions) # self. # print(api_descriptions) # print(self.word2inedx) # 计算IDF num_all_texts = self.num_mashup + self.num_api self.average_len = word_count / num_all_texts self.wordindex2IDF = { self.word2inedx.get(word): log(num_all_texts / len(existed_docs)) for word, existed_docs in word2DF.items() } # 获得每个词的embedding: id->array embedding = get_embedding(self.embedding_name, self.embedding_dim) self.wordindex2embedding = { self.word2inedx.get(word): get_word_embedding(embedding, self.embedding_name, word, self.embedding_dim, initize='random') for word in word2DF.keys() }