예제 #1
0
 def process_texts(self):
     """
     只处理文档  不处理tag
     :param data_dirs: 某个路径下的mashup和api文档
     :return:
     """
     self.encoded_texts=encoding_padding(meta_data.descriptions,new_Para.param.remove_punctuation) # 可得到各文本的encoded形式
예제 #2
0
파일: baseline.py 프로젝트: ssea-lab/DLISR
def get_default_gd(default_encoding_texts=None,tag_times=2,mashup_only=False,strict_train=False): # 可传入encoding_texts对象
    if default_encoding_texts is None:
        default_encoding_texts = encoding_padding (meta_data.descriptions+meta_data.tags, True)
    mashup_descriptions=default_encoding_texts.texts_in_index_nopadding[:meta_data.mashup_num]
    api_descriptions   =default_encoding_texts.texts_in_index_nopadding[meta_data.mashup_num:meta_data.mashup_num+meta_data.api_num]
    mashup_categories  =default_encoding_texts.texts_in_index_nopadding[meta_data.mashup_num+meta_data.api_num:2*meta_data.mashup_num+meta_data.api_num]
    api_categories     =default_encoding_texts.texts_in_index_nopadding[2*meta_data.mashup_num+meta_data.api_num:]
    gd = gensim_data (mashup_descriptions, api_descriptions, mashup_categories, api_categories,tag_times,mashup_only=mashup_only,strict_train=strict_train) # 调整tag出现的次数
    return gd
예제 #3
0
    def process_text(self):
        """
        只处理文档  不处理tag
        :param data_dirs: 某个路径下的mashup和api文档
        :return:
        """
        mashup_descriptions, api_descriptions, mashup_categories, api_categories=self.pd.get_all_texts()
        descriptions = mashup_descriptions+api_descriptions # 先mashup后api 无tag

        self.encoded_texts=encoding_padding(descriptions,self.remove_punctuation) # 可得到各文本的encoded形式
예제 #4
0
 def process_text(self):  # 处理文本,先进行
     """
     process mashup and service together
     :param data_dirs: 某个路径下的mashup和api文档
     :return:
     """
     mashup_descriptions, api_descriptions, mashup_categories, api_categories = self.pd.get_all_texts (
         self.Category_type)
     descriptions = mashup_descriptions + api_descriptions + mashup_categories + api_categories  # 先mashup后api 最后是类别
     """
     with open('../data/all_texts','w',encoding='utf-8') as f:
         for text in descriptions:
             f.write('{}\n'.format(text))
     """
     self.encoded_texts = encoding_padding (descriptions, self.remove_punctuation)  # 可得到各文本的encoded形式
예제 #5
0
    def __init__(self,
                 model_name='PasRec',
                 semantic_mode='HDP',
                 LDA_topic_num=None,
                 epoch_num=15,
                 neighbor_size=15,
                 topTopicNum=3,
                 cluster_mode='LDA',
                 cluster_mode_topic_num=100):
        # topTopicNum在PasRec中用于计算content相似度;在IsRec中用于从K个类中寻找近邻

        # semantic_mode='HDP',LDA_topic_num=None: about feature in HIN  只在IsRec_best中使用,因为PasRec和IsRec计算文本相似度时要么使用topic作为tag,要么使用EmbMax
        # cluster_mode='LDA',cluster_mode_topic_num: ABOUT clustering by LDA...
        self.simple_name = model_name
        self.epoch_num = epoch_num
        self.neighbor_size = neighbor_size  # 找最近邻时的规模
        self.topTopicNum = topTopicNum
        if self.simple_name == 'IsRec_best':
            self.p1_weight, self.p2_weight, self.p3_weight = 1 / 3, 1 / 3, 1 / 3
            self.path_weights = [
                self.p1_weight, self.p2_weight, self.p3_weight
            ]
        elif self.simple_name == 'PasRec_2path':
            self.p1_weight, self.p2_weight = 1 / 2, 1 / 2
            self.path_weights = [self.p1_weight, self.p2_weight]
        elif self.simple_name == 'IsRec':
            self.p1_weight, self.p2_weight, self.p3_weight, self.p4_weight, self.p5_weight, self.p6_weight, self.p7_weight = 1 / 7, 1 / 7, 1 / 7, 1 / 7, 1 / 7, 1 / 7, 1 / 7
            self.path_weights = [
                self.p1_weight, self.p2_weight, self.p3_weight, self.p4_weight,
                self.p5_weight, self.p6_weight, self.p7_weight
            ]
        else:
            self.p1_weight, self.p2_weight, self.p3_weight, self.p4_weight, self.p5_weight, self.p6_weight = 1 / 6, 1 / 6, 1 / 6, 1 / 6, 1 / 6, 1 / 6
            self.path_weights = [
                self.p1_weight, self.p2_weight, self.p3_weight, self.p4_weight,
                self.p5_weight, self.p6_weight
            ]

        self.learning_rate = 0.001
        self.reg = 0.001
        # 'new_true'  _25pairs

        if LDA_topic_num is None:
            LDA_topic_num = ''
        self.model_name = '{}_{}_epoch{}_nbSize{}TopicNum{}{}{}NEW'.format(
            model_name, semantic_mode, epoch_num, neighbor_size, topTopicNum,
            cluster_mode, cluster_mode_topic_num)
        self.model_dir = dataset.crt_ds.model_path.format(
            self.model_name)  # 模型路径 # !!!
        self.weight_path = os.path.join(self.model_dir,
                                        'weights.npy')  # 最核心的数据,只保存它,其他无用!

        # 数据集相关
        self.all_mashup_num = meta_data.mashup_num
        self.all_api_num = meta_data.api_num
        self.his_m_ids = dataset.crt_ds.his_mashup_ids
        self.his_m_ids_set = set(self.his_m_ids)

        # 没区分
        # self.train_mashup_api_list = meta_data.mashup_api_list # 纯正例的训练集!!!
        # self.train_mashup_api_dict = meta_data.pd.get_mashup_api_pair('dict')

        # 严格的训练集!!!
        self.train_mashup_api_list = [
            pair for pair in meta_data.mashup_api_list
            if pair[0] in self.his_m_ids_set
        ]
        self.train_mashup_api_dict = {
            key: value
            for key, value in meta_data.pd.get_mashup_api_pair('dict').items()
            if key in self.his_m_ids_set
        }
        print(len(self.train_mashup_api_dict))

        # 训练数据集 api_id: set(mashup_ids)
        self.train_aid2mids = {}
        for mashup_id, api_id in self.train_mashup_api_list:
            if api_id not in self.train_aid2mids.keys():
                self.train_aid2mids[api_id] = set()
            self.train_aid2mids[api_id].add(mashup_id)
        self.his_a_ids = list(
            self.train_aid2mids.keys())  # 训练数据集中出现的api_id !!!
        self.notInvokeScore = 0  # 加入评价的api是历史mashup从未调用过的,基准评分0.5;参考1和0  0.5很差!!!

        # 文本,HIN相似度相关
        self.HIN_path = os.path.join(self.model_dir,
                                     'HIN_sims')  # 存储各个HIN_sim源文件的root !!!
        self.semantic_mode = semantic_mode
        self.LDA_topic_num = LDA_topic_num
        encoded_texts = encoding_padding(
            meta_data.descriptions + meta_data.tags,
            new_Para.param.remove_punctuation)  # 文本编码对象
        embedding_matrix = get_embedding_matrix(
            encoded_texts.word2index,
            new_Para.param.embedding_name,
            dimension=new_Para.param.embedding_dim)  # 每个编码词的embedding

        # HIN中 文本相似度计算  只在IsRec_best中使用,因为PasRec和IsRec计算文本相似度时要么使用topic作为tag,要么使用EmbMax!!!
        HIN_gd = get_default_gd(encoded_texts,
                                tag_times=0,
                                mashup_only=True,
                                strict_train=True)  # 用gensim处理文本,文本中不加tag
        self._mashup_features, self._api_features = HIN_gd.model_pcs(
            self.semantic_mode, self.LDA_topic_num)  # IsRec_best需要使用TF_IDF!!!
        features = self._mashup_features, self._api_features
        self.mhs = mashup_HIN_sims(
            embedding_matrix,
            encoded_texts,
            semantic_name=self.semantic_mode,
            HIN_path=self.HIN_path,
            features=features,
            if_text_sem=True,
            if_tag_sem=False)  # 计算HIN_sim的对象,传入的是mashup和api的文本feature
        self.mID2PathSims = {}  # 每个mashupID(含已调用apis),跟历史mashup的各种路径的相似度
        self.HIN_sims_changed_flag = False

        # topTopicNum在PasRec中用于基于LDA等的主题计算content相似度;在IsRec中用于从K个类中寻找近邻!!!
        topic_gd = get_default_gd(encoded_texts,
                                  tag_times=0,
                                  mashup_only=True,
                                  strict_train=True)  # 用gensim处理文本,文本中不加tag
        topic_gd.model_pcs(
            cluster_mode, cluster_mode_topic_num)  # 暂时用HDP分类/提取特征;确定主题数之后改成LDA
        self.m_id2topic, self.a_id2topic = topic_gd.get_topTopics(
            self.topTopicNum)

        self.topic2m_ids = {}  # topic到mashup的映射;相当于按主题分类 全部mashup!不区分训练集测试集!
        for m_id, topic_indexes in enumerate(self.m_id2topic):
            for topic_index in topic_indexes:
                if topic_index not in self.topic2m_ids:
                    self.topic2m_ids[topic_index] = []
                self.topic2m_ids[topic_index].append(m_id)

        self.read_model()  # 主要读取权重参数,其他不重要
예제 #6
0
 def process_tags(self):
     self.encoded_tags = encoding_padding (meta_data.tags, new_Para.param.remove_punctuation)  # 可得到各文本的encoded形式
예제 #7
0
 def process_texts(self):
     self.encoded_texts = encoding_padding(meta_data.descriptions,new_Para.param.remove_punctuation)  # 可得到各文本的encoded形式