示例#1
0
    def __init__(self, archPath, model_weight=[0.25, 0.25, 0.25, 0.25]):
        """
        初始化模型, 初始化一个: 
            lmirBM25Model

        Input:
            archPath: archtectureDataset的数据位置
            modelWeight: list of model weight [BM25, JM, DIR, ABS]
    
        """
        self.archDataset = Arch(annotationFile=archPath)
        self.archDataset.reverseCharForAllContext()

        # generate annotation and corpora list
        self.annIdList = []
        self.corporaList = []
        self.notCutCorporaList = []
        for annotation, content in self.archDataset.anns.items():
            self.annIdList.append(annotation)
            self.corporaList.append(content["cutConcateText"])
            self.notCutCorporaList.append(content["concateText"])

        # model weight [staticModel, featureModel]
        self.model_weight = model_weight

        self.model = lmirBm25Model(self.corporaList,
                                   modelWeight=self.model_weight)
    def __init__(self, archPath, modelPath):
        """
        初始化模型, 初始化一个: 
            lmirBM25Model

        Input:
            archPath: archtectureDataset的数据位置
            modelPath: doc2vec训练好的模型位置
    
        """
        self.archDataset = Arch(annotationFile=archPath)
        self.archDataset.reverseCharForAllContext()

        # generate annotation and corpora list
        self.annIdList = []
        self.corporaList = []
        self.notCutCorporaList = []
        for annotation, content in self.archDataset.anns.items():
            self.annIdList.append(annotation)
            self.corporaList.append(content["cutConcateText"])
            self.notCutCorporaList.append(content["concateText"])

        # modelPath
        self.modelPath = modelPath

        self.model = doc2vecModel(self.corporaList, model_path=self.modelPath)
示例#3
0
    def __init__(self, archPath, modelPath, model_weight=[0.5, 0.5]):
        """
        初始化模型, 分别初始化两个: 
            1. lmirBM25
            2. Doc2Vec

        Input:
            archPath: archtectureDataset的数据位置
            modelPath: doc2vec训练好的模型位置
            modelWeight: list of model weight[staticModel, featureModel]
    
        """
        self.archDataset = Arch(annotationFile=archPath)
        self.archDataset.reverseCharForAllContext()

        # generate annotation and corpora list
        self.annIdList = []
        self.corporaList = []
        self.notCutCorporaList = []
        for annotation, content in self.archDataset.anns.items():
            self.annIdList.append(annotation)
            self.corporaList.append(content["cutConcateText"])
            self.notCutCorporaList.append(content["concateText"])

        # model weight [staticModel, featureModel]
        self.model_weight = model_weight

        self.staticModel = lmirBm25Model(self.corporaList)
        self.featureModel = doc2vecModel(self.corporaList, modelPath)
class archDoc2vecModel():
    def __init__(self, archPath, modelPath):
        """
        初始化模型, 初始化一个: 
            lmirBM25Model

        Input:
            archPath: archtectureDataset的数据位置
            modelPath: doc2vec训练好的模型位置
    
        """
        self.archDataset = Arch(annotationFile=archPath)
        self.archDataset.reverseCharForAllContext()

        # generate annotation and corpora list
        self.annIdList = []
        self.corporaList = []
        self.notCutCorporaList = []
        for annotation, content in self.archDataset.anns.items():
            self.annIdList.append(annotation)
            self.corporaList.append(content["cutConcateText"])
            self.notCutCorporaList.append(content["concateText"])

        # modelPath
        self.modelPath = modelPath

        self.model = doc2vecModel(self.corporaList, model_path=self.modelPath)
    
    def standardization(self, data):
        minValue = np.min(data)
        maxValue = np.max(data)
        return (data - minValue) / (maxValue - minValue) if (maxValue - minValue) != 0 else data

    def searchSentence(self, listWords):
        """
        Search a sentence contains keyword

        Input:
            listWords: 未分好词的查询部分, 是一个或几个字符串的数组
        """
        # cut words
        to_search = []
        for i in listWords:
            to_search += list(jieba.cut(i, True))

        # search
        result = self.model.forward(to_search)

        # get index
        index = np.argsort(result["ALL"])[::-1]
        sortedResult = self.standardization(np.sort(result["ALL"])[::-1])
        
        # result
        imageId = []
        annoId = []
        copora = []
        for i in index:
            annoId.append(self.annIdList[i])
            copora.append(self.notCutCorporaList[i])
            imageId.append(self.archDataset.anns[self.annIdList[i]]["imageId"])

        return sortedResult, index, copora, annoId, imageId

    def searchWords(self, listWords, weights):
        """
        Search a list of keyword

        Input:
            listWords: 未分好词的查询部分, 是一个或几个字符串的数组
        """
        # cut words
        to_search = []
        for i in listWords:
            to_search += list(jieba.cut(i, True))

        # search
        result = self.model.forwardWords(to_search, weights)

        # get index
        index = np.argsort(result["ALL"]["ALL"])[::-1]
        sortedResult = self.standardization(np.sort(result["ALL"])[::-1])

        # result
        imageId = []
        annoId = []
        copora = []
        for i in index:
            annoId.append(self.annIdList[i])
            copora.append(self.notCutCorporaList[i])
            imageId.append(self.archDataset.anns[self.annIdList[i]]["imageId"])

        # result
        return sortedResult, index, copora, annoId, imageId
    def retrieve(self, test_text, model_dm, corpus):
        test_vec = np.expand_dims(model_dm.infer_vector(test_text), axis=0)

        sim_array = np.zeros(len(corpus))
        for idx, sample in enumerate(corpus):
            sample_vec = np.expand_dims(model_dm.infer_vector(sample), axis=0)
            sim_array[idx] = cosine_similarity(test_vec, sample_vec)

        return sim_array


if __name__ == '__main__':
    # load archdataset
    ArchDataset = Arch(
        annotationFile="../../../Dataset/Arch/DemoData_20201228.json",
        imageFolder=None)
    ArchDataset.reverseCharForAllContext()

    # generate annotation and corpora list
    annIdList = []
    corporaList = []
    corporaList_d2v = []
    TaggededDocument = gensim.models.doc2vec.TaggedDocument  # 方便gensim用的文档对象

    for i, (annotation, content) in enumerate(ArchDataset.anns.items()):
        annIdList.append(annotation)
        corporaList.append(content["cutConcateText"])
        document = TaggededDocument(content["cutConcateText"], tags=[i])
        corporaList_d2v.append(document)
示例#6
0
class archLmirBm25Model():
    def __init__(self, archPath, model_weight=[0.25, 0.25, 0.25, 0.25]):
        """
        初始化模型, 初始化一个: 
            lmirBM25Model

        Input:
            archPath: archtectureDataset的数据位置
            modelWeight: list of model weight [BM25, JM, DIR, ABS]
    
        """
        self.archDataset = Arch(annotationFile=archPath)
        self.archDataset.reverseCharForAllContext()

        # generate annotation and corpora list
        self.annIdList = []
        self.corporaList = []
        self.notCutCorporaList = []
        for annotation, content in self.archDataset.anns.items():
            self.annIdList.append(annotation)
            self.corporaList.append(content["cutConcateText"])
            self.notCutCorporaList.append(content["concateText"])

        # model weight [staticModel, featureModel]
        self.model_weight = model_weight

        self.model = lmirBm25Model(self.corporaList,
                                   modelWeight=self.model_weight)

    def standardization(self, data):
        minValue = np.min(data)
        maxValue = np.max(data)
        return (data - minValue) / (maxValue - minValue) if (
            maxValue - minValue) != 0 else data

    def searchSentence(self, listWords):
        """
        Search a sentence contains keyword

        Input:
            listWords: 未分好词的查询部分, 是一个或几个字符串的数组
        """
        # cut words
        to_search = []
        for i in listWords:
            to_search += list(jieba.cut(i, True))

        # search
        result = self.model.forward(to_search)

        # get index
        index = np.argsort(result["ALL"])[::-1]
        sortedResult = self.standardization(np.sort(result["ALL"])[::-1])

        # result
        imageId = []
        annoId = []
        copora = []
        for i in index:
            annoId.append(self.annIdList[i])
            copora.append(self.notCutCorporaList[i])
            imageId.append(self.archDataset.anns[self.annIdList[i]]["imageId"])

        return sortedResult, index, copora, annoId, imageId

    def cut_words(self, listWords, weights):
        """
        预处理: 分词分weight

            Input:
                listWords: ["中国红花"]
                weights: ["1"]

            Return:
                cutWords:["中国", "红花"]
                weights: [0.5, 0.5]
        """
        cutWords = []
        newWeight = []
        for i in range(len(listWords)):
            # cut word into ["中国", "红花"]
            toCut = listWords[i]
            cutWord = list(jieba.cut(toCut, True))
            cutWords += cutWord

            # cal weight
            weight = weights[i]
            weight = float(weight) / len(cutWords)

            newWeight += [weight for i in range(len(cutWords))]
        return cutWords, newWeight

    def searchWords(self, listWords, weights):
        """
        Search a list of keyword

        Input:
            listWords: 未分好词的查询部分, 是一个或几个字符串的数组
        """
        # cut words
        listWords, weights = self.cut_words(listWords, weights)

        # search
        result = self.model.forwardWords(listWords, weights)

        # get index
        index = np.argsort(result["ALL"])[::-1]
        sortedResult = self.standardization(np.sort(result["ALL"])[::-1])

        # result
        imageId = []
        annoId = []
        copora = []
        for i in index:
            annoId.append(self.annIdList[i])
            copora.append(self.notCutCorporaList[i])
            imageId.append(self.archDataset.anns[self.annIdList[i]]["imageId"])

        # result
        return sortedResult, index, copora, annoId, imageId
示例#7
0
class archMixModel():
    def __init__(self, archPath, modelPath, model_weight=[0.5, 0.5]):
        """
        初始化模型, 分别初始化两个: 
            1. lmirBM25
            2. Doc2Vec

        Input:
            archPath: archtectureDataset的数据位置
            modelPath: doc2vec训练好的模型位置
            modelWeight: list of model weight[staticModel, featureModel]
    
        """
        self.archDataset = Arch(annotationFile=archPath)
        self.archDataset.reverseCharForAllContext()

        # generate annotation and corpora list
        self.annIdList = []
        self.corporaList = []
        self.notCutCorporaList = []
        for annotation, content in self.archDataset.anns.items():
            self.annIdList.append(annotation)
            self.corporaList.append(content["cutConcateText"])
            self.notCutCorporaList.append(content["concateText"])

        # model weight [staticModel, featureModel]
        self.model_weight = model_weight

        self.staticModel = lmirBm25Model(self.corporaList)
        self.featureModel = doc2vecModel(self.corporaList, modelPath)


    def standardization(self, data):
        minValue = np.min(data)
        maxValue = np.max(data)
        return (data - minValue) / (maxValue - minValue) if (maxValue - minValue) != 0 else data

    def forward(self, X):
        """
        查询部分, 由于可以整个句子输入, 所以不加weight.

        Input:
            X: list 分好词的查询部分, 一个元素是一个词, 多个词组合在一起查询.
        
        Return:
            dict{
                “模型名”: [结果list]
                “ALL”: [加权结果list]
            }
        """
        resultStatic = self.staticModel.forward(X)["ALL"]
        resultFeature = self.featureModel.forward(X)["ALL"]

        resultWeighted = resultStatic * self.model_weight[0] + \
                         resultFeature * self.model_weight[1]

        resultWeighted = self.standardization(resultWeighted)

        return {
                "staticModel": resultStatic,
                "featureModel": resultFeature,
                "ALL": resultWeighted,
                }

    # TODO
    def forwardWords(self, X, weights):
        """
        查询部分, 输入为一系列的关键词, 无多余字符.

        Input:
            X: list 分好词的查询部分, 每一个元素是一个词.
            wordWeight: (list) 对于每一个词占算法重要性的比例, 在算法计算完成后最终融合时, 将会按照权重进行加权求和.
        Return:
            dict{
                "1": {
                        “模型名”: [结果nparray]
                        “ALL”: [加权结果nparray]
                        },
                "2": {
                        “模型名”: [结果nparray]
                        “ALL”: [加权结果nparray]
                        },
                "ALL": {
                        “模型名”: [结果nparray]
                        “ALL”: [加权结果nparray]
                        },
                }
        """
        raise NotImplementedError

    def searchSentence(self, listWords):
        """
        Search a sentence contains keyword

        Input:
            listWords: 未分好词的查询部分, 是一个或几个字符串的数组
        """
        # cut words
        to_search = []
        for i in listWords:
            to_search += list(jieba.cut(i, True))

        # search
        result = self.forward(to_search)

        # get index
        index = np.argsort(result["ALL"])[::-1]
        sortedResult = self.standardization(np.sort(result["ALL"])[::-1])
        
        # result
        imageId = []
        annoId = []
        copora = []
        for i in index:
            annoId.append(self.annIdList[i])
            copora.append(self.notCutCorporaList[i])
            imageId.append(self.archDataset.anns[self.annIdList[i]]["imageId"])

        return sortedResult, index, copora, annoId, imageId

    def searchWords(self, listWords, weights):
        """
        Search a list of keyword

        Input:
            listWords: 未分好词的查询部分, 是一个或几个字符串的数组
        """
        # cut words
        to_search = []
        for i in listWords:
            to_search += list(jieba.cut(i, True))

        # search
        result = self.forwardWords(to_search, weights)

        # get index
        index = np.argsort(result["ALL"])[::-1]
        sortedResult = self.standardization(np.sort(result["ALL"])[::-1])
        
        # result
        imageId = []
        annoId = []
        copora = []
        for i in index:
            annoId.append(self.annIdList[i])
            copora.append(self.notCutCorporaList[i])
            imageId.append(self.archDataset.anns[self.annIdList[i]]["imageId"])

        # result
        return sortedResult, index, copora, annoId, imageId