示例#1
0
def getSlideRatingVecs(slideSearchIndex, slideRatingsData, slideHierarchy):
    """
    Converts all the downloaded slide rating data into vectors which can
    then be fed into pyltr to train a local PYLTR model for slide rankings.
    """
    retval = {}

    slidesDict = {slide["id"]:slide for slide in slideHierarchy["Slides"]}

    for label in ["T", "V", "E"]: # Training, Validatoin and Evaluation.
        retval[label] = {"X" : [], "y" : [], "qids" : [], "resultIds": []}

    for (index, ratedQuery) in enumerate(np.random.permutation(slideRatingsData)):
        # Pick label in a round robin manner on a random permutation.
        label = ["T", "V", "E"][(index % 3)]

        print("{0}: Processing query({1}) searching for keywords({2}) as {3}.",
            index, 
            None if "id" not in ratedQuery else ratedQuery["id"],
            ratedQuery["queryJson"]["RatingKeywords"],
            label)

        # Build Ty and Tqids. Also build selectedSlides array to build Tx later.
        selectedSlides = []
        for queryResult in ratedQuery["results"]:
            retval[label]["y"].append(queryResult["avgRating"])
            retval[label]["qids"].append(ratedQuery["id"])
            slideId = queryResult["slide"]
            selectedSlides.append(slidesDict[slideId])
        with blockProfiler("buildSeedTrainingSet.FeatureComputation"):
            retval[label]["X"].extend(slideSearchIndex.features(ratedQuery["queryJson"], selectedSlides))

        print("Profiling data for query collation:\n {0}".format(json.dumps(lastCallProfile(), indent=4)))
    return retval
示例#2
0
 def __init__(self):
     """
     Constructor
     """
     with blockProfiler("Word2vecDistanceModel.__init__"):
         # self.word2vecModel = gensim.models.KeyedVectors.load_word2vec_format('C:/Users/NishantSharma/source/repos/word2vec-slim/GoogleNews-vectors-negative300-SLIM.bin', binary=True)
         self.word2vecModel = gensim.models.KeyedVectors.load_word2vec_format(lisaConfig.word2vecModelPath, binary=True)
示例#3
0
    def dispatch(self, *kwargs, **initkwargs):
        with blockProfiler("view.dispatch"):
            retval = super().dispatch(*kwargs, **initkwargs)

        try:
            retval.data["CallProfile"] = lastCallProfile(True)
            # retval.data.move_to_end("CallProfile", last=False)
        except:
            pass
        return retval
示例#4
0
    def create(self, request):
        """
        Method called when we are create a query instance using a POST method.
        """
        # Create the query instance by calling parent method.
        queryJson = request.data

        with blockProfiler("create.InsertSearchQuery"):
            retval = super().create(request)
        if retval.status_code != status.HTTP_201_CREATED:
            return retval
        else:
            with blockProfiler("create.GetSearchResults"):
                queryObj = SearchQuery.objects.get(pk=retval.data["id"])
                searchIndex = queryObj.index
                queryObj.resultJson = searchIndex.slideSearch(queryObj)
                queryObj.save()

            with blockProfiler("create.SerializeSearchResults"):
                # QueryObj may have now changed.
                queryObj = SearchQuery.objects.get(pk=retval.data["id"])

                # Update retval with new data and return.
                retval.data = SearchQuerySerializer(queryObj,
                                                    context={
                                                        'request': request
                                                    }).data

                # Next and previous URLs in the pagination class work for GET queries.
                # However, they are incorrect for post queries.
                # A "slight" hack to modify these links so that they work for the current POST query.
                paginatedResults = retval.data["results"]
                if paginatedResults["next"] is not None:
                    paginatedResults["next"] = paginatedResults[
                        "next"].replace("?",
                                        str(queryObj.id) + "/?")
                if paginatedResults["previous"] is not None:
                    paginatedResults["previous"] = paginatedResults[
                        "previous"].replace("?",
                                            str(queryObj.id) + "/?")
            return retval
示例#5
0
    def __init__(self, corpus, dictionary):
        with blockProfiler("SectionModel.__init__"):
            self.corpus = corpus

            # Create a word dictionary for use in vector building.
            # self.dictionary = gensim.corpora.Dictionary(self.corpus)
            self.dictionary = dictionary

            # Build a TFIDF model for the corpus.
            self.tfidfModel = gensim.models.TfidfModel(self.corpus, dictionary=self.dictionary)

            self.bm25 = gensim.summarization.bm25.BM25(self.corpus)

            self.average_idf = sum(float(val) for val in self.bm25.idf.values()) / len(self.bm25.idf)
示例#6
0
    def __init__(self, dataForIndexing, config):
        """
        Constructor for SlideSearchIndex takes the path of slide contents file as input.
        """
        isDjangoModel = config["isDjangoModel"]
        with blockProfiler("SlideSearchLambdaMart.__init__"):
            # Invoke base class constructor.
            super().__init__(dataForIndexing, config)

            # Build the word corpus.
            if not self.dataForIndexing["Slides"]:
                self.dictionary = None
                self.slideTagModel = None
                self.constructPathList = []
                self.constructPathToIndex = {}
                self.constructPathModel = None
            else:
                allSlides = self.dataForIndexing["Slides"]

                def extractSlideCorpus(slide):
                    retval = []
                    retval.extend(self.getTags(slide))
                    return retval

                completeCorpus = [
                    extractSlideCorpus(slide) for slide in allSlides
                ]

                # Create a word dictionary for use in vector building.
                self.dictionary = gensim.corpora.Dictionary(completeCorpus)

                # Build section wise corpora and model for slide tags.
                slideTagCorpora = [self.getTags(slide) for slide in allSlides]

                self.slideTagModel = SectionModel(slideTagCorpora,
                                                  self.dictionary)

                # Build corpora for construct paths.
                constructPathCorpora = set(
                    [self.getPath(slide) for slide in allSlides])
                self.constructPathList = [
                    list(constructPath)
                    for constructPath in constructPathCorpora
                ]
                self.constructPathToIndex = {
                    tuple(path): index
                    for (index, path) in enumerate(self.constructPathList)
                }
                self.constructPathModel = SectionModel(self.constructPathList,
                                                       self.dictionary)
示例#7
0
    def buildSeedTrainingSet(self, seedDataBuilder):
        """
        To train LambdaMART model, we need to first build a basic training set.
        This training set should work for the case when true rating data is not available.
        """
        # Build a word occurence dictionary mapping words to slides where they occur.
        wordToMatchingSlides = {}
        for slide in self.dataForIndexing["Slides"]:
            for tag in self.getTags(slide):
                if re.search("[0-9]", tag):
                    # Tags with digits are not interesting for search.
                    continue
                if tag in wordToMatchingSlides:
                    wordToMatchingSlides[tag].append(slide)
                else:
                    wordToMatchingSlides[tag] = [slide]
        wordToMatchingSlides = list(wordToMatchingSlides.items())

        # Sort words according to the # of slides they occur in.
        wordToMatchingSlides.sort(key=lambda tuple: len(tuple[1]))
        # Save word occurence dictionary.
        with open(lisaConfig.dataFolderPath + "trainingWords.json", "w") as fp:
            wordToMatchingSlideIds = {}
            for (word, matchingSlides) in wordToMatchingSlides:
                wordToMatchingSlideIds[word] = list(
                    map(lambda slide: slide["id"], matchingSlides))
            json.dump(wordToMatchingSlideIds, fp, indent=4)

        # Only retain words with frequency less than 1% of total slides.
        freqThreshold = int(0.02 * len(self.dataForIndexing["Slides"]))
        nonMatchingSlideCount = int(0.02 * len(self.dataForIndexing["Slides"]))
        wordToMatchingSlides = [(word, matchingSlides)
                                for (word,
                                     matchingSlides) in wordToMatchingSlides
                                if len(matchingSlides) < freqThreshold]

        retval = []
        for (index, (word, matchingSlides)) in enumerate(wordToMatchingSlides):
            with blockProfiler("buildSeedTrainingSet." + word):
                simulatedQuery = {"id": word}
                simulatedQuery["queryJson"] = {"RatingKeywords": [word]}

                # Now, find slides, which are close but are not matching.
                closeButNotMatchingSlides = []
                i = 0
                permittedSlideList = list(
                    seedDataBuilder.getPermittedSlides(
                        simulatedQuery["queryJson"]))
                results = seedDataBuilder.slideSearch(
                    simulatedQuery["queryJson"], permittedSlideList)
                while len(closeButNotMatchingSlides) < nonMatchingSlideCount:
                    if results[i][1] not in matchingSlides:
                        closeButNotMatchingSlides.append(results[i][1])
                    i += 1

                simulatedQueryResults = []
                simulatedQuery["results"] = simulatedQueryResults

                maxDownloads1 = max(
                    [slide["zeptoDownloads"] for slide in matchingSlides])
                maxDownloads2 = max([
                    slide["zeptoDownloads"]
                    for slide in closeButNotMatchingSlides
                ])
                maxDownloads = float(
                    max(maxDownloads1, maxDownloads2) + 0.0001)
                # Build positive results.
                for slide in matchingSlides:
                    simulatedQueryResult = {
                        "avgRating":
                        5 + int(10 * slide["zeptoDownloads"] / maxDownloads),
                        "slide":
                        slide["id"],
                    }
                    simulatedQueryResults.append(simulatedQueryResult)

                # Build negative results.
                for slide in closeButNotMatchingSlides:
                    simulatedQueryResult = {
                        "avgRating":
                        -15 + int(10 * slide["zeptoDownloads"] / maxDownloads),
                        "slide":
                        slide["id"],
                    }
                    simulatedQueryResults.append(simulatedQueryResult)

                retval.append(simulatedQuery)
            print("{0}: Processed word {1}, occuring in {2}.".format(
                index, word, wordToMatchingSlideIds[word]))
        return retval