def getSlideRatingVecs(slideSearchIndex, slideRatingsData, slideHierarchy): """ Converts all the downloaded slide rating data into vectors which can then be fed into pyltr to train a local PYLTR model for slide rankings. """ retval = {} slidesDict = {slide["id"]:slide for slide in slideHierarchy["Slides"]} for label in ["T", "V", "E"]: # Training, Validatoin and Evaluation. retval[label] = {"X" : [], "y" : [], "qids" : [], "resultIds": []} for (index, ratedQuery) in enumerate(np.random.permutation(slideRatingsData)): # Pick label in a round robin manner on a random permutation. label = ["T", "V", "E"][(index % 3)] print("{0}: Processing query({1}) searching for keywords({2}) as {3}.", index, None if "id" not in ratedQuery else ratedQuery["id"], ratedQuery["queryJson"]["RatingKeywords"], label) # Build Ty and Tqids. Also build selectedSlides array to build Tx later. selectedSlides = [] for queryResult in ratedQuery["results"]: retval[label]["y"].append(queryResult["avgRating"]) retval[label]["qids"].append(ratedQuery["id"]) slideId = queryResult["slide"] selectedSlides.append(slidesDict[slideId]) with blockProfiler("buildSeedTrainingSet.FeatureComputation"): retval[label]["X"].extend(slideSearchIndex.features(ratedQuery["queryJson"], selectedSlides)) print("Profiling data for query collation:\n {0}".format(json.dumps(lastCallProfile(), indent=4))) return retval
def __init__(self): """ Constructor """ with blockProfiler("Word2vecDistanceModel.__init__"): # self.word2vecModel = gensim.models.KeyedVectors.load_word2vec_format('C:/Users/NishantSharma/source/repos/word2vec-slim/GoogleNews-vectors-negative300-SLIM.bin', binary=True) self.word2vecModel = gensim.models.KeyedVectors.load_word2vec_format(lisaConfig.word2vecModelPath, binary=True)
def dispatch(self, *kwargs, **initkwargs): with blockProfiler("view.dispatch"): retval = super().dispatch(*kwargs, **initkwargs) try: retval.data["CallProfile"] = lastCallProfile(True) # retval.data.move_to_end("CallProfile", last=False) except: pass return retval
def create(self, request): """ Method called when we are create a query instance using a POST method. """ # Create the query instance by calling parent method. queryJson = request.data with blockProfiler("create.InsertSearchQuery"): retval = super().create(request) if retval.status_code != status.HTTP_201_CREATED: return retval else: with blockProfiler("create.GetSearchResults"): queryObj = SearchQuery.objects.get(pk=retval.data["id"]) searchIndex = queryObj.index queryObj.resultJson = searchIndex.slideSearch(queryObj) queryObj.save() with blockProfiler("create.SerializeSearchResults"): # QueryObj may have now changed. queryObj = SearchQuery.objects.get(pk=retval.data["id"]) # Update retval with new data and return. retval.data = SearchQuerySerializer(queryObj, context={ 'request': request }).data # Next and previous URLs in the pagination class work for GET queries. # However, they are incorrect for post queries. # A "slight" hack to modify these links so that they work for the current POST query. paginatedResults = retval.data["results"] if paginatedResults["next"] is not None: paginatedResults["next"] = paginatedResults[ "next"].replace("?", str(queryObj.id) + "/?") if paginatedResults["previous"] is not None: paginatedResults["previous"] = paginatedResults[ "previous"].replace("?", str(queryObj.id) + "/?") return retval
def __init__(self, corpus, dictionary): with blockProfiler("SectionModel.__init__"): self.corpus = corpus # Create a word dictionary for use in vector building. # self.dictionary = gensim.corpora.Dictionary(self.corpus) self.dictionary = dictionary # Build a TFIDF model for the corpus. self.tfidfModel = gensim.models.TfidfModel(self.corpus, dictionary=self.dictionary) self.bm25 = gensim.summarization.bm25.BM25(self.corpus) self.average_idf = sum(float(val) for val in self.bm25.idf.values()) / len(self.bm25.idf)
def __init__(self, dataForIndexing, config): """ Constructor for SlideSearchIndex takes the path of slide contents file as input. """ isDjangoModel = config["isDjangoModel"] with blockProfiler("SlideSearchLambdaMart.__init__"): # Invoke base class constructor. super().__init__(dataForIndexing, config) # Build the word corpus. if not self.dataForIndexing["Slides"]: self.dictionary = None self.slideTagModel = None self.constructPathList = [] self.constructPathToIndex = {} self.constructPathModel = None else: allSlides = self.dataForIndexing["Slides"] def extractSlideCorpus(slide): retval = [] retval.extend(self.getTags(slide)) return retval completeCorpus = [ extractSlideCorpus(slide) for slide in allSlides ] # Create a word dictionary for use in vector building. self.dictionary = gensim.corpora.Dictionary(completeCorpus) # Build section wise corpora and model for slide tags. slideTagCorpora = [self.getTags(slide) for slide in allSlides] self.slideTagModel = SectionModel(slideTagCorpora, self.dictionary) # Build corpora for construct paths. constructPathCorpora = set( [self.getPath(slide) for slide in allSlides]) self.constructPathList = [ list(constructPath) for constructPath in constructPathCorpora ] self.constructPathToIndex = { tuple(path): index for (index, path) in enumerate(self.constructPathList) } self.constructPathModel = SectionModel(self.constructPathList, self.dictionary)
def buildSeedTrainingSet(self, seedDataBuilder): """ To train LambdaMART model, we need to first build a basic training set. This training set should work for the case when true rating data is not available. """ # Build a word occurence dictionary mapping words to slides where they occur. wordToMatchingSlides = {} for slide in self.dataForIndexing["Slides"]: for tag in self.getTags(slide): if re.search("[0-9]", tag): # Tags with digits are not interesting for search. continue if tag in wordToMatchingSlides: wordToMatchingSlides[tag].append(slide) else: wordToMatchingSlides[tag] = [slide] wordToMatchingSlides = list(wordToMatchingSlides.items()) # Sort words according to the # of slides they occur in. wordToMatchingSlides.sort(key=lambda tuple: len(tuple[1])) # Save word occurence dictionary. with open(lisaConfig.dataFolderPath + "trainingWords.json", "w") as fp: wordToMatchingSlideIds = {} for (word, matchingSlides) in wordToMatchingSlides: wordToMatchingSlideIds[word] = list( map(lambda slide: slide["id"], matchingSlides)) json.dump(wordToMatchingSlideIds, fp, indent=4) # Only retain words with frequency less than 1% of total slides. freqThreshold = int(0.02 * len(self.dataForIndexing["Slides"])) nonMatchingSlideCount = int(0.02 * len(self.dataForIndexing["Slides"])) wordToMatchingSlides = [(word, matchingSlides) for (word, matchingSlides) in wordToMatchingSlides if len(matchingSlides) < freqThreshold] retval = [] for (index, (word, matchingSlides)) in enumerate(wordToMatchingSlides): with blockProfiler("buildSeedTrainingSet." + word): simulatedQuery = {"id": word} simulatedQuery["queryJson"] = {"RatingKeywords": [word]} # Now, find slides, which are close but are not matching. closeButNotMatchingSlides = [] i = 0 permittedSlideList = list( seedDataBuilder.getPermittedSlides( simulatedQuery["queryJson"])) results = seedDataBuilder.slideSearch( simulatedQuery["queryJson"], permittedSlideList) while len(closeButNotMatchingSlides) < nonMatchingSlideCount: if results[i][1] not in matchingSlides: closeButNotMatchingSlides.append(results[i][1]) i += 1 simulatedQueryResults = [] simulatedQuery["results"] = simulatedQueryResults maxDownloads1 = max( [slide["zeptoDownloads"] for slide in matchingSlides]) maxDownloads2 = max([ slide["zeptoDownloads"] for slide in closeButNotMatchingSlides ]) maxDownloads = float( max(maxDownloads1, maxDownloads2) + 0.0001) # Build positive results. for slide in matchingSlides: simulatedQueryResult = { "avgRating": 5 + int(10 * slide["zeptoDownloads"] / maxDownloads), "slide": slide["id"], } simulatedQueryResults.append(simulatedQueryResult) # Build negative results. for slide in closeButNotMatchingSlides: simulatedQueryResult = { "avgRating": -15 + int(10 * slide["zeptoDownloads"] / maxDownloads), "slide": slide["id"], } simulatedQueryResults.append(simulatedQueryResult) retval.append(simulatedQuery) print("{0}: Processed word {1}, occuring in {2}.".format( index, word, wordToMatchingSlideIds[word])) return retval