embeddings = numpy.load(os.path.join(directory, 'features.npy')) inputs = numpy.load(os.path.join(directory, 'inputs.npy')) labels = numpy.load(os.path.join(directory, 'labels.npy')) chunkCount = embeddings.shape[0] chunkLength = embeddings.shape[1] clusters = numpy.reshape( KMeans(n_clusters=numberOfClusters).fit_predict( numpy.reshape(embeddings, (-1, embeddings.shape[-1]))), (chunkCount, chunkLength)) clusterMap = {i: [] for i in range(numberOfClusters)} for chunk in range(chunkCount): chunkString = [ vocab.getTokenString(labels[chunk, word]) for word in range(chunkLength) ] for word in range(chunkLength): clusterId = clusters[chunk, word] wordString = vocab.getTokenString(labels[chunk, word]) clusterMap[clusterId].append((wordString, chunkString)) for clusterId, words in clusterMap.items(): print("Cluster", clusterId) for word, chunk in words: print(" ", "'" + word + "'", chunk)
def groupDataIntoClusters(self): kmeans = MiniBatchKMeans(n_clusters=self.numberOfClusters) featurizer = Featurizer(self.config, self.validationDataset) vocab = Vocab(self.config) if self.usePCA(): pca = IncrementalPCA(n_components=32) logger.info("Reducing dimensionality...") # fit the pca model if self.usePCA(): for iteration in range(self.getIterations()): if iteration % 10 == 0: logger.info(" " + str(iteration) + " / " + str(self.getIterations())) inputs, labels, embeddings = featurizer.featurizeOneBatch() pca.partial_fit( numpy.reshape(embeddings, (-1, embeddings.shape[-1]))) self.validationDataset.reset() logger.info("Fitting model...") # fit the kmeans model for iteration in range(self.getIterations()): if iteration % 10 == 0: inputs, labels, embeddings, dataTime, modelTime = featurizer.featurizeOneBatch( reportTime=True) logger.info(" " + str(iteration) + " / " + str(self.getIterations()) + " data load time: " + str(dataTime) + " model eval time: " + str(modelTime)) else: inputs, labels, embeddings = featurizer.featurizeOneBatch() if self.usePCA(): embeddings = pca.transform( numpy.reshape(embeddings, (-1, embeddings.shape[-1]))) kmeans.partial_fit( numpy.reshape(embeddings, (-1, embeddings.shape[-1]))) self.validationDataset.reset() # group into clusters # create a histogram of word frequencies per cluster clusterHistogram = {i: {} for i in range(self.numberOfClusters)} clusterWins = {i: 0 for i in range(self.numberOfClusters)} documentMap = {} logger.info("Clustering data...") for iteration in range(self.getIterations()): if iteration % 10 == 0: inputs, labels, embeddings, dataTime, modelTime = featurizer.featurizeOneBatch( reportTime=True) logger.info(" " + str(iteration) + " / " + str(self.getIterations()) + " data load time: " + str(dataTime) + " model eval time: " + str(modelTime)) else: inputs, labels, embeddings = featurizer.featurizeOneBatch() chunkLength = embeddings.shape[1] batchSize = embeddings.shape[0] if self.usePCA(): embeddings = pca.transform( numpy.reshape(embeddings, (-1, embeddings.shape[-1]))) clusters = numpy.reshape( kmeans.predict( numpy.reshape(embeddings, (-1, embeddings.shape[-1]))), (batchSize, chunkLength)) for batch in range(batchSize): documentId = labels[batch, 0] if not documentId in documentMap: documentMap[documentId] = [] clusterIds = [] for wordIndex in range(1, chunkLength): word = vocab.getTokenString(labels[batch, wordIndex]) cluster = clusters[batch, wordIndex] clusterIds.append(cluster) if not labels[batch, wordIndex] in clusterHistogram[cluster]: clusterHistogram[cluster][labels[batch, wordIndex]] = 0 clusterHistogram[cluster][labels[batch, wordIndex]] += 1 clusterWins[cluster] += 1 documentMap[documentId].extend(clusterIds) if not os.path.exists(self.outputDirectory): os.makedirs(self.outputDirectory) # write histograms with open(self.getOutputHistogramFileName(), "w") as log: for clusterId, clusterCount in sorted(clusterWins.items(), key=lambda x: x[1], reverse=True): words = clusterHistogram[clusterId] log.write("Cluster, " + str(clusterId) + " (" + str(clusterCount) + ")\n") for wordIndex, count in sorted(words.items(), key=lambda x: x[1], reverse=True): log.write(" '" + vocab.getTokenString(wordIndex) + "' " + str(count) + "\n") # write document clusters for documentId, clusters in documentMap.items(): histogram = {} for cluster in clusters: if not cluster in histogram: histogram[cluster] = 0 histogram[cluster] += 1 with open(self.getOutputDocumentClusterFileName(documentId), "w") as log: for cluster, count in sorted(histogram.items(), key=lambda x: x[1], reverse=True): words = clusterHistogram[cluster] topWord = vocab.getTokenString( sorted(words.items(), key=lambda x: x[1], reverse=True)[0][0]) log.write("Cluster, " + str(cluster) + ", " + topWord + ", " + str(count) + "\n")
class FallbackTokenEvaluator: def __init__(self, config): self.config = config self.vocab = Vocab(config) def initialize(self): self.perplexityStates = self.createPerplexityStates( self.getBatchSize()) def evaluate(self, inputs, labels, predictions): inputIndices, predictions, vocabProbabilities = self.rewriteSplitTokens( inputs, labels, predictions) self.recordPredictions(predictions, vocabProbabilities, inputIndices, inputs) def getRequestedPredictions(self, inputs, labels): return numpy.expand_dims(labels, axis=2) def finalize(self): return self.getPerplexity() def getBatchSize(self): if not "adaptor" in self.config: return 1 if not "batching" in self.config["adaptor"]: return 1 if not "size" in self.config["adaptor"]["batching"]: return 1 return int(self.config["adaptor"]["batching"]["size"]) def createPerplexityStates(self, count): return [PerplexityState(self.vocab) for i in range(count)] def getPerplexity(self): byteCount = sum( [state.getByteCount() for state in self.perplexityStates]) totalEntropy = sum( [state.getEntropy() for state in self.perplexityStates]) return 2.0**(totalEntropy / byteCount) def recordPredictions(self, predictions, vocabProbabilities, inputIndices, inputs): # predictions is Tensor(batch-size, sequence-length, vocab-size) # inputs is Tensor(batch-size, sequence-length) batchSize = predictions.shape[0] sequenceLength = predictions.shape[1] # TODO: replace with something like batch gather for batch in range(batchSize): for element in range(sequenceLength): labelPrediction = predictions[batch, element] self.perplexityStates[batch].addPrediction( inputs[batch, :], inputIndices[batch, element], labelPrediction, vocabProbabilities[batch, element, :]) def rewriteSplitTokens(self, inputs, labels, predictions): from functools import reduce newInputs = [] newPredictions = [] newVocabProbabilities = [] batchSize = predictions.shape[0] sequenceLength = predictions.shape[1] # collapse expanded tokens for batch in range(batchSize): inputString = "".join([ self.vocab.getTokenString(token) for token in labels[batch, :] if not Vocab.isReservedToken(token) ]) reservedIndices = set([ index for index, token in enumerate(labels[batch, :]) if Vocab.isReservedToken(token) ]) tokenizer = UnlimitedVocabTokenizerAdaptor( StringDataSource(inputString)) completeTokens = [ tokenizer.next() for i in range(tokenizer.size()) ] logger.debug("Reformed input string: '" + str([ self.vocab.getTokenString(token) for token in labels[batch, :] if not Vocab.isReservedToken(token) ])) logger.debug("' tokenized to: " + str(completeTokens)) logger.debug( " tokens: " + str([self.vocab.getToken(token) for token in completeTokens])) index = 0 completeTokenIndex = 0 newBatchInputs = [] newBatchPredictions = [] newBatchVocabProbabilities = [] while index < sequenceLength: token = labels[batch, index] completeToken = completeTokens[completeTokenIndex] # get token end tokenEndIndex = index + 1 if self.vocab.getToken( completeToken ) != token and not index in reservedIndices: while tokenEndIndex < sequenceLength: possibleToken = labels[batch, tokenEndIndex] if (completeTokenIndex + 1) < len(completeTokens): if self.vocab.getToken( completeTokens[completeTokenIndex + 1]) == possibleToken: break tokenEndIndex += 1 # add token newBatchInputs.append([index, tokenEndIndex]) newBatchVocabProbabilities.append( list(predictions[batch, index, :])) newBatchVocabProbabilities[-1][0] = 0.0 # compute new probabilities for the merged token predictionValues = predictions[batch, index:tokenEndIndex, 0] newBatchPredictions.append( reduce(lambda x, y: x * y, predictionValues)) if tokenEndIndex > (index + 1): logger.debug("Reformed split tokens: " + str([ self.vocab.getTokenString(token) for token in labels[batch, index:tokenEndIndex] ]) + (" with prob: %.4f" % newBatchPredictions[-1])) if not index in reservedIndices: completeTokenIndex += 1 index = tokenEndIndex newInputs.append(newBatchInputs) newPredictions.append(newBatchPredictions) newVocabProbabilities.append(newBatchVocabProbabilities) # pad maxLength = max([len(tokens) for tokens in newInputs]) newInputs = [ inputs + [self.getPadToken() for i in range(maxLength - len(inputs))] for inputs in newInputs ] newPredictions = [ predictions + [0.0 for i in range(maxLength - len(predictions))] for predictions in newPredictions ] newVocabProbabilties = [ predictions + [0.0 for i in range(maxLength - len(predictions))] for predictions in newVocabProbabilities ] return numpy.array(newInputs), numpy.array( newPredictions), numpy.array(newVocabProbabilities)