def convertDataToEmbeddingTensors(self, dataJSONFile, dataStats): """ Reads in JSON file with SNLI sentences and convert to embedding tensor. Note sentences below maxLength are padded with zeros. :param dataJSONFile: Path to file with SNLI sentences ('train', 'dev', or 'test') :param dataStats: Stats about max sent length/vocab size in data file :return: """ sentences = loadExampleSentences(dataJSONFile) with open(dataStats, "r") as statsFile: statsJSON = json.load(statsFile) maxSentLengthPremise = statsJSON["maxSentLenPremise"] maxSentLengthHypothesis = statsJSON["maxSentLenHypothesis"] numSent = len(sentences) premiseTensor = np.zeros((maxSentLengthPremise, numSent, self.dimEmbeddings), dtype=np.float32) hypothesisTensor = np.zeros((maxSentLengthHypothesis, numSent, self.dimEmbeddings), dtype=np.float32) for idx, (premiseSent, hypothesisSent) in enumerate(sentences): premiseIdxMat = self.convertSentListToIdxMatrix(premiseSent) premiseEmbedList = self.convertIdxMatrixToEmbeddingList(premiseIdxMat) premiseTensor[0 : len(premiseEmbedList), idx, :] = premiseEmbedList # Pad with zeros at end hypothesisIdxMat = self.convertSentListToIdxMatrix(hypothesisSent) hypothesisEmbedList = self.convertIdxMatrixToEmbeddingList(hypothesisIdxMat) hypothesisTensor[0 : len(hypothesisEmbedList), idx, :] = hypothesisEmbedList return premiseTensor, hypothesisTensor
def convertDataToIdxMatrices(self, dataJSONFile, dataStats, pad='right'): """ Converts data file to matrix of dim (# maxlength, # numSamples, 1) where the last dimension stores the idx of the word embedding. :param dataJSONFile: File to data with sentences :param dataStats: :param pad: Whether to pad with zeros at beginning (left) or end (right) :return: """ sentences = loadExampleSentences(dataJSONFile) with open(dataStats, "r") as statsFile: statsJSON = json.load(statsFile) maxSentLengthPremise = statsJSON["maxSentLenPremise"] maxSentLengthHypothesis = statsJSON["maxSentLenHypothesis"] numSent = len(sentences) premiseIdxMatrix = np.zeros((maxSentLengthPremise, numSent, 1), dtype=np.float32) hypothesisIdxMatrix = np.zeros((maxSentLengthHypothesis, numSent, 1), dtype=np.float32) # Fill with 'nan' so that we get random embeddings for words that don't exist premiseIdxMatrix.fill(np.nan) hypothesisIdxMatrix.fill(np.nan) for idx, (premiseSent, hypothesisSent) in enumerate(sentences): premiseIdxMat = np.array( self.convertSentListToIdxMatrix(premiseSent))[:, 0] hypothesisIdxMat = np.array( self.convertSentListToIdxMatrix(hypothesisSent))[:, 0] if pad == 'right': # Pad with zeros at end premiseIdxMatrix[0:len(premiseIdxMat), idx, 0] = premiseIdxMat # Slight bug here hypothesisIdxMatrix[0:len(hypothesisIdxMat), idx, 0] = hypothesisIdxMat else: # Pad with zeros at beginning premiseIdxMatrix[-len(premiseIdxMat):, idx, 0] = premiseIdxMat # Slight bug here hypothesisIdxMatrix[-len(hypothesisIdxMat):, idx, 0] = hypothesisIdxMat return premiseIdxMatrix, hypothesisIdxMatrix
def convertDataToIdxMatrices(self, dataJSONFile, dataStats, pad="right"): """ Converts data file to matrix of dim (# maxlength, # numSamples, 1) where the last dimension stores the idx of the word embedding. :param dataJSONFile: File to data with sentences :param dataStats: :param pad: Whether to pad with zeros at beginning (left) or end (right) :return: """ sentences = loadExampleSentences(dataJSONFile) with open(dataStats, "r") as statsFile: statsJSON = json.load(statsFile) maxSentLengthPremise = statsJSON["maxSentLenPremise"] maxSentLengthHypothesis = statsJSON["maxSentLenHypothesis"] numSent = len(sentences) premiseIdxMatrix = np.zeros((maxSentLengthPremise, numSent, 1), dtype=np.float32) hypothesisIdxMatrix = np.zeros((maxSentLengthHypothesis, numSent, 1), dtype=np.float32) # Fill with 'nan' so that we get random embeddings for words that don't exist premiseIdxMatrix.fill(np.nan) hypothesisIdxMatrix.fill(np.nan) for idx, (premiseSent, hypothesisSent) in enumerate(sentences): premiseIdxMat = np.array(self.convertSentListToIdxMatrix(premiseSent))[:, 0] hypothesisIdxMat = np.array(self.convertSentListToIdxMatrix(hypothesisSent))[:, 0] if pad == "right": # Pad with zeros at end premiseIdxMatrix[0 : len(premiseIdxMat), idx, 0] = premiseIdxMat # Slight bug here hypothesisIdxMatrix[0 : len(hypothesisIdxMat), idx, 0] = hypothesisIdxMat else: # Pad with zeros at beginning premiseIdxMatrix[-len(premiseIdxMat) :, idx, 0] = premiseIdxMat # Slight bug here hypothesisIdxMatrix[-len(hypothesisIdxMat) :, idx, 0] = hypothesisIdxMat return premiseIdxMatrix, hypothesisIdxMatrix
def convertDataToEmbeddingTensors(self, dataJSONFile, dataStats): """ Reads in JSON file with SNLI sentences and convert to embedding tensor. Note sentences below maxLength are padded with zeros. :param dataJSONFile: Path to file with SNLI sentences ('train', 'dev', or 'test') :param dataStats: Stats about max sent length/vocab size in data file :return: """ sentences = loadExampleSentences(dataJSONFile) with open(dataStats, "r") as statsFile: statsJSON = json.load(statsFile) maxSentLengthPremise = statsJSON["maxSentLenPremise"] maxSentLengthHypothesis = statsJSON["maxSentLenHypothesis"] numSent = len(sentences) premiseTensor = np.zeros( (maxSentLengthPremise, numSent, self.dimEmbeddings), dtype=np.float32) hypothesisTensor = np.zeros( (maxSentLengthHypothesis, numSent, self.dimEmbeddings), dtype=np.float32) for idx, (premiseSent, hypothesisSent) in enumerate(sentences): premiseIdxMat = self.convertSentListToIdxMatrix(premiseSent) premiseEmbedList = self.convertIdxMatrixToEmbeddingList( premiseIdxMat) premiseTensor[0:len(premiseEmbedList), idx, :] = premiseEmbedList # Pad with zeros at end hypothesisIdxMat = self.convertSentListToIdxMatrix(hypothesisSent) hypothesisEmbedList = self.convertIdxMatrixToEmbeddingList( hypothesisIdxMat) hypothesisTensor[0:len(hypothesisEmbedList), idx, :] = hypothesisEmbedList return premiseTensor, hypothesisTensor