def split(self, filePath=None, numLabels=3, textPreprocess=False, dataDict=None, abbrCSV="", contrCSV="", ignoreCommon=100, removeStrings="[identifier deleted]", correctSpell=True): """ Split all the comments in a file into tokens, w/ or w/o preprocessing. Specifying both filePath and dataDict will prefer filePath. @param filePath (str) Path to csv file @param dataDict (dict) Data as returned by readCSV() @param numLabels (int) Number of columns of category labels. @param textPreprocess (bool) True will preprocess text while tokenizing. @return dataDict (dict) Data as read in from filePath. Please see TextPreprocess tokenize() for the other parameters; they're only used when textPreprocess is True. """ if filePath: dataDict = readCSV(filePath, numLabels=numLabels) if dataDict is None: raise Exception("No data given, or could not read CSV.") preprocessor = TextPreprocess(abbrCSV=abbrCSV, contrCSV=contrCSV) expandAbbr = (abbrCSV != "") expandContr = (contrCSV != "") for recordNum, record in dataDict.iteritems(): comment, categories, uniqueID = record # Convert the categories to a string of their IDs categories = string.join([str(self.categoryToId[c]) for c in categories]) if textPreprocess: tokens = preprocessor.tokenize( comment, ignoreCommon, removeStrings, correctSpell, expandAbbr, expandContr) else: tokens = preprocessor.tokenize(comment) data = self._formatSequence(tokens, categories, recordNum, uniqueID) self.records.append(data) self.sequenceCount += 1 return dataDict
def testTokenizeExpandContraction(self): """Tests contractions are expanded.""" text = "I can't work at [identifier deleted] if you don't allw me to wfh" processor = TextPreprocess() expected_tokens = ["i", "can", "not", "work", "at", "identifier", "deleted", "if", "you", "do", "not", "allw", "me", "to", "wfh"] tokens = processor.tokenize(text, expandContr=True) self.assertSequenceEqual(tokens, expected_tokens)
def testTokenizeRemoveString(self): """Tests a provided string is ignored.""" text = "I can't work at [identifier deleted] if you don't allw me to wfh" processor = TextPreprocess() expected_tokens = ["i", "can", "t", "work", "at", "if", "you", "don", "t", "allw", "me", "to", "wfh"] tokens = processor.tokenize(text, removeStrings=["[identifier deleted]"]) self.assertSequenceEqual(tokens, expected_tokens)
def testTokenizeNoPreprocess(self): """Tests none of the preprocessing methods are used.""" text = "I can't work at [identifier deleted] if you don't allw me to wfh" processor = TextPreprocess() expected_tokens = ["i", "can", "t", "work", "at", "identifier", "deleted", "if", "you", "don", "t", "allw", "me", "to", "wfh"] tokens = processor.tokenize(text) self.assertSequenceEqual(tokens, expected_tokens)
def trainModelWithText(model, trainingData): """ Train the given model on trainingData. This is (essentially) the same training method as in the research repo's imbu_runner.py. """ textPreprocessor = TextPreprocess() for seqId, (text, _, _) in enumerate(trainingData.values()): textTokens = textPreprocessor.tokenize(text) # TODO: use model's tokenization method instead lastToken = len(textTokens) - 1 for i, token in enumerate(textTokens): # use the sequence's ID as the category label model.trainText(token, [seqId], sequenceId=seqId, reset=int(i==lastToken))
def trainModelWithText(model, trainingData): """ Train the given model on trainingData. This is (essentially) the same training method as in the research repo's imbu_runner.py. """ textPreprocessor = TextPreprocess() for seqId, (text, _, _) in enumerate(trainingData.values()): textTokens = textPreprocessor.tokenize( text) # TODO: use model's tokenization method instead lastToken = len(textTokens) - 1 for i, token in enumerate(textTokens): # use the sequence's ID as the category label model.trainText(token, [seqId], sequenceId=seqId, reset=int(i == lastToken))
def testFunctionsWithoutDataFiles(self): """ Ensures a TextPreprocess object can be created and tokenize when there are no text data files (corpus text, abbreviations, and contractions). """ text = "I can't work at [identifier deleted] if you don't allw me to wfh" processor = TextPreprocess(corpusTxt="fake.txt", abbrCSV="not_here.csv", contrCSV="not_real.csv") tokens = processor.tokenize(text) expected_tokens = ["i", "can", "t", "work", "at", "identifier", "deleted", "if", "you", "don", "t", "allw", "me", "to", "wfh"] self.assertSequenceEqual(tokens, expected_tokens)