Python TextPreprocess 예제들, fluent.utils.text_preprocess.TextPreprocess Python 예제들

예제 #1

0

파일 보기

파일: classification_model.py 프로젝트: numenta/nupic.fluent

    def queryModel(self, query, preprocess):
        """
    Preprocesses the query, encodes it into a pattern, then queries the
    classifier to infer distances to trained-on samples.
    @return       (list)          Two-tuples of sample ID and distance, sorted
                                  closest to farthest from the query.
    """
        if preprocess:
            sample = TextPreprocess().tokenize(
                query,
                ignoreCommon=100,
                removeStrings=["[identifier deleted]"],
                correctSpell=True)
        else:
            sample = TextPreprocess().tokenize(query)

        allDistances = self.infer(self.encodeSample(sample))

        # Model trains multiple times for multi-label samples, so remove repeats.
        # note: numpy.unique() auto sorts least to greatest

        if len(allDistances) != len(self.sampleReference):
            raise IndexError(
                "Number of protoype distances must match number of "
                "samples trained on.")

        sampleDistances = defaultdict()
        for i, uniqueID in enumerate(self.sampleReference):
            sampleDistances[uniqueID] = min([
                allDistances[i] for i, x in enumerate(self.sampleReference)
                if x == uniqueID
            ])

        return sorted(sampleDistances.items(), key=operator.itemgetter(1))

예제 #2

0

파일 보기

파일: baseline_experiment.py 프로젝트: mihail911/nupic.fluent

def setupData(args):
  """ Performs data preprocessing and setup given the user-specified args.

  @param args       (Namespace)     User-provided arguments via the cmd line.
  @return           (tuple)         Tuple where first entry is a list of the
      samples, the second is the list of gold labels per example, the third is
      the list of all possible labels, and the fourth is the labels per example
      in the data.
  """
  dataDict = readCSV(args.dataPath, 2, args.numLabels)

  # Collect each possible label string into a list, where the indices will be
  # their references throughout the experiment.
  labelReference = list(set(
      itertools.chain.from_iterable(dataDict.values())))

  for sample, labels in dataDict.iteritems():
    dataDict[sample] = numpy.array([labelReference.index(label)
                                    for label in labels],
                                    dtype="int8")

  texter = TextPreprocess()
  if args.textPreprocess:
    samples = [(texter.tokenize(sample,
                                ignoreCommon=100,
                                removeStrings=["[identifier deleted]"],
                                correctSpell=True),
               labels) for sample, labels in dataDict.iteritems()]
  else:
    samples = [(texter.tokenize(sample), labels)
               for sample, labels in dataDict.iteritems()]

  return samples, labelReference

예제 #3

0

파일 보기

파일: text_preprocess_test.py 프로젝트: JunDeWinter/nupic.fluent

    def testFunctionsWithoutDataFiles(self):
        """
    Ensures a TextPreprocess object can be created and tokenize when there are
    no text data files (corpus text, abbreviations, and contractions).
    """
        text = "I can't work at [identifier deleted] if you don't allw me to wfh"
        processor = TextPreprocess(corpusTxt="fake.txt", abbrCSV="not_here.csv", contrCSV="not_real.csv")

        tokens = processor.tokenize(text)
        expected_tokens = [
            "i",
            "can",
            "t",
            "work",
            "at",
            "identifier",
            "deleted",
            "if",
            "you",
            "don",
            "t",
            "allw",
            "me",
            "to",
            "wfh",
        ]

        self.assertSequenceEqual(tokens, expected_tokens)

예제 #4

0

파일 보기

파일: text_preprocess_test.py 프로젝트: JunDeWinter/nupic.fluent

 def testReadExpansionFileWithSuffixes(self):
     """Tests TextPreprocess reads csv files correctly and adds suffixes."""
     processor = TextPreprocess()
     suffixes = ["", "s", "'s"]
     abbreviations = processor.readExpansionFile("abbreviations.csv", suffixes)
     expectedAbbreviations = {"wfh": "work from home", "wfhs": "work from homes", "wfh's": "work from home's"}
     self.assertEqual(abbreviations, expectedAbbreviations)

예제 #5

0

파일 보기

파일: multi_runner.py 프로젝트: akhilaananthram/nupic.fluent

  def _preprocess(self, preprocess):
    """Tokenize the samples, with or without preprocessing."""
    texter = TextPreprocess()
    if preprocess:
      self.samples = {category: [(texter.tokenize(data[0],
                                                  ignoreCommon=100,
                                                  removeStrings=["identifier deleted]"],
                                                  correctSpell=True), data[1], idx)
                      for idx, data in samples.iteritems()]
                      for category, samples in self.dataDict.iteritems()}

      if self.testDict:
        self.testSamples = [(texter.tokenize(data[0],
                                            ignoreCommon=100,
                                            removeStrings=["identifier deleted]"],
                                            correctSpell=True), data[1], idx)
                            for idx, data in self.testDict.iteritems()]
    else:
      self.samples = {category: [(texter.tokenize(data[0]), data[1], idx)
                      for idx, data in samples.iteritems()]
                      for category, samples in self.dataDict.iteritems()}

      if self.testDict:
        self.testSamples = [(texter.tokenize(data[0]), data[1], idx)
                            for idx, data in self.testDict.iteritems()]

예제 #6

0

파일 보기

파일: text_preprocess_test.py 프로젝트: JunDeWinter/nupic.fluent

    def testTokenizeExpandAbbreviation(self):
        """Tests abbreviations are expanded."""
        text = "I can't work at [identifier deleted] if you don't allw me to wfh"
        processor = TextPreprocess()

        expected_tokens = [
            "i",
            "can",
            "t",
            "work",
            "at",
            "identifier",
            "deleted",
            "if",
            "you",
            "don",
            "t",
            "allw",
            "me",
            "to",
            "work",
            "from",
            "home",
        ]

        tokens = processor.tokenize(text, expandAbbr=True)
        self.assertSequenceEqual(tokens, expected_tokens)

예제 #7

0

파일 보기

파일: text_preprocess_test.py 프로젝트: JunDeWinter/nupic.fluent

    def testTokenizeRemoveString(self):
        """Tests a provided string is ignored."""
        text = "I can't work at [identifier deleted] if you don't allw me to wfh"
        processor = TextPreprocess()

        expected_tokens = ["i", "can", "t", "work", "at", "if", "you", "don", "t", "allw", "me", "to", "wfh"]
        tokens = processor.tokenize(text, removeStrings=["[identifier deleted]"])
        self.assertSequenceEqual(tokens, expected_tokens)

예제 #8

0

파일 보기

파일: text_preprocess_test.py 프로젝트: lscheinkman/nupic.fluent

  def testTokenizeExpandContraction(self):
    """Tests contractions are expanded."""
    text = "I can't work at [identifier deleted] if you don't allw me to wfh"
    processor = TextPreprocess()

    expected_tokens = ["i", "can", "not", "work", "at", "identifier", "deleted",
                       "if", "you", "do", "not", "allw", "me", "to", "wfh"]
    tokens = processor.tokenize(text, expandContr=True)
    self.assertSequenceEqual(tokens, expected_tokens)

예제 #9

0

파일 보기

파일: text_preprocess_test.py 프로젝트: lscheinkman/nupic.fluent

  def testTokenizeRemoveString(self):
    """Tests a provided string is ignored."""
    text = "I can't work at [identifier deleted] if you don't allw me to wfh"
    processor = TextPreprocess()

    expected_tokens = ["i", "can", "t", "work", "at", "if", "you", "don",
                       "t", "allw", "me", "to", "wfh"]
    tokens = processor.tokenize(text, removeStrings=["[identifier deleted]"])
    self.assertSequenceEqual(tokens, expected_tokens)

예제 #10

0

파일 보기

파일: text_preprocess_test.py 프로젝트: lscheinkman/nupic.fluent

  def testTokenizeNoPreprocess(self):
    """Tests none of the preprocessing methods are used."""
    text = "I can't work at [identifier deleted] if you don't allw me to wfh"
    processor = TextPreprocess()

    expected_tokens = ["i", "can", "t", "work", "at", "identifier", "deleted",
                       "if", "you", "don", "t", "allw", "me", "to", "wfh"]
    tokens = processor.tokenize(text)
    self.assertSequenceEqual(tokens, expected_tokens)

예제 #11

0

파일 보기

파일: text_preprocess_test.py 프로젝트: lscheinkman/nupic.fluent

 def testReadExpansionFileWithSuffixes(self):
   """Tests TextPreprocess reads csv files correctly and adds suffixes."""
   processor = TextPreprocess()
   suffixes = ["", "s", "'s"]
   abbreviations = processor.readExpansionFile("abbreviations.csv", suffixes)
   expectedAbbreviations = {"wfh": "work from home",
                            "wfhs": "work from homes",
                            "wfh's": "work from home's"}
   self.assertEqual(abbreviations, expectedAbbreviations)

예제 #12

0

파일 보기

    def split(self,
              filePath,
              numLabels,
              textPreprocess=False,
              abbrCSV="",
              contrCSV="",
              ignoreCommon=100,
              removeStrings="[identifier deleted]",
              correctSpell=True):
        """
    Split all the comments in a file into tokens. Preprocess if necessary.
    
    @param filePath        (str)    Path to csv file
    @param numLabels       (int)    Number of columns of category labels.
    @param textPreprocess  (bool)   True will preprocess text while tokenizing.
    
    Please see TextPreprocess tokenize() for the other parameters; they're only
    used when textPrepricess is True.
    """
        dataDict = readCSV(filePath, numLabels=numLabels)
        if dataDict is None:
            raise Exception("Could not read CSV.")

        preprocessor = TextPreprocess(abbrCSV=abbrCSV, contrCSV=contrCSV)
        expandAbbr = (abbrCSV != "")
        expandContr = (contrCSV != "")

        for i, uniqueID in enumerate(dataDict.keys()):
            comment, categories = dataDict[uniqueID]
            # Convert the categories to a string of their IDs
            categories = string.join(
                [str(self.categoryToId[c]) for c in categories])

            if textPreprocess:
                tokens = preprocessor.tokenize(comment, ignoreCommon,
                                               removeStrings, correctSpell,
                                               expandAbbr, expandContr)
            else:
                tokens = preprocessor.tokenize(comment)

            # Write the sequence of data records for this sample.
            record = {"_categories": categories, "_sequenceID": i}
            data = []
            reset = 1
            for t in tokens:
                tokenRecord = record.copy()
                tokenRecord["_token"] = t
                tokenRecord["_reset"] = reset
                tokenRecord["ID"] = uniqueID
                reset = 0
                data.append(tokenRecord)

            self.records.append(data)

예제 #13

0

파일 보기

파일: runner.py 프로젝트: akhilaananthram/nupic.fluent

 def _preprocess(self, preprocess):
   """Tokenize the samples, with or without preprocessing."""
   texter = TextPreprocess()
   if preprocess:
     self.samples = [(texter.tokenize(data[0],
                                      ignoreCommon=100,
                                      removeStrings=["[identifier deleted]"],
                                      correctSpell=True),
                      data[1]) for _, data in self.dataDict.iteritems()]
   else:
     self.samples = [(texter.tokenize(data[0]), data[1])
                     for _, data in self.dataDict.iteritems()]

예제 #14

0

파일 보기

파일: runner.py 프로젝트: lscheinkman/nupic.fluent

 def _preprocess(self, preprocess):
     """Tokenize the samples, with or without preprocessing."""
     texter = TextPreprocess()
     if preprocess:
         self.samples = [
             (texter.tokenize(data[0],
                              ignoreCommon=100,
                              removeStrings=["[identifier deleted]"],
                              correctSpell=True), data[1])
             for id, data in self.dataDict.iteritems()
         ]
     else:
         self.samples = [(texter.tokenize(data[0]), data[1])
                         for id, data in self.dataDict.iteritems()]

예제 #15

0

파일 보기

파일: text_preprocess_test.py 프로젝트: lscheinkman/nupic.fluent

 def testFunctionsWithoutDataFiles(self):
   """
   Ensures a TextPreprocess object can be created and tokenize when there are
   no text data files (corpus text, abbreviations, and contractions).
   """
   text = "I can't work at [identifier deleted] if you don't allw me to wfh"
   processor = TextPreprocess(corpusTxt="fake.txt",
                              abbrCSV="not_here.csv",
                              contrCSV="not_real.csv")
   
   tokens = processor.tokenize(text)
   expected_tokens = ["i", "can", "t", "work", "at", "identifier", "deleted",
                      "if", "you", "don", "t", "allw", "me", "to", "wfh"]
   
   self.assertSequenceEqual(tokens, expected_tokens)

예제 #16

0

파일 보기

파일: classification_model.py 프로젝트: numenta/nupic.fluent

    def prepText(text, preprocess=False):
        """
    Returns a list of the text tokens.

    @param preprocess   (bool)    Whether or not to preprocess the text data.
    """
        if preprocess:
            sample = TextPreprocess().tokenize(
                text,
                ignoreCommon=100,
                removeStrings=["[identifier deleted]"],
                correctSpell=True)
        else:
            sample = TextPreprocess().tokenize(text)

        return sample

예제 #17

0

파일 보기

파일: cio_encoder.py 프로젝트: lscheinkman/nupic.fluent

    def getUnionEncoding(self, text):
        """
    Encode each token of the input text, take the union, and then sparsify.

    @param  text    (str)             A non-tokenized sample of text.
    @return         (dict)            The bitmap encoding is at
                                      encoding["fingerprint"]["positions"].
    """
        tokens = TextPreprocess().tokenize(text)

        # Count the ON bits represented in the encoded tokens.
        counts = Counter()
        for t in tokens:
            bitmap = self.client.getBitmap(t)["fingerprint"]["positions"]
            counts.update(bitmap)

        positions = self.sparseUnion(counts)

        # Populate encoding
        encoding = {
            "text": text,
            "sparsity": len(positions) * 100 / float(self.n),
            "df": 0.0,
            "height": self.h,
            "width": self.w,
            "score": 0.0,
            "fingerprint": {
                "positions": sorted(positions)
            },
            "pos_types": []
        }

        return encoding

예제 #18

0

파일 보기

파일: network_data_generator.py 프로젝트: BoltzmannBrain/nupic.fluent

  def split(self, filePath, numLabels, textPreprocess=False, abbrCSV="",
            contrCSV="", ignoreCommon=100, removeStrings="[identifier deleted]",
            correctSpell=True):
    """
    Split all the comments in a file into tokens. Preprocess if necessary.
    
    @param filePath        (str)    Path to csv file
    @param numLabels       (int)    Number of columns of category labels.
    @param textPreprocess  (bool)   True will preprocess text while tokenizing.
    
    Please see TextPreprocess tokenize() for the other parameters; they're only
    used when textPrepricess is True.
    """
    dataDict = readCSV(filePath, numLabels=numLabels)
    if dataDict is None:
      raise Exception("Could not read CSV.")

    preprocessor = TextPreprocess(abbrCSV=abbrCSV, contrCSV=contrCSV)
    expandAbbr = (abbrCSV != "")
    expandContr = (contrCSV != "")

    for i, uniqueID in enumerate(dataDict.keys()):
      comment, categories = dataDict[uniqueID]
      # Convert the categories to a string of their IDs
      categories = string.join([str(self.categoryToId[c]) for c in categories])

      if textPreprocess:
        tokens = preprocessor.tokenize(
            comment, ignoreCommon, removeStrings, correctSpell, expandAbbr,
            expandContr)
      else:
        tokens = preprocessor.tokenize(comment)

      # Write the sequence of data records for this sample.
      record = {"_categories":categories,
                "_sequenceID":i}
      data = []
      reset = 1
      for t in tokens:
        tokenRecord = record.copy()
        tokenRecord["_token"] = t
        tokenRecord["_reset"] = reset
        tokenRecord["ID"] = uniqueID
        reset = 0
        data.append(tokenRecord)

      self.records.append(data)

예제 #19

0

파일 보기

def setupData(args):
    """ Performs data preprocessing and setup given the user-specified args.

  @param args       (Namespace)     User-provided arguments via the cmd line.
  @return           (tuple)         Tuple where first entry is a list of the
      samples, the second is the list of gold labels per example, the third is
      the list of all possible labels, and the fourth is the labels per example
      in the data.
  """
    dataDict = readCSV(args.dataPath, 2, args.numLabels)

    # Collect each possible label string into a list, where the indices will be
    # their references throughout the experiment.
    labelReference = list(
        set(
            itertools.chain.from_iterable(
                map(lambda x: x[1], dataDict.values()))))

    for idx, data in dataDict.iteritems():
        comment, labels = data
        dataDict[idx] = (comment,
                         numpy.array(
                             [labelReference.index(label) for label in labels],
                             dtype="int8"))

    texter = TextPreprocess(abbrCSV=args.abbrCSV, contrCSV=args.contrCSV)
    expandAbbr = (args.abbrCSV != "")
    expandContr = (args.contrCSV != "")
    if args.textPreprocess:
        samples = [(texter.tokenize(data[0],
                                    ignoreCommon=100,
                                    removeStrings=["[identifier deleted]"],
                                    correctSpell=True,
                                    expandAbbr=expandAbbr,
                                    expandContr=expandContr), data[1])
                   for _, data in dataDict.iteritems()]
    else:
        samples = [(texter.tokenize(data[0]), data[1])
                   for _, data in dataDict.iteritems()]

    return samples, labelReference

예제 #20

0

파일 보기

파일: multi_runner.py 프로젝트: numenta/nupic.fluent

    def _preprocess(self, preprocess):
        """Tokenize the samples, with or without preprocessing."""
        texter = TextPreprocess()
        if preprocess:
            self.samples = {
                category:
                [(texter.tokenize(data[0],
                                  ignoreCommon=100,
                                  removeStrings=["identifier deleted]"],
                                  correctSpell=True), data[1], idx)
                 for idx, data in samples.iteritems()]
                for category, samples in self.dataDict.iteritems()
            }

            if self.testDict:
                self.testSamples = [
                    (texter.tokenize(data[0],
                                     ignoreCommon=100,
                                     removeStrings=["identifier deleted]"],
                                     correctSpell=True), data[1], idx)
                    for idx, data in self.testDict.iteritems()
                ]
        else:
            self.samples = {
                category: [(texter.tokenize(data[0]), data[1], idx)
                           for idx, data in samples.iteritems()]
                for category, samples in self.dataDict.iteritems()
            }

            if self.testDict:
                self.testSamples = [(texter.tokenize(data[0]), data[1], idx)
                                    for idx, data in self.testDict.iteritems()]

예제 #21

0

파일 보기

파일: network_data_generator.py 프로젝트: numenta/nupic.fluent

    def generateSequence(self, text, preprocess=False):
        """
    Return a list of lists representing the text sequence in network data 
    format. Does not preprocess the text.
    """
        # TODO: enable text preprocessing; abstract out the logic in split() into a common method.
        tokens = TextPreprocess().tokenize(text)
        cat = [-1]
        self.sequenceCount += 1
        uniqueID = "q"
        data = self._formatSequence(tokens, cat, self.sequenceCount, uniqueID)

        return data

예제 #22

0

파일 보기

def run(args):
  """
  The experiment is configured to run on question response data.

  The runner sets up the data path to such that the experiment runs on a single
  data file located in the nupic.fluent/data directory.
  The data path MUST BE SPECIFIED at the cmd line, e.g. from the fluent dir:

  python experiments/random_baseline_runner.py data/sample_reviews/sample_reviews_data_training.csv

  To run k-folds cross validation, arguments must be: kFolds > 1, train = False,
  test = False. To run either training or testing, kFolds = 1.
  """
  start = time.time()

  # Setup directories.
  root = os.path.dirname(__file__)
  dataPath = os.path.abspath(os.path.join(root, '../..', args.dataFile))
  modelPath = os.path.abspath(
    os.path.join(root, args.resultsDir, args.expName, args.modelName))
  if not os.path.exists(modelPath):
    os.makedirs(modelPath)

  # Verify input params.
  if not os.path.isfile(dataPath):
    raise ValueError("Invalid data path.")
  if (not isinstance(args.kFolds, int)) or (args.kFolds < 1):
    raise ValueError("Invalid value for number of cross-validation folds.")
  if args.train and args.test:
    raise ValueError("Run training and testing independently.")
  if (args.train or args.test) and args.kFolds > 1:
    raise ValueError("Experiment runs either k-folds CV or training/testing, "
                     "not both.")

  # Load or init model.
  if args.load:
    with open(
      os.path.join(modelPath, "model.pkl"), "rb") as f:
      model = pkl.load(f)
    print "Model loaded from \'{0}\'.".format(modelPath)
  else:
    model = ClassificationModelRandomSDR(verbosity=args.verbosity)

  # Get and prep data.
  texter = TextPreprocess()
  samples, labels = readCSV(dataPath, 2, [3])  # Y data, [3] -> range(3,6)
  labelReference = list(set(labels))
  labels = numpy.array([labelReference.index(l) for l in labels], dtype=int)
  split = len(samples)/args.kFolds
  samples = [texter.tokenize(sample,
                             ignoreCommon=100,
                             removeStrings=["[identifier deleted]"],
                             correctSpell=True)
             for sample in samples]
  if args.verbosity > 1:
    for i, s in enumerate(samples): print i, s, labelReference[labels[i]]
  patterns = [[model.encodePattern(t) for t in tokens] for tokens in samples]

  # Either we train on all the data, test on all the data, or run k-fold CV.
  if args.train:
    training(model,
      [(p, labels[i]) for i, p in enumerate(patterns)])
  elif args.test:
    trialResults = testing(model,
      [(p, labels[i]) for i, p in enumerate(patterns)])
  elif args.kFolds>1:
    intermResults = []
    predictions = []
    for k in range(args.kFolds):
      # Train the model on a subset, and hold the evaluation subset.
      model.resetModel()
      evalIndices = range(k*split, (k+1)*split)
      trainIndices = [i for i in range(len(samples)) if not i in evalIndices]

      print "Training for CV fold {0}.".format(k)
      training(model,
        [(patterns[i], labels[i]) for i in trainIndices])

      print "Evaluating for trial {0}.".format(k)
      trialResults = testing(model,
        [(patterns[i], labels[i]) for i in evalIndices])

      if args.expectationDataPath:
        # Keep the predicted labels (top prediction only) for later.
        p = [l if l else [None] for l in trialResults[0]]
        predictions.append([labelReference[idx[0]] if idx[0] != None else '(none)' for idx in p])

      print "Calculating intermediate results for this fold."
      result = model.evaluateTrialResults(
        trialResults, labelReference, evalIndices)
      intermResults.append(result)
      result[1].to_csv(os.path.join(
        modelPath, "evaluation_fold_" + str(k) + ".csv"))

    print "Calculating cumulative results for {0} trials.".format(args.kFolds)
    results = model.evaluateFinalResults(intermResults)
    results["total_cm"].to_csv(os.path.join(modelPath, "evaluation_totals.csv"))
    if args.expectationDataPath:
      computeExpectedAccuracy(list(itertools.chain.from_iterable(predictions)),
        os.path.abspath(os.path.join(root, '../..', args.expectationDataPath)))

  print "Calculating random classifier results for comparison."
  print model.classifyRandomly(labels)

  print "Saving model to \'{0}\' directory.".format(modelPath)
  with open(
    os.path.join(modelPath, "model.pkl"), "wb") as f:
    pkl.dump(model, f)
  print "Experiment complete in {0:.2f} seconds.".format(time.time() - start)

예제 #23

0

파일 보기

파일: text_preprocess_test.py 프로젝트: JunDeWinter/nupic.fluent

 def testReadExpansionFileNoSuffixes(self):
     """Tests TextPreprocess reads csv files correctly."""
     processor = TextPreprocess()
     abbreviations = processor.readExpansionFile("abbreviations.csv")
     expectedAbbreviations = {"wfh": "work from home"}
     self.assertEqual(abbreviations, expectedAbbreviations)

예제 #24

0

파일 보기

파일: text_preprocess_test.py 프로젝트: lscheinkman/nupic.fluent

 def testReadExpansionFileNoSuffixes(self):
   """Tests TextPreprocess reads csv files correctly."""
   processor = TextPreprocess()
   abbreviations = processor.readExpansionFile("abbreviations.csv")
   expectedAbbreviations = {"wfh": "work from home"}
   self.assertEqual(abbreviations, expectedAbbreviations)

예제 #25

0

파일 보기

파일: baseline_runner.py 프로젝트: luisandresilva/nupic.fluent

def run(args):
  """
  The experiment is configured to run on question response data.

  To run k-folds cross validation, arguments must be: kFolds > 1, train = False,
  test = False. To run either training or testing, kFolds = 1.
  """
  start = time.time()

  # Setup directories.
  root = os.path.dirname(__file__)
  dataPath = os.path.abspath(os.path.join(root, '../..', args.dataFile))
  modelPath = os.path.abspath(
    os.path.join(root, args.resultsDir, args.expName, args.modelName))
  if not os.path.exists(modelPath):
    os.makedirs(modelPath)

  # Verify input params.
  if not os.path.isfile(dataPath):
    raise ValueError("Invalid data path.")
  if (not isinstance(args.kFolds, int)) or (args.kFolds < 1):
    raise ValueError("Invalid value for number of cross-validation folds.")
  if args.train and args.test:
    raise ValueError("Run training and testing independently.")
  if (args.train or args.test) and args.kFolds > 1:
    raise ValueError("Experiment runs either k-folds CV or training/testing, "
                     "not both.")

  # Load or init model.
  if args.load:
    with open(
      os.path.join(modelPath, "model.pkl"), "rb") as f:
      model = pkl.load(f)
    print "Model loaded from \'{0}\'.".format(modelPath)
  else:
    try:
      module = __import__(args.modelModuleName, {}, {}, args.modelName)
      modelClass = getattr(module, args.modelName)
      model = modelClass(verbosity=args.verbosity)
    except ImportError:
      raise RuntimeError("Could not find model class \'%s\' to import."
                         % args.modelName)

  print "Reading in data and preprocessing."
  texter = TextPreprocess()
  samples, labels = readCSV(dataPath, 2, [3])  # Y data, [3] -> range(3,6)
  labelReference = list(set(labels))
  labels = numpy.array([labelReference.index(l) for l in labels], dtype="int8")
  samples = [texter.tokenize(sample,
                             ignoreCommon=100,
                             removeStrings=["[identifier deleted]"],
                             correctSpell=True)
             for sample in samples]
  if args.verbosity > 1:
    for i, s in enumerate(samples): print i, s, labelReference[labels[i]]
  patterns = [model.encodePattern(s) for s in samples]

  # Either we train on all the data, test on all the data, or run k-fold CV.
  if args.train:
    training(model, [(p, labels[i]) for i, p in enumerate(patterns)])
  elif args.test:
    results = testing(model, [(p, labels[i]) for i, p in enumerate(patterns)])
    calculateTrialResults(model, results, labelReference, xrange(len(samples)),
      os.path.join(modelPath, "test_results.csv"))
  elif args.kFolds>1:
    # Run k-folds cross validation -- train the model on a subset, and evaluate
    # on the remaining subset.
    partitions = KFolds(args.kFolds).split(xrange(len(samples)))
    intermResults = []
    predictions = []
    for k in xrange(args.kFolds):
      print "Training and testing for CV fold {0}.".format(k)
      trialResults = runExperiment(model, patterns, labels, partitions[k])

      if args.expectationDataPath:
        # Keep the predicted labels (top prediction only) for later.
        p = [l if l else [None] for l in trialResults[0]]
        predictions.append(
          [labelReference[idx[0]] if idx[0] != None else '(none)' for idx in p])

      print "Calculating intermediate results for this fold. Writing to CSV."
      intermResults.append(calculateTrialResults(model,
        trialResults, labelReference, partitions[k][1],
        os.path.join(modelPath, "evaluation_fold_" + str(k) + ".csv")))

    print "Calculating cumulative results for {0} trials.".format(args.kFolds)
    results = model.evaluateFinalResults(intermResults)
    results["total_cm"].to_csv(os.path.join(modelPath, "evaluation_totals.csv"))
    if args.expectationDataPath:
      computeExpectedAccuracy(list(itertools.chain.from_iterable(predictions)),
        os.path.abspath(os.path.join(root, '../..', args.expectationDataPath)))

  print "Calculating random classifier results for comparison."
  print model.classifyRandomly(labels)

  print "Saving model to \'{0}\' directory.".format(modelPath)
  with open(
    os.path.join(modelPath, "model.pkl"), "wb") as f:
    pkl.dump(model, f)
  print "Experiment complete in {0:.2f} seconds.".format(time.time() - start)