def main(): """" Preprocesses, extracts, learns, tests""" # process flags do_retrain, do_rebuildValidation, do_test = False, False, False for arg in sys.argv[1:]: if ("--retrain" in arg): if ("yes" in arg): do_retrain = True if ("--rebuildValidation" in arg): if ("yes" in arg): do_rebuildValidation = True if ("--test" in arg): if ("yes" in arg): do_test = True # preprocessing do = DataOrganizer() # __________________________________ TRAINING ________________________ # # use BoG to convert to frequency vector fe = FeatureExtractor(FeatureExtractor.ModelType.BagOfClusters) clf = 0 clf_file = "" # get the latest trained model filenames = os.listdir("models/") if len(filenames) > 0: clf_file = "models/" + filenames[-1] else: clf_file = None # get sets of tweets as training data # trainData0, trainData1, validation0, validation1 \ # = do.organizeTrainWithValidation("data/trainValidate/", do_rebuildValidation) trainData0, trainData1 = do.organizeTrain("data/train/") if do_retrain or not clf_file: # split training set into validation and training set X0, X1 = fe.extractTrainFeatureVectors((trainData0, trainData1)) clf = learn(X0, X1) millis = int(round(time.time() * 1000)) clf_file = "trainedModel" + str(millis) print "Saving model to file..." joblib.dump(clf, "models/" + clf_file, compress=1) else: print "Using trained model and BoG..." fe.bog = BagOfWords() fe.bog.getLatestBoG() clf = joblib.load(clf_file) # we're either validating or testing based on the passed flag # ____________________________________VALIDATION__________________________# if not do_test: # feed in the validation sets as one set validationData = do.organizeTest("data/validation/") validationFeatures, validationLabels = fe.extractTestFeatureVectors( validationData) test("Validation", clf, validationFeatures, validationLabels) else: # ____________________________________TESTING _______________________ # # extract test features and test print "Using testing" testData, testLabels = do.organizeTest("data/test/") testFeatures = fe.extractTestFeatureVectors(testData) test("Testing, Global Protests With Background Subtraction", clf, testFeatures, testLabels)