示例#1
0
 def setUp(self):
     # create a dummy ClassifierModel object
     self.classifier_model = ClassifierModel.objects.create(
         version=1,
         data=b"This is just dummy data. PLEASE, DON'T UNPICKLE THIS !!",
         name="Dummy model")
     self.classifier_class = SKNaiveBayesClassifier
     self.data = get_processed_data(csv_path)
示例#2
0
def create_classifier_model(
        version,
        csv_path,
        classifier_class=SKNaiveBayesClassifier,
        confusion_matrix=True
        ):
    """
    Create a new classifier object to save to the database

    Parameters
    ----------
    @classifier_class : Classifier class to use to creat model
    @data : labeled data list [(text, classification), ...]
    @version : version of the classifier model
    """

    # check if version already exists
    try:
        ClassifierModel.objects.get(version=version)
        raise Exception("Classifier version {} already exists".format(version))
    except ClassifierModel.DoesNotExist:
        pass

    from helpers.deep import get_processed_data
    data = get_processed_data(csv_path)

    # get train, test data
    train, test = create_train_test_data(data)

    classifier = classifier_class.new(train)
    accuracy = classifier.get_accuracy(test)

    if confusion_matrix:
        classifier.calculate_confusion_matrix(test)

    pickle_data = pickle.dumps(classifier)

    modelobj = ClassifierModel(
        data=pickle_data,
        accuracy=accuracy,
        version=version,
        name=classifier_class.__name__
    )
    testfilename = 'test_data_v-{}.pkl'.format(version)
    filepath = 'model_test_datas/{}'.format(testfilename)
    with open(filepath, 'wb') as f:
        f.write(pickle.dumps(test))
    modelobj.test_file_path = filepath

    return modelobj
示例#3
0
def main(*args, **kwargs):
    if not kwargs.get('model_version'):
        print("Version not provided. Provide it as --modelversion <version>")
        return
    csv_path = kwargs.get('path', '_playground/sample_data/processed_new_data.csv')
    # TODO; check for model name
    version = kwargs['model_version']

    from helpers.deep import get_processed_data

    # get data
    data = get_processed_data(csv_path)
    classifier_model = create_and_save_classifier_model(version, data)
    print('Classifier {}  created successfully with  test data'.format(
        classifier_model
    ))
示例#4
0
 def _get_model(self, version):
     # first create classifier
     csv_path = 'fixtures/processed_data_for_testing.csv'
     data = get_processed_data(csv_path)
     return create_classifier_model(version, data)
示例#5
0
 def setUp(self):
     self.test_data = get_processed_data(
         'fixtures/processed_data_for_testing.csv')
     self.train, self.test = create_train_test_data(self.test_data)
     self.classifier = SKNaiveBayesClassifier.new(self.train)
     self.classifier.calculate_confusion_matrix(self.test)
示例#6
0
logger = logging.getLogger('myapp')
hdlr = logging.FileHandler(logfilepath)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.WARNING)

# logfile = open(logfilepath, 'w')

num_accuracy = []

try:
    logger.info('.. GETTING DEEP DATA\n')
    print('.. GETTING DEEP DATA\n')
    deepdata = get_processed_data(
        '_playground/sample_data/processed_sectors_subsectors.csv')
    logger.info('.. SHUFFLING DEEP DATA\n')
    print('.. SHUFFLING DEEP DATA\n')
    random.shuffle(deepdata)

    total = len(deepdata)

    logger.info('.. INITIALIZING DATASETSIZE TO 500\n')
    print('.. INITIALIZING DATASETSIZE TO 500\n')
    dataset_num = 500
    logger.info('.. SETTING SIZE INCREMENT TO 150\n')
    print('.. SETTING SIZE INCREMENT TO 150\n')
    increment = 150

    # first create dir to store accuracy vs size data
    logger.info('.. CREATING DIRECTORY `DEEP_DATA` FOR STORING DATA\n')
示例#7
0
def test_create_train_test_data():
    data = get_processed_data(csv_path)
    train, test = create_train_test_data(data)
    assert len(test) == int(len(data) / 4)
    assert len(train) == int(3 * len(data) / 4)
示例#8
0
def test_get_processed_data():
    data = get_processed_data(csv_path)
    assert type(data) == list, "The resulting data should be a list"
    assert type(data[0]) == tuple, "Should be a tuple"
    assert len(data[0]) == 2, "Tuple size should be 2"
示例#9
0
def main(*args, **kwargs):

    try:
        logger.info('.. GETTING DEEP DATA\n')
        print('.. GETTING DEEP DATA\n')
        deepdata = get_processed_data(
            '_playground/sample_data/processed_sectors_subsectors.csv')
        logger.info('.. SHUFFLING DEEP DATA\n')
        print('.. SHUFFLING DEEP DATA\n')
        random.shuffle(deepdata)

        total = len(deepdata)

        logger.info('.. INITIALIZING DATASETSIZE TO 500\n')
        print('.. INITIALIZING DATASETSIZE TO 500\n')
        dataset_num = 500
        logger.info('.. SETTING SIZE INCREMENT TO 150\n')
        print('.. SETTING SIZE INCREMENT TO 150\n')
        increment = 150

        # first create dir to store accuracy vs size data
        logger.info('.. CREATING DIRECTORY `DEEP_DATA` FOR STORING DATA\n')
        print('.. CREATING DIRECTORY `DEEP_DATA` FOR STORING DATA\n')
        dirpath = os.path.join(os.path.expanduser('~'), 'data_DEEPL')
        subprocess.call(['mkdir', '-p', dirpath])

        filepath = os.path.join(dirpath, 'accuracy_vs_size.txt')
        logger.info('.. RUNNING LOOP')
        print('.. RUNNING LOOP')
        sectors_accuracies = {}
        while dataset_num <= total:
            random.shuffle(deepdata)
            one_fourth = int(dataset_num / 4.0)
            train = deepdata[:dataset_num][one_fourth:]
            test = deepdata[:dataset_num][:one_fourth]
            logger.info('.. dataset_num:{}\n'.format(dataset_num))
            classifier = CLASSIFIER.new(train)
            classifier.calculate_confusion_matrix(test)

            # calculate accuracy for other
            indices = classifier.confusion_matrix._indices
            matrix = classifier.confusion_matrix._confusion
            if not sectors_accuracies:
                sectors_accuracies = {k: [] for k, v in indices.items()}
            for k, v in indices:
                total = sum(matrix[v])
                correct = matrix[v][v]
                sectors_accuracies[k].append(
                    [dataset_num, correct / float(total)])

            accuracy = classifier.get_accuracy(test)
            num_accuracy.append((dataset_num, accuracy))
            logger.info('.. accuracy: {}\n'.format(accuracy))
            print('.. accuracy: {}\n'.format(accuracy))

            dataset_num += increment
        # now plot
        data = num_accuracy
        x = list(map(lambda x: x[0], data))
        y = list(map(lambda x: x[1], data))

        print("$$$$$$$$$$$$$$$$$$$")
        print(data)
        print("$$$$$$$$$$$$$$$$$$$")
        print(sectors_accuracies)
        print("$$$$$$$$$$$$$$$$$$$")

        fig = plt.figure(figsize=(15, 8))
        plt.xticks([x for x in range(500, 28000, 1500)])
        plt.xlabel('# of TRAINING SETS')
        plt.ylabel('ACCURACY')
        plt.grid(True)
        plt.plot(x, y, 'k')
        plt.savefig(str(datetime.datetime.now()) + ".png")

        logger.info('.. DONE!!!')
    except Exception as e:
        import traceback
        logger.info(traceback.format_exc())
        print(traceback.format_exc())
        logger.info('\n')