def select_matching(texts, labels, words, threshold=1):
    selected_content = []
    for text, label in zip(texts, labels):
        sent_list = text.split(' ')
        word_count = [sent_list.count(w) for w in words]
        word_count = np.array(word_count)
        occur_times = (word_count >= 1).sum()
        if occur_times >= threshold:
            print(label, text, np.array(words)[word_count >= 1])
            selected_content.append((label, text))
    csv_save(selected_content, './data/traindata/Sentiment140/pre-processed/anew_part_of_nostem_160000.csv')
    return
def select_matching(texts, labels, words, threshold=1):
    selected_content = []
    for text, label in zip(texts, labels):
        sent_list = text.split(' ')
        word_count = [sent_list.count(w) for w in words]
        word_count = np.array(word_count)
        occur_times = (word_count >= 1).sum()
        if occur_times >= threshold:
            print(label, text, np.array(words)[word_count >= 1])
            selected_content.append((label, text))
    csv_save(
        selected_content,
        './data/traindata/Sentiment140/pre-processed/anew_part_of_nostem_160000.csv'
    )
    return
def preprocess_tweeets(tweets_list, tweets_labels, filename):
    def isEnglish(s):
        try:
            s.encode('ascii')
        except UnicodeEncodeError:
            return False
        else:
            return True

    processed_texts = []
    for line, l in zip(tweets_list, tweets_labels):
        if isEnglish(line):
            processed_texts.append((l, preprocessor(line)))
        # else: # print or not ?
        #     print(line)

    os_name = get_os_name()
    if os_name == 'windows':
        file_dir = 'C:/Corpus/'
    elif os_name == 'ubuntu':
        file_dir = '/home/hs/Data/'
    else:
        return
    csv_save(processed_texts, file_dir + filename)
def preprocess_tweeets(tweets_list, tweets_labels, filename):
    def isEnglish(s):
        try:
            s.encode('ascii')
        except UnicodeEncodeError:
            return False
        else:
            return True

    processed_texts = []
    for line, l in zip(tweets_list, tweets_labels):
        if isEnglish(line):
            processed_texts.append((l, preprocessor(line)))
        # else: # print or not ?
        #     print(line)

    os_name = get_os_name()
    if os_name == 'windows':
        file_dir = 'C:/Corpus/'
    elif os_name == 'ubuntu':
        file_dir = '/home/hs/Data/'
    else:
        return
    csv_save(processed_texts, file_dir + filename)
예제 #5
0
def evaluate_lenet5(learning_rate=0.1, n_epoches=200,
                    nkerns=[20, 50], batch_size=500):
    # load data from dataset
    logging.info('... loading data')
    from load_data import load_qrcode
    from sklearn.cross_validation import train_test_split

    def upToInt(array):
        array = np.mat(array)
        m, n = np.shape(array)
        newArray = np.zeros((m, n))
        for i in range(m):
            for j in range(n):
                if array[i, j] > 0:
                    newArray[i, j] = 1
        return newArray

    features, labels = load_qrcode()
    features = upToInt(features)
    (trainData, trainLabel, testData, testY) = train_test_split(features, labels, test_size=0.2)
    # trainData, trainLabel = util.load_total_data()
    # testData = util.loadTestData()

    train_set_x = theano.shared(np.asarray(trainData,
                                           dtype=theano.config.floatX),
                                borrow=True)

    train_set_y = theano.shared(np.asarray(trainLabel,
                                           dtype=theano.config.floatX),
                                borrow=True)

    test_set_x = theano.shared(np.asarray(testData,
                                          dtype=theano.config.floatX),
                               borrow=True)

    train_set_y = T.cast(train_set_y, 'int32')
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = train_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]

    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    rng = np.random.RandomState(23455)

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    x = T.matrix('x')
    y = T.ivector('y')

    logging.info('... building the model')
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
    # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
    layer0 = LeNetConvPoolLayer(
        rng,
        input=layer0_input,
        image_shape=(batch_size, 1, 28, 28),
        filter_shape=(nkerns[0], 1, 5, 5),
        poolsize=(2, 2)
    )

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
    # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
    # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
    layer1 = LeNetConvPoolLayer(
        rng,
        input=layer0.output,
        image_shape=(batch_size, nkerns[0], 12, 12),
        filter_shape=(nkerns[1], nkerns[0], 5, 5),
        poolsize=(2, 2)
    )

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
    # or (500, 50 * 4 * 4) = (500, 800) with the default values.
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(
        rng,
        input=layer2_input,
        n_in=nkerns[1] * 4 * 4,
        n_out=500,
        activation=T.tanh
    )

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size],
        }
    )

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    updates = [
        (param_i, param_i - learning_rate * grad_i)
        for param_i, grad_i in zip(params, grads)
        ]

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    validation_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    logging.info('... training')
    patience = 10000
    patience_increase = 2
    improvement_threshold = 0.995

    validation_frequency = min(n_train_batches, patience / 2)

    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epoches) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                logging.info('training @ iter = %d' % (iter))
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [validation_model(i) for i
                                     in range(n_valid_batches)]
                this_validation_loss = np.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                if (this_validation_loss * 100.) < 0.001:
                    done_looping = True
                    break

    end_time = time.clock()
    logging.info('The code for file ' +
                 os.path.split(__file__)[1] +
                 ' ran for %.2fm' % ((end_time - start_time) / 60.))

    # make a prediction and save file
    # make a prediction
    predict_model = theano.function(
        inputs=[index],
        outputs=layer3.predict(),
        givens={
            x: test_set_x[index * batch_size: (index + 1) * batch_size]
        }
    )

    # save the result file
    testLabel = np.array([])
    for test_index in range(n_test_batches):
        tempLabel = predict_model(test_index)
        testLabel = np.hstack((testLabel, tempLabel))
    from save_data import csv_save

    csv_save(testLabel, './data/cnn_result.csv')
def evaluate_lenet5(learning_rate=0.1, n_epoches=200, nkerns=[20, 50], batch_size=500):
    # load data from dataset
    logging.info("... loading data")
    from load_data import load_qrcode
    from sklearn.cross_validation import train_test_split

    def upToInt(array):
        array = np.mat(array)
        m, n = np.shape(array)
        newArray = np.zeros((m, n))
        for i in range(m):
            for j in range(n):
                if array[i, j] > 0:
                    newArray[i, j] = 1
        return newArray

    features, labels = load_qrcode()
    features = upToInt(features)
    (trainData, trainLabel, testData, testY) = train_test_split(features, labels, test_size=0.2)
    # trainData, trainLabel = util.load_total_data()
    # testData = util.loadTestData()

    train_set_x = theano.shared(np.asarray(trainData, dtype=theano.config.floatX), borrow=True)

    train_set_y = theano.shared(np.asarray(trainLabel, dtype=theano.config.floatX), borrow=True)

    test_set_x = theano.shared(np.asarray(testData, dtype=theano.config.floatX), borrow=True)

    train_set_y = T.cast(train_set_y, "int32")
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = train_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]

    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    rng = np.random.RandomState(23455)

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    x = T.matrix("x")
    y = T.ivector("y")

    logging.info("... building the model")
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
    # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
    layer0 = LeNetConvPoolLayer(
        rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)
    )

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
    # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
    # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
    layer1 = LeNetConvPoolLayer(
        rng,
        input=layer0.output,
        image_shape=(batch_size, nkerns[0], 12, 12),
        filter_shape=(nkerns[1], nkerns[0], 5, 5),
        poolsize=(2, 2),
    )

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
    # or (500, 50 * 4 * 4) = (500, 800) with the default values.
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: train_set_x[index * batch_size : (index + 1) * batch_size],
            y: train_set_y[index * batch_size : (index + 1) * batch_size],
        },
    )

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)]

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size : (index + 1) * batch_size],
            y: train_set_y[index * batch_size : (index + 1) * batch_size],
        },
    )

    validation_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: train_set_x[index * batch_size : (index + 1) * batch_size],
            y: train_set_y[index * batch_size : (index + 1) * batch_size],
        },
    )

    logging.info("... training")
    patience = 10000
    patience_increase = 2
    improvement_threshold = 0.995

    validation_frequency = min(n_train_batches, patience / 2)

    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0.0
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epoches) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                logging.info("training @ iter = %d" % (iter))
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [validation_model(i) for i in range(n_valid_batches)]
                this_validation_loss = np.mean(validation_losses)
                print(
                    "epoch %i, minibatch %i/%i, validation error %f %%"
                    % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.0)
                )

                if (this_validation_loss * 100.0) < 0.001:
                    done_looping = True
                    break

    end_time = time.clock()
    logging.info(
        "The code for file " + os.path.split(__file__)[1] + " ran for %.2fm" % ((end_time - start_time) / 60.0)
    )

    # make a prediction and save file
    # make a prediction
    predict_model = theano.function(
        inputs=[index], outputs=layer3.predict(), givens={x: test_set_x[index * batch_size : (index + 1) * batch_size]}
    )

    # save the result file
    testLabel = np.array([])
    for test_index in range(n_test_batches):
        tempLabel = predict_model(test_index)
        testLabel = np.hstack((testLabel, tempLabel))
    from save_data import csv_save

    csv_save(testLabel, "./data/cnn_result.csv")
예제 #7
0
from load_data import load_processed_data
from qrcode_generator import to_qrcode
import numpy as np

texts, labels = load_processed_data(data_type='train', stem=False)
feature_vec = []
i = 0
for text, label in zip(texts, labels):
    text_qrcode = to_qrcode(text)
    text_qrcode = np.array(list(text_qrcode.getdata()))
    text_qrcode[text_qrcode > 0] = 1
    feature_vec.append(np.append(label, text_qrcode))

from save_data import csv_save

csv_save(feature_vec, './data/traindata/qrcode_20000.csv')