Exemplo n.º 1
0
def main():
    # load the tweet objects
    tweets = pickle.load(open('tweets.p', 'rb'))

    # preprocess each tweet's text
    preprocess(tweets)

    # perform KFold cross-validation with 3 folds to get more accurate accuracy prediction
    kf = KFold(n=len(tweets), n_folds=3, shuffle=True)
    for train_indices, test_indices in kf:
        tweets_train = [tweets[i] for i in train_indices]
        tweets_test = [tweets[i] for i in test_indices]

        clf = BayesSentimentClassifier()
        clf.train(
            tweets_train
        )  # train the classifier, i.e. populate the sentiment dictionary
        clf.predict(
            tweets_test)  # predict sentiment of each tweet using Bayes Theorem

        # calculate accuracy
        print(accuracy_score(tweets_test))

        test = Tweet("", "", "I am skeptical about this result",
                     "Tue Oct 18 18:05:50 +0000 2011", "")
        clf.predict([test])
        print(test.prediction.sentiment)
        print(test.prediction.probabilities)
Exemplo n.º 2
0
def process_tweets(rank, input_file, processes):
    ht_occurences = Counter([])
    lang_occurences = Counter([])

    with open(input_file) as f:
        logging.info(f"Process: {rank} | Initiating processing task.")
        try:
            for i, line in enumerate(f):
                line = line.replace(",\n", "")
                if i % processes == rank:
                    try:
                        data = json.loads(line)
                        lang_occurences[data['doc']['lang']] += 1
                        hashtags = [
                            preprocess(i['text'])
                            for i in data['doc']['entities']['hashtags']
                        ]
                        for ht in hashtags:
                            ht_occurences[ht] += 1

                    except ValueError:
                        logging.info(
                            f"Process: {rank} | Malformed JSON on line: {i}")
        except Exception:
            logging.error(f"Problem reading file.")

    logging.info(f"Process: {rank} | I am done Processing.")

    return ht_occurences, lang_occurences
Exemplo n.º 3
0
    def __init__(self, image_path, transform=None):
        self.dataset = np.array([])
        self.label = []
        self.transform = transform

        # Initialize and read data in the specified path
        self._init_dataset(os.path.abspath(image_path))

        # Preprocess data
        self.dataset = preprocess(self.dataset)

        # For data augmentation
        if self.transform is not None:
            for i in range(len(self.dataset)):
                self.dataset[i] = self.transform(
                    Image.fromarray(self.dataset[i]))
        else:
            self.dataset = torch.FloatTensor(
                self.dataset)  # a tensor of shape (n, 36, 128)
        self.label = torch.LongTensor(
            self.label).squeeze()  # a tensor of shape (n, 5)

        if torch.cuda.is_available():
            self.dataset = self.dataset.cuda()
            self.label = self.label.cuda()
Exemplo n.º 4
0
def main():
    print("Reading training data ...")
    train = data_io.read_train()
    train.fillna(0, inplace=True)

    train_sample = train.fillna(value=0)

    features = ut.preprocess(train_sample)
    target = ut.construct_target(train_sample)
    # target = train_sample["booking_bool"].values
    # save the processed data, which may be useful 
    # to test the performance of our model
    print("Saving processed training data ...")
    data_io.save_processed_data([features, target])

    print("Training the Regressor ...")
    regressor = RandomForestRegressor(n_estimators=10, #RandomForestClassifier
                                        verbose=2,
                                        n_jobs=-1,
                                        max_features = "sqrt",
                                        min_samples_split=10,
                                        random_state=1)
    regressor.fit(features, target)
    
    print("Saving the Regressor ...")
    data_io.save_model(regressor)
def main():
    # load the tweet objects
    tweets = pickle.load(open('tweets.p', 'rb'))

    # preprocess each tweet's text
    preprocess(tweets)

    # perform KFold cross-validation with 3 folds to get more accurate accuracy prediction
    kf = KFold(n=len(tweets), n_folds=3, shuffle=True)
    for train_indices, test_indices in kf:
        tweets_train = [tweets[i] for i in train_indices]
        tweets_test = [tweets[i] for i in test_indices]

        clf = BayesSentimentClassifier()
        clf.train(tweets_train)  # train the classifier, i.e. populate the sentiment dictionary
        clf.predict(tweets_test)  # predict sentiment of each tweet using Bayes Theorem

        # calculate accuracy
        print(accuracy_score(tweets_test))

        test = Tweet("", "", "I am skeptical about this result", "Tue Oct 18 18:05:50 +0000 2011", "")
        clf.predict([test])
        print(test.prediction.sentiment)
        print(test.prediction.probabilities)
Exemplo n.º 6
0
def predict(event, context):
    """Makes inference on the passed data."""
    df = utilities.load_dataframe_from_sqs_event(event)
    X = utilities.preprocess(df)

    model = utilities.load_model(MODEL_URI)
    y = model.predict(X)
    results = utilities.postprocess(X, y)

    msg = utilities.SQSMessage()
    msg.dataframe = results
    msg.send(queue=WRITER_QUEUE)
    return {
        "status": "success",
    }
Exemplo n.º 7
0
def main():
    model_file_path = "output" + os.sep + "linear_regression_model_mv.sav"

    ignored_columns = ['ZN', 'CHAS', 'NOX', 'RM', 'DIS', 'RAD', 'TAX', 'PIRATIO', 'B', 'LSTAT']
    X, Y = load_data('input' + os.sep + 'housing.csv', False, ignored_columns)

    X = preprocess(X, "normalize")

    X_train, y_train, X_test, y_test = split_dataset(X, Y)

    train(X_train, y_train, model_file_path)
    y_predicted = predict(X_test, model_file_path)

    rmse_ration = calculate_rmse_ration(y_test, y_predicted)

    print("rmse ratio:", rmse_ration)
def main(input_path, output_path, ignored_columns, preprocess_type,
         training_data_rate, step_length, threshold_rate, max_loop_num,
         dynamic_step):
    print("input:", input_path)
    print("output:", output_path)
    print("\n")
    if ignored_columns is not None:
        print("ignored_columns:", ignored_columns)
    print("\n")
    print("preprocess_type:", preprocess_type)
    print("training_data_rate:", training_data_rate)
    print("\n")
    print("threshold_rate:", threshold_rate)
    print("max_loop_num:", max_loop_num)
    print("step_length:", step_length)
    if dynamic_step:
        print("dynamic stepping ...")
    else:
        print("static stepping ...")
    print("\n")
    start_time = datetime.now()

    X, Y = load_data(input_path, True, ignored_columns)

    X = preprocess(X, preprocess_type)

    X_train, y_train, X_test, y_test = split_dataset(X, Y, training_data_rate)

    threshold = gen_threshold(Y, threshold_rate)

    train(X_train, y_train, output_path, step_length, threshold, max_loop_num,
          dynamic_step)

    Y_pred = predict(output_path, X_test)

    rmse_ration = calculate_rmse_ration(y_test, Y_pred)
    print("rmse ratio (rmse / y_mean) is:", rmse_ration, "\n")

    end_time = datetime.now()

    execution_duration = end_time - start_time

    print("execution duration:", execution_duration, "\n")

    return
Exemplo n.º 9
0
def master_worker(comm, input_file):
    # Read our tweets
    rank = comm.Get_rank()
    size = comm.Get_size()

    if size > 1:
        logging.info(f'Process: {rank} | I am Master!')
        ht_counts, lang_counts = process_tweets(rank, input_file, size)
        ht_temp, lang_temp = gather_tweets(comm)
        logging.info(f"Process: 0 (Master) | Shutting Down slave(s)")
        ht_counts += ht_temp
        lang_counts += lang_temp
        # Turn everything off
        for i in range(size - 1):
            comm.send('exit', dest=(i + 1), tag=(i + 1))

        return ht_counts.most_common(10), lang_counts.most_common(10)

    else:
        logging.info(f'Process: {rank} | I am processing alone!')
        ht_counts = Counter([])
        lang_counts = Counter([])
        with open(input_file) as f:
            logging.info(f"Process: {rank} | Initiating processing task.")
            try:
                for i, line in enumerate(f):
                    line = line.replace(",\n", "")
                    try:
                        data = json.loads(line)
                        lang_counts[data['doc']['lang']] += 1
                        hashtags = [
                            preprocess(i['text'])
                            for i in data['doc']['entities']['hashtags']
                        ]
                        for ht in hashtags:
                            ht_counts[ht] += 1
                    except ValueError:
                        logging.info(
                            f"Process: {rank} | Malformed JSON on line: {i}")
            except Exception:
                logging.error(f"Problem reading file.")

        logging.info(f"Process: {rank} | I am done Processing.")

        return ht_counts.most_common(10), lang_counts.most_common(10)
Exemplo n.º 10
0
def main():

    ignored_columns = [
        'ZN', 'CHAS', 'NOX', 'RM', 'DIS', 'RAD', 'TAX', 'PIRATIO', 'B', 'LSTAT'
    ]
    X, Y = load_data('input' + os.sep + 'housing.csv', True, ignored_columns)

    X = preprocess(X, "normalize")

    X_train, y_train, X_test, y_test = split_dataset(X, Y)

    path = 'output' + os.sep + 'lsm_multivariant.csv'

    lsm(X_train, y_train, path)
    y_predicted = predict(path, X_test)

    rmse_ration = calculate_rmse_ration(y_test, y_predicted)
    print("rmse ratio:", rmse_ration)
    return
Exemplo n.º 11
0
def cross_vad(examples, num_folds = 10):
    data = ut.dataCrossSplit(examples, num_folds, False)
    errorRates = []
    for i in range(num_folds):
        egs = data[i]
        dt = DTree(SelectAtt)
        dt.training(egs[0], 1)
        # calculate error rate
        error = [0.] * 2
        for j in range(2):
            for x in egs[j]:
                if dt.predict(x) != x[0]:
                    error[j] = error[j] + 1
            error[j] = error[j] / len(egs[j])
        print "Fold ", i, " trainingData errorRate: ", error[0], " testData errorRate:", error[1]
        errorRates.append(error)
    arr = np.array(errorRates)
    print "Train Mean ErrorRate:", np.mean(arr[:,0]), " Test Mean ErrorRate:", np.mean(arr[:,1])
    print "Train StdVar ErrorRate:", np.sqrt(np.var(arr[:,0])), " Test Mean ErrorRate:", np.sqrt(np.var(arr[:, 1]))

if __name__ == '__main__':
    filename = sys.argv[1]
    egs = ut.importRawData(filename)
    egs = ut.preprocess(egs)
    cross_vad(egs)
    dt = DTree(SelectAtt)
    dt.training(egs)
    print "========= the decision tree ============"
    dt.printTree()
Exemplo n.º 12
0
def find_math_elements(image, arrow_c, bound=20, show=False):
    """Detect math elements in the image.

    Args:
        image: The input image.
        arrow: Center of the arrow.
        bound: Bounding box size.
        show: Whether to show the results.
    Return:
        list: Images of all math elements.
        list: Center coordinates of math elements.
    """
    image_original = image.copy()
    image_copy = image.copy()

    # cover red arrow with white rectangle
    cv2.rectangle(image_copy, (arrow_c[0] - 60, arrow_c[1] - 60),
                  (arrow_c[0] + 60, arrow_c[1] + 60), (255, 255, 255), -1)

    value_threshold = int(
        cv2.cvtColor(image, cv2.COLOR_RGB2HSV)[:, :, 2].mean() * 0.9)
    masked = mask_image(image_copy, np.array([0, 0, 0]),
                        np.array([180, 255, value_threshold]),
                        np.array([100, 100, 0]), np.array([140, 255, 255]))
    preprocessed = preprocess(masked, 10, False)

    # get the contours of all shapes
    contours, _ = cv2.findContours(preprocessed, cv2.RETR_LIST,
                                   cv2.CHAIN_APPROX_SIMPLE)

    centers = []
    elements = []

    for i, c in enumerate(contours):
        # compute the centroid of the shapes
        M = cv2.moments(c)

        area = M['m00']
        elongation = compute_elongation(M)
        # these are either too small or too big or too elongated
        if area < 40 or area > 400 or elongation > 3000: continue

        cY = int(M['m01'] / M['m00'])
        cX = int(M['m10'] / M['m00'])
        center = (cX, cY)

        # if it is too close to a known element, it is not a valid element
        too_close = False
        for center_ in centers:
            d = (center_[0] - center[0])**2 + (center_[1] - center[1])**2
            if d < 4000:
                too_close = True
                break
        if too_close: continue

        # save element and center
        element = image[cY - bound:cY + bound, cX - bound:cX + bound]
        element = cv2.resize(element, (28, 28))
        elements.append(element)
        centers.append(center)

        # visualize the result on the image
        label_color = (214, 39, 40)
        cv2.rectangle(image_original, (cX - bound, cY - bound),
                      (cX + bound, cY + bound), label_color, 2)
        cv2.putText(image_original, f'{len(elements) - 1}', (cX, cY + 40),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, label_color, 2)

    if show:
        imshow(image_original)

    return elements, centers
Exemplo n.º 13
0
def find_red_arrow(image, show=False):
    """Detect the red arrow in the image.

    Args:
        image: The input image.
        show: Whether to show the results.
    Return:
        tuple: Tip coordinates.
        tuple: Center coordinates.
    """
    image_copy = image.copy()

    masked = mask_image(image_copy, np.array([0, 100, 0]),
                        np.array([20, 255, 255]), np.array([160, 100, 0]),
                        np.array([180, 255, 255]))
    preprocessed = preprocess(masked, 100)

    contours, _ = cv2.findContours(preprocessed, cv2.RETR_LIST,
                                   cv2.CHAIN_APPROX_SIMPLE)

    for c in contours:
        # compute the centroid of the shapes
        M = cv2.moments(c)

        area = M['m00']
        elongation = compute_elongation(M)
        # these will not be the arrow (too small or too big)
        if area < 1000 or area > 10000 or elongation > 100: continue

        cX = int(M['m10'] / area)
        cY = int(M['m01'] / area)
        center = (cX, cY)

        # Not sure do we need this
        # if abs(M['mu20'] - M['mu02']) > 420000: continue

        # find the corners of the arrow
        points = cv2.approxPolyDP(c, 4.7, True).squeeze(1)

        tip_idx = 0
        cand_tips = []
        angles = []

        # find tip candidates
        for i in range(len(points)):
            # get the current point and the surrounding points
            x = points[i - 1] if i != 0 else points[-1]
            y = points[i]
            z = points[i + 1] if i != len(points) - 1 else points[0]
            # get the lengths between the current point and the surrounding points
            l1 = np.linalg.norm(np.array(x) - np.array(y))
            l2 = np.linalg.norm(np.array(y) - np.array(z))

            ang = compute_angle(x, y, z)
            angles.append(ang)
            # save candidates
            if abs(ang - 100) < 15 and (l1 + l2 > 30):
                cand_tips.append(len(angles) - 1)

        # choose the correct tip
        for i in cand_tips:
            pang = angles[i - 1] if i != 0 else angles[-1]
            nang = angles[i + 1] if i != len(angles) - 1 else angles[0]
            if pang + nang < 300 and pang + nang > 200:
                tip_idx = i

        # visualize the result on the image
        cv2.drawContours(image_copy, [c], 0, (214, 39, 40), 2)
        cv2.circle(image_copy, tuple(center), 5, (0, 255, 0), -1)
        cv2.circle(image_copy, tuple(points[tip_idx]), 5, (0, 0, 255), -1)

        break

    if show:
        imshow(image_copy)

    return points[tip_idx], center
Exemplo n.º 14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--load',
                        type=str,
                        help='Checkpoint to load all weights from.')
    parser.add_argument('--load-gen',
                        type=str,
                        help='Checkpoint to load generator weights only from.')
    parser.add_argument('--name', type=str, help='Name of experiment.')
    parser.add_argument('--overfit',
                        action='store_true',
                        help='Overfit to a single image.')
    parser.add_argument('--batch-size',
                        type=int,
                        default=16,
                        help='Mini-batch size.')
    parser.add_argument(
        '--log-freq',
        type=int,
        default=10000,
        help='How many training iterations between validation/checkpoints.')
    parser.add_argument('--learning-rate',
                        type=float,
                        default=1e-4,
                        help='Learning rate for Adam.')
    parser.add_argument('--content-loss',
                        type=str,
                        default='mse',
                        choices=['mse', 'vgg22', 'vgg54'],
                        help='Metric to use for content loss.')
    parser.add_argument(
        '--use-gan',
        action='store_true',
        help='Add adversarial loss term to generator and trains discriminator.'
    )
    parser.add_argument('--image-size',
                        type=int,
                        default=96,
                        help='Size of random crops used for training samples.')
    parser.add_argument('--vgg-weights',
                        type=str,
                        default='vgg_19.ckpt',
                        help='File containing VGG19 weights (tf.slim)')
    parser.add_argument('--train-dir',
                        type=str,
                        help='Directory containing training images')
    parser.add_argument(
        '--validate-benchmarks',
        action='store_true',
        help=
        'If set, validates that the benchmarking metrics are correct for the images provided by the authors of the SRGAN paper.'
    )
    parser.add_argument('--gpu',
                        type=str,
                        default='0',
                        help='Which GPU to use')
    parser.add_argument('--epoch',
                        type=int,
                        default='1000000',
                        help='How many iterations ')

    args = parser.parse_args()
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    # Set up models
    d_training = tf.placeholder(tf.bool, name='d_training')
    g_training = tf.placeholder(tf.bool, name='g_training')
    discriminator = srgan.SRGanDiscriminator(training=g_training,
                                             image_size=args.image_size)
    generator = srgan.SRGanGenerator(discriminator=discriminator,
                                     training=d_training,
                                     learning_rate=args.learning_rate,
                                     content_loss=args.content_loss,
                                     use_gan=args.use_gan)
    # Generator
    g_x = tf.placeholder(tf.float32, [None, None, None, 3],
                         name='input_lowres')
    g_y = tf.placeholder(tf.float32, [None, None, None, 3],
                         name='input_highres')
    g_y_pred = generator.forward(g_x)
    g_loss = generator.loss_function(g_y, g_y_pred)
    g_train_step = generator.optimize(g_loss)
    # Discriminator
    d_x_real = tf.placeholder(tf.float32, [None, None, None, 3],
                              name='input_real')
    d_y_real_pred, d_y_real_pred_logits = discriminator.forward(d_x_real)
    d_y_fake_pred, d_y_fake_pred_logits = discriminator.forward(g_y_pred)
    d_loss = discriminator.loss_function(d_y_real_pred, d_y_fake_pred,
                                         d_y_real_pred_logits,
                                         d_y_fake_pred_logits)
    d_train_step = discriminator.optimize(d_loss)

    # Set up benchmarks
    benchmarks = [
        Benchmark('Benchmarks/Set5', name='Set5'),
        Benchmark('Benchmarks/Set14', name='Set14'),
        Benchmark('Benchmarks/BSD100', name='BSD100')
    ]
    if args.validate_benchmarks:
        for benchmark in benchmarks:
            benchmark.validate()

    # Create log folder
    if args.load and not args.name:
        log_path = os.path.dirname(args.load)
    else:
        log_path = build_log_dir(args, sys.argv)

    with tf.Session() as sess:
        # Build input pipeline
        get_train_batch, get_val_batch, get_eval_batch = input_setup(
            args, sess)
        # Initialize
        sess.run(tf.local_variables_initializer())
        sess.run(tf.global_variables_initializer())
        # Start input pipeline thread(s)
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        # Load saved weights
        iteration = 0
        saver = tf.train.Saver()
        # Load generator
        if args.load_gen:
            gen_saver = tf.train.Saver(
                tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                  scope='generator'))
            iteration = int(args.load_gen.split('-')[-1])
            gen_saver.restore(sess, args.load_gen)
        # Load all
        if args.load:
            iteration = int(args.load.split('-')[-1])
            saver.restore(sess, args.load)
            print(saver)
            print("load_process_DEBUG")
        # Load VGG
        if 'vgg' in args.content_loss:
            vgg_saver = tf.train.Saver(
                tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                  scope='vgg_19'))
            vgg_saver.restore(sess, args.vgg_weights)

        # Train
        while True:
            if iteration % args.log_freq == 0:
                # Test every log-freq iterations
                val_error = evaluate_model(g_loss, get_val_batch, sess, 119,
                                           args.batch_size)
                eval_error = evaluate_model(g_loss, get_eval_batch, sess, 119,
                                            args.batch_size)
                # Log error
                print('[%d] Test: %.7f, Train: %.7f' %
                      (iteration, val_error, eval_error),
                      end='')
                # Evaluate benchmarks
                log_line = ''
                for benchmark in benchmarks:
                    psnr, ssim, _, _ = benchmark.evaluate(
                        sess, g_y_pred, log_path, iteration)
                    print(' [%s] PSNR: %.2f, SSIM: %.4f' %
                          (benchmark.name, psnr, ssim),
                          end='')
                    log_line += ',%.7f, %.7f' % (psnr, ssim)
                print()
                # Write to log
                with open(log_path + '/loss.csv', 'a') as f:
                    f.write('%d, %.15f, %.15f%s\n' %
                            (iteration, val_error, eval_error, log_line))
                # Save checkpoint
                saver.save(sess,
                           os.path.join(log_path, 'weights'),
                           global_step=iteration,
                           write_meta_graph=False)

            # Train discriminator
            if args.use_gan:
                batch_hr = sess.run(get_train_batch)
                batch_lr = downsample_batch(batch_hr, factor=4)
                batch_lr, batch_hr = preprocess(batch_lr, batch_hr)
                sess.run(d_train_step,
                         feed_dict={
                             d_training: True,
                             g_training: True,
                             g_x: batch_lr,
                             g_y: batch_hr,
                             d_x_real: batch_hr
                         })
            # Train generator
            batch_hr = sess.run(get_train_batch)
            batch_lr = downsample_batch(batch_hr, factor=4)
            batch_lr, batch_hr = preprocess(batch_lr, batch_hr)
            sess.run(g_train_step,
                     feed_dict={
                         d_training: True,
                         g_training: True,
                         g_x: batch_lr,
                         g_y: batch_hr
                     })

            iteration += 1

        # Stop queue threads
        coord.request_stop()
        coord.join(threads)
from keras import backend as K
from keras.optimizers import SGD

K.tensorflow_backend._get_available_gpus()
from matplotlib.pyplot import imshow
import numpy as np
import scipy.io
import matplotlib.pyplot as plt
import utilities

#load our dataset
train_data = scipy.io.loadmat('train_32x32.mat')
test_data = scipy.io.loadmat('test_32x32.mat')

#x_train : 73257x1024 , y_train : 73257 x 11
x_train, y_train = utilities.preprocess(train_data)
x_test, y_test = utilities.preprocess(test_data)

imshow(x_test[4].reshape(32, 32))
plt.show()
print(y_test[4])

#training
model = Sequential()
model.add(Dense(input_dim=32 * 32, units=633, activation='relu'))
#training set的performance不大好,所以先不加dropout
#model.add(Dropout(0.5))
model.add(Dense(units=633, activation='relu'))
model.add(Dense(units=633, activation='relu'))
model.add(Dense(units=633, activation='sigmoid'))
model.add(Dense(units=633, activation='sigmoid'))
Exemplo n.º 16
0
raw_text1 = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold."""

raw_text = preprocess(raw_text1)
data = prepareData(raw_text)

vocab = set(raw_text)
vocab_size = len(vocab)
print(vocab_size)

word_to_ix = {word: i for i, word in enumerate(vocab)}

print(len(data))

losses = []
loss_function = nn.NLLLoss()
model = CBOW(len(vocab), EMBEDDING_DIM, 2 * CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)