Exemplo n.º 1
0
def main():
    # load dataset
    df = pd.read_csv('movies_metadata.csv')
    df = df.loc[:, ['title', 'genres', 'overview']]
    df = df[pd.notnull(df.overview)]
    df = df[pd.notnull(df.title)]
    df = df[pd.notnull(df.genres)]

    # Training parameters
    max_len_desc = 300
    max_len_title = 50
    max_input_len = max_len_title + max_len_desc
    genres_to_be_predicted = [
        'Drama', 'Comedy', 'Documentary', 'Science Fiction', 'Romance'
    ]

    num_classes = len(genres_to_be_predicted)

    params = {
        'GENRES': genres_to_be_predicted,
        'VOCABULARY_SIZE': 20000,
        'EMBEDDING_DIM': 100,
        'MAX_LEN_DESC': max_len_desc,
        'MAX_LEN_TITLE': max_len_title,
        'INPUT_LEN': max_input_len,
        'NUM_DENSE_1': 512,
        'NUM_CLASSES': num_classes,
        'NUM_EPOCHS': 4,
        'BATCH_DIM': 64
    }

    # init custom classes
    p = preprocessor(genres=params['GENRES'])
    e = encoder(max_words=params['VOCABULARY_SIZE'],
                maxlen_desc=params['MAX_LEN_DESC'],
                maxlen_title=params['MAX_LEN_TITLE'])
    m = model_classifier()

    # prepare data for training
    df = p.preprocess(df)
    X, y = e.encode(df)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.30,
                                                        random_state=1000)
    e.save()

    # create and train model
    model = m.define_model(params)
    history = m.train_model(X_train, X_test, y_train, y_test)

    # save
    m.save_model()
    m.save_params()
Exemplo n.º 2
0
def reliability_label_predictor(attributes):
    source_name = attributes[0]
    topic_name = attributes[1]
    url = attributes[2]
    print(source_name, topic_name, url)
    article = parse_url(url)
    cleanedArticle = preprocess.preprocessor(article)
    cluster_number, flag = Clustering.cluster_new_article(
        cleanedArticle, topic_name)
    neutrality_score = Neutrality.neutrality_score_finder(
        cluster_number, flag, cleanedArticle)
    print(neutrality_score)
    source_score = df.loc[df['Source'] == source_name, 'SourceScore'].iloc[0]
    print(source_score)
    score_label = strong_words_score(cleanedArticle)
    print(score_label)
    reliability_label = reliability_finder(source_score, neutrality_score,
                                           score_label)
    print(reliability_label)
    return reliability_label
Exemplo n.º 3
0
# softmax for computing the perplexity later on, not used elsewhere (no gradient computation)
softmax = tf.nn.softmax(predictions)

# Average Cross Entropy loss, compute CE separately to use in testing
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits = predictions, labels = labels)
loss = tf.reduce_sum(cross_entropy)

#training
adam = tf.train.AdamOptimizer(conf.lr)
gradients, variables = zip(*adam.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, 10.0)
train_step = adam.apply_gradients(zip(gradients, variables))

# preprocessing
print("Starting preprocessing")
preproc = preprocessor()
preproc.preprocess("../data/sentences.train")

# training
print("Start training")
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
if not os.path.exists(conf.ckpt_dir):
    os.makedirs(conf.ckpt_dir)
saver = tf.train.Saver()

with tf.Session(config=config) as sess:
    if conf.mode == "TRAIN":
        print("Mode set to TRAIN")
        sess.run(tf.global_variables_initializer())
        for i in range(conf.num_epochs):
Exemplo n.º 4
0
def get_input_image(imagename, train_mode):
    pp = preprocess.preprocessor()
    image = pp._parse_function(imagename)
    cimage = tf.cond(train_mode, lambda: pp.training_preprocess(image),
                     lambda: pp.val_preprocess(image))
    return cimage
import numpy as np
import sklearn.datasets
from sklearn.model_selection import KFold
from preprocess import preprocessor
import pandas as pd

import optuna.integration.lightgbm as lgb

if __name__ == "__main__":

    ### Load the data ###
    train = pd.read_csv("train.tsv", sep='\t')
    test = pd.read_csv("test.tsv", sep='\t')

    ### preprocess the data ###
    prep = preprocessor()
    train = prep.fit_transform(train)
    test = prep.transform(test)
    data_test = test.drop(['revenue'], axis=1)
    logtarget_test = np.log1p(test.revenue)
    data = train.drop(['revenue'], axis=1)
    target = train.revenue
    logtarget = np.log1p(target)
    dtrain = lgb.Dataset(data, label=logtarget)

    ### set the parameters and optimize the hiper-parameters ####
    params = {
        "objective": "rmse",
        "metric": "rmse",
        "verbosity": -1,
        "boosting_type": "gbdt",
Exemplo n.º 6
0
# #!/usr/bin/env python
# # -*- coding: utf-8 -*-
#
# '''
# Created on 26 Jul 2019
#
# @author: Ajay
# '''
#
from preprocess import preprocessor
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import time

pp = preprocessor(1500, "sentiment", "mysentiment")
X_train = pp.X_train
X_test = pp.X_test
y_train = pp.y_train
y_test = pp.y_test

clf = MultinomialNB(alpha=1.1)
start = time.time()
model = clf.fit(X_train, y_train)
stop = time.time()

predicted_y = model.predict(X_test)

# expected results vs predicted results
# print(y_test, predicted_y)
# print("Predict:   ", model.predict_proba(X_test))
print("Accuracy:  ", accuracy_score(y_test, predicted_y))
# #!/usr/bin/env python
# # -*- coding: utf-8 -*-
'''
Created on 28 Jul 2019

@author: Ajay
'''
from preprocess import preprocessor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn import tree
import time

# DT uses max_features=200 in addition to normal CountVectorizer arguments
pp = preprocessor(1500, "topic", "dt")
X_train = pp.X_train
X_test = pp.X_test
y_train = pp.y_train
y_test = pp.y_test

# if random_state is not set, the features are randomised, therefore the tree may be different each time
clf = tree.DecisionTreeClassifier(criterion='entropy',
                                  random_state=0,
                                  min_samples_leaf=20)
start = time.time()
model = clf.fit(X_train, y_train)
stop = time.time()

predicted_y = model.predict(X_test)

# expected results vs predicted results
# print(y_test, predicted_y)
Exemplo n.º 8
0
# #!/usr/bin/env python
# # -*- coding: utf-8 -*-
#
# '''
# Created on 26 Jul 2019
#
# @author: Ajay
# '''
#
from preprocess import preprocessor
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import time

divider = 1500
pp = preprocessor(divider, "sentiment", "bnb")
X_train = pp.X_train
X_test = pp.X_test
y_train = pp.y_train
y_test = pp.y_test

clf = BernoulliNB()
start = time.time()
model = clf.fit(X_train, y_train)
stop = time.time()
predicted_y = model.predict(X_test)

# expected results vs predicted results
# print(y_test, predicted_y)
# print("Predict:   ", model.predict_proba(X_test))
print("Accuracy:  ", accuracy_score(y_test, predicted_y))
# #!/usr/bin/env python
# # -*- coding: utf-8 -*-
# 
# '''
# Created on 26 Jul 2019
# 
# @author: Ajay
# '''
# 
from preprocess import preprocessor
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import time

pp = preprocessor(1500, "topic", "mytopic")
X_train = pp.X_train
X_test = pp.X_test
y_train = pp.y_train
y_test = pp.y_test

clf = MultinomialNB(alpha=.77)
start = time.time()
model = clf.fit(X_train, y_train)
stop = time.time()
predicted_y = model.predict(X_test)

# expected results vs predicted results
# print(y_test, predicted_y)
# print("Predict:   ", model.predict_proba(X_test))
print("Accuracy:  ", accuracy_score(y_test, predicted_y))
print("Precision (array): ", precision_score(y_test, predicted_y, average=None))
        return out


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Enhanced Neural Network')
    parser.add_argument("--hidden",
                        type=int,
                        help="number of hidden neurons",
                        default=5)
    parser.add_argument("--activation",
                        help="activation for neural network",
                        default="sigmoid")

    args = parser.parse_args()
    h = int(args.hidden)
    activation = args.activation

    dataset_url = "https://raw.githubusercontent.com/ronakHegde98/CS-4372-Computational-Methods-for-Data-Scientists/master/data/diabetic_data.csv"
    df = pd.read_csv(dataset_url)
    X_train, X_test, y_train, y_test = preprocessor(df)

    #reshaping of train and test
    y_train = y_train.values.reshape(y_train.shape[0], 1)
    y_test = y_test.values.reshape(y_test.shape[0], 1)

    nn_model = NeuralNet(X_train.T, y_train.T, h)
    nn_model.train(activation)

    predictions = nn_model.predict(X_test.T, y_test.T, activation)
    # predictions = np.around(predictions, 0).astype(np.int32)
# #!/usr/bin/env python
# # -*- coding: utf-8 -*-
#
# '''
# Created on 26 Jul 2019
#
# @author: Ajay
# '''
#
from preprocess import preprocessor
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import time

pp = preprocessor(1500, "topic", "mnb")
X_train = pp.X_train
X_test = pp.X_test
y_train = pp.y_train
y_test = pp.y_test

clf = MultinomialNB()
start = time.time()
model = clf.fit(X_train, y_train)
stop = time.time()
predicted_y = model.predict(X_test)

# expected results vs predicted results
# print(y_test, predicted_y)
# print("Predict:   ", model.predict_proba(X_test))
print("Accuracy:  ", accuracy_score(y_test, predicted_y))
print("Precision (array): ", precision_score(y_test, predicted_y,