예제 #1
0
    def __getitem__(self, item):
        data = process_data(self.tweet[item], self.selected_text[item],
                            self.sentiment[item])

        return {
            'ids':
            torch.tensor(data["ids"], dtype=torch.long),
            'mask':
            torch.tensor(data["mask"], dtype=torch.long),
            'token_type_ids':
            torch.tensor(data["token_type_ids"], dtype=torch.long),
            'targets_start':
            torch.tensor(data["targets_start"], dtype=torch.long),
            'targets_end':
            torch.tensor(data["targets_end"], dtype=torch.long),
            'orig_tweet':
            data["orig_tweet"],
            'orig_selected':
            data["orig_selected"],
            'sentiment':
            data["sentiment"],
            'offsets':
            torch.tensor(data["offsets"], dtype=torch.long),
            'padding_len':
            data["padding_len"]
        }
예제 #2
0
def getInfoFromParameters(input_file, parameters, estimator):
    Corpus = preprocessing.process_data(
        input_file,
        to_lower_case=parameters.lowerCaseFlag,
        remove_stop_words=parameters.removeStopWordsFlag,
        stem=parameters.stemFlag)
    pipeline = preprocessing.vectorize(estimator,
                                       max_features=parameters.maxFeatures,
                                       ngram_range=parameters.ngramRange,
                                       tf=parameters.tfidfFlags[0],
                                       tfidf=parameters.tfidfFlags[1])

    return Corpus, pipeline
예제 #3
0
def getInfoFromParameters(input_file, parameters):
    Corpus = preprocessing.process_data(
        input_file,
        to_lower_case=parameters.lowerCaseFlag,
        remove_stop_words=parameters.removeStopWordsFlag,
        stem=parameters.stemFlag)
    counts_by_comment, names = preprocessing.vectorize(
        Corpus,
        max_features=parameters.maxFeatures,
        ngram_range=parameters.ngramRange,
        tf=parameters.tfidfFlags[0],
        tfidf=parameters.tfidfFlags[1])

    return Corpus, counts_by_comment, names
예제 #4
0
def process_and_send_flows(combined_flows, attributes, normalization_values):
    normalization_values = np.asarray(normalization_values).astype(np.float32)
    flow_list = []
    flow_names = [flow_attribute[0] for flow_attribute in flow.get_flow_attributes()]
    indexes = []
    for name in flow_names:
        indexes.append([i for i in range(len(attributes)) if attributes[i][0] == name][0])
    temp_list = ["None" if type(attribute[1]) == list else 0 for attribute in attributes]
    for key in combined_flows:
        flow_list.append(temp_list)
        new_values = combined_flows[key].get_flow_as_list()
        for i in range(len(new_values)):
            flow_list[-1][indexes[i]] = new_values[i]

    #Function returns predictions too, not needed online
    data, useless = process_data(flow_list, attributes)
    data = np.asarray(data).astype(np.float32)
    data = (data - normalization_values[0]) / normalization_values[1]
    data = np.nan_to_num(data)
    send_data(data)
          "keep_prob": keep_prob,\
          "batch_size": batch_size,\
          "epochs": epochs,\
          "max_to_keep": max_to_keep,\
          "no_imprv_tolerance": no_imprv_tolerance,\
          "checkpoint_path": checkpoint_path,\
          "summary_path": summary_path,\
          "model_name": model_name}

# alpha & gamma for focal loss (tune hyperparameter)
alpha = 0.1
gamma = 0.5
import os
if not os.path.exists(config["save_path"]):
    os.mkdir(config["save_path"])
    process_data(config)

print("Load datasets...")
# used for training
train_set = batchnize_dataset(config["train_set"],
                              config["batch_size"],
                              shuffle=True)
# used for computing validate loss
valid_set = batchnize_dataset(config["dev_set"], batch_size=100, shuffle=False)

import tensorflow as tf
tf.reset_default_graph()
print("Build models...")
model = BiLSTM_Attention_model(config, alpha, gamma)
model.train(train_set, valid_set)
# used for computing test precision, recall and F1 scores
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
from preprocessing import process_data

TWEETS = 'tweet'
LABEL = 'user'

# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

# Read data
df = pd.read_csv('../train_data.csv', index_col=0)
df.reset_index(inplace=True)
dataset, num_features = process_data(df[TWEETS])
y = df[LABEL]

# Split data
X_train, X_test, y_train, y_test = train_test_split(dataset, y, train_size=0.8)

# Random Forest
estimator = RandomForestClassifier(n_estimators=200,
                                   n_jobs=4,
                                   verbose=True,
                                   random_state=0)

estimator.fit(X_train, y_train)
y_hat = estimator.predict(X_test)
print("Accuracy of model: " + str(sum(y_hat == y_test) / len(y_test)))
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
from preprocessing import process_data

TWEETS = 'tweet'
LABEL = 'user'

# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

# Read data
df = pd.read_csv('../train_data.csv', index_col=0)
df.reset_index(inplace=True)
dataset = process_data(df[TWEETS])
y = df[LABEL]

# Split data
X_train, X_test, y_train, y_test = train_test_split(dataset, y, train_size=0.8)

# Logistic Regression
estimator = SGDClassifier(verbose=True)

estimator.fit(X_train, y_train)
y_hat = estimator.predict(X_test)
print("Accuracy of model: " + str(sum(y_hat == y_test) / len(y_test)))
예제 #8
0
import numpy as np
import visualisation
import model_functions as modf
import preprocessing
import config
import build
import pickle

np.random.seed(8888)

model_type = config.model

if __name__ == '__main__':
    df = preprocessing.download_data()
    df = preprocessing.process_data(df)

    model = build.create_model()

    build.build_model(model, df)
예제 #9
0
        'stars': 1
    }).limit(500000)
    reviews_4 = db.reviews.find({
        'stars': 4
    }, {
        'text': 1,
        'stars': 1
    }).limit(500000)
    reviews_5 = db.reviews.find({
        'stars': 5
    }, {
        'text': 1,
        'stars': 1
    }).limit(500000)
    reviews = chain(reviews_1, reviews_2, reviews_3, reviews_4, reviews_5)

    result, stars = preprocessing.process_data(reviews, lexicon='save')

    svm.train_model(result, stars)
    # yelp_neural_networks.train_model('lstm', result, stars)
    # yelp_neural_networks.train_model('cnn', result, stars)

    # svm.evaluate_model(result, stars)
    # yelp_neural_networks.evaluate_model('lstm', result, stars)
    # yelp_neural_networks.evaluate_model('cnn', result, stars)

    # predictions = yelp_neural_networks.predict_model('lstm', result)

    # print error_cost(predictions, stars)
    # print confusion_matrix(stars, predictions, labels=[1, 2, 3, 4, 5])
예제 #10
0


if __name__ == '__main__':

    client = MongoClient('localhost', 27017)
    db = client.Yelp

    reviews_1 = db.reviews.find({'stars': 1}, {'text': 1, 'stars': 1}).skip(100000).limit(10000)
    reviews_2 = db.reviews.find({'stars': 2}, {'text': 1, 'stars': 1}).skip(50000).limit(10000)
    reviews_3 = db.reviews.find({'stars': 3}, {'text': 1, 'stars': 1}).skip(100000).limit(10000)
    reviews_4 = db.reviews.find({'stars': 4}, {'text': 1, 'stars': 1}).skip(100000).limit(10000)
    reviews_5 = db.reviews.find({'stars': 5}, {'text': 1, 'stars': 1}).skip(100000).limit(10000)
    reviews = chain(reviews_1, reviews_2, reviews_3, reviews_4, reviews_5)

    result, stars = preprocessing.process_data(reviews, lexicon='load', using='tokenizer')
    # result_svm, stars = preprocessing.process_data(reviews, lexicon='load', using='tf-idf')


    # svm.train_model(result, stars)
    # yelp_neural_networks.train_model('lstm', result, stars)
    # yelp_neural_networks.train_model('cnn', result, stars)

    # svm.evaluate_model(result, stars)
    # yelp_neural_networks.evaluate_model('lstm', result, stars)
    # yelp_neural_networks.evaluate_model('cnn', result, stars)

    predictions_cnn = yelp_neural_networks.predict_model('cnn', result)
    predictions_lstm = yelp_neural_networks.predict_model('lstm', result)
    # predictions_svm = svm.predict_model(result_svm)
예제 #11
0
import torch.utils.data as data_utils
import torch.nn.functional as F
from preprocessing import process_data

# Seed for reproducibility
torch.manual_seed(12)

# Parameters
epochs = 100
batch_size = 20
learn_rate = .01
input_size = 36
num_classes = 3

# Data
X, Y, _ = process_data()

# Convert to tensor
X = torch.Tensor(X).float()
Y = torch.Tensor(Y).long()

# Initialize data loader
dataset = data_utils.TensorDataset(X, Y)
data_loader = data_utils.DataLoader(dataset, batch_size=batch_size)


# Network
class Classifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super().__init__()
        self.h1 = nn.Linear(input_size, 100)