Пример #1
0
def main(argv):
    if len(argv) not in range(2, 6):
        programme_name = "lingofunk_classify_sentiment.model.hnatt.run"
        print(f"usage: PYTHONPATH=. python -m {programme_name} "
              "<category> <quantity> <embedding_name> <input_size> "
              "<learning rate>")
        sys.exit(2)
    category = argv[0]
    quantity = int(argv[1])

    embeddings_path = None
    if len(argv) >= 3:
        embeddings_name = argv[2]
        embeddings_path = fetch(
            f'{config["embeddings"][embeddings_name]["basepath"]}.txt')
        if not os.path.isfile(embeddings_path):
            download_embedding(embeddings_name)

    if len(argv) >= 4:
        input_size = int(argv[3])

    if len(argv) == 5:
        learning_rate = float(argv[4])

    preprocessor_path = fetch(config["models"]["hnatt"]["preprocessor"])
    preprocessor_dir = os.path.dirname(preprocessor_path)
    if not os.path.exists(preprocessor_dir):
        os.makedirs(preprocessor_dir)

    joblib.dump(normalize, preprocessor_path, compress=0)

    (train_X, train_y), (test_X,
                         test_y) = load_balanced_train_and_test_dataframes(
                             category, quantity, normalize, save_reviews)

    # initialize HNATT
    h = HNATT()
    h.train(
        train_X,
        train_y,
        batch_size=64,
        epochs=10,
        embeddings_path=embeddings_path,
        input_size=input_size,
        learning_rate=learning_rate,
    )
    quantity = len(train_y)
    tag = str(date.today())
    h.load_weights(
        weights_path=WEIGHTS_PATH_TEMPLATE.substitute(quantity=quantity,
                                                      tag=tag),
        tokenizer_path=TOKENIZER_PATH_TEMPLATE.substitute(quantity=quantity,
                                                          tag=tag),
    )

    activation_maps = h.activation_maps(
        "they have some pretty interesting things here. i will definitely go back again."
    )
    print(activation_maps)
Пример #2
0
def main(argv):
    if len(argv) != 2:
        programme_name = "lingofunk_classify_sentiment.model.naive_bayes.run"
        print(
            f"usage: PYTHONPATH=. python -m {programme_name} <category> <quantity>"
        )
        sys.exit(2)
    category = argv[0]
    quantity = int(argv[1])

    try:
        (pos_words,
         neg_words) = load_samples(category, quantity,
                                   remove_stopwords_and_include_bigrams,
                                   save_reviews)
    except Exception:
        print("The data for this category and quantity have not been found.")
        sys.exit(2)

    preprocessor_path = fetch(config["models"]["naive_bayes"]["preprocessor"])
    joblib.dump(remove_stopwords_and_include_bigrams,
                preprocessor_path,
                compress=0)

    print(f"Category: {category}")
    (accuracy, classifier, train_set, test_set) = train(pos_words, neg_words)
    classifier.show_most_informative_features()
def train(pos_samples, neg_samples):
    model_path = fetch(config["models"]["naive_bayes"]["weights"])

    samples = np.array(pos_samples + neg_samples)

    train_samples, test_samples = train_test_split(samples,
                                                   test_size=0.2,
                                                   random_state=42)

    if os.path.isfile(model_path):
        classifier = joblib.load(model_path).train(train_samples)
    else:
        classifier = nltk.NaiveBayesClassifier.train(train_samples)
        joblib.dump(classifier, model_path, compress=0)

    accuracy = nltk.classify.util.accuracy(classifier, test_samples)
    print(f"Finished training. The accuracy is {accuracy}.")
    test_trained_classifier(classifier, test_samples)

    return (accuracy, classifier, train_samples, test_samples)
Пример #4
0
def download_embedding(embedding):
    settings = config["embeddings"][embedding]

    glove_basepath = fetch(settings["basepath"])
    glove_zip_path = f"{glove_basepath}.zip"
    glove_unzip_path = f"{glove_basepath}.txt"

    embedding_dirs = map(os.path.dirname, [glove_zip_path, glove_unzip_path])
    for embedding_dir in embedding_dirs:
        if not os.path.exists(embedding_dir):
            os.makedirs(embedding_dir)

    glove_url = settings["url"]
    # Download the GloVe data if applicable
    if os.path.isfile(glove_zip_path):
        logger.info("GloVe data already exists, skipping download.")
    else:
        logger.info("Downloading GloVe data to {}".format(glove_zip_path))
        try:
            args = ["wget", "-O", glove_zip_path, glove_url]
            output = subprocess.Popen(args, stdout=subprocess.PIPE)
            out, err = output.communicate()
        except:
            logger.info("Couldn't download GloVe data with wget, "
                        "falling back to (slower) Python downloading.")
            glove_response = requests.get(glove_url, stream=True)
            with open(glove_zip_path, "wb") as glove_file:
                for chunk in glove_response.iter_content(chunk_size=1024 *
                                                         1024):
                    # Filter out keep-alive new chunks.
                    if chunk:
                        glove_file.write(chunk)

    # Extract the GloVe data if it does not already exist.
    if os.path.exists(glove_unzip_path):
        logger.info("Unzipped GloVe data already exists, skipping unzip.")
    else:
        logger.info("Unzipping GloVe archive to {}".format(glove_unzip_path))
        zip_ref = zipfile.ZipFile(glove_zip_path, "r")
        zip_ref.extractall(os.path.dirname(glove_unzip_path))
        zip_ref.close()
from keras import backend as K
from keras import initializers, regularizers
from keras.callbacks import *
from keras.engine.topology import Layer
from keras.layers import *
from keras.models import *
from keras.optimizers import *
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import CustomObjectScope

from lingofunk_classify_sentiment.config import config, fetch
from lingofunk_classify_sentiment.data.load import load_glove_embedding
from lingofunk_classify_sentiment.model.hnatt.preprocess import normalize

WEIGHTS_PATH_TEMPLATE = Template(fetch(config["models"]["hnatt"]["weights"]))
TOKENIZER_PATH_TEMPLATE = Template(
    fetch(config["models"]["hnatt"]["tokenizer"]))
MAX_VOCABULARY_SIZE = config["constants"]["max_vocabulary_size"]
INPUT_SIZE = config["constants"]["input_size"]
LEARNING_RATE = config["constants"]["learning_rate"]

# Uncomment below for debugging
# from tensorflow.python import debug as tf_debug
# sess = K.get_session()
# sess = tf_debug.LocalCLIDebugWrapperSession(sess)
# K.set_session(sess)


def dot_with_kernel(x, kernel):
    """
Пример #6
0
# based on https://github.com/sfotiadis/yenlp/blob/master/extract_reviews.py

import json
import os
import sys
from string import Template

from lingofunk_classify_sentiment.config import config, fetch

business_data_filename = fetch(config["datasets"]["yelp"]["ids"])
reviews_data_filename = fetch(config["datasets"]["yelp"]["reviews"])
sample_template_filename = Template(
    fetch(config["datasets"]["yelp"]["sample_format"]))


def get_business_ids(category):
    """Gets the business ids for the given category"""
    with open(business_data_filename) as businesses:
        business_ids = []
        for business in businesses:
            business = json.loads(business)
            if business["categories"] and category in business[
                    "categories"].split():
                business_ids.append(business["business_id"])
    return business_ids


def save_reviews(category, quantity):
    """Saves the given number of reviews of a specific category to two files,
    one for each class(pos/neg)."""
    pos_reviews_filename = sample_template_filename.substitute(
import json
import os
from string import Template

import numpy as np
from tqdm import tqdm

import pandas as pd

from lingofunk_classify_sentiment.config import config, fetch

tqdm.pandas()
sample_template_filename = Template(fetch(config["datasets"]["yelp"]["sample_format"]))


def load_samples(category, quantity, preprocess, save=False):
    pos_reviews_fn = sample_template_filename.substitute(
        category=category.lower(), quantity=quantity, label="pos"
    )
    neg_reviews_fn = sample_template_filename.substitute(
        category=category.lower(), quantity=quantity, label="neg"
    )

    bothExist = os.path.isfile(pos_reviews_fn) and os.path.isfile(neg_reviews_fn)

    if not bothExist and save:
        save(category, quantity)

    pos_reviews = open(pos_reviews_fn, "r")
    neg_reviews = open(neg_reviews_fn, "r")