Exemplo n.º 1
0
def main(network_type):
    if network_type == "cnn":
        print("Testing CNN network")
        from cnn_params import params
    if network_type == "lstm":
        print("Testing LSTM network")
        from params import params
    train_data = ATEDataProcessor(params["train_file"], **params)
    test_data = ATEDataProcessor(params["test_file"],
                                 pos_id=get_count(
                                     train_data.annotated_sentences),
                                 **params)

    test_set = test_data.annotated_sentences
    test = DataIterator(test_set,
                        word_file=params["word_file"],
                        char_file=params["char_file"])
    if network_type == "cnn":
        model = CNNNetwork(max_sentence_length=train_data.max_sentence_len,
                           **params)
    elif network_type == "lstm":
        model = LSTMNetwork(**params)
    model.build()
    model.restore_session(model.model_directory)
    model.evaluate(test)
def generate_data(kwargs):
  """
  Generate data based on params given as input
  Args:
      kwargs (dict): Params required for building data set.
  """
  train = ATEDataProcessor(
    data_file=kwargs["train_file"], **kwargs
  )
  test = ATEDataProcessor(
    data_file=kwargs["test_file"], **kwargs
  )
  w_dim = kwargs["w_dim"]
  vector_text_file = kwargs["raw_vectors"]
  word_to_id, _ = dump_vocab([train, test], kwargs["word_file"],
                             kwargs["char_file"], vector_text_file)
  store_word_vectors(vector_file=vector_text_file, word_to_id=word_to_id,
                     stored_vectors=kwargs["vector_file"])
  if kwargs.get("multi_rnn") or kwargs.get("use_additional_embeddings"):
    vector_text_file_2 = kwargs["raw_vectors_2"]
    store_word_vectors(vector_file=vector_text_file_2, word_to_id=word_to_id, 
                       stored_vectors=kwargs["vector_file_2"])
  if kwargs.get("use_pos"):
    print("building pos embeddings array")
    train_tags = np.array(train.pos_tags)
    test_tags = np.array(test.pos_tags)
    tags = np.concatenate([train_tags, test_tags])
    np.save(kwargs["pos_file"], tags)
Exemplo n.º 3
0
def kfold_validate(model, k, kwargs):
  """
  This functin does something similar to k fold validation. We train and test 
  our model k times, by randomly splitting our entire data set into three parts
  (train, dev and test) and return the average of the K runs.
  Args:
      model (str): What kind of model to use. It can be either lstm or cnn
      k (int): Number of iterations over which to average
      kwargs (dict): The parameters that define the model
  
  Returns:
      dict: A dictionary of results, contating the keys precision, recall and 
        fscore.
  """
  p_1 = 0.0
  r_1 = 0.0
  f_1 = 0.0
  train_data = ATEDataProcessor(kwargs["train_file"], **kwargs)
  test_data = ATEDataProcessor(kwargs["test_file"],
                               pos_id=get_count(train_data.annotated_sentences),
                               **kwargs)
  sentences = train_data.annotated_sentences + test_data.annotated_sentences
  for i in range(k):
    print("Run number: {}".format(i))
    train_set, test_set = split(sentences, test_size=0.2, random_state=42)
    train_set, dev_set = split(train_set, test_size=kwargs["test_size"], 
                               random_state=42)
    train = DataIterator(train_set, **kwargs)
    dev = DataIterator(dev_set, **kwargs)
    test = DataIterator(test_set, **kwargs)
    if model == "lstm":
      model = LSTMNetwork(**kwargs)
    elif model == "cnn":
      model = CNNNetwork(max_sentence_length=train_data.max_sentence_len,
                         **kwargs)
    model.build()
    model.train(train, dev)
    results = model.evaluate(test)
    p_1 += float(results["p_1"])
    r_1 += float(results["r_1"])
    f_1 += float(results["f_1"])
    model.close_session()
  print("p_1: {}\nr_1: {}\nf_1: {}".format(p_1/k, r_1/k, f_1/k))
  return {
    "precision": p_1/k,
    "recall": r_1/k,
    "fscore": f_1/k
  }
def average_calculator(model, k, kwargs, gen_data=True):
    if gen_data:
        generate_data(kwargs)
    p_1 = 0.0
    r_1 = 0.0
    f_1 = 0.0
    f_scores = []
    train_data = ATEDataProcessor(kwargs["train_file"], **kwargs)
    test_data = ATEDataProcessor(kwargs["test_file"],
                                 pos_id=get_count(
                                     train_data.annotated_sentences),
                                 **kwargs)
    for i in range(k):
        print("Run number: {}".format(i))
        test_set = test_data.annotated_sentences
        train_set, dev_set = split(train_data.annotated_sentences,
                                   test_size=kwargs["test_size"])
        train = DataIterator(train_set, **kwargs)
        dev = DataIterator(dev_set, **kwargs)
        test = DataIterator(test_set, **kwargs)
        if model == "lstm":
            model = LSTMNetwork(**kwargs)
        elif model == "cnn":
            model = CNNNetwork(max_sentence_length=train_data.max_sentence_len,
                               **kwargs)
        model.build()
        model.train(train, dev)
        model.restore_session(model.model_directory)
        results = model.evaluate(test)
        f_scores.append(results["f_1"])
        p_1 += float(results["p_1"])
        r_1 += float(results["r_1"])
        f_1 += float(results["f_1"])
        model.close_session()
    print("p_1: {}\nr_1: {}\nf_1: {}".format(p_1 / k, r_1 / k, f_1 / k))
    print(mean_confidence_interval(f_scores))
    return {"precision": p_1 / k, "recall": r_1 / k, "fscore": f_1 / k}
def main(network_type):
    if network_type == "cnn":
        print("Training CNN network")
        from cnn_params import params
    if network_type == "lstm":
        print("Training LSTM network")
        from params import params
    train_data = ATEDataProcessor(params["train_file"], **params)
    sentences = train_data.annotated_sentences
    train_set, dev_set = split(sentences, test_size=params["test_size"])
    train = DataIterator(train_set,
                         word_file=params["word_file"],
                         char_file=params["char_file"])
    dev = DataIterator(dev_set,
                       word_file=params["word_file"],
                       char_file=params["char_file"])
    if network_type == "cnn":
        model = CNNNetwork(max_sentence_length=train_data.max_sentence_len,
                           **params)
    elif network_type == "lstm":
        model = LSTMNetwork(**params)
    model.build()
    model.train(train, dev)
Exemplo n.º 6
0
import shelve
from crf_params import params, data_set
from data_processor import ATEDataProcessor

train = ATEDataProcessor(data_file=params["train_file"], **params)
test = ATEDataProcessor(data_file=params["test_file"], **params)
sentences = train.annotated_sentences + test.annotated_sentences
db = shelve.open("word_counts/word_counts_{}.db".format(data_set))
for i, sentence in enumerate(sentences):
    print i, len(sentences)
    for word, _, _ in sentence:
        word = word.encode("utf-8")
        word = str(word)
        db[word] = db.get(word, 0) + 1
db.close()
import numpy as np
import os
from params import params
from data_processor import ATEDataProcessor
from stanfordcorenlp import StanfordCoreNLP

print("Connecting to CoreNLP server.....")
nlp = StanfordCoreNLP("{}/stanford-corenlp".format(os.path.expanduser("~")))
print("Connected!")
train_data = ATEDataProcessor(params["train_file"],
                              pos_source_file=params.get("pos_train_file"),
                              small_pos=params.get("small_pos"))
test_data = ATEDataProcessor(params["test_file"],
                             pos_source_file=params.get("pos_test_file"),
                             small_pos=params.get("small_pos"))

sentences = test_data.raw_sentences
dependencies = []
pos = []
i = 0
j = len(sentences)
'''
opinion_words = []
with open(params["opinion_words_file"]) as f:
     for line in f:
         word = line.split("\n")
         opinion_words.append(word)

stop_words = []
with open(params["stop_words_file"]) as f:
  for line in f:
"""
A helper script to generate raw text files from xml files.
"""
from data_processor import ATEDataProcessor
from params import params

if __name__ == "__main__":
  # escape_word is the character that seperates two sentences in the final
  # output. Remember that some of the raw sentences don't end with
  # punctuation, don't begin with capital letters, so it might be better to
  # have a end of sentence symbol.
  escape_word = "\n"
  dp_train = ATEDataProcessor(params["train_file"])
  dp_test = ATEDataProcessor(params["test_file"])
  raw_sentences = dp_train.raw_sentences + dp_test.raw_sentences
  raw_sentences = escape_word.join(raw_sentences)
  with open("raw_rest.txt", "w")as fp:
    fp.write(raw_sentences)
Exemplo n.º 9
0
"""
Module to save dependencies of each sentence in given dataset to a npy array
"""
import numpy as np
import os
from cnn_params import params, data_set
from data_processor import ATEDataProcessor
from stanfordcorenlp import StanfordCoreNLP

print("Connecting to CoreNLP server..")
nlp = StanfordCoreNLP("{}/stanford-corenlp".format(os.path.expanduser("~")))
print("Connected!")
train_data = ATEDataProcessor("./train_data/{}_train_rules.xml".format(data_set), **params)
test_data = ATEDataProcessor("./test_data/{}_test_rules.xml".format(data_set), **params)

sentences = train_data.raw_sentences + test_data.raw_sentences
dependencies = []
pos = []
# ner = []
i = 0
j = len(sentences)
for sentence in sentences:
  print i, j
  i += 1
  dependencies.append(list(nlp.dependency_parse(sentence)))
  pos.append(list(nlp.pos_tag(sentence)))
#  ner.append(list(nlp.ner(sentence)))

nlp.close()
np.save(params["dependencies"], dependencies)
np.save(params["dependencies_pos"], pos)