Created on Wed Oct 30 21:47:48 2019 @author: tanma """ import matplotlib.pyplot as plt import numpy as np from IPython.core.display import HTML from itertools import chain from collections import Counter, defaultdict, namedtuple from helpers import show_model, Dataset from pomegranate import State, HiddenMarkovModel, DiscreteDistribution data = Dataset("tags-universal.txt", "brown-universal.txt", train_test_split=0.8) assert len(data) == len(data.training_set) + len(data.testing_set), \ "The number of sentences in the training set + testing set should sum to the number of sentences in the corpus" assert data.N == data.training_set.N + data.testing_set.N, \ "The number of training + test samples should sum to the total number of samples" def pair_counts(sequences_A, sequences_B): """Return a dictionary keyed to each unique value in the first sequence list that counts the number of occurrences of the corresponding value from the second sequences list. For example, if sequences_A is tags and sequences_B is the corresponding
acc += 1 return float(acc) / len(val_data) if __name__ == '__main__': tf.logging.set_verbosity(tf.logging.ERROR) os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' argv = sys.argv[1:] args, _ = Parser().getParser().parse_known_args(argv) random.seed() logging.basicConfig(filename=args.log, level=logging.INFO, format='%(message)s') logger = logging.getLogger(__name__) dataset = Dataset(args.dataset, logger) train_data, dev_data = dataset.getdata(args.maxlenth) word_vector = dataset.get_wordvector(args.word_vector) ### train_text_num = 500 dev_text_num = 20 if args.smalldata == 1: train_data = train_data[:train_text_num] dev_data = dev_data[:dev_text_num] print("train_data ", len(train_data)) print("dev_data", len(dev_data)) ### config = tf.ConfigProto() config.gpu_options.allow_growth = True
import matplotlib.pyplot as plt import numpy as np from IPython.core.display import HTML from itertools import chain from collections import Counter, defaultdict from helpers import show_model, Dataset from pomegranate import State, HiddenMarkovModel, DiscreteDistribution data = Dataset("tags-universal.txt", "brown-universal.txt", train_test_split=0.8) print("There are {} sentences in the corpus.".format(len(data))) print("There are {} sentences in the training set.".format(len(data.training_set))) print("There are {} sentences in the testing set.".format(len(data.testing_set))) assert len(data) == len(data.training_set) + len(data.testing_set), "The number of sentences in the training set + testing set should sum to the number of sentences in the corpus" key = 'b100-38532' print("Sentence: {}".format(key)) print("words:\n\t{!s}".format(data.sentences[key].words)) print("tags:\n\t{!s}".format(data.sentences[key].tags)) print("There are a total of {} samples of {} unique words in the corpus." .format(data.N, len(data.vocab))) print("There are {} samples of {} unique words in the training set." .format(data.training_set.N, len(data.training_set.vocab)))
res = pd.DataFrame([" ".join(tup) for tup in bigram_sequence_set], columns=["bigram_sequence"]) res = res.groupby("bigram_sequence").size().reset_index(name="Count") res.to_csv(bigram_training_path) df = pd.read_csv(bigram_training_path) df.drop(columns=["Unnamed: 0"], inplace=True) dct = df.set_index("bigram_sequence").T.to_dict("Records")[0] return {key_to_tuple(k): dct[k] for k in dct} def tag_aggregate(sequences): if not os.path.exists(TAG_TRAINING_PATH): start_end_frame = [] for i, seq in enumerate(sequences): tup = (seq[0], seq[-1]) start_end_frame.append(tup) df = pd.DataFrame(start_end_frame, columns=["start_type", "end_type"]) df.to_csv(TAG_TRAINING_PATH) df = pd.read_csv(TAG_TRAINING_PATH) start_frame, end_frame = df.start_type.value_counts( ), df.end_type.value_counts() return start_frame.to_dict, end_frame.to_dict() data = Dataset(TAG_PATH, BROWN_PATH, train_test_split=0.8) tag_starts, tag_end = tag_aggregate(data.training_set.Y)
import matplotlib.pyplot as plt import numpy as np from IPython.core.display import HTML from itertools import chain from collections import Counter, defaultdict from helpers import show_model, Dataset from pomegranate import State, HiddenMarkovModel, DiscreteDistribution data = Dataset("tags-universal.txt", "brown-universal.txt", train_test_split=0.8) print("There are {} sentences in the corpus.".format(len(data))) print("There are {} sentences in the training set.".format(len(data.training_set))) print("There are {} sentences in the testing set.".format(len(data.testing_set))) assert len(data) == len(data.training_set) + len(data.testing_set), \ "The number of sentences in the training set + testing set should sum to the number of sentences in the corpus" key = 'b100-38532' print("Sentence: {}".format(key)) print("words:\t{!s}".format(data.sentences[key].words)) print("tags:\t{!s}".format(data.sentences[key].tags)) print("There are a total of {} samples of {} unique words in the copus." .format(data.N, len(data.vocab))) print("There are {} samples of {} unique words in the training set." .format(data.training_set.N, len(data.training_set.vocab))) print("There are {} samples of {} unique words in the testing set." .format(data.testing_set.N, len(data.testing_set.vocab))) print("There are {} words in the test set that are missing in the training set." .format(len(data.testing_set.vocab - data.training_set.vocab)))