Python Dataset примеры использования

Язык программирования: Python

Пространство имен/Пакет: helpers

Класс/Тип: Dataset

Примеров на hotexamples.com: 5

Python Dataset - 5 примеров найдено. Это лучшие примеры Python кода для helpers.Dataset, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Dataset(3)

stream(3)

get_wordvector(1)

getdata(1)

Пример #1

Показать файл

Created on Wed Oct 30 21:47:48 2019

@author: tanma
"""

import matplotlib.pyplot as plt
import numpy as np

from IPython.core.display import HTML
from itertools import chain
from collections import Counter, defaultdict, namedtuple
from helpers import show_model, Dataset
from pomegranate import State, HiddenMarkovModel, DiscreteDistribution

data = Dataset("tags-universal.txt",
               "brown-universal.txt",
               train_test_split=0.8)

assert len(data) == len(data.training_set) + len(data.testing_set), \
       "The number of sentences in the training set + testing set should sum to the number of sentences in the corpus"

assert data.N == data.training_set.N + data.testing_set.N, \
       "The number of training + test samples should sum to the total number of samples"


def pair_counts(sequences_A, sequences_B):
    """Return a dictionary keyed to each unique value in the first sequence list
    that counts the number of occurrences of the corresponding value from the
    second sequences list.
    
    For example, if sequences_A is tags and sequences_B is the corresponding

Пример #2

Показать файл

            acc += 1
    return float(acc) / len(val_data)


if __name__ == '__main__':
    tf.logging.set_verbosity(tf.logging.ERROR)
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    argv = sys.argv[1:]
    args, _ = Parser().getParser().parse_known_args(argv)
    random.seed()

    logging.basicConfig(filename=args.log, level=logging.INFO,
                        format='%(message)s')
    logger = logging.getLogger(__name__)

    dataset = Dataset(args.dataset, logger)
    train_data, dev_data = dataset.getdata(args.maxlenth)
    word_vector = dataset.get_wordvector(args.word_vector)

    ###
    train_text_num = 500
    dev_text_num = 20
    if args.smalldata == 1:
        train_data = train_data[:train_text_num]
        dev_data = dev_data[:dev_text_num]
    print("train_data ", len(train_data))
    print("dev_data", len(dev_data))
    ###

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

Пример #3

Показать файл

Файл: hmm_tagger.py Проект: shashi95/hmm-tagger

import matplotlib.pyplot as plt
import numpy as np

from IPython.core.display import HTML
from itertools import chain
from collections import Counter, defaultdict
from helpers import show_model, Dataset
from pomegranate import State, HiddenMarkovModel, DiscreteDistribution


data = Dataset("tags-universal.txt", "brown-universal.txt", train_test_split=0.8)


print("There are {} sentences in the corpus.".format(len(data)))
print("There are {} sentences in the training set.".format(len(data.training_set)))
print("There are {} sentences in the testing set.".format(len(data.testing_set)))


assert len(data) == len(data.training_set) + len(data.testing_set),        "The number of sentences in the training set + testing set should sum to the number of sentences in the corpus"


key = 'b100-38532'
print("Sentence: {}".format(key))
print("words:\n\t{!s}".format(data.sentences[key].words))
print("tags:\n\t{!s}".format(data.sentences[key].tags))


print("There are a total of {} samples of {} unique words in the corpus."
      .format(data.N, len(data.vocab)))
print("There are {} samples of {} unique words in the training set."
      .format(data.training_set.N, len(data.training_set.vocab)))

Пример #4

Показать файл

Файл: hmm_tagger_build.py Проект: danielmiller57910/hmm_tagger_udacity-

        res = pd.DataFrame([" ".join(tup) for tup in bigram_sequence_set],
                           columns=["bigram_sequence"])
        res = res.groupby("bigram_sequence").size().reset_index(name="Count")
        res.to_csv(bigram_training_path)

    df = pd.read_csv(bigram_training_path)
    df.drop(columns=["Unnamed: 0"], inplace=True)
    dct = df.set_index("bigram_sequence").T.to_dict("Records")[0]
    return {key_to_tuple(k): dct[k] for k in dct}


def tag_aggregate(sequences):

    if not os.path.exists(TAG_TRAINING_PATH):
        start_end_frame = []
        for i, seq in enumerate(sequences):
            tup = (seq[0], seq[-1])
            start_end_frame.append(tup)

        df = pd.DataFrame(start_end_frame, columns=["start_type", "end_type"])
        df.to_csv(TAG_TRAINING_PATH)

    df = pd.read_csv(TAG_TRAINING_PATH)
    start_frame, end_frame = df.start_type.value_counts(
    ), df.end_type.value_counts()
    return start_frame.to_dict, end_frame.to_dict()


data = Dataset(TAG_PATH, BROWN_PATH, train_test_split=0.8)
tag_starts, tag_end = tag_aggregate(data.training_set.Y)

Пример #5

Показать файл

Файл: pos_tagger.py Проект: Sarankdl/POS-Tagger

import matplotlib.pyplot as plt
import numpy as np
from IPython.core.display import HTML
from itertools import chain
from collections import Counter, defaultdict
from helpers import show_model, Dataset
from pomegranate import State, HiddenMarkovModel, DiscreteDistribution

data = Dataset("tags-universal.txt", "brown-universal.txt", train_test_split=0.8)

print("There are {} sentences in the corpus.".format(len(data)))
print("There are {} sentences in the training set.".format(len(data.training_set)))
print("There are {} sentences in the testing set.".format(len(data.testing_set)))

assert len(data) == len(data.training_set) + len(data.testing_set), \
    "The number of sentences in the training set + testing set should sum to the number of sentences in the corpus"

key = 'b100-38532'
print("Sentence: {}".format(key))
print("words:\t{!s}".format(data.sentences[key].words))
print("tags:\t{!s}".format(data.sentences[key].tags))

print("There are a total of {} samples of {} unique words in the copus."
      .format(data.N, len(data.vocab)))
print("There are {} samples of {} unique words in the training set."
      .format(data.training_set.N, len(data.training_set.vocab)))
print("There are {} samples of {} unique words in the testing set."
      .format(data.testing_set.N, len(data.testing_set.vocab)))
print("There are {} words in the test set that are missing in the training set."
      .format(len(data.testing_set.vocab - data.training_set.vocab)))