Python PlaintextCorpusReader.sents示例

编程语言: Python

命名空间/包名称: nltk.corpus.reader

方法/功能: sents

hotexamples.com的示例: 7

Python PlaintextCorpusReader.sents - 已找到7个示例。这些是从开源项目中提取的最受好评的nltk.corpus.reader.PlaintextCorpusReader.sents现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

PlaintextCorpusReader(18)

fileids(9)

raw(6)

sents(5)

words(4)

__init__(3)

paras(2)

categories(1)

chunked_paras(1)

chunked_sents(1)

chunked_words(1)

ensure_loaded(1)

tagged_paras(1)

tagged_sents(1)

tagged_words(1)

示例#1

显示文件

def load_sentences(text_file, stopwords, lang):
    path, f = ntsplit(text_file)
    reader = PlaintextCorpusReader(path, f)
    sentences = [sent for sent in reader.sents()]
    clean = []
    originalSentenceOf = {}
    if lang == "fr":
        stemmer = FrenchStemmer()
    elif lang == "en":
        stemmer = SnowballStemmer("english")
    # Data cleansing
    for sent in sentences:
        s = stemmize(stemmer, sent, stopwords)
        clean.append(" ".join(s))
        originalSentenceOf[clean[-1]] = sent
    setClean = set(clean)
    return setClean, originalSentenceOf, sentences, clean

示例#2

显示文件

        # replace=True, stem=True)]

        # list of words spoken by the child with POS tags
        child_words_tagged_xml = corpus_xml.tagged_words(text_xml,
                                                         speaker=['CHI'])

        # List of sentences/utterances spoken by the child
        child_sents_xml = corpus_xml.sents(text_xml, speaker=['CHI'])

        # List of sentences/utterances spoken by the investigator
        inv_sents_xml = corpus_xml.sents(text_xml,
                                         speaker=['INV', 'CLN', 'MOT', 'CLI'])

        # List of sentences spoken by the child in plain text with all annotations included
        child_sents_plain = []
        s = corpus_plain.sents(text_plain)
        for k in range(len(s)):
            for w in range(len(s[k])):
                try:
                    if s[k][w] == '*' and s[k][w + 1] == 'CHI':
                        child_sents_plain.append(s[k][w:])
                except IndexError:
                    continue

        # current_text = Text()
        """
        Extracts features for some text
        features names: total number of words, number of different words, total number of utterances, mean length of
        utterance, average number of syllables per word, Flesch-Kincaid score, ratio of raw-verbs to total number of
        verbs, number of different POS tags, number of repeated words/phrases, number of partial words,
        number of filler words

示例#3

显示文件

文件： eval.py 项目： emicuenca/PLN-2019

    # load the model
    filename = opts['-i']
    f = open(filename, 'rb')
    model = pickle.load(f)
    f.close()

    # load the data
    # WORK HERE!! LOAD YOUR EVALUATION CORPUS
    #sents = gutenberg.sents('austen-persuasion.txt')
    corpora_dir = find(os.path.join(os.getcwd(), 'corpora'))
    custom_tokenizer = RegexpTokenizer('[^.!?]+')
    reader = PlaintextCorpusReader(corpora_dir,
                                   '.*\.txt',
                                   sent_tokenizer=custom_tokenizer)
    sents = reader.sents('test-utf8.txt')

    # load the model
    filename = opts['-i']
    f = open(filename, 'rb')
    model = pickle.load(f)
    f.close()

    # compute the cross entropy
    # WORK HERE!!
    log_prob = model.log_prob(sents)
    e = model.cross_entropy(sents)
    p = model.perplexity(sents)

    print('Log probability: {}'.format(log_prob))
    print('Cross entropy: {}'.format(e))

示例#4

显示文件

文件： NLTK-corpora.py 项目： atulkakrana/Data-Analytics

sample = gutenberg.raw("bible-kjv.txt")
sent = sent_tokenize(sample)

for x in range(5):
    print("Sentence - %s\n" % (sent[x]))
    print("Words - %s\n" % (nltk.word_tokenize(sent[x])))

## Reading corpora from a text files ##########
## No POS tags, chunks or categories ##########
reader = PlaintextCorpusReader("/Users/atul/nltk_data/corpora/gutenberg",
                               r'^.*\.txt')
files = reader.fileids()
print("File IDs:", files)
print("Number of files:", len(files))
print(reader.words(files[0]))
print(reader.sents(files[0]))

## Reading tagged corpora #####################
reader = TaggedCorpusReader('/Users/atul/nltk_data',
                            r'brown.pos',
                            tagset='en-brown')
reader1 = TaggedCorpusReader('/Users/atul/nltk_data',
                             r'brown.pos',
                             word_tokenizer=SpaceTokenizer())

print(reader.words())
print(reader.sents())
print(reader.tagged_words())
print(reader.tagged_sents())
print(
    reader.tagged_words(tagset='universal')

示例#5

显示文件

文件： Bruckner_Assignment1.py 项目： brucknr2/Assignments

    out_file.write("")

# Create a corpus from the files using NLTK
corpus = PlaintextCorpusReader("./Part1/", ".*\.txt")

# Loop through each file in the corpus
for fileid in corpus._fileids:

    # Set flags to 0
    org_found = 0           # Flag for when the NSF organization name has been found in the file
    amt_found = 0           # Flag for when the award amount has been found in the file
    abstract_found = 0      # Flag for when the abstract has been found in the file

    # Try to loop through each sentence in the file and apply GetOrg and GetAmt functions.
    try:
        for sent in corpus.sents(fileid):
            GetOrg()
            GetAmt()

    # If a file cannot be decoded to utf-8, add it to the problem file list and skip it.
    except UnicodeDecodeError:
        problem_files.append(fileid)
        continue

    # If there is missing data, add the file to the problem file list and skip it.
    if org==[] or org==['null'] or amt==[] or amt==['null']:
        problem_files.append(fileid)
        continue

    # Extract single values from list objects
    org = org[0]

示例#6

显示文件

文件： train.py 项目： emicuenca/PLN-2019

models = {
    'ngram': NGram,
    'addone': AddOneNGram,
    'inter': InterpolatedNGram,
    'backoff': BackOffNGram
}

if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the data
    # TODO: Corpus must be larger than 5MB
    corpora_dir = find(os.path.join(os.getcwd(), 'corpora'))
    custom_tokenizer = RegexpTokenizer('[^.!?]+')
    reader = PlaintextCorpusReader(corpora_dir,
                                   '.*\.txt',
                                   sent_tokenizer=custom_tokenizer)
    sents = reader.sents('corpus-utf8.txt')

    # train the model
    n = int(opts['-n'])
    model_class = models[opts['-m']]
    model = model_class(n, sents)

    # save it
    filename = opts['-o']
    f = open(filename, 'wb')
    pickle.dump(model, f)
    f.close()

示例#7

显示文件

文件： pos_tagging.py 项目： IHI-Zittau/pdm_research_python

directory = "D:/Eigene Dateien_rklein/z_Forschung/_Konferenzen/_79_ICFCA - Dresden - Concept Analysis/Data/"
input_directory = directory + "Input/_Product_Management/"
output_directory = directory + "1_POS/"
if not os.path.exists(output_directory): os.mkdir(output_directory)

# reading stuff
file_list = os.listdir(input_directory)
print file_list

# just for testing create a corpus reader
from nltk.corpus.reader import PlaintextCorpusReader
reader = PlaintextCorpusReader(input_directory,'.*.txt')

reader.fileids()
reader.raw()
reader.sents()
reader.words()

## default POS tagger from NLTK ##
import nltk
# import pprint
# sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
pos = "nltk"
path = output_directory + pos
if not os.path.exists(path): os.mkdir(path)
for i in range(len(file_list)):
#    posting = []
    output = path + "/" + str(file_list[i])
    jfile=open (output,"w")
    reader = PlaintextCorpusReader(input_directory,str(file_list[i]))
    text = str(reader.raw())