示例#1
0
def load_sentences(text_file, stopwords, lang):
    path, f = ntsplit(text_file)
    reader = PlaintextCorpusReader(path, f)
    sentences = [sent for sent in reader.sents()]
    clean = []
    originalSentenceOf = {}
    if lang == "fr":
        stemmer = FrenchStemmer()
    elif lang == "en":
        stemmer = SnowballStemmer("english")
    # Data cleansing
    for sent in sentences:
        s = stemmize(stemmer, sent, stopwords)
        clean.append(" ".join(s))
        originalSentenceOf[clean[-1]] = sent
    setClean = set(clean)
    return setClean, originalSentenceOf, sentences, clean
示例#2
0
        # replace=True, stem=True)]

        # list of words spoken by the child with POS tags
        child_words_tagged_xml = corpus_xml.tagged_words(text_xml,
                                                         speaker=['CHI'])

        # List of sentences/utterances spoken by the child
        child_sents_xml = corpus_xml.sents(text_xml, speaker=['CHI'])

        # List of sentences/utterances spoken by the investigator
        inv_sents_xml = corpus_xml.sents(text_xml,
                                         speaker=['INV', 'CLN', 'MOT', 'CLI'])

        # List of sentences spoken by the child in plain text with all annotations included
        child_sents_plain = []
        s = corpus_plain.sents(text_plain)
        for k in range(len(s)):
            for w in range(len(s[k])):
                try:
                    if s[k][w] == '*' and s[k][w + 1] == 'CHI':
                        child_sents_plain.append(s[k][w:])
                except IndexError:
                    continue

        # current_text = Text()
        """
        Extracts features for some text
        features names: total number of words, number of different words, total number of utterances, mean length of
        utterance, average number of syllables per word, Flesch-Kincaid score, ratio of raw-verbs to total number of
        verbs, number of different POS tags, number of repeated words/phrases, number of partial words,
        number of filler words
示例#3
0
    # load the model
    filename = opts['-i']
    f = open(filename, 'rb')
    model = pickle.load(f)
    f.close()

    # load the data
    # WORK HERE!! LOAD YOUR EVALUATION CORPUS
    #sents = gutenberg.sents('austen-persuasion.txt')
    corpora_dir = find(os.path.join(os.getcwd(), 'corpora'))
    custom_tokenizer = RegexpTokenizer('[^.!?]+')
    reader = PlaintextCorpusReader(corpora_dir,
                                   '.*\.txt',
                                   sent_tokenizer=custom_tokenizer)
    sents = reader.sents('test-utf8.txt')

    # load the model
    filename = opts['-i']
    f = open(filename, 'rb')
    model = pickle.load(f)
    f.close()

    # compute the cross entropy
    # WORK HERE!!
    log_prob = model.log_prob(sents)
    e = model.cross_entropy(sents)
    p = model.perplexity(sents)

    print('Log probability: {}'.format(log_prob))
    print('Cross entropy: {}'.format(e))
sample = gutenberg.raw("bible-kjv.txt")
sent = sent_tokenize(sample)

for x in range(5):
    print("Sentence - %s\n" % (sent[x]))
    print("Words - %s\n" % (nltk.word_tokenize(sent[x])))

## Reading corpora from a text files ##########
## No POS tags, chunks or categories ##########
reader = PlaintextCorpusReader("/Users/atul/nltk_data/corpora/gutenberg",
                               r'^.*\.txt')
files = reader.fileids()
print("File IDs:", files)
print("Number of files:", len(files))
print(reader.words(files[0]))
print(reader.sents(files[0]))

## Reading tagged corpora #####################
reader = TaggedCorpusReader('/Users/atul/nltk_data',
                            r'brown.pos',
                            tagset='en-brown')
reader1 = TaggedCorpusReader('/Users/atul/nltk_data',
                             r'brown.pos',
                             word_tokenizer=SpaceTokenizer())

print(reader.words())
print(reader.sents())
print(reader.tagged_words())
print(reader.tagged_sents())
print(
    reader.tagged_words(tagset='universal')
    out_file.write("")

# Create a corpus from the files using NLTK
corpus = PlaintextCorpusReader("./Part1/", ".*\.txt")

# Loop through each file in the corpus
for fileid in corpus._fileids:

    # Set flags to 0
    org_found = 0           # Flag for when the NSF organization name has been found in the file
    amt_found = 0           # Flag for when the award amount has been found in the file
    abstract_found = 0      # Flag for when the abstract has been found in the file

    # Try to loop through each sentence in the file and apply GetOrg and GetAmt functions.
    try:
        for sent in corpus.sents(fileid):
            GetOrg()
            GetAmt()

    # If a file cannot be decoded to utf-8, add it to the problem file list and skip it.
    except UnicodeDecodeError:
        problem_files.append(fileid)
        continue

    # If there is missing data, add the file to the problem file list and skip it.
    if org==[] or org==['null'] or amt==[] or amt==['null']:
        problem_files.append(fileid)
        continue

    # Extract single values from list objects
    org = org[0]
示例#6
0
models = {
    'ngram': NGram,
    'addone': AddOneNGram,
    'inter': InterpolatedNGram,
    'backoff': BackOffNGram
}

if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the data
    # TODO: Corpus must be larger than 5MB
    corpora_dir = find(os.path.join(os.getcwd(), 'corpora'))
    custom_tokenizer = RegexpTokenizer('[^.!?]+')
    reader = PlaintextCorpusReader(corpora_dir,
                                   '.*\.txt',
                                   sent_tokenizer=custom_tokenizer)
    sents = reader.sents('corpus-utf8.txt')

    # train the model
    n = int(opts['-n'])
    model_class = models[opts['-m']]
    model = model_class(n, sents)

    # save it
    filename = opts['-o']
    f = open(filename, 'wb')
    pickle.dump(model, f)
    f.close()
directory = "D:/Eigene Dateien_rklein/z_Forschung/_Konferenzen/_79_ICFCA - Dresden - Concept Analysis/Data/"
input_directory = directory + "Input/_Product_Management/"
output_directory = directory + "1_POS/"
if not os.path.exists(output_directory): os.mkdir(output_directory)

# reading stuff
file_list = os.listdir(input_directory)
print file_list

# just for testing create a corpus reader
from nltk.corpus.reader import PlaintextCorpusReader
reader = PlaintextCorpusReader(input_directory,'.*.txt')

reader.fileids()
reader.raw()
reader.sents()
reader.words()

## default POS tagger from NLTK ##
import nltk
# import pprint
# sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
pos = "nltk"
path = output_directory + pos
if not os.path.exists(path): os.mkdir(path)
for i in range(len(file_list)):
#    posting = []
    output = path + "/" + str(file_list[i])
    jfile=open (output,"w")
    reader = PlaintextCorpusReader(input_directory,str(file_list[i]))
    text = str(reader.raw())