def load_sentences(text_file, stopwords, lang): path, f = ntsplit(text_file) reader = PlaintextCorpusReader(path, f) sentences = [sent for sent in reader.sents()] clean = [] originalSentenceOf = {} if lang == "fr": stemmer = FrenchStemmer() elif lang == "en": stemmer = SnowballStemmer("english") # Data cleansing for sent in sentences: s = stemmize(stemmer, sent, stopwords) clean.append(" ".join(s)) originalSentenceOf[clean[-1]] = sent setClean = set(clean) return setClean, originalSentenceOf, sentences, clean
# replace=True, stem=True)] # list of words spoken by the child with POS tags child_words_tagged_xml = corpus_xml.tagged_words(text_xml, speaker=['CHI']) # List of sentences/utterances spoken by the child child_sents_xml = corpus_xml.sents(text_xml, speaker=['CHI']) # List of sentences/utterances spoken by the investigator inv_sents_xml = corpus_xml.sents(text_xml, speaker=['INV', 'CLN', 'MOT', 'CLI']) # List of sentences spoken by the child in plain text with all annotations included child_sents_plain = [] s = corpus_plain.sents(text_plain) for k in range(len(s)): for w in range(len(s[k])): try: if s[k][w] == '*' and s[k][w + 1] == 'CHI': child_sents_plain.append(s[k][w:]) except IndexError: continue # current_text = Text() """ Extracts features for some text features names: total number of words, number of different words, total number of utterances, mean length of utterance, average number of syllables per word, Flesch-Kincaid score, ratio of raw-verbs to total number of verbs, number of different POS tags, number of repeated words/phrases, number of partial words, number of filler words
# load the model filename = opts['-i'] f = open(filename, 'rb') model = pickle.load(f) f.close() # load the data # WORK HERE!! LOAD YOUR EVALUATION CORPUS #sents = gutenberg.sents('austen-persuasion.txt') corpora_dir = find(os.path.join(os.getcwd(), 'corpora')) custom_tokenizer = RegexpTokenizer('[^.!?]+') reader = PlaintextCorpusReader(corpora_dir, '.*\.txt', sent_tokenizer=custom_tokenizer) sents = reader.sents('test-utf8.txt') # load the model filename = opts['-i'] f = open(filename, 'rb') model = pickle.load(f) f.close() # compute the cross entropy # WORK HERE!! log_prob = model.log_prob(sents) e = model.cross_entropy(sents) p = model.perplexity(sents) print('Log probability: {}'.format(log_prob)) print('Cross entropy: {}'.format(e))
sample = gutenberg.raw("bible-kjv.txt") sent = sent_tokenize(sample) for x in range(5): print("Sentence - %s\n" % (sent[x])) print("Words - %s\n" % (nltk.word_tokenize(sent[x]))) ## Reading corpora from a text files ########## ## No POS tags, chunks or categories ########## reader = PlaintextCorpusReader("/Users/atul/nltk_data/corpora/gutenberg", r'^.*\.txt') files = reader.fileids() print("File IDs:", files) print("Number of files:", len(files)) print(reader.words(files[0])) print(reader.sents(files[0])) ## Reading tagged corpora ##################### reader = TaggedCorpusReader('/Users/atul/nltk_data', r'brown.pos', tagset='en-brown') reader1 = TaggedCorpusReader('/Users/atul/nltk_data', r'brown.pos', word_tokenizer=SpaceTokenizer()) print(reader.words()) print(reader.sents()) print(reader.tagged_words()) print(reader.tagged_sents()) print( reader.tagged_words(tagset='universal')
out_file.write("") # Create a corpus from the files using NLTK corpus = PlaintextCorpusReader("./Part1/", ".*\.txt") # Loop through each file in the corpus for fileid in corpus._fileids: # Set flags to 0 org_found = 0 # Flag for when the NSF organization name has been found in the file amt_found = 0 # Flag for when the award amount has been found in the file abstract_found = 0 # Flag for when the abstract has been found in the file # Try to loop through each sentence in the file and apply GetOrg and GetAmt functions. try: for sent in corpus.sents(fileid): GetOrg() GetAmt() # If a file cannot be decoded to utf-8, add it to the problem file list and skip it. except UnicodeDecodeError: problem_files.append(fileid) continue # If there is missing data, add the file to the problem file list and skip it. if org==[] or org==['null'] or amt==[] or amt==['null']: problem_files.append(fileid) continue # Extract single values from list objects org = org[0]
models = { 'ngram': NGram, 'addone': AddOneNGram, 'inter': InterpolatedNGram, 'backoff': BackOffNGram } if __name__ == '__main__': opts = docopt(__doc__) # load the data # TODO: Corpus must be larger than 5MB corpora_dir = find(os.path.join(os.getcwd(), 'corpora')) custom_tokenizer = RegexpTokenizer('[^.!?]+') reader = PlaintextCorpusReader(corpora_dir, '.*\.txt', sent_tokenizer=custom_tokenizer) sents = reader.sents('corpus-utf8.txt') # train the model n = int(opts['-n']) model_class = models[opts['-m']] model = model_class(n, sents) # save it filename = opts['-o'] f = open(filename, 'wb') pickle.dump(model, f) f.close()
directory = "D:/Eigene Dateien_rklein/z_Forschung/_Konferenzen/_79_ICFCA - Dresden - Concept Analysis/Data/" input_directory = directory + "Input/_Product_Management/" output_directory = directory + "1_POS/" if not os.path.exists(output_directory): os.mkdir(output_directory) # reading stuff file_list = os.listdir(input_directory) print file_list # just for testing create a corpus reader from nltk.corpus.reader import PlaintextCorpusReader reader = PlaintextCorpusReader(input_directory,'.*.txt') reader.fileids() reader.raw() reader.sents() reader.words() ## default POS tagger from NLTK ## import nltk # import pprint # sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') pos = "nltk" path = output_directory + pos if not os.path.exists(path): os.mkdir(path) for i in range(len(file_list)): # posting = [] output = path + "/" + str(file_list[i]) jfile=open (output,"w") reader = PlaintextCorpusReader(input_directory,str(file_list[i])) text = str(reader.raw())