def read_emails(path): files = [f for f in listdir(path) if isfile(join(path, f))] try: del (files[files.index('.DS_Store')]) except: pass reader = WordListCorpusReader(path, files) text = clean(reader.raw()) emails = split_emails(text, reader.fileids()) return emails
def main(): reader = WordListCorpusReader(path, ['banbagsfb.txt']) pages = line_tokenize(reader.raw()) thispage = pages[4] thispage = thispage.raw() """ The easiest way to deal with strings in Python that contain escape characters and quotes is to triple double-quote the string (""") and prefix it with r. For example: my_str = r"""This string would "really "suck"" to write if I didn't know how to tell Python to parse it as "raw" text with the 'r' character and triple " quotes. Especially since I want \n to show up as a backlash followed by n. I don't want \0 to be the null byte either!""" The r means "take escape characters as literal". The triple double-quotes (""") prevent single-quotes, double-quotes, and double double-quotes from prematurely ending the string. """ m = re.search("(\d)", thispage) thisitem = m.group(0) m = re.search("(\d\d\D\d\d)", thispage) thisdate = m.group(0) starturl = thispage.find('http') endurl = thispage.find(' ', starturl)-2 thisurl = thispage[starturl:endurl] soup = BeautifulSoup(thispage) newpage = soup.findAll(text=True) html = replace_all(newpage, reps) html = html[11:len(html)] postdate = html[0:5] posttext = html[5:len(html)] print "post date = " + postdate print "post text = " + posttext def replace_all(txt, reps): for i, j in reps.iteritems(): txt = txt.replace(i, j) return text if __name__ == "__main__": main()
import nltk from nltk.corpus import brown from nltk.corpus.reader import WordListCorpusReader from nltk.stem.porter import * from nltk.stem import WordNetLemmatizer x = nltk.data.load('files/big.txt', format='text') reader = WordListCorpusReader('files/', ['computerscience.txt']) cs_text = reader.raw() cs_words = [] cs_words = (nltk.word_tokenize(cs_text)) print(cs_words) stemmer = PorterStemmer() wnl = WordNetLemmatizer() for word in cs_words: print(stemmer.stem(word)) print(wnl.lemmatize(word))
tagged_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/training" untagged_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/seminar_testdata/test_untagged" general_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/Data" l_names = WordListCorpusReader(general_data_filepath, ["names.family"]).words() file_names = [ f for f in listdir(untagged_data_filepath) if isfile(join(untagged_data_filepath, f)) ] file_names = file_names[1:] reader = WordListCorpusReader(untagged_data_filepath, [file_names[0]]) corpus = reader.raw() words = reader.words() def get_tags_by_name(corpus, name): return re.findall(r"<" + name + r">.+</" + name + r">", corpus) def tokenise(corpus): return re.findall("([^\s<>]+)[\s\n<>]", corpus) def get_name_of_poster(corpus): return re.findall()