def get(self): # tokenList = word_tokenize("John's big idea isn't all that bad.") # tokenList = pos_tag(word_tokenize("John's big idea isn't all that bad.")) stemmer = PorterStemmer() plurals = ['caresses', 'flies', 'dies', 'mules', 'denied', 'died', 'agreed', 'owned', 'humbled', 'sized', 'meeting', 'stating', 'siezing', 'itemization', 'sensational', 'traditional', 'reference', 'colonizer', 'plotted'] singles = [] for plural in plurals: singles.append(stemmer.stem(plural)) self.response.headers['Content-Type'] = 'text/html' self.response.out.write('Hello test!') self.response.out.write(singles) nlProcessor = NLPlib() s = "Very little is known about Beethoven's childhood. He was baptized on December 17, 1770 and was probably born a few days before that. [1][4][5][6] Beethoven's parents were Johann van Beethoven (1740 in Bonn - December 18, 1792) and Maria Magdalena Keverich (1744 in Ehrenbreitstein - July 17, 1787)." v = nlProcessor.tokenize(s) t = nlProcessor.tag(v) for i in range(len(v)): self.response.out.write(v[i] + "(" + t[i] + ")<br/>")
def test(): tweets = get_file_data(sys.argv[1]) posts = [] sys.path.append("/home/nsatvik/twitminer/miner") print "1-Sports 2-Politics" tagger = NLPlib() for t in tweets: posts.append(tweet(t, 1)) print posts[-1].get_text() a = input("1 to display tags") if a == 1: words = tagger.tokenize(posts[-1].get_text()) tags = tagger.tag(words) for i in range(len(words)): print words[i], " ", tags[i] else: continue
def post(self): nlProcessor = NLPlib() content = self.request.get('content') tokens = nlProcessor.tokenize(content) taggedContent = nlProcessor.tag(tokens) content = taggedContent for i in range(len(taggedContent)): isVerb = (taggedContent[i] == "VBD" or taggedContent[i] == "VBZ") if isVerb: correctVerb = tokens[i] tokens[i] = "<select id=\"clozefox_answer\">" tokens[i] += "<option value=\"wrongAnswer\">loves</option>" tokens[i] += "<option value=\"wrongAnswer\">hates</option>" tokens[i] += "<option value=\"trueAnswer\">" + correctVerb + "</option>" tokens[i] += "</select>" content = ' '.join(tokens) self.response.headers['Content-Type'] = 'text/html' self.response.out.write(content)
newlinearray = re.findall("[^\r\n]+?[\r\n]+?", line) #separate sentences in the tweet #output_fpntr.write(str(newlinearray)+"\n") tokens = [re.split("[ \t]+", line.strip()) for line in newlinearray] # separate every word using space as separator # print tokens # #extract words, punctiation and clitics nopunctiation = [] for sentence in tokens: newsentence = [] for word in sentence: newtokens = re.compile("('(?:m|re|s|ll|ve|t)|n't|#\S+|\.[ \t]?\.(?:[ \t]*\.)*|-[ \t]*-(?:[ \t]*-)*|[!?\s]+|[^\w\s])", re.IGNORECASE).split(word) noempty = [newtoken.strip() for newtoken in newtokens if newtoken.strip() != ''] newsentence = newsentence + noempty if (newsentence != []): nopunctiation.append(newsentence) sent = nopunctiation tags = [tagger.tag(sent) for sent in nopunctiation ] zipper = lambda x, y, z: [x[i]+y[i]+z[i] for i in range(0, len(x))] zipped = [zipper(nopunctiation[i], ['/']*len(tags[i]), tags[i]) for i in range(0, len(tags))] for sentence in zipped: for i in range (0, len(sentence)): output_fpntr.write(sentence[i]) if i != len(sentence) - 1: output_fpntr.write(' ') output_fpntr.write('\n') output_fpntr.write('|\n') #output_fpntr.write('##########################################################\n') #close file pointers input_fpntr.close() output_fpntr.close()