示例#1
0
    def get(self):
        # tokenList = word_tokenize("John's big idea isn't all that bad.")
        # tokenList = pos_tag(word_tokenize("John's big idea isn't all that bad.")) 

        stemmer = PorterStemmer()
        plurals = ['caresses', 'flies', 'dies', 'mules', 'denied',
                   'died', 'agreed', 'owned', 'humbled', 'sized',
                   'meeting', 'stating', 'siezing', 'itemization',
                   'sensational', 'traditional', 'reference', 'colonizer',
                   'plotted']
        singles = []
        for plural in plurals:
            singles.append(stemmer.stem(plural))


        self.response.headers['Content-Type'] = 'text/html'
        self.response.out.write('Hello test!')
        self.response.out.write(singles)


        nlProcessor = NLPlib()

        s = "Very little is known about Beethoven's childhood. He was baptized  on December 17, 1770 and was probably born a few days before that. [1][4][5][6]  Beethoven's parents were Johann van Beethoven (1740 in Bonn - December 18, 1792) and Maria Magdalena Keverich (1744 in Ehrenbreitstein - July 17, 1787)."

        v = nlProcessor.tokenize(s)
        t = nlProcessor.tag(v)
        for i in range(len(v)):
            self.response.out.write(v[i] + "(" + t[i] + ")<br/>")
示例#2
0
def test():
    tweets = get_file_data(sys.argv[1])
    posts = []
    sys.path.append("/home/nsatvik/twitminer/miner")
    print "1-Sports 2-Politics"
    tagger = NLPlib()
    for t in tweets:
        posts.append(tweet(t, 1))
        print posts[-1].get_text()
        a = input("1 to display tags")
        if a == 1:
            words = tagger.tokenize(posts[-1].get_text())
            tags = tagger.tag(words)
            for i in range(len(words)):
                print words[i], " ", tags[i]

        else:
            continue
示例#3
0
    def post(self):

        nlProcessor = NLPlib()
        content = self.request.get('content')
        tokens = nlProcessor.tokenize(content) 
        taggedContent = nlProcessor.tag(tokens)

        content = taggedContent

        for i in range(len(taggedContent)):
            isVerb = (taggedContent[i] == "VBD" or taggedContent[i] == "VBZ") 
            if isVerb:
                correctVerb = tokens[i]
                tokens[i] = "<select id=\"clozefox_answer\">"
                tokens[i] += "<option value=\"wrongAnswer\">loves</option>" 
                tokens[i] += "<option value=\"wrongAnswer\">hates</option>" 
                tokens[i] += "<option  value=\"trueAnswer\">" + correctVerb + "</option>"
                tokens[i] += "</select>"
        
        content = ' '.join(tokens)

        self.response.headers['Content-Type'] = 'text/html'
        self.response.out.write(content)
示例#4
0
        newlinearray = re.findall("[^\r\n]+?[\r\n]+?", line) #separate sentences in the tweet
        #output_fpntr.write(str(newlinearray)+"\n")
        tokens = [re.split("[ \t]+", line.strip()) for line in newlinearray] # separate every word using space as separator
#        print tokens
#        #extract words, punctiation and clitics
        nopunctiation = []
        for sentence in tokens:
            newsentence = []
            for word in sentence:
                newtokens = re.compile("('(?:m|re|s|ll|ve|t)|n't|#\S+|\.[ \t]?\.(?:[ \t]*\.)*|-[ \t]*-(?:[ \t]*-)*|[!?\s]+|[^\w\s])", re.IGNORECASE).split(word)
                noempty = [newtoken.strip() for newtoken in newtokens if newtoken.strip() != '']
                newsentence = newsentence + noempty
            if (newsentence != []): nopunctiation.append(newsentence)

        sent = nopunctiation
        tags = [tagger.tag(sent) for sent in nopunctiation ]

        zipper = lambda x, y, z: [x[i]+y[i]+z[i] for i in range(0, len(x))]
        zipped = [zipper(nopunctiation[i], ['/']*len(tags[i]), tags[i]) for i in range(0, len(tags))]

        for sentence in zipped:
            for i in range (0, len(sentence)):
                output_fpntr.write(sentence[i])
                if i != len(sentence) - 1: output_fpntr.write(' ')
            output_fpntr.write('\n')

        output_fpntr.write('|\n')
        #output_fpntr.write('##########################################################\n')
    #close file pointers
    input_fpntr.close()
    output_fpntr.close()