def do_GET(s): """Respond to a GET request.""" s.send_response(200) s.send_header("Content-type", "text/html") s.end_headers() # If someone went to "http://something.somewhere.net/foo/bar/", # then s.path equals "/foo/bar/". post = s.path.replace("%20", " ") of_interest, words = normalise.normalise_post(post) is_abusive, top_score = ngram.rate(words) s.wfile.write(str(is_abusive) + ',' + str(top_score))
def check_mention(self, post, shoot, rate_verbose): sender = post['user']['screen_name'] post_text = post['text'].encode('ascii', 'ignore') if (self.verbose or rate_verbose): print("Post: {0} ".format(post['text'].encode('utf-8'))) if self.verbose > 2: print("Sender: {0} ".format(sender)) banter_rating = [0.0, 0.0, 0.0, 0.0] if ((post['retweet_count'] > 0) and ('retweeted_status' in post.keys()) and (post['retweeted_status']['user']['id'] == self.twitter_id)): print("This post is a pure retweet of {0} .. ignoring".format( self.screen_name)) of_interest = False else: of_interest, words = normalise.normalise_post(post_text) if of_interest: for i in range(len(words)): if (self.verbose > 2): print i, words[i] for ng in range(1, 4): # There is no switch in python.. if ng == 1: ngram = self.unigram elif ng == 2: ngram = self.bigram else: ngram = self.trigram # Harvest the maximum rating. if (shoot): rate_verbose = True # Always on for mentions.. alert, rating = ngram.rate(words, rate_verbose) else: rating = 1.0 # HACK banter_rating[ng] = rating if self.verbose: print("Banter ratings: {0}".format(banter_rating[1:4])) return banter_rating, sender
def frequency( self, corpus_file ): # use nltk.ngrams( line, self.ngrams ) freq = nltk.FreqDist() for line in corpus_file: if ( not (line[0] == '#' and line[1] == ' ') ): of_interest, normalised_line = normalise.normalise_post( line ) if of_interest: ngrams_group = nltk.ngrams(normalised_line, self.ngrams) for ngram in ngrams_group: unit = " ".join( ngram ) freq.inc( unit, 1 ) else: if self.verbose > 3: s = "Discarded line from corpus: ", line print s # Ensure we are at the start of file. corpus_file.seek(0, 0) return freq
dest="verbose", required=False, help="Print status messages to stdout") args = parser.parse_args() assert ( args.neutral_corpus or args.restore), \ "Must supply either neutral corpus or a pickled version" ngram = ngram.Ngram( "Ngram Classifier", args.bad_corpus, args.neutral_corpus, args.verbose, args.top_words, args.pickle, args.restore, args.ngram_count, args.abuse_freq ) server_class = BaseHTTPServer.HTTPServer httpd = server_class(('localhost', 50007), MyHandler) print("Serving") httpd.serve_forever() s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.bind(('localhost', 50007)) s.listen(1) conn, addr = s.accept() print('Connected by', addr) while True: post = conn.recv(1024) if post == '': continue of_interest, words = normalise.normalise_post( post ) is_abusive, top_score = ngram.rate(words) conn.sendall(str(is_abusive) + ', ' + str(top_score) + '\n')
def analyse(self, idx, post): if self.verbose > 1: print " " + str(idx) + " " + post['text'].encode( 'utf-8') # Harvest basic statistics self.stats['total'] += 1 if post['favorited']: self.stats['i_favorited'] += 1 # Is this post an original? if 'retweeted_status' in post: self.stats['i_retweeted'] += 1 else: # An original tweet, now these interest us the most. if 'retweet_count' in post: self.stats['others_retweeted'] += post['retweet_count'] if 'favorite-count' in post: self.stats['others_favourited'] += post['favorite-count'] # Is this post a reply? if 'in_reply_to_status_id' in post and post['in_reply_to_status_id']: self.stats['in_reply'] += 1 if len(post['entities']['urls']): self.stats['contains_urls'] += 1 if len(post['entities']['user_mentions']): self.stats['contains_mentions'] += 1 if ('media' in post['entities']): self.stats['contains_media'] += 1 # Now parse this post using a number of raters. post_text = post['text'].encode('ascii', 'ignore') of_interest, words = normalise.normalise_post(post_text) if not of_interest: if self.verbose: print "No interest in this post" else: rate_verbose = False if (self.verbose > 2): rate_verbose = True for i in range(len(words)): print i, words[i] for ng in range(1, 3): # There is no switch in python.. if ng == 1: ngram = self.unigram elif ng == 2: ngram = self.bigram else: ngram = self.trigram if (ngram): alert, rating = ngram.rate(words, rate_verbose) else: alert, rating = False, 0.0 # Save the highest ratings we find. # FIXME: Do we also save the mean, median, standard distribution? if 'retweeted_status' in post: if rating > self.retweeted_banter[ng]: if (self.verbose > 1): print "Retweeted banter: ", self.retweeted_banter[ ng], " -> ", rating self.retweeted_banter[ng] = rating elif len(post['entities']['user_mentions']): if rating > self.directed_banter[ng]: if (self.verbose > 1): print "Directed banter: ", self.directed_banter[ ng], " -> ", rating self.directed_banter[ng] = rating else: if rating > self.ranting_banter[ng]: if (self.verbose > 1): print "Ranting banter: ", self.ranting_banter[ ng], " -> ", rating self.ranting_banter[ng] = rating