Пример #1
0
 def do_GET(s):
     """Respond to a GET request."""
     s.send_response(200)
     s.send_header("Content-type", "text/html")
     s.end_headers()
     # If someone went to "http://something.somewhere.net/foo/bar/",
     # then s.path equals "/foo/bar/".
     post = s.path.replace("%20", " ")
     of_interest, words = normalise.normalise_post(post)
     is_abusive, top_score = ngram.rate(words)
     s.wfile.write(str(is_abusive) + ',' + str(top_score))
Пример #2
0
    def check_mention(self, post, shoot, rate_verbose):

        sender = post['user']['screen_name']
        post_text = post['text'].encode('ascii', 'ignore')

        if (self.verbose or rate_verbose):
            print("Post:   {0} ".format(post['text'].encode('utf-8')))
        if self.verbose > 2:
            print("Sender: {0} ".format(sender))

        banter_rating = [0.0, 0.0, 0.0, 0.0]

        if ((post['retweet_count'] > 0) and ('retweeted_status' in post.keys())
                and
            (post['retweeted_status']['user']['id'] == self.twitter_id)):
            print("This post is a pure retweet of {0} .. ignoring".format(
                self.screen_name))
            of_interest = False
        else:
            of_interest, words = normalise.normalise_post(post_text)

        if of_interest:
            for i in range(len(words)):
                if (self.verbose > 2):
                    print i, words[i]

            for ng in range(1, 4):
                # There is no switch in python..
                if ng == 1:
                    ngram = self.unigram
                elif ng == 2:
                    ngram = self.bigram
                else:
                    ngram = self.trigram

                # Harvest the maximum rating.
                if (shoot):
                    rate_verbose = True  # Always on for mentions..
                    alert, rating = ngram.rate(words, rate_verbose)
                else:
                    rating = 1.0  # HACK

                banter_rating[ng] = rating

        if self.verbose:
            print("Banter ratings: {0}".format(banter_rating[1:4]))
        return banter_rating, sender
Пример #3
0
    def frequency( self, corpus_file ):

        # use nltk.ngrams( line, self.ngrams )
        freq = nltk.FreqDist()
        for line in corpus_file:
            if ( not (line[0] == '#' and line[1] == ' ') ):
                of_interest, normalised_line = normalise.normalise_post( line )
 
                if of_interest:
                    ngrams_group = nltk.ngrams(normalised_line, self.ngrams)
                    for ngram in ngrams_group:
                        unit = " ".join( ngram )
                        freq.inc( unit, 1 )
                else:
                    if self.verbose > 3:
                        s = "Discarded line from corpus: ", line
                        print s
        
        # Ensure we are at the start of file.
        corpus_file.seek(0, 0)
        return freq
Пример #4
0
                      dest="verbose",
                      required=False,
                      help="Print status messages to stdout")

    args = parser.parse_args()

    assert ( args.neutral_corpus or args.restore), \
             "Must supply either neutral corpus or a pickled version"

    ngram = ngram.Ngram( "Ngram Classifier", args.bad_corpus, args.neutral_corpus,
                         args.verbose, args.top_words,
                         args.pickle, args.restore, args.ngram_count, args.abuse_freq )


    server_class = BaseHTTPServer.HTTPServer
    httpd = server_class(('localhost', 50007), MyHandler)
    print("Serving")
    httpd.serve_forever()


    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.bind(('localhost', 50007))
    s.listen(1)
    conn, addr = s.accept()
    print('Connected by', addr)
    while True:
      post = conn.recv(1024)
      if post == '': continue
      of_interest, words = normalise.normalise_post( post )
      is_abusive, top_score = ngram.rate(words)
      conn.sendall(str(is_abusive) + ', ' + str(top_score) + '\n')
Пример #5
0
    def analyse(self, idx, post):
        if self.verbose > 1:
            print "    " + str(idx) + "          " + post['text'].encode(
                'utf-8')

        # Harvest basic statistics
        self.stats['total'] += 1
        if post['favorited']:
            self.stats['i_favorited'] += 1

        # Is this post an original?
        if 'retweeted_status' in post:
            self.stats['i_retweeted'] += 1
        else:
            # An original tweet, now these interest us the most.
            if 'retweet_count' in post:
                self.stats['others_retweeted'] += post['retweet_count']

            if 'favorite-count' in post:
                self.stats['others_favourited'] += post['favorite-count']

        # Is this post a reply?
        if 'in_reply_to_status_id' in post and post['in_reply_to_status_id']:
            self.stats['in_reply'] += 1

        if len(post['entities']['urls']):
            self.stats['contains_urls'] += 1

        if len(post['entities']['user_mentions']):
            self.stats['contains_mentions'] += 1

        if ('media' in post['entities']):
            self.stats['contains_media'] += 1

        # Now parse this post using a number of raters.
        post_text = post['text'].encode('ascii', 'ignore')
        of_interest, words = normalise.normalise_post(post_text)

        if not of_interest:
            if self.verbose:
                print "No interest in this post"
        else:
            rate_verbose = False
            if (self.verbose > 2):
                rate_verbose = True
                for i in range(len(words)):
                    print i, words[i]

            for ng in range(1, 3):
                # There is no switch in python..
                if ng == 1:
                    ngram = self.unigram
                elif ng == 2:
                    ngram = self.bigram
                else:
                    ngram = self.trigram

                if (ngram):
                    alert, rating = ngram.rate(words, rate_verbose)
                else:
                    alert, rating = False, 0.0

                # Save the highest ratings we find.
                # FIXME: Do we also save the mean, median, standard distribution?
                if 'retweeted_status' in post:
                    if rating > self.retweeted_banter[ng]:
                        if (self.verbose > 1):
                            print "Retweeted banter: ", self.retweeted_banter[
                                ng], " -> ", rating
                        self.retweeted_banter[ng] = rating

                elif len(post['entities']['user_mentions']):
                    if rating > self.directed_banter[ng]:
                        if (self.verbose > 1):
                            print "Directed banter: ", self.directed_banter[
                                ng], " -> ", rating
                        self.directed_banter[ng] = rating

                else:
                    if rating > self.ranting_banter[ng]:
                        if (self.verbose > 1):
                            print "Ranting banter: ", self.ranting_banter[
                                ng], " -> ", rating
                        self.ranting_banter[ng] = rating