Exemplo n.º 1
0
    def is_tweet_valid(self, tweet):

        if not tweet or 'delete' in tweet:
            logger.debug('Empty tweet - skipping')
            return False

        if not 'lang' in tweet or tweet['lang'] != 'en':
            logger.debug('Non EN - skipping')
            return False

        if not 'text' in tweet or tweet['text'].startswith('RT'):
            logger.debug('RE-Tweet found - skipping')
            return False

        folded_text = TwitterMixin.word_map(tweet['text']).split()
        if '__h__' in folded_text and '__s__' in folded_text:
            logger.debug('Tweet with double emoicons found - skipping')
            return False

        return True
Exemplo n.º 2
0
    def is_tweet_valid(self, tweet):

        if not tweet or "delete" in tweet:
            logger.debug("Empty tweet - skipping")
            return False

        if not "lang" in tweet or tweet["lang"] != "en":
            logger.debug("Non EN - skipping")
            return False

        if not "text" in tweet or tweet["text"].startswith("RT"):
            logger.debug("RE-Tweet found - skipping")
            return False

        folded_text = TwitterMixin.word_map(tweet["text"]).split()
        if "__h__" in folded_text and "__s__" in folded_text:
            logger.debug("Tweet with double emoicons found - skipping")
            return False

        return True
Exemplo n.º 3
0
    def is_tweet_valid(self, tweet):

        if not tweet or 'delete' in tweet:
            logger.debug('Empty tweet - skipping')
            return False

        if not 'lang' in tweet or tweet['lang'] != 'en':
            logger.debug('Non EN - skipping')
            return False

        if not 'text' in tweet or tweet['text'].startswith('RT'):
            logger.debug('RE-Tweet found - skipping')
            return False

        folded_text = TwitterMixin.word_map(tweet['text']).split()
        if '__h__' in folded_text and '__s__' in folded_text:
            logger.debug('Tweet with double emoicons found - skipping')
            return False

        return True
Exemplo n.º 4
0
 def test_remove_usernames(self):
     text = "hello @username xxx"
     result = TwitterMixin.remove_usernames(text).split()
     expect = "hello xxx".split()
     self.assertEqual(expect, result)
Exemplo n.º 5
0
 def test_word_map(self):
     text = "hello :) :( not xxx :)"
     result = TwitterMixin.word_map(text).split()
     expect = "hello __h__ __s__ __not__ xxx __h__".split()
     self.assertEqual(expect, result)
Exemplo n.º 6
0
 def test_char_fold(self):
     text = "hello loooooooool"
     result = TwitterMixin.char_fold(text).split()
     expect = "hello lool".split()
     self.assertEqual(expect, result)
Exemplo n.º 7
0
 def test_remove_numbers(self):
     text = "hello 12456 xxx"
     result = TwitterMixin.remove_numbers(text).split()
     expect = "hello xxx".split()
     self.assertEqual(expect, result)
Exemplo n.º 8
0
 def test_remove_urls(self):
     text = "hello http://cyhex.com xxx"
     result = TwitterMixin.remove_urls(text).split()
     expect = "hello xxx".split()
     self.assertEqual(expect, result)
Exemplo n.º 9
0
 def test_remove_hashtags(self):
     text = "hello #hashtag xxx"
     result = TwitterMixin.remove_hashtags(text).split()
     expect = "hello xxx".split()
     self.assertEqual(expect, result)
Exemplo n.º 10
0
 def test_remove_usernames(self):
     text = "hello @username xxx"
     result = TwitterMixin.remove_usernames(text).split()
     expect = "hello xxx".split()
     self.assertEqual(expect, result)
Exemplo n.º 11
0
 def test_word_map(self):
     text = "hello :) :( not xxx :)"
     result = TwitterMixin.word_map(text).split()
     expect = "hello __h__ __s__ __not__ xxx __h__".split()
     self.assertEqual(expect, result)
Exemplo n.º 12
0
 def test_char_fold(self):
     text = "hello loooooooool"
     result = TwitterMixin.char_fold(text).split()
     expect = "hello lool".split()
     self.assertEqual(expect, result)
Exemplo n.º 13
0
 def test_remove_numbers(self):
     text = "hello 12456 xxx"
     result = TwitterMixin.remove_numbers(text).split()
     expect = "hello xxx".split()
     self.assertEqual(expect, result)
Exemplo n.º 14
0
 def test_remove_urls(self):
     text = "hello http://cyhex.com xxx"
     result = TwitterMixin.remove_urls(text).split()
     expect = "hello xxx".split()
     self.assertEqual(expect, result)
Exemplo n.º 15
0
 def test_remove_hashtags(self):
     text = "hello #hashtag xxx"
     result = TwitterMixin.remove_hashtags(text).split()
     expect = "hello xxx".split()
     self.assertEqual(expect, result)
    return (feature_extractor(row.text), row.get_label())


if args.source:
    featureset = []
    f = io.open(args.source)
    c = 0
    for l in f.readlines():
        pos, id, posScore, negScore, synsetTerm, gloss = l.split('\t')

        c += 1

        if c == 1:
            continue

        gloss = TwitterMixin.make_plain(gloss)
        print negScore
        negScore = float(negScore)
        posScore = float(posScore)

        if posScore > negScore:
            label = labels.positive
        elif posScore < negScore:
            label = labels.negative
        else:
            continue

        featureset.append((tokenizer.getFeatures(gloss), label))

else: