def extract_more_decep_tech_features(self, tweets, vocab_file):
        #print 'Extracting decep_tech/decep_type features with training vocab'
        train_vocab = {}
        k = 0
        for line in open(vocab_file):
            train_vocab[line.strip()] = k
            k += 1
    #print 'Train vocab size=>' + str(len(train_vocab))

        cv = CountVectorizer(ngram_range=(1, 1),
                             binary=True,
                             vocSuraiyalary=train_vocab)
        train_features_bow = cv.fit_transform(tweets)

        add_decep_tech_matrix = []
        hash_pattern = re.compile('\#+[\w_]+[\w\'_\-]*[\w_]+')
        elong_pattern = re.compile("([a-zA-Z])\\1{2,}")
        caps_pattern = re.compile(('[A-Z][A-Z\d]+'))
        punc_pattern = re.compile('([.,!?]+)')

        for tweet in tweets:
            tweet_vector = []
            tokens = twokenize.tokenize(tweet)
            #count the number of elongated tokens
            n_elong = len(re.findall(elong_pattern, tweet))

            #count the number of all_caps tokens
            n_caps = len(re.findall(caps_pattern, tweet))

            #count the number of repeated punctuation
            n_rep_punct = len(re.findall(punc_pattern, tweet))

            #count the number of hasgtags
            n_hahtag = len(re.findall(hash_pattern, tweet))

            #check if the tweets has SAD, HAPPY, BOTH_SH or NA emoticon
            emoticon_mood = emoticons.analyze_tweet(tweet.strip())
            if emoticon_mood == 'NA':
                emoticon_mood = 0
            elif emoticon_mood == 'HAPPY':
                emoticon_mood = 2
            elif emoticon_mood == 'SAD':
                emoticon_mood = 1
            elif emoticon_mood == 'BOTH_HS':
                emoticon_mood = 4
            tweet_vector = [
                n_elong, n_caps, n_rep_punct, n_hahtag, emoticon_mood
            ]
            add_decep_tech_matrix.append(tweet_vector)

    #print np.asarray(add_decep_tech_matrix).shape
        a = np.asarray(add_decep_tech_matrix)
        #print 'additional 5 features: ' + str(a)

        sa = sparse.csr_matrix(add_decep_tech_matrix)
        features = hstack([sa, train_features_bow])
        #print 'final feature matrix size: ' + str(features.shape)

        return features
def extractEmoticons(tweets):
    vects = []  # BOTH_HS, HAPPY, SAD, NA
    vocab = ["BOTH_HS", "HAPPY", "SAD", "NA"]
    for i, tweet in enumerate(tweets):
        vect = np.zeros(4)
        emo = analyze_tweet(tweet)
        if emo == "NA":
            vect[0] = 1
        elif emo == "HAPPY":
            vect[1] = 1
        elif emo == "SAD":
            vect[2] = 1
        elif emo == "BOTH_HS":
            vect[3] = 1
        vects.append(vect)
    return vects, vocab
def extractEmoticons(tweets):
    vects = [] # BOTH_HS, HAPPY, SAD, NA
    vocab = ["BOTH_HS", "HAPPY", "SAD", "NA"]
    for i, tweet in enumerate(tweets):
        vect = np.zeros(4)
        emo = analyze_tweet(tweet)
        if emo == "NA":
            vect[0] = 1
        elif emo == "HAPPY":
            vect[1] = 1
        elif emo == "SAD":
            vect[2] = 1
        elif emo == "BOTH_HS":
            vect[3] = 1
        vects.append(vect)
    return vects, vocab
示例#4
0
def tweet_features(tweet, bigrams=True):
  """
  Extracts a list of features for a given tweet

  Features:
  - singletons, bigrams
  - hashtags already included
  - emoticons
  - repeated punctuation
  - all caps
  - dialog RT @
  - sentiwordnet
  - slang / proper engish
  """
  rawtext = tweet
  tokens = transform(rawtext)
  # singletons
  for tok in tokens:
    if not ONLY_PUNCTUATION_RE.match(tok):
      yield tok
  # bigrams
  if bigrams:
    for tok1, tok2 in itertools.izip(tokens[:-1], tokens[1:]):
      if not ONLY_PUNCTUATION_RE.match(tok1) and not ONLY_PUNCTUATION_RE.match(tok2):
        if tok1 < tok2:
          yield "<2>{},{}</2>".format(tok1, tok2)
        else:
          yield "<2>{},{}</2>".format(tok2, tok1)
  # emoticons
  for emoticon in emoticons.analyze_tweet(rawtext):
    yield "<e>{}</e>".format(emoticon)
  # repeated punctuation
  if REPEATED_PUNCTUATION_RE.search(rawtext):
    yield "<rp>!</rp>"
  # dialog
  if DIALOG_RE.search(rawtext):
    yield "<d>!</d>"
  # all caps
  if ALL_CAPS_RE.search(rawtext):
    yield "<ac>!</ac>"