def extractW2VAggrFeatures(w2vmodel, phrasemodel, tweets, targets, labels):

    feats = []
    # for each tweet, multiply the word vectors
    for i, tweet in enumerate(tweets):
        tokenised_tweet = tokenize(tweet.lower())
        words = filterStopwords(tokenised_tweet)
        numvects = 0
        vect = []
        for token in phrasemodel[words]:
            try:
                s = w2vmodel[token]
                vect.append(s)
                numvects += 1
            except KeyError:
                s = 0.0
        if vect.__len__() > 0:
            mtrmean = np.average(vect, axis=0)
            if i == 0:
                feats = mtrmean
            else:
                feats = np.vstack((feats, mtrmean))
        else:
            feats = np.vstack((feats, np.zeros(300)))  # 300-dimensional vector for now

    return feats
Exemplo n.º 2
0
def prepData(stopfilter, multiword, useDev=False):
    print("Preparing data...")

    ret = []  # list of lists

    print("Reading data...")
    tweets = readTweets()
    tweets_train, targets_train, labels_train = readTweetsOfficial(
        tokenize_tweets.FILETRAIN, 'windows-1252', 2)
    tweets_trump, targets_trump, labels_trump = readTweetsOfficial(
        tokenize_tweets.FILETRUMP, 'utf-8', 1)
    print(str(len(tweets)))
    tweets.extend(tweets_train)
    print(str(len(tweets_train)), "\t", str(len(tweets)))
    tweets.extend(tweets_trump)
    print(str(len(tweets_trump)), "\t", str(len(tweets)))
    if useDev == True:
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(
            tokenize_tweets.FILEDEV, 'windows-1252', 2)
        tweets.extend(tweets_dev)
        print(str(len(tweets_dev)), "\t", str(len(tweets)))

    print("Tokenising...")
    for tweet in tweets:
        tokenised_tweet = tokenize(tweet.lower())
        if stopfilter:
            words = filterStopwords(tokenised_tweet)
            ret.append(words)
        else:
            ret.append(tokenised_tweet)

    if multiword:
        return learnMultiword(ret)
    else:
        return ret
def extractW2VAggrFeatures(w2vmodel, phrasemodel, tweets, targets, labels):

    feats = []
    # for each tweet, multiply the word vectors
    for i, tweet in enumerate(tweets):
        tokenised_tweet = tokenize(tweet.lower())
        words = filterStopwords(tokenised_tweet)
        numvects = 0
        vect = []
        for token in phrasemodel[words]:
            try:
                s = w2vmodel[token]
                vect.append(s)
                numvects += 1
            except KeyError:
                s = 0.0
        if vect.__len__() > 0:
            mtrmean = np.average(vect, axis=0)
            if i == 0:
                feats = mtrmean
            else:
                feats = np.vstack((feats, mtrmean))
        else:
            feats = np.vstack(
                (feats, np.zeros(300)))  # 300-dimensional vector for now

    return feats
def prepData(stopfilter, multiword, useDev=False):
    print("Preparing data...")

    ret = [] # list of lists

    print("Reading data...")
    tweets = readTweets()
    tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2)
    tweets_trump, targets_trump, labels_trump = readTweetsOfficial(tokenize_tweets.FILETRUMP, 'utf-8', 1)
    print(str(len(tweets)))
    tweets.extend(tweets_train)
    print(str(len(tweets_train)), "\t" , str(len(tweets)))
    tweets.extend(tweets_trump)
    print(str(len(tweets_trump)), "\t" , str(len(tweets)))
    if useDev == True:
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2)
        tweets.extend(tweets_dev)
        print(str(len(tweets_dev)), "\t" , str(len(tweets)))


    print("Tokenising...")
    for tweet in tweets:
        tokenised_tweet = tokenize(tweet.lower())
        if stopfilter:
            words = filterStopwords(tokenised_tweet)
            ret.append(words)
        else:
            ret.append(tokenised_tweet)

    if multiword:
        return learnMultiword(ret)
    else:
        return ret
def extractFeatureVocab(tweets, keyword="all", usephrasemodel=True, phrasemodel="phrase.model", anon_targets=False):
    tokencounts = Counter()
    features_final = []
    bigram = Phrases(phrasemodel)
    #tokens_topic = []

    #if keyword == "all":
    #    for top in tokenize_tweets.TOPICS:
    #        if top != 'clinton':
    #            for tok in tokenize(tokenize_tweets.TOPICS_LONG[top]):
    #                tokens_topic.append(tok)
    #else:
    #    tokens_topic = tokenize(tokenize_tweets.TOPICS_LONG[keyword])

    for tweet in tweets:
        if usephrasemodel == False:
            tokenised_tweet = tokenize(tweet)
            for token in tokenised_tweet:  #unigram features
                tokencounts[token] += 1
                #for toktopic in tokens_topic:
                #    tokencounts[toktopic + '|' + token] += 1
            for l in zip(*[tokenised_tweet[i:] for i in range(2)]): #bigram features
                tokencounts["_".join(l)] += 1
                #for ltop in zip(*[tokens_topic[i:] for i in range(2)]):
                #    tokencounts["_".join(ltop) + '|' + "_".join(l)] += 1
        else:
            # this includes unigrams and frequent bigrams
            tokens = filterStopwords(tokenize(tweet.lower()))  #For Trump it's [1]
            phrasetoks = bigram[tokens]
            target_keywords = []
            if anon_targets==True:
                for top in tokenize_tweets.TOPICS:
                    if top == "climate": # hack, this is the only non-list value
                        target_keywords.append("climate")
                    else:
                        #for keyw in tokenize_tweets.KEYWORDS[top]:
                        target_keywords.extend(tokenize_tweets.KEYWORDS[top])

                phrasetoks_new = []
                for token in phrasetoks:
                    for keyw in target_keywords:
                        if keyw in token:
                            token = token.replace(keyw, "TARGET")
                    phrasetoks_new.append(token)
                phrasetoks = phrasetoks_new

            for token in phrasetoks:
                tokencounts[token] += 1
            for l in zip(*[phrasetoks[i:] for i in range(2)]):
                tokencounts["_".join(l)] += 1

    for token, count in tokencounts.most_common():
        if count > 1:
            features_final.append(token)
            #print token, count

    return features_final
def extractFeaturesBOW(tweets,
                       targets,
                       features_final,
                       anon_targets=False,
                       usephrasemodel=False,
                       phrasemodel="phrase.model"):

    bigram = Phrases(phrasemodel)

    matrix = []  # np.zeros((len(features_final), len(tweets)))

    for i, tweet in enumerate(tweets):
        vect = np.zeros((len(features_final)))
        if usephrasemodel == False:
            tokenised_tweet = tokenize(tweet)
            for token in tokenised_tweet:
                insertIntoVect(features_final, vect, token)
                #for toktopic in tokens_topic:
                #    insertIntoVect(features_final, vect, toktopic + '|' + token)
            for l in zip(*[tokenised_tweet[i:] for i in range(2)]):
                insertIntoVect(features_final, vect, "_".join(l))
                #for ltop in zip(*[tokens_topic[i:] for i in range(2)]):
                #    insertIntoVect(features_final, vect, "_".join(ltop) + '|' + "_".join(l))
        else:
            inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()}
            target_keywords = tokenize_tweets.KEYWORDS.get(
                inv_topics.get(targets[i]))

            tokens = filterStopwords(tokenize(
                tweet.lower()))  #For Trump it's [1]
            phrasetoks = bigram[tokens]

            if anon_targets == True:
                phrasetoks_new = []
                for token in phrasetoks:
                    if target_keywords == "climate":
                        if target_keywords in token:
                            token = token.replace(keyw, "TARGET")
                    else:
                        for keyw in target_keywords:
                            if keyw in token:
                                token = token.replace(keyw, "TARGET")
                    phrasetoks_new.append(token)
                phrasetoks = phrasetoks_new

            for token in phrasetoks:
                insertIntoVect(features_final, vect, token)
            for l in zip(*[phrasetoks[i:] for i in range(2)]):
                insertIntoVect(features_final, vect, "_".join(l))

        matrix.append(vect)
        #print " ".join(str(v) for v in vect), "\n"

    return matrix
def extractFeaturesBOW(tweets, targets, features_final, anon_targets=False, usephrasemodel=False, phrasemodel="phrase.model"):

    bigram = Phrases(phrasemodel)

    matrix = [] # np.zeros((len(features_final), len(tweets)))

    for i, tweet in enumerate(tweets):
        vect = np.zeros((len(features_final)))
        if usephrasemodel == False:
            tokenised_tweet = tokenize(tweet)
            for token in tokenised_tweet:
                insertIntoVect(features_final, vect, token)
                #for toktopic in tokens_topic:
                #    insertIntoVect(features_final, vect, toktopic + '|' + token)
            for l in zip(*[tokenised_tweet[i:] for i in range(2)]):
                insertIntoVect(features_final, vect, "_".join(l))
                #for ltop in zip(*[tokens_topic[i:] for i in range(2)]):
                #    insertIntoVect(features_final, vect, "_".join(ltop) + '|' + "_".join(l))
        else:
            inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()}
            target_keywords = tokenize_tweets.KEYWORDS.get(inv_topics.get(targets[i]))

            tokens = filterStopwords(tokenize(tweet.lower()))  #For Trump it's [1]
            phrasetoks = bigram[tokens]


            if anon_targets==True:
                phrasetoks_new = []
                for token in phrasetoks:
                    if target_keywords == "climate":
                        if target_keywords in token:
                            token = token.replace(keyw, "TARGET")
                    else:
                        for keyw in target_keywords:
                            if keyw in token:
                                token = token.replace(keyw, "TARGET")
                    phrasetoks_new.append(token)
                phrasetoks = phrasetoks_new

            for token in phrasetoks:
                insertIntoVect(features_final, vect, token)
            for l in zip(*[phrasetoks[i:] for i in range(2)]):
                insertIntoVect(features_final, vect, "_".join(l))

        matrix.append(vect)
        #print " ".join(str(v) for v in vect), "\n"

    return matrix
def extractW2VFeaturesSim(w2vmodelfile, phrasemodel, tweets, targets, labels):
    phmodel = Phrases.load(phrasemodel)
    w2vmodel = word2vec.Word2Vec.load(w2vmodelfile)

    inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()}


    for i, tweet in enumerate(tweets):

        # get the neut/pos/neg hashtags
        neut = KEYWORDS_NEUT[inv_topics[targets[i]]]
        pos = KEYWORDS_POS[inv_topics[targets[i]]]
        neg = KEYWORDS_NEG[inv_topics[targets[i]]]

        tokenised_tweet = tokenize(tweet.lower())
        words = filterStopwords(tokenised_tweet)

        neutcnt, poscnt, negcnt = 0, 0, 0
        neutsc, possc, negsc = 0.0, 0.0, 0.0


        # transform, as earlier, with the phrase model
        for token in phmodel[words]:
            try:
                neutsim = w2vmodel.similarity(neut, token)
                neutcnt += 1
                neutsc += neutsim
            except KeyError:
                neutsim = 0
            try:
                possim = w2vmodel.similarity(pos, token)
                possc += possim
                poscnt += 1
            except KeyError:
                possim = 0
            try:
                negsim = w2vmodel.similarity(neg, token)
                negsc += negsim
                negcnt += 1
            except KeyError:
                negsim = 0
            #print targets[i], "\t", token, "\t", neutsim, "\t", possim, "\t", negsim
        neutsc_tweet = neutsc/neutcnt
        possc_tweet = possc/poscnt
        negsc_tweet = negsc/negcnt
        print(targets[i], "\t", labels[i], "\t", neutsc_tweet, "\t", possc_tweet, "\t", negsc_tweet)
Exemplo n.º 9
0
def extractW2VFeaturesSim(w2vmodelfile, phrasemodel, tweets, targets, labels):
    phmodel = Phrases.load(phrasemodel)
    w2vmodel = word2vec.Word2Vec.load(w2vmodelfile)

    inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()}

    for i, tweet in enumerate(tweets):

        # get the neut/pos/neg hashtags
        neut = KEYWORDS_NEUT[inv_topics[targets[i]]]
        pos = KEYWORDS_POS[inv_topics[targets[i]]]
        neg = KEYWORDS_NEG[inv_topics[targets[i]]]

        tokenised_tweet = tokenize(tweet.lower())
        words = filterStopwords(tokenised_tweet)

        neutcnt, poscnt, negcnt = 0, 0, 0
        neutsc, possc, negsc = 0.0, 0.0, 0.0

        # transform, as earlier, with the phrase model
        for token in phmodel[words]:
            try:
                neutsim = w2vmodel.similarity(neut, token)
                neutcnt += 1
                neutsc += neutsim
            except KeyError:
                neutsim = 0
            try:
                possim = w2vmodel.similarity(pos, token)
                possc += possim
                poscnt += 1
            except KeyError:
                possim = 0
            try:
                negsim = w2vmodel.similarity(neg, token)
                negsc += negsim
                negcnt += 1
            except KeyError:
                negsim = 0
            #print targets[i], "\t", token, "\t", neutsim, "\t", possim, "\t", negsim
        neutsc_tweet = neutsc / neutcnt
        possc_tweet = possc / poscnt
        negsc_tweet = negsc / negcnt
        print(targets[i], "\t", labels[i], "\t", neutsc_tweet, "\t",
              possc_tweet, "\t", negsc_tweet)
def extractFeatureVocab(tweets,
                        keyword="all",
                        usephrasemodel=True,
                        phrasemodel="phrase.model",
                        anon_targets=False):
    tokencounts = Counter()
    features_final = []
    bigram = Phrases(phrasemodel)
    #tokens_topic = []

    #if keyword == "all":
    #    for top in tokenize_tweets.TOPICS:
    #        if top != 'clinton':
    #            for tok in tokenize(tokenize_tweets.TOPICS_LONG[top]):
    #                tokens_topic.append(tok)
    #else:
    #    tokens_topic = tokenize(tokenize_tweets.TOPICS_LONG[keyword])

    for tweet in tweets:
        if usephrasemodel == False:
            tokenised_tweet = tokenize(tweet)
            for token in tokenised_tweet:  #unigram features
                tokencounts[token] += 1
                #for toktopic in tokens_topic:
                #    tokencounts[toktopic + '|' + token] += 1
            for l in zip(*[tokenised_tweet[i:]
                           for i in range(2)]):  #bigram features
                tokencounts["_".join(l)] += 1
                #for ltop in zip(*[tokens_topic[i:] for i in range(2)]):
                #    tokencounts["_".join(ltop) + '|' + "_".join(l)] += 1
        else:
            # this includes unigrams and frequent bigrams
            tokens = filterStopwords(tokenize(
                tweet.lower()))  #For Trump it's [1]
            phrasetoks = bigram[tokens]
            target_keywords = []
            if anon_targets == True:
                for top in tokenize_tweets.TOPICS:
                    if top == "climate":  # hack, this is the only non-list value
                        target_keywords.append("climate")
                    else:
                        #for keyw in tokenize_tweets.KEYWORDS[top]:
                        target_keywords.extend(tokenize_tweets.KEYWORDS[top])

                phrasetoks_new = []
                for token in phrasetoks:
                    for keyw in target_keywords:
                        if keyw in token:
                            token = token.replace(keyw, "TARGET")
                    phrasetoks_new.append(token)
                phrasetoks = phrasetoks_new

            for token in phrasetoks:
                tokencounts[token] += 1
            for l in zip(*[phrasetoks[i:] for i in range(2)]):
                tokencounts["_".join(l)] += 1

    for token, count in tokencounts.most_common():
        if count > 1:
            features_final.append(token)
            #print token, count

    return features_final
Exemplo n.º 11
0
def extractW2VHashFeatures(w2vmodel, phrasemodel, mode, tweets, targets,
                           labels):
    features = []

    inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()}

    for i, tweet in enumerate(tweets):

        # get the neut/pos/neg hashtags
        neut = KEYWORDS_NEUT[inv_topics[targets[i]]]
        pos = KEYWORDS_POS[inv_topics[targets[i]]]
        neg = KEYWORDS_NEG[inv_topics[targets[i]]]

        neutsim = w2vmodel.most_similar(neut, topn=60)
        possim = w2vmodel.most_similar(pos, topn=60)
        negsim = w2vmodel.most_similar(neg, topn=60)

        tokenised_tweet = tokenize(tweet.lower())
        words = filterStopwords(tokenised_tweet)

        neutcnt, poscnt, negcnt, neutsimp, possimp, negsimp = 0, 0, 0, 0, 0, 0

        # transform, as earlier, with the phrase model
        for token in phrasemodel[words]:
            if neut == token:
                neutsimp = 1
            if pos == token:
                possimp = 1
            if neg == token:
                negsimp = 1
            for n, sc in neutsim:
                if sc >= 0.4 and n == token:
                    neutcnt += 1
            for n, sc in possim:
                if sc >= 0.4 and n == token:
                    poscnt += 1
            for n, sc in negsim:
                if sc >= 0.4 and n == token:
                    negcnt += 1

        #print targets[i], "\t", labels[i], "\t", neutcnt, "\t", poscnt, "\t", negcnt, "\t", neutsimp, "\t", possimp, "\t", negsimp
        #featint = [neutcnt, poscnt, negcnt, neutsimp, possimp, negsimp]
        pn = 0
        if possim and negsim:
            pn = 1
            possimp = 0
            negsimp = 0
        if mode == "hash":
            featint = [neutsimp, possimp, negsimp, pn]
            features.append(featint)
        if mode == "w2v_hash":
            featint = [neutcnt, poscnt, negcnt, neutsimp, possimp, negsimp, pn]
            features.append(featint)

    featlabels = []
    if mode == "hash":
        featlabels = ["neut_hash", "pos_hash", "neg_hash", "posneg_hash"]
    if mode == "w2v_hash":
        featlabels = [
            "neut_extw2v", "pos_extw2v", "neg_extw2v", "neut_hash", "pos_hash",
            "neg_hash", "posneg_hash"
        ]

    return features, featlabels
def extractW2VHashFeatures(w2vmodel, phrasemodel, mode, tweets, targets, labels):
    features = []

    inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()}



    for i, tweet in enumerate(tweets):

        # get the neut/pos/neg hashtags
        neut = KEYWORDS_NEUT[inv_topics[targets[i]]]
        pos = KEYWORDS_POS[inv_topics[targets[i]]]
        neg = KEYWORDS_NEG[inv_topics[targets[i]]]

        neutsim = w2vmodel.most_similar(neut, topn=60)
        possim = w2vmodel.most_similar(pos, topn=60)
        negsim = w2vmodel.most_similar(neg, topn=60)

        tokenised_tweet = tokenize(tweet.lower())
        words = filterStopwords(tokenised_tweet)

        neutcnt, poscnt, negcnt, neutsimp, possimp, negsimp = 0, 0, 0, 0, 0, 0


        # transform, as earlier, with the phrase model
        for token in phrasemodel[words]:
            if neut == token:
                neutsimp = 1
            if pos == token:
                possimp = 1
            if neg == token:
                negsimp = 1
            for n, sc in neutsim:
                if sc >= 0.4 and n == token:
                   neutcnt += 1
            for n, sc in possim:
                if sc >= 0.4 and n == token:
                   poscnt += 1
            for n, sc in negsim:
                if sc >= 0.4 and n == token:
                   negcnt += 1

        #print targets[i], "\t", labels[i], "\t", neutcnt, "\t", poscnt, "\t", negcnt, "\t", neutsimp, "\t", possimp, "\t", negsimp
        #featint = [neutcnt, poscnt, negcnt, neutsimp, possimp, negsimp]
        pn = 0
        if possim and negsim:
            pn = 1
            possimp = 0
            negsimp = 0
        if mode == "hash":
            featint = [neutsimp, possimp, negsimp, pn]
            features.append(featint)
        if mode == "w2v_hash":
            featint = [neutcnt, poscnt, negcnt, neutsimp, possimp, negsimp, pn]
            features.append(featint)

    featlabels = []
    if mode == "hash":
        featlabels = ["neut_hash", "pos_hash", "neg_hash", "posneg_hash"]
    if mode == "w2v_hash":
        featlabels = ["neut_extw2v", "pos_extw2v", "neg_extw2v", "neut_hash", "pos_hash", "neg_hash", "posneg_hash"]

    return features, featlabels