예제 #1
0
def writeToksToFile():

    tokens, tweets_on_topic, tweets = readToks()

    for topic in TOPICS:

        tokenized_tweets = Tweets()

        for index in tweets_on_topic[topic]:

            tweet = tweets[index]

            tokenized = tokenized_tweets.tweets.add()
            tokenized.tweet = tweet['text']
            for token in tokenize(tweet['text']):
                try:
                    index = tokens.index(token)
                    tokenized.tokens.append(index)
                except ValueError:
                    tokenized.tokens.append(-1)

            print(tokenized.tokens)
            f = open(topic + '.tweets', "wb")
            f.write(tokenized_tweets.SerializeToString())
            f.close()
예제 #2
0
def convertTweetsOfficialToVec(numtoks,
                               tokens,
                               tweets,
                               filtering=False,
                               phrasemodelpath="phrase.model"):

    tokens_sub = tokens[:numtoks]
    tokenized_tweets = Tweets()
    vects = []
    norm_tweets = []

    if filtering == True:
        bigram = Phrases(phrasemodelpath)

    for tweet in tweets:

        vect = np.zeros(
            numtoks
        )  # dimensionality. the most frequent tokens have a low index, then we can do a cutoff. original: 93988
        norm_tweet = []

        tokenized = tokenized_tweets.tweets.add()
        tokenized.tweet = tweet
        if filtering == False:
            tokenised_tweet = tokenize(tokenized.tweet)
        else:
            tokens = filterStopwords(tokenize(tokenized.tweet.lower()))
            tokenised_tweet = bigram[tokens]
        for token in tokenised_tweet:
            try:
                index = tokens_sub.index(token)
            except ValueError:
                index = -1
            if index > -1:
                vect[index] = 1
                norm_tweet.append(token)
            else:
                norm_tweet.append('NULL')

        #print(norm_tweet)
        norm_tweets.append(norm_tweet)
        vects.append(vect)

    return vects, norm_tweets
print (len(tokens))

sys.exit()

tweets_on_topic = defaultdict(list)
for topic in topics:
    for index, tweet in enumerate(tweets):
        for keyword in keywords[topic]:
            if keyword in tweet['text'].lower():
                tweets_on_topic[topic].append(index)
                break


for topic in topics:

    tokenized_tweets = Tweets()

    for index in tweets_on_topic[topic]:

        tweet = tweets[index]

        tokenized = tokenized_tweets.tweets.add()
        tokenized.tweet = tweet['text']
        for token in tokenize(tweet['text']):
            try:
                index = tokens.index(token)
                tokenized.tokens.append(index)
            except ValueError:
                tokenized.tokens.append(-1)

        f = open(topic + '.tweets', "wb")
예제 #4
0
def convertTweetsToVec(topic="all",
                       numtoks='all',
                       phrasemodel=False,
                       phrasemodelpath="phrase.model"):

    print("Reading tokens")
    tokens, tweets_on_topic, tweets = readToks(phrasemodel)

    if phrasemodel == True:
        bigram = Phrases(phrasemodelpath)

    if numtoks != "all":
        tokens_sub = tokens[:numtoks]
    else:
        tokens_sub = tokens
        numtoks = tokens.__sizeof__()

    tokenized_tweets = Tweets()
    vects = []
    norm_tweets = []

    print("Converting JSON tweets")
    if topic == 'all':
        #for topic in TOPICS:
        for tweet in tweets:

            vect = np.zeros(
                numtoks, dtype=bool
            )  # dimensionality. the most frequent tokens have a low index, then we can do a cutoff. original: 93988
            norm_tweet = []

            tokenized = tokenized_tweets.tweets.add()
            tokenized.tweet = tweet['text']
            if phrasemodel == False:
                tokenised_tweet = tokenize(tweet['text'])
            else:
                tokens = filterStopwords(tokenize(tweet['text'].lower()))
                tokenised_tweet = bigram[tokens]
            for token in tokenised_tweet:
                try:
                    index = tokens_sub.index(token)
                except ValueError:
                    index = -1
                if index > -1:
                    vect[index] = 1
                    norm_tweet.append(token)
                else:
                    norm_tweet.append('NULL')

            #print(norm_tweet)
            norm_tweets.append(norm_tweet)
            vects.append(vect)
    else:  # discouraged, needs to be updated
        for index in tweets_on_topic[topic]:

            tweet = tweets[index]
            vect = np.zeros(
                numtoks
            )  # dimensionality. the most frequent tokens have a low index, then we can do a cutoff. original: 93988
            norm_tweet = []

            tokenized = tokenized_tweets.tweets.add()
            tokenized.tweet = tweet['text']
            for token in tokenize(tweet['text']):
                try:
                    index = tokens_sub.index(token)
                except ValueError:
                    index = -1
                if index > -1:
                    vect[index] = 1
                    norm_tweet.append(token)
                else:
                    norm_tweet.append('NULL')

            print(norm_tweet)
            norm_tweets.append(norm_tweet)
            vects.append(vect)

    print("Finished converting JSON tweets")
    return tokens_sub, vects, norm_tweets