def gen_json_for_tweets_of_interest(data, identity_list):
    of_id, uid_list = data
    json_of_name = os.path.join(JSON_OUTPUT_DIRECTORY, str(of_id) + ".json.gz")

    print 'inp: ', json_of_name, len(uid_list), uid_list[0:2]
    tweets_to_write = []

    if not os.path.exists(json_of_name):
        for i, uid in enumerate(uid_list):
            if i % 25 == 0:
                print i, len(tweets_to_write)
            try:
                u = TwitterUser()
                u.populate_tweets_from_file(os.path.join(
                    JSON_INPUT_DIRECTORY, uid + ".json.gz"),
                                            store_json=True)
                tweets_to_keep = []
                for t in u.tweets:
                    if not t.retweeted and len(t.tokens) > 4:
                        expanded_token_set = copy(t.tokens)
                        for token in t.tokens:
                            expanded_token_set += get_alternate_wordforms(
                                token)
                        if len(set(expanded_token_set) & identity_list):
                            tweets_to_keep.append(t)
                tweets_to_write += tweets_to_keep
            except:
                print 'FAILED JSON FOR USER: '******'WRITING JSON'
        out_fil = gzip.open(json_of_name, "wb")
        for tweet in tweets_to_write:
            out_fil.write(
                json.dumps(tweet.raw_json).strip().encode("utf8") + "\n")
        out_fil.close()
def gen_conll_file(fil,ptb_dir, dp_dir):
    user = TwitterUser()
    user.populate_tweets_from_file(fil, do_tokenize=False)

    if 50 <= user.n_total_tweets <= 15000 and\
       user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE:

        dp_filename = os.path.join(dp_dir,str(user.user_id)+".gz")
        ptb_filename = os.path.join(ptb_dir,str(user.user_id)+".txt.gz")

        if not os.path.exists(dp_filename) or not os.path.exists(ptb_filename):
            return ['no_dp_ptb',[user.user_id,os.path.exists(dp_filename),os.path.exists(ptb_filename)]]

        penntreebank = {x[0] : x[1:] for x in read_grouped_by_newline_file(ptb_filename)}
        dependency_parse =  read_grouped_by_newline_file(dp_filename)

        tweet_set = [(i,t) for i,t in enumerate(user.tweets) if t.retweeted is None and\
                       len(t.urls) == 0 and 'http:' not in t.text and\
                       langid.classify(t.text)[0] == 'en']

        # non english speaker or spam
        if len(tweet_set) < 40:
            return ['notweets',user.user_id]


        data_to_return = []
        for twit_it, tweet in tweet_set:

            data_for_tweet = []

            ptb_for_tweet = penntreebank[str(tweet.id)]
            dp_for_tweet = dependency_parse[twit_it]

            if ptb_for_tweet[0].split("\t")[2] != DependencyParseObject(dp_for_tweet[0]).text:
                print 'ahhhhh, weird stuff'
                continue

            for i, p in enumerate(dp_for_tweet):
                d = DependencyParseObject(tsn([p,tweet.id,user.user_id,tweet.created_at.strftime("%m-%d-%y")],newline=False))
                # get java features
                spl_java = ptb_for_tweet[i].split("\t")
                java_id, penn_pos_tag,word = spl_java[:3]
                java_features = '' if len(spl_java) == 3 else spl_java[3]
                d.features += [x for x in java_features.split("|") if x != '']
                d.features.append("penn_treebank_pos="+penn_pos_tag)
                data_for_tweet.append(d)
            data_to_return.append(data_for_tweet)

        return ['success', [user.user_id,data_to_return]]
    else:
        return ['baduser',user.user_id]
def gen_output(data, json_data_dir):

    term, is_reply, tweets_needed = data

    dataset = []

    # get all user files
    files = glob.glob(os.path.join(json_data_dir, "*"))
    random.shuffle(files)

    for f in files:
        user = TwitterUser()
        user.populate_tweets_from_file(f,
                                       store_json=True,
                                       do_arabic_stemming=False,
                                       lemmatize=False)

        if 50 <= user.n_total_tweets <= 10000 and\
           user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE:

            tweet_set = [t for t in user.tweets if t.retweeted is None and\
                           len(t.urls) == 0 and 'http:' not in t.text and\
                           len(t.tokens) > 5 and\
                           t.created_at >= MIN_TWEET_DATE and\
                           (term == '' or term in t.tokens) and\
                           langid.classify(t.text)[0] == 'en'and\
                           sentiment(t.text)['compound'] != 0]

            if is_reply:
                tweet_set = [t for t in tweet_set if t.reply_to]
            else:
                tweet_set = [t for t in tweet_set if not t.reply_to]

            if len(tweet_set) == 0:
                print 'size 0', term, tweets_needed, is_reply
                continue

            tweet = random.sample(tweet_set, 1)[0]
            print user.screen_name, term, tweets_needed, is_reply, "::::  ", tweet.text
            dataset.append(tweet)
            tweets_needed -= 1
            if tweets_needed == 0:
                name = term if term != '' else 'random'
                name += '_reply' if is_reply else '_non_reply'
                pickle.dump(dataset, open(name + ".p", 'wb'))
                print 'done with: ', name, is_reply
                return

        else:
            print 'failed user'
示例#4
0
def gen_json_for_tweets_of_interest(input_filename,
                                    output_filename,
                                    keep_only_tweets_with_terms=None):
    """
    This function generates a cleaned json file so that the identity
    extraction only happens on "interesting" tweets.  Right now,
    interesting is defined as non-retweets that have >4 tokens. Feel free to redefine
    as you feel is suitable

    :param input_filename: input json file name (Can be gzipped)
    :param output_filename: cleaned output json filename
    :param keep_only_tweets_with_terms: If you only want to keep tweets containing a specific
            set of terms, you can use this argument and pass in a set of terms here
    :return:
    """
    tweets_to_write = []

    if not os.path.exists(output_filename):
        try:
            u = TwitterUser()
            u.populate_tweets_from_file(input_filename, store_json=True)
            tweets = [
                t for t in u.tweets if not t.retweeted and len(t.tokens) > 4
            ]
            tweets_to_keep = []

            if keep_only_tweets_with_terms:
                expanded_token_set = copy(t.tokens)
                for t in tweets:
                    for token in t.tokens:
                        expanded_token_set += get_alternate_wordforms(token)
                    if len(
                            set(expanded_token_set)
                            & keep_only_tweets_with_terms):
                        tweets_to_keep.append(t)
            else:
                tweets_to_keep = tweets

            tweets_to_write += tweets_to_keep
        except:
            print 'FAILED TO PARSE JSON FILE: ', input_filename

        print 'WRITING JSON'
        out_fil = gzip.open(output_filename, "wb")
        for tweet in tweets_to_write:
            out_fil.write(
                json.dumps(tweet.raw_json).strip().encode("utf8") + "\n")
        out_fil.close()
def gen_output(data, json_data_dir):

    term,is_reply,tweets_needed = data

    dataset = []

    # get all user files
    files = glob.glob(os.path.join(json_data_dir,"*"))
    random.shuffle(files)

    for f in files:
        user = TwitterUser()
        user.populate_tweets_from_file(f,store_json=True,do_arabic_stemming=False,lemmatize=False)

        if 50 <= user.n_total_tweets <= 10000 and\
           user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE:

            tweet_set = [t for t in user.tweets if t.retweeted is None and\
                           len(t.urls) == 0 and 'http:' not in t.text and\
                           len(t.tokens) > 5 and\
                           t.created_at >= MIN_TWEET_DATE and\
                           (term == '' or term in t.tokens) and\
                           langid.classify(t.text)[0] == 'en'and\
                           sentiment(t.text)['compound'] != 0]

            if is_reply:
                tweet_set = [t for t in tweet_set if t.reply_to]
            else:
                tweet_set = [t for t in tweet_set if not t.reply_to]

            if len(tweet_set) == 0:
                print 'size 0', term, tweets_needed, is_reply
                continue

            tweet = random.sample(tweet_set, 1)[0]
            print user.screen_name, term, tweets_needed, is_reply, "::::  ", tweet.text
            dataset.append(tweet)
            tweets_needed -= 1
            if tweets_needed == 0:
                name = term if term != '' else 'random'
                name += '_reply' if is_reply else '_non_reply'
                pickle.dump(dataset,open(name+".p",'wb'))
                print 'done with: ',name, is_reply
                return

        else:
            print 'failed user'
def gen_json_for_tweets_of_interest(input_filename, output_filename,keep_only_tweets_with_terms=None):
    """
    This function generates a cleaned json file so that the identity
    extraction only happens on "interesting" tweets.  Right now,
    interesting is defined as non-retweets that have >4 tokens. Feel free to redefine
    as you feel is suitable

    :param input_filename: input json file name (Can be gzipped)
    :param output_filename: cleaned output json filename
    :param keep_only_tweets_with_terms: If you only want to keep tweets containing a specific
            set of terms, you can use this argument and pass in a set of terms here
    :return:
    """
    tweets_to_write = []

    if not os.path.exists(output_filename):
        try:
            u = TwitterUser()
            u.populate_tweets_from_file(input_filename,store_json=True)
            tweets = [t for t in u.tweets if not t.retweeted and len(t.tokens) > 4]
            tweets_to_keep = []

            if keep_only_tweets_with_terms:
                expanded_token_set = copy(t.tokens)
                for t in tweets:
                    for token in t.tokens:
                        expanded_token_set += get_alternate_wordforms(token)
                    if len(set(expanded_token_set) & keep_only_tweets_with_terms):
                        tweets_to_keep.append(t)
            else:
                tweets_to_keep = tweets


            tweets_to_write += tweets_to_keep
        except:
            print 'FAILED TO PARSE JSON FILE: ', input_filename

        print 'WRITING JSON'
        out_fil = gzip.open(output_filename, "wb")
        for tweet in tweets_to_write:
            out_fil.write(json.dumps(tweet.raw_json).strip().encode("utf8") + "\n")
        out_fil.close()
示例#7
0
def getTweets(twitterid):
    '''
    Function to get the twitter data for an individual twitter ID.
    This function is written to work with Kenny's github example here: https://github.com/kennyjoseph/twitter_dm
    
    Input: string of twitterID
    Output: list of the raw string of all tweets for twitterID
    '''
    from twitter_dm.TwitterUser import TwitterUser

    tweets = []
    u = TwitterUser()
    u.populate_tweets_from_file(
        twitterid + '.json'
    )  #Need to figure out of we can use numeric ID (123456789.json) or name (kenny_joseph.json)

    for t in u.tweets:
        tweets.append(
            t.tokens
        )  #not sure if tokens is exactly what we want, we want the raw words, not necessarily tokens. We'll check this.
    #
    # texts={}
    # source_filename='Datasets/Twitter/members.zip'
    # parser = etree.XMLParser(encoding='utf8',recover=True)
    # with zipfile.ZipFile(source_filename) as zf:
    #     for i,member in enumerate(zf.infolist()):
    #         name=member.filename.split('/')[1].split('.')[0]    #filename is Raw3/name.csv
    #         if idx ==name:
    #             #print idx, name
    #             raw=zf.open(member)
    #             data=csv.reader(raw)
    #             for j,line in enumerate(data):
    #                 if j>0:
    #                     texts[idx+'_'+str(j)]=line[0]
    # if texts=={}:
    #     print 'no tweets for ', idx

    return tweets
示例#8
0
from twitter_dm.TwitterUser import TwitterUser
import datetime
from time import mktime
u = TwitterUser()
u.populate_tweets_from_file("/Users/kennyjoseph/git/thesis/twitter_dm/examples/2431225676.json.gz")

for t in u.tweets:
    print mktime(t.created_at.timetuple())
示例#9
0
def gen_conll_file(fil, ptb_dir, dp_dir):
    user = TwitterUser()
    user.populate_tweets_from_file(fil, do_tokenize=False)

    if 50 <= user.n_total_tweets <= 15000 and\
       user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE:

        dp_filename = os.path.join(dp_dir, str(user.user_id) + ".gz")
        ptb_filename = os.path.join(ptb_dir, str(user.user_id) + ".txt.gz")

        if not os.path.exists(dp_filename) or not os.path.exists(ptb_filename):
            return [
                'no_dp_ptb',
                [
                    user.user_id,
                    os.path.exists(dp_filename),
                    os.path.exists(ptb_filename)
                ]
            ]

        penntreebank = {
            x[0]: x[1:]
            for x in read_grouped_by_newline_file(ptb_filename)
        }
        dependency_parse = read_grouped_by_newline_file(dp_filename)

        tweet_set = [(i,t) for i,t in enumerate(user.tweets) if t.retweeted is None and\
                       len(t.urls) == 0 and 'http:' not in t.text and\
                       langid.classify(t.text)[0] == 'en']

        # non english speaker or spam
        if len(tweet_set) < 40:
            return ['notweets', user.user_id]

        data_to_return = []
        for twit_it, tweet in tweet_set:

            data_for_tweet = []

            ptb_for_tweet = penntreebank[str(tweet.id)]
            dp_for_tweet = dependency_parse[twit_it]

            if ptb_for_tweet[0].split("\t")[2] != DependencyParseObject(
                    dp_for_tweet[0]).text:
                print 'ahhhhh, weird stuff'
                continue

            for i, p in enumerate(dp_for_tweet):
                d = DependencyParseObject(
                    tsn([
                        p, tweet.id, user.user_id,
                        tweet.created_at.strftime("%m-%d-%y")
                    ],
                        newline=False))
                # get java features
                spl_java = ptb_for_tweet[i].split("\t")
                java_id, penn_pos_tag, word = spl_java[:3]
                java_features = '' if len(spl_java) == 3 else spl_java[3]
                d.features += [x for x in java_features.split("|") if x != '']
                d.features.append("penn_treebank_pos=" + penn_pos_tag)
                data_for_tweet.append(d)
            data_to_return.append(data_for_tweet)

        return ['success', [user.user_id, data_to_return]]
    else:
        return ['baduser', user.user_id]