예제 #1
0
def twitterToCsv(inputfolder, outputFolder):
    if not os.path.exists(outputFolder):
        os.makedirs(outputFolder)

    for file in os.listdir(inputfolder):
        print(file)
        common.json2csv(file, 'tweets_' + file + '.csv', ['text'])
예제 #2
0
 def convert(self,input_file,output_file):
     with open(input_file) as fp:
         json2csv(fp, output_file,
         ['created_at', 'favorite_count', 'id', 'in_reply_to_status_id',
         'in_reply_to_user_id', 'retweet_count', 'retweeted',
         'text', 'truncated', 'user.id'])
     return 1
예제 #3
0
def test_user_metadata(tmp_path, infile):
    ref_fn = subdir / "tweets.20150430-223406.user.csv.ref"
    fields = ["id", "text", "user.id", "user.followers_count", "user.friends_count"]

    outfn = tmp_path / "tweets.20150430-223406.user.csv"
    json2csv(infile, outfn, fields, gzip_compress=False)
    assert files_are_identical(outfn, ref_fn)
예제 #4
0
 def test_textoutput(self):
     ref_fn = os.path.join(self.subdir,
                           'tweets.20150430-223406.text.csv.ref')
     with TemporaryDirectory() as tempdir:
         outfn = os.path.join(tempdir, 'tweets.20150430-223406.text.csv')
         json2csv(self.infile, outfn, ['text'], gzip_compress=False)
         self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
예제 #5
0
def find_matching_tweets(num_tweets=100,
                         fname="matching_tweets.csv",
                         shownum=50):
    """Given the number of tweets to retrieve, queries that number of tweets with
    the keyword "Trump" and saves the tweet id and text as a csv file "fname". Prints
    out the shownum amount of tweets using panda. Does not remove retweets."""
    oauth = credsfromfile()
    # create and register a streamer
    client = Streamer(**oauth)
    writer = TweetWriter(limit=num_tweets)
    client.register(writer)
    # get the name of the newly-created json file
    input_file = writer.timestamped_file()
    client.filter(track="trump")  # case-insensitive
    client.sample()

    with open(input_file) as fp:
        # these two fields for now
        json2csv(fp, fname, [
            'id',
            'text',
        ])

    # pretty print using pandas
    tweets = pd.read_csv(fname, encoding="utf8")
    return tweets.head(shownum)
    def test_user_metadata(self):
        ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.user.csv.ref')
        fields = ['id', 'text', 'user.id', 'user.followers_count', 'user.friends_count']

        with TemporaryDirectory() as tempdir:
            outfn = os.path.join(tempdir, 'tweets.20150430-223406.user.csv')
            json2csv(self.infile, outfn, fields, gzip_compress=False)
            self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
예제 #7
0
def test_file_is_wrong(tmp_path, infile):
    """
    Sanity check that file comparison is not giving false positives.
    """
    ref_fn = subdir / "tweets.20150430-223406.retweet.csv.ref"
    outfn = tmp_path / "tweets.20150430-223406.text.csv"
    json2csv(infile, outfn, ["text"], gzip_compress=False)
    assert not files_are_identical(outfn, ref_fn)
    def test_user_metadata(self):
        ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.user.csv.ref')
        fields = ['id', 'text', 'user.id', 'user.followers_count', 'user.friends_count']

        with TemporaryDirectory() as tempdir:
            outfn = os.path.join(tempdir, 'tweets.20150430-223406.user.csv')
            json2csv(self.infile, outfn, fields, gzip_compress=False)
            self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
 def test_file_is_wrong(self):
     """
     Sanity check that file comparison is not giving false positives.
     """
     ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.retweet.csv.ref')
     with TemporaryDirectory() as tempdir:
         outfn = os.path.join(tempdir, 'tweets.20150430-223406.text.csv')
         json2csv(self.infile, outfn, ['text'], gzip_compress=False)
         self.assertFalse(are_files_identical(outfn, ref_fn), msg=self.msg)
 def test_file_is_wrong(self):
     """
     Sanity check that file comparison is not giving false positives.
     """
     ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.retweet.csv.ref')
     with TemporaryDirectory() as tempdir:
         outfn = os.path.join(tempdir, 'tweets.20150430-223406.text.csv')
         json2csv(self.infile, outfn, ['text'], gzip_compress=False)
         self.assertFalse(are_files_identical(outfn, ref_fn), msg=self.msg)
예제 #11
0
def test_user_metadata(tmp_path, infile):
    ref_fn = subdir / 'tweets.20150430-223406.user.csv.ref'
    fields = [
        'id', 'text', 'user.id', 'user.followers_count', 'user.friends_count'
    ]

    outfn = tmp_path / 'tweets.20150430-223406.user.csv'
    json2csv(infile, outfn, fields, gzip_compress=False)
    assert files_are_identical(outfn, ref_fn)
    def test_tweet_metadata(self):
        ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.tweet.csv.ref')
        fields = ['created_at', 'favorite_count', 'id',
                  'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_count',
                  'retweeted', 'text', 'truncated', 'user.id']

        with TemporaryDirectory() as tempdir:
            outfn = os.path.join(tempdir, 'tweets.20150430-223406.tweet.csv')
            json2csv(self.infile, outfn, fields, gzip_compress=False)
            self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
예제 #13
0
def test_tweet_metadata(tmp_path, infile):
    ref_fn = subdir / 'tweets.20150430-223406.tweet.csv.ref'
    fields = [
        'created_at',
        'favorite_count',
        'id',
        'in_reply_to_status_id',
        'in_reply_to_user_id',
        'retweet_count',
        'retweeted',
        'text',
        'truncated',
        'user.id',
    ]

    outfn = tmp_path / 'tweets.20150430-223406.tweet.csv'
    json2csv(infile, outfn, fields, gzip_compress=False)
    assert files_are_identical(outfn, ref_fn)
예제 #14
0
def test_tweet_metadata(tmp_path, infile):
    ref_fn = subdir / "tweets.20150430-223406.tweet.csv.ref"
    fields = [
        "created_at",
        "favorite_count",
        "id",
        "in_reply_to_status_id",
        "in_reply_to_user_id",
        "retweet_count",
        "retweeted",
        "text",
        "truncated",
        "user.id",
    ]

    outfn = tmp_path / "tweets.20150430-223406.tweet.csv"
    json2csv(infile, outfn, fields, gzip_compress=False)
    assert files_are_identical(outfn, ref_fn)
예제 #15
0
def read_single_tweets(path):
    word_list = []
    if os.path.isfile(path):
        input_tweets = twitter_samples.abspath(os.path.abspath(path))
        output_tweets = os.path.join(
            os.path.dirname(path) + '_text',
            os.path.basename(path) + '.csv')
        os.makedirs(os.path.dirname(output_tweets), exist_ok=True)
        try:
            with open(input_tweets) as fp:
                json2csv(fp, output_tweets, ['text'])
            with open(output_tweets, 'r') as fp:
                reader = csv.DictReader(fp)
                for row in reader:
                    try:
                        tweet = row['text']
                        if detect(tweet) == 'en':
                            word_list.append(clean_and_tokenize(tweet))
                    except lang_detect_exception.LangDetectException:
                        continue
        except:
            print(path)
    return word_list
 def test_textoutput(self):
     ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.text.csv.ref')
     with TemporaryDirectory() as tempdir:
         outfn = os.path.join(tempdir, 'tweets.20150430-223406.text.csv')
         json2csv(self.infile, outfn, ['text'], gzip_compress=False)
         self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
예제 #17
0
def test_textoutput(tmp_path, infile):
    ref_fn = subdir / "tweets.20150430-223406.text.csv.ref"
    outfn = tmp_path / "tweets.20150430-223406.text.csv"
    json2csv(infile, outfn, ["text"], gzip_compress=False)
    assert files_are_identical(outfn, ref_fn)
예제 #18
0
n = 10  # 設定拿取 tweets 資料則數
username = '******'

# Query
client = Query(**oauth)  # 歷史資料
client.register(TweetWriter())  # 寫入
client.user_tweets(username, n)  # 拿取 tweets 資料(n則)

'''
使用 json2csv 存取 tweets 資料 (text欄位)
input_file 的 abspath 需參考上述 Query 寫入資料的路徑做修改
'''

input_file = twitter_samples.abspath('/Users/youngmihuang/twitter-files/tweets.20180726-155316.json')
with open(input_file) as fp:
    json2csv(fp, 'tweets_text.csv', ['text'])

# 讀取
data = pd.read_csv('tweets_text.csv')
for line in data.text:
    print('Trump tweets content: ')
    print(line)

# 斷詞
tokenized = twitter_samples.tokenized(input_file)
for tok in tokenized[:5]:
    print('tokenized: ')
    print(tok)

# tweets 資料處理
with open(input_file) as fp:
예제 #19
0
from nltk.corpus import twitter_samples
from nltk.twitter.common import json2csv

#corpus twitter_sample tweets ~20k
jsonfile = twitter_samples.fileids()[-1]

#absolute path for the file: #input_file = os.path.abspath(jsonfile)=>returns virtualenv file path
input_file = twitter_samples.abspath(jsonfile) #returns system /usr/share/ path

#with open(input_file) as fp:
	#json2csv(fp,'tweets_text.csv',['text']) #json2csv(pointer, nameoffile, [feature1,feature2,feature3])

#think about the attributes to be imported, convert to panda, make a dataframe, apply stemming to tweet texts, save them. 
with open(input_file) as fp:
	json2csv(fp, 'tweets_dataframe.csv',['id','text','user.favourites_count','user.id','lang','user.followers_count','user.verified','truncated'])
#json, csv
from nltk.twitter.common import json2csv

#absolute path for the file: #input_file = os.path.abspath(jsonfile)=>returns virtualenv file path
input_file = '/home/mradul/twitter-files/Tweets.json'
#with open(input_file) as fp:
	#json2csv(fp,'tweets_text.csv',['text']) #json2csv(pointer, nameoffile, [feature1,feature2,feature3])

#think about the attributes to be imported, convert to panda, make a dataframe, apply stemming to tweet texts, save them. 
with open(input_file) as fp:
	json2csv(fp, 'boston.csv',['text','id','from_user','iso_language_code'])
#json, csv