示例#1
0
def process_tweets_from_file(fin, fout):
    valid_count = 0
    invalid_count = 0
    sia = SIA()

    list_of_tweets = []
    with open(fin) as f:
        for line in f:
            j = json.loads(line)
            try:
                # validate date format
                created_at = datetime.strptime(j['created_at'],
                                               '%Y-%m-%dT%H:%M:%S')

                # text formatting
                formatted_text = format.format_full(j['text'])
                j['formatted_text'] = formatted_text

                # sentiment
                sent = sia.polarity_scores(formatted_text)
                j['sentiment'] = sent

                list_of_tweets.append(json.dumps(j))
                valid_count += 1
                if (valid_count % 25000 == 0):
                    print(valid_count)
            except ValueError:
                invalid_count += 1
                if (invalid_count % 100 == 0):
                    print('Invalid:' + str(created_at), j['created_at'])
                continue

    with open(fout, 'w') as f:
        for tweet in list_of_tweets:
            f.write(tweet + '\n')
示例#2
0
def process_tweets_from_file(fin, fout):
    valid_count = 0
    invalid_count = 0
    sia = SIA()
    list_of_tweets = []
    
    reader = csv.reader(open('/home/amit/Desktop/Sentiment_analysis/Bitcoin-price-prediction-model-master/coindesk-bpi-USD-close_data-2010-07-18_2018-06-07.csv', 'r'))
    d = {}
    for row in reader:
        k, v = row
        print(k)
        k = datetime.datetime.strptime(k, '%Y-%m-%d %H:%M:%S').date()
        #print(k)
        d[k] = v
    print(d)
        
    with open(fin) as f:
        for line in f:
            j = json.loads(line)
            try:
                # validate date format
                created_at = datetime.datetime.strptime(j['created_at'], '%Y-%m-%dT%H:%M:%S')
                #sprint(type(created_at))
                this_date = created_at.date()
                print(this_date)
                next_date = this_date + datetime.timedelta(days=1)
                j['price'] = 0
                if(next_date > datetime.datetime.today().date()):
                	j['price'] = d[this_date]
                else:
	                j['price'] = d[next_date]

                # datetime.strptime('2016-10-27 22:58:14', '%Y-%m-%d %H:%M:%S')

                # text formatting
                formatted_text = format.format_full(j['text'])
                j['formatted_text'] = formatted_text

                # sentiment
                sent = sia.polarity_scores(formatted_text)
                j['sentiment'] = sent

                list_of_tweets.append(json.dumps(j))
                valid_count += 1
                if (valid_count % 25000 == 0):
                    print(valid_count)
            except ValueError:
                invalid_count += 1
                if (invalid_count % 100 == 0):
                    print('Invalid:' + str(created_at), j['created_at'])
                continue

            
    
    with open(fout, 'w') as f:
        for tweet in list_of_tweets:
            f.write(tweet+'\n')
    
    print("Successfully processed", len(list_of_tweets), "tweets")