Exemplo n.º 1
0
def insert_record(connection, tweet, word_list):
    try:
        with connection.cursor() as cursor:
            # Create new record
            id = tweet['id']
            tweet_txt = tweet['text']
            cleaned_text = sh.parse_sentence(tweet_txt, word_list)
            tweet_url = tweet['extended_entities']['media'][0]['media_url']
            timestamp = convert_twitter_date_to_datetime(tweet['created_at'])
            username = tweet['user']['screen_name']
            sql = "INSERT INTO Original_tweets (tweet_id, username, text, " \
                  "processed_text, image_url, created_ts) " \
                  "VALUES (%s, %s, %s, %s, %s, %s)"
            cursor.execute(sql, (id, username, tweet_txt, cleaned_text,
                                 tweet_url, timestamp))
    except:
        pass
Exemplo n.º 2
0
def fetchsamples(needed_sent_val=None, max_iters=1000):
    word_list = sh.english_word_list()
    afinn_dict = cs.load_afinn_dictionary('text_sentiment/AFINN-111.txt')
    huliu_dict = \
        cs.load_huliu_dict('text_sentiment/hu_liu/opinion-lexicon-English/')
    url = "https://stream.twitter.com/1/statuses/sample.json"
    parameters = []
    response = ts.twitterreq(url, "GET", parameters)
    num_iters = 0

    for line in response:

        if num_iters > max_iters:
            break

        if isinstance(line, bytes):
            line = line.decode('utf-8')

        # decode if not error message; else wait 1 sec to avoid rate limits
        try:
            tweet = json.loads(line.strip())
        except:
            time.sleep(1)
            print('waiting....')
            continue

        # stop processing if tweet doesn't meet basic criteria
        if not prt.decide_to_include_tweet(tweet):
            continue
        if not prt.image_is_original(tweet):
            continue

        # Calculate tweet sentiment
        tweet_txt = tweet['text']
        cleaned_text = sh.parse_sentence(tweet_txt, word_list)
        vader_sent = cs.calculate_vader(cleaned_text)
        afinn_sent = cs.calculate_simple_sentiment(cleaned_text, afinn_dict)
        hului_sent = cs.calculate_simple_sentiment(cleaned_text, huliu_dict)
        consistent = vader_sent == afinn_sent == hului_sent
        if not consistent:
            continue
        if needed_sent_val and (vader_sent != needed_sent_val):
            continue

        # retrieve and hash image
        image_url = tweet['extended_entities']['media'][0]['media_url']
        img = fetch_image(image_url)
        image_hash = dedupe.calculate_image_hash(img)

        # Ensure not an exact duplicate
        match = dedupe.find_matching_hash(image_hash, tweet['id'])
        if match:
            try:
                add_dupe_to_db(tweet, match, vader_sent,
                               image_hash, cleaned_text)
            except Exception as err:
                print(err)
            continue

        # Save image and write info to db
        try:
            add_new_record_to_db(tweet, vader_sent, image_hash, cleaned_text)
            img.save(IMAGE_SAVE_PATH + tweet['id_str'] + '.jpg')
        except Exception as err:
            print(err)
            continue
        num_iters += 1

    return