示例#1
0
def send_to_database(tweet_file):
    english_words = sh.english_word_list()
    tweet_list = subset_tweets(tweet_file)

    # Open database connection
    connection = pymysql.connect(host=sql_vals.host,
                                 password=sql_vals.password,
                                 port=sql_vals.port,
                                 user=sql_vals.user,
                                 db=sql_vals.db,
                                 cursorclass=pymysql.cursors.DictCursor,
                                 charset='utf8mb4')

    # save relevant data on tweets with original images
    insert_count = 0
    for tweet in tweet_list:
        if image_is_original(tweet):
            insert_record(connection, tweet, english_words)
            insert_count += 1
    connection.commit()
    print(str(insert_count) + ' records inserted\n')

    connection.close()
示例#2
0
def fetchsamples(needed_sent_val=None, max_iters=1000):
    word_list = sh.english_word_list()
    afinn_dict = cs.load_afinn_dictionary('text_sentiment/AFINN-111.txt')
    huliu_dict = \
        cs.load_huliu_dict('text_sentiment/hu_liu/opinion-lexicon-English/')
    url = "https://stream.twitter.com/1/statuses/sample.json"
    parameters = []
    response = ts.twitterreq(url, "GET", parameters)
    num_iters = 0

    for line in response:

        if num_iters > max_iters:
            break

        if isinstance(line, bytes):
            line = line.decode('utf-8')

        # decode if not error message; else wait 1 sec to avoid rate limits
        try:
            tweet = json.loads(line.strip())
        except:
            time.sleep(1)
            print('waiting....')
            continue

        # stop processing if tweet doesn't meet basic criteria
        if not prt.decide_to_include_tweet(tweet):
            continue
        if not prt.image_is_original(tweet):
            continue

        # Calculate tweet sentiment
        tweet_txt = tweet['text']
        cleaned_text = sh.parse_sentence(tweet_txt, word_list)
        vader_sent = cs.calculate_vader(cleaned_text)
        afinn_sent = cs.calculate_simple_sentiment(cleaned_text, afinn_dict)
        hului_sent = cs.calculate_simple_sentiment(cleaned_text, huliu_dict)
        consistent = vader_sent == afinn_sent == hului_sent
        if not consistent:
            continue
        if needed_sent_val and (vader_sent != needed_sent_val):
            continue

        # retrieve and hash image
        image_url = tweet['extended_entities']['media'][0]['media_url']
        img = fetch_image(image_url)
        image_hash = dedupe.calculate_image_hash(img)

        # Ensure not an exact duplicate
        match = dedupe.find_matching_hash(image_hash, tweet['id'])
        if match:
            try:
                add_dupe_to_db(tweet, match, vader_sent,
                               image_hash, cleaned_text)
            except Exception as err:
                print(err)
            continue

        # Save image and write info to db
        try:
            add_new_record_to_db(tweet, vader_sent, image_hash, cleaned_text)
            img.save(IMAGE_SAVE_PATH + tweet['id_str'] + '.jpg')
        except Exception as err:
            print(err)
            continue
        num_iters += 1

    return