Пример #1
0
def test_since_id():
    t = Twarc()
    for tweet in t.search('obama'):
        id = tweet['id_str']
        break
    assert id
    time.sleep(5)
    for tweet in t.search('obama', since_id=id):
        assert tweet['id_str'] > id
Пример #2
0
def test_max_id():
    t = Twarc()
    for tweet in t.search('obama'):
        id = tweet['id_str']
        break
    assert id
    time.sleep(5)
    count = 0
    for tweet in t.search('obama', max_id=id):
        count += 1
        assert tweet['id_str'] <= id
        if count > 100:
            break
Пример #3
0
def test_max_and_since_ids():
    t = Twarc()
    max_id = since_id = None
    count = 0
    for tweet in t.search('obama'):
        count += 1
        if not max_id:
            max_id = tweet['id_str']
        since_id = tweet['id_str']
        if count > 500:
            break
    count = 0
    for tweet in t.search('obama', max_id=max_id, since_id=since_id):
        count += 1
        assert tweet['id_str'] <= max_id
        assert tweet['id_str'] > since_id
Пример #4
0
def test_paging():
    # pages are 100 tweets big so if we can get 500 paging is working
    t = Twarc()
    count = 0
    for tweet in t.search('obama'):
        count += 1
        if count == 500:
            break
    assert count == 500
Пример #5
0
def test_search():
    count = 0
    t = Twarc()
    for tweet in t.search('obama'):
        assert tweet['id_str']
        count += 1
        if count == 10:
            break
    assert count == 10
Пример #6
0
def main(get_method=None, input_hashtags=None, storage_location=None):
    if not os.path.exists(storage_location):
        os.makedirs(storage_location, exist_ok=True)

    hashtag_query = input_hashtags.strip().replace(",", "+OR+")

    try:
        tweets = 0
        t = Twarc(
            consumer_key,
            consumer_secret,
            access_token,
            access_token_secret,
            tweet_mode="extended",
        )

        print(
            "Started storing tweets related to "
            + input_hashtags
            + " at "
            + storage_location
            + " since "
            + str(datetime.datetime.now())
        )

        if get_method == "populate":
            for tweet in t.search(hashtag_query, lang=language):
                with open(
                    os.path.join(
                        storage_location + "tweet" + str(tweet["id"]) + ".json"
                    ),
                    "w",
                    encoding="utf8",
                ) as file:
                    json.dump(tweet, file)
                    tweets += 1

        elif get_method == "track":
            for tweet in t.filter(hashtag_query):
                with open(
                    storage_location + "/tweet" + str(tweet["id"]) + ".json",
                    "w",
                    encoding="utf8",
                ) as file:
                    json.dump(tweet, file)
                    tweets += 1
        else:
            print("No method defined, exiting...")

    except KeyboardInterrupt:
        print("Shutdown requested...successfully stored " + str(tweets) + " tweets")
    except BaseException:
        traceback.print_exc(file=sys.stdout)

    sys.exit(0)
Пример #7
0
def count_tweets(app_auth):
    """
    Search for covid_19 in tweets using the given context and return the number
    of tweets that were fetched in 10 minutes.
    """
    count = 0
    t = Twarc(app_auth=app_auth)
    start = None
    for tweet in t.search('covid_19'):
        # start the timer when we get the first tweet
        if start is None:
            start = datetime.now()
        count += 1
        if datetime.now() - start > timedelta(minutes=10):
            break
    t.client.close()
    return count
Пример #8
0
def crawl_feed(feed_dict, credentials):
    twarc = Twarc(credentials['consumer_key'], credentials['consumer_secret'],
                  credentials['access_token'],
                  credentials['access_token_secret'])
    crawl_time = datetime.datetime.now()
    crawl_time_filename = crawl_time.strftime('%Y%m%d%I%M%S')
    crawl_time_html = crawl_time.strftime('%B %d, %Y')
    crawl_name = feed_dict['crawl_name']
    crawl_type = feed_dict['crawl_type']
    short_name = feed_dict['short_name']
    search_string = feed_dict['search_string']

    feed_dir = feed_dict['feed_dir']
    json_dir = join(feed_dir, 'json')
    html_dir = join(feed_dir, 'html')
    media_dir = join(feed_dir, 'media')
    logs_dir = join(feed_dir, 'logs')

    for directory in [feed_dir, json_dir, html_dir, media_dir, logs_dir]:
        if not os.path.exists(directory):
            os.makedirs(directory)

    log_file = join(logs_dir, 'twarc.log')

    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    logger = logging.getLogger(crawl_name)
    handler = logging.FileHandler(log_file)
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)

    base_filename = short_name + '-' + crawl_time_filename
    json_file = join(json_dir, base_filename + '.json')

    print("Searching Twitter API for {0}".format(search_string))
    print("Writing JSON and HTML files...")

    logger.info("starting search for %s", search_string)
    tweet_count = 0

    if crawl_type == "timeline":
        for tweet in twarc.timeline(screen_name=search_string):
            with open(json_file, 'a') as json_out:
                json_out.write("{}\n".format(json.dumps(tweet)))

            if "id_str" in tweet:
                logger.info("archived https://twitter.com/%s/status/%s",
                            tweet['user']['screen_name'], tweet["id_str"])
            elif 'limit' in tweet:
                logger.warn("%s tweets undelivered", tweet["limit"]["track"])
            elif 'warning' in tweet:
                logger.warn(tweet['warning']['message'])
            else:
                logger.warn(json.dumps(tweet))

            tweet_count += 1

    else:
        for tweet in twarc.search(search_string):
            with open(json_file, 'a') as json_out:
                json_out.write("{}\n".format(json.dumps(tweet)))

            if "id_str" in tweet:
                logger.info("archived https://twitter.com/%s/status/%s",
                            tweet['user']['screen_name'], tweet["id_str"])
            elif 'limit' in tweet:
                logger.warn("%s tweets undelivered", tweet["limit"]["track"])
            elif 'warning' in tweet:
                logger.warn(tweet['warning']['message'])
            else:
                logger.warn(json.dumps(tweet))

            tweet_count += 1

    if tweet_count == 0:
        logger.info("no new tweets matching %s", search_string)

        # Write an empty json file. Maybe don't do this?
        with open(json_file, 'w') as json_out:
            json_out.close()

    return base_filename, tweet_count, crawl_time_html
 print "Signing in as: " + acct_name
 search = "\"" + search + "\""
 print(str(count) + "/" + str(max_s) + " searching: " + search)
 current_label = "search_" + str(count)
 output_dir = output_dir_base + str(count) + "/"
 if not os.path.exists(output_dir):
     print("Created directory: " + output_dir)
     os.makedirs(output_dir)
 fn = os.path.join(output_dir, "target.txt")
 with open(fn, "w") as f:
     f.write(search + "\n")
 dump_filename = output_dir + "raw.json"
 dump_file_handle = open(dump_filename, "a")
 data = {}
 set_counters()
 for status in t.search(search):
     captured_status = {}
     increment_counter("tweets_encountered")
     if "lang" in status:
         lang = status["lang"]
         increment_counter("tweets_" + lang)
         captured_status = capture_status_items(status)
         process_status(captured_status)
         if captured_status is not None:
             increment_counter("tweets_captured")
             increment_counter("tweets_processed")
             increment_counter("tweets_processed_this_interval")
             dump_file_handle.write(json.dumps(captured_status))
             dump_file_handle.write("\n")
             periodic_events()
         sys.stdout.write("#")
Пример #10
0
class AnalyzerProcess():
    def __init__(self, config, loggerObject, alerLoggerObject, rules,
                 executionMode):
        self.logger = loggerObject
        self.alertLogger = alerLoggerObject
        self.rules = rules
        self.config = config
        self.executionMode = executionMode
        self.access_token = "insert Twitter API access token"
        self.access_token_secret = "insert Twitter API token secret"
        self.consumer_key = "insert Twitter API consumer key"
        self.consumer_secret = "insert Twitter API consumer secret"
        self.twarc = Twarc(self.consumer_key, self.consumer_secret,
                           self.access_token, self.access_token_secret)
        self.currdir = "/home/centos/modosint-python3" + path.dirname(__file__)
        self.wcloud = ""
        self.stop_words = get_stop_words('spanish')
        newStopWords = ["http", "https", "co", "n'", "'", '"']
        self.stop_words.extend(newStopWords)

#Search Tweets that contais term in different Language

    def searchDifLanguage(self, text, language, ruleId):
        fichero = open("/var/log/modosint/analyzer-twitter/graylog.txt",
                       "+a",
                       encoding='utf8')
        with io.open("/var/log/modosint/analyzer-twitter/cache.txt",
                     'a+') as f:
            os.chmod("/var/log/modosint/analyzer-twitter/cache.txt", 0o777)
            traductor = Translator()
            translatedText = traductor.translate(text, dest=language)
            repeated = False
            if self.executionMode == "daemon":
                searchDif = self.twarc.search(translatedText.text)
                for tweet in searchDif:
                    tweetTime = parser.parse(''.join(tweet['created_at']))
                    timeFormed = time.strptime(
                        str(tweetTime.time()).split(',')[0], '%H:%M:%S')
                    createdAtSeconds = datetime.timedelta(
                        hours=timeFormed.tm_hour,
                        minutes=timeFormed.tm_min,
                        seconds=timeFormed.tm_sec).total_seconds()
                    nowTimeUtc = datetime.datetime.utcnow().time()
                    nowTimeFormed = time.strptime(
                        str(nowTimeUtc).split('.')[0], '%H:%M:%S')
                    nowTimeSeconds = datetime.timedelta(
                        hours=nowTimeFormed.tm_hour,
                        minutes=nowTimeFormed.tm_min,
                        seconds=nowTimeFormed.tm_sec).total_seconds()
                    if (nowTimeSeconds - createdAtSeconds <
                            300):  #time in 5 minutes
                        if 'retweeted_status' not in tweet:  #avoid RT
                            f.seek(0)  #read temporary file (cache)
                            content = f.readlines()
                            content = [
                                x.strip('\n').strip('u') for x in content
                            ]
                            for i in range(len(content)):
                                if tweet['id_str'] in content:
                                    repeated = True
                                else:
                                    repeated = False
                            if repeated == False:
                                f.seek(0, 2)  #write temporary file (cache)
                                f.write(tweet['id_str'])
                                f.write('\n')

                                texto = tweet['full_text']

                                for c in texto:
                                    if c in emoji.UNICODE_EMOJI:
                                        texto = texto.replace(c, "")
                                texto = u'' + texto
                                try:
                                    emoji_pattern = re.compile(
                                        u"(\ud83d[\ude00-\ude4f])|"  # emoticons
                                        u"(\ud83c[\udf00-\uffff])|"  # symbols & pictographs (1 of 2)
                                        u"(\ud83d[\u0000-\uddff])|"  # symbols & pictographs (2 of 2)
                                        u"(\ud83d[\ude80-\udeff])|"  # transport & map symbols
                                        u"(\U0001F1E0-\U0001F1FF])|"
                                        u"(\U0001F600-\U0001F64F])|"  # emoticons 2
                                        u"(\U0001F300-\U0001F5FF])|"  # symbols & pictographs
                                        u"(\U0001F680-\U0001F6FF])|"
                                        u"(\u2600-\u26FF])|"
                                        u"(\U0001F1F2\U0001F1F4)|"  # Macau flag
                                        u"([\U0001F1E6-\U0001F1FF]{2})|"  # flags
                                        u"([\U0001F600-\U0001F64F])"  # emoticons 3		
                                        u"(\ud83c[\udde0-\uddff])"  # flags (iOS)
                                        "+",
                                        flags=re.UNICODE)
                                    resultesp = traductor.translate(
                                        emoji_pattern.sub(r'', texto),
                                        dest='es')
                                except ValueError:
                                    self.my_logger.debug(
                                        '[Emoji Error] Tweet can not be translated. Unrecognized emoji in tweet.'
                                    )
                                tweetdata = {
                                    "CreatedTime":
                                    tweet['created_at'],
                                    "short_message":
                                    tweet['full_text'],
                                    "TranslatedTweet":
                                    resultesp.text,
                                    "Author":
                                    tweet['user']['screen_name'],
                                    "Retweets":
                                    tweet['retweet_count'],
                                    "Likes":
                                    tweet['favorite_count'],
                                    "Location":
                                    tweet['user']['location'],
                                    "Rule":
                                    ruleId,
                                    "full_message":
                                    "Tweet matched with RULE: " + ruleId
                                }
                                autotweet = json.dumps(tweetdata)
                                fichero.write(autotweet + '\n')
                                self.wcloud.write(resultesp.text + '\n')
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/wcloudRule"
                                    + ruleId + ".txt", 0o777)
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/graylog.txt",
                                    0o777)
                    else:
                        break

            else:
                searchDif = self.twarc.search(translatedText.text)
                for tweet in searchDif:
                    tweetTime = ''.join(tweet['created_at'])
                    datetweet = parser.parse(tweetTime)
                    if (datetweet.date() == datetime.datetime.now().date()
                            or datetweet.date()
                            == (datetime.datetime.now().date() -
                                timedelta(1))):
                        if 'retweeted_status' not in tweet:  #avoid RT
                            f.seek(0)  #read temporary file (cache)
                            content = f.readlines()
                            content = [
                                x.strip('\n').strip('u') for x in content
                            ]
                            for i in range(len(content)):
                                if tweet['id_str'] in content:
                                    repeated = True
                                else:
                                    repeated = False
                            if repeated == False:
                                f.seek(0, 2)  #write temporary file (cache)
                                f.write(tweet['id_str'])
                                f.write('\n')

                                texto = tweet['full_text']

                                for c in texto:
                                    if c in emoji.UNICODE_EMOJI:
                                        texto = texto.replace(c, "")
                                texto = u'' + texto
                                try:
                                    emoji_pattern = re.compile(
                                        u"(\ud83d[\ude00-\ude4f])|"  # emoticons
                                        u"(\ud83c[\udf00-\uffff])|"  # symbols & pictographs (1 of 2)
                                        u"(\ud83d[\u0000-\uddff])|"  # symbols & pictographs (2 of 2)
                                        u"(\ud83d[\ude80-\udeff])|"  # transport & map symbols
                                        u"(\U0001F1E0-\U0001F1FF])|"
                                        u"(\U0001F600-\U0001F64F])|"  # emoticons 2
                                        u"(\U0001F300-\U0001F5FF])|"  # symbols & pictographs
                                        u"(\U0001F680-\U0001F6FF])|"
                                        u"(\u2600-\u26FF])|"
                                        u"(\U0001F1F2\U0001F1F4)|"  # Macau flag
                                        u"([\U0001F1E6-\U0001F1FF]{2})|"  # flags
                                        u"([\U0001F600-\U0001F64F])"  # emoticons 3		
                                        u"(\ud83c[\udde0-\uddff])"  # flags (iOS)
                                        "+",
                                        flags=re.UNICODE)
                                    resultesp = traductor.translate(
                                        emoji_pattern.sub(r'', texto),
                                        dest='es')
                                except ValueError:
                                    self.my_logger.debug(
                                        '[Emoji Error] Tweet can not be translated. Unrecognized emoji in tweet.'
                                    )
                                tweetdata = {
                                    "CreatedTime":
                                    tweet['created_at'],
                                    "short_message":
                                    tweet['full_text'],
                                    "TranslatedTweet":
                                    resultesp.text,
                                    "Author":
                                    tweet['user']['screen_name'],
                                    "Retweets":
                                    tweet['retweet_count'],
                                    "Likes":
                                    tweet['favorite_count'],
                                    "Location":
                                    tweet['user']['location'],
                                    "Rule":
                                    ruleId,
                                    "full_message":
                                    "Tweet matched with RULE: " + ruleId
                                }
                                autotweet = json.dumps(tweetdata)
                                fichero.write(autotweet + '\n')
                                self.wcloud.write(resultesp.text + '\n')
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/wcloudRule"
                                    + ruleId + ".txt", 0o777)
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/graylog.txt",
                                    0o777)
                    else:
                        break

#Search Tweets that contains term or Hashtag

    def searchTweetOrHashtag(self, text, ruleId):
        fichero = open("/var/log/modosint/analyzer-twitter/graylog.txt",
                       "+a",
                       encoding='utf8')
        with io.open("/var/log/modosint/analyzer-twitter/cache.txt",
                     'a+') as f:
            os.chmod("/var/log/modosint/analyzer-twitter/cache.txt", 0o777)
            repeated = False
            if self.executionMode == "daemon":
                tweets = self.twarc.search(text)
                for tweet in tweets:
                    tweetTime = parser.parse(''.join(tweet['created_at']))
                    timeFormed = time.strptime(
                        str(tweetTime.time()).split(',')[0], '%H:%M:%S')
                    createdAtSeconds = datetime.timedelta(
                        hours=timeFormed.tm_hour,
                        minutes=timeFormed.tm_min,
                        seconds=timeFormed.tm_sec).total_seconds()
                    nowTimeUtc = datetime.datetime.utcnow().time()
                    nowTimeFormed = time.strptime(
                        str(nowTimeUtc).split('.')[0], '%H:%M:%S')
                    nowTimeSeconds = datetime.timedelta(
                        hours=nowTimeFormed.tm_hour,
                        minutes=nowTimeFormed.tm_min,
                        seconds=nowTimeFormed.tm_sec).total_seconds()
                    if (nowTimeSeconds - createdAtSeconds <
                            300):  #time in 5 minutes
                        if 'retweeted_status' not in tweet:  #avoid RT
                            f.seek(0)  #read temporary file (cache)
                            content = f.readlines()
                            content = [
                                x.strip('\n').strip('u') for x in content
                            ]
                            for i in range(len(content)):
                                if tweet['id_str'] in content:
                                    repeated = True
                                else:
                                    repeated = False
                            if repeated == False:
                                f.seek(0, 2)  #write temporary file (cache)
                                f.write(tweet['id_str'] + '\n')
                                tweetdata = {
                                    "CreatedTime":
                                    tweet['created_at'],
                                    "short_message":
                                    tweet['full_text'],
                                    "Author":
                                    tweet['user']['screen_name'],
                                    "Retweets":
                                    tweet['retweet_count'],
                                    "Likes":
                                    tweet['favorite_count'],
                                    "Location":
                                    tweet['user']['location'],
                                    "Rule":
                                    ruleId,
                                    "full_message":
                                    "Tweet matched with RULE: " + ruleId
                                }
                                autotweet = json.dumps(tweetdata)
                                fichero.write(autotweet + '\n')
                                self.wcloud.write(tweet['full_text'] + '\n')
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/wcloudRule"
                                    + ruleId + ".txt", 0o777)
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/graylog.txt",
                                    0o777)
                    else:
                        break

            else:
                tweets = self.twarc.search(text)
                for tweet in tweets:  #no daemon(tweets in this day and yesterday)
                    tweetTime = ''.join(tweet['created_at'])
                    datetweet = parser.parse(tweetTime)
                    if (datetweet.date() == datetime.datetime.now().date()
                            or datetweet.date()
                            == (datetime.datetime.now().date() -
                                timedelta(1))):
                        if 'retweeted_status' not in tweet:  #avoid RT
                            f.seek(0)  #read temporary file (cache)
                            content = f.readlines()
                            content = [
                                x.strip('\n').strip('u') for x in content
                            ]
                            for i in range(len(content)):
                                if tweet['id_str'] in content:
                                    repeated = True
                                else:
                                    repeated = False
                            if repeated == False:
                                f.seek(0, 2)  #write temporary file (cache)
                                f.write(tweet['id_str'] + '\n')
                                tweetdata = {
                                    "CreatedTime":
                                    tweet['created_at'],
                                    "short_message":
                                    tweet['full_text'],
                                    "Author":
                                    tweet['user']['screen_name'],
                                    "Retweets":
                                    tweet['retweet_count'],
                                    "Likes":
                                    tweet['favorite_count'],
                                    "Location":
                                    tweet['user']['location'],
                                    "Rule":
                                    ruleId,
                                    "full_message":
                                    "Tweet matched with RULE: " + ruleId
                                }
                                autotweet = json.dumps(tweetdata)
                                fichero.write(autotweet + '\n')
                                self.wcloud.write(tweet['full_text'] + '\n')
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/wcloudRule"
                                    + ruleId + ".txt", 0o777)
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/graylog.txt",
                                    0o777)
                    else:
                        break

    #Search All Tweets or timeline from @user
    def searchUserTweets(self, user, ruleId, fullstring):
        fichero = open("/var/log/modosint/analyzer-twitter/graylog.txt",
                       "+a",
                       encoding='utf8')
        with io.open("/var/log/modosint/analyzer-twitter/cache.txt",
                     'a+') as f:
            os.chmod("/var/log/modosint/analyzer-twitter/cache.txt", 0o777)
            tweets = self.twarc.timeline(None, user, None, None)
            repeated = False
            t_end = time.time() + 30
            for tweet in tweets:
                if time.time() < t_end:
                    for text in fullstring:
                        if text in tweet['full_text']:
                            f.seek(0)  #read temporary file (cache)
                            content = f.readlines()
                            content = [
                                x.strip('\n').strip('u') for x in content
                            ]
                            for i in range(len(content)):
                                if tweet['id_str'] in content:
                                    repeated = True
                                else:
                                    repeated = False
                            if repeated == False:
                                f.seek(0, 2)  #write temporary file (cache)
                                f.write(tweet['id_str'])
                                f.write('\n')
                                tweetdata = {
                                    "CreatedTime":
                                    tweet['created_at'],
                                    "short_message":
                                    tweet['full_text'],
                                    "Author":
                                    tweet['user']['screen_name'],
                                    "Retweets":
                                    tweet['retweet_count'],
                                    "Likes":
                                    tweet['favorite_count'],
                                    "Location":
                                    tweet['user']['location'],
                                    "Rule":
                                    ruleId,
                                    "full_message":
                                    "Tweet matched with RULE: " + ruleId
                                }
                                autotweet = json.dumps(tweetdata)
                                fichero.write(autotweet + '\n')
                                self.wcloud.write(tweet['full_text'] + '\n')
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/wcloudRule"
                                    + ruleId + ".txt", 0o777)
                                os.chmod(
                                    "/var/log/modosint/analyzer-twitter/graylog.txt",
                                    0o777)
                else:
                    break

    def create_wordcloud(self, text, ruleId):
        mask = np.array(Image.open(path.join(self.currdir,
                                             "twitter_mask.png")))
        # create wordcloud object
        wc = WordCloud(background_color="white",
                       max_words=200,
                       mask=mask,
                       stopwords=self.stop_words)
        try:
            # generate wordcloud
            wc.generate(text)
            # save wordcloud
            wc.to_file(
                path.join(self.currdir + "/WordCloud/Twitter/",
                          "wcTwitterRule" + ruleId + ".png"))
            os.chmod(
                path.join(self.currdir + "/WordCloud/Twitter/",
                          "wcTwitterRule" + ruleId + ".png"), 0o777)
        except ValueError as e:
            error = True

    # custom functionality
    def run(self):
        self.logger.info("working...")
        OSINTRules = self.rules
        for element in OSINTRules:
            ruleId = element.get('metadata', False).get('id', False)
            self.wcloud = open(
                "/var/log/modosint/analyzer-twitter/wcloudRule" + ruleId +
                ".txt", "a+")
            checkUsername = element.get('_username', False)
            checkString = element.get('_string', False)
            if checkUsername:
                user = (''.join(element['_username']))
            if checkString:
                string = (','.join(element['_string']))
                fullstring = element['_string']
                checkLanguage = element.get('_language', False)
                if checkLanguage:
                    language = (''.join(element['_language']))
                    self.searchDifLanguage(string, language, ruleId)
                else:
                    self.searchTweetOrHashtag(string, ruleId)
                if checkUsername:
                    self.searchUserTweets(user, ruleId, fullstring)
        if not os.path.exists(self.currdir + "/WordCloud"):
            os.makedirs(self.currdir + "/WordCloud/")
            os.chmod(self.currdir + "/WordCloud/", 0o777)
        if not os.path.exists(self.currdir + "/WordCloud/Twitter"):
            os.makedirs(self.currdir + "/WordCloud/Twitter/")
            os.chmod(self.currdir + "/WordCloud/Twitter/", 0o777)
        for element in OSINTRules:
            ruleId = element.get('metadata', False).get('id', False)
            file_content = open(
                "/var/log/modosint/analyzer-twitter/wcloudRule" + ruleId +
                ".txt", "r")
            file_content = file_content.readlines()
            self.create_wordcloud(str(file_content), ruleId)
        self.createPlotMentions()
        self.createPlotHashtag()
        self.alertLogger.info("Twitter Analyzer Job Finished succesfully.")

    def exportReferenceHashtag(self, mensaje):
        lista = re.findall(r'#\w+', mensaje)
        return lista if lista != [] else np.NaN

    def exportReferenceMentions(self, mensaje):
        lista = re.findall(r'@\w+', mensaje)
        return lista if lista != [] else np.NaN

    def createPlotMentions(self):
        with io.open('/var/log/modosint/analyzer-twitter/graylog.txt',
                     'r') as f:
            dataMentions = f.readlines()
            data_json = json.dumps(
                list(map(lambda entry: eval(entry[:-1]), dataMentions)))
            data_twitter = pd.read_json(data_json)
            referenceMentions = data_twitter.short_message.map(
                self.exportReferenceMentions)
            referenceMentions.dropna(inplace=True)
            referenceMentions.head()
            referenceMentions = list(referenceMentions)
            referenceMentions_list = list(itertools.chain(*referenceMentions))
            count_referenceMentions = pd.Series(
                referenceMentions_list).value_counts()
            fig = plt.figure(figsize=(12, 8))
            sns.barplot(y=count_referenceMentions.iloc[:20].index,
                        x=count_referenceMentions.iloc[:20].values)
            fig.savefig(self.currdir + 'mentionsPlot.png')
            os.chmod(self.currdir + 'mentionsPlot.png', 0o777)

    def createPlotHashtag(self):
        with io.open('/var/log/modosint/analyzer-twitter/graylog.txt',
                     'r') as f:
            dataHashtag = f.readlines()
            data_json = json.dumps(
                list(map(lambda entry: eval(entry[:-1]), dataHashtag)))
            data_twitter = pd.read_json(data_json)
            referenceHash = data_twitter.short_message.map(
                self.exportReferenceHashtag)
            referenceHash.dropna(inplace=True)
            referenceHash.head()
            referenceHash = list(referenceHash)
            referenceHash_list = list(itertools.chain(*referenceHash))
            count_referenceHash = pd.Series(referenceHash_list).value_counts()
            fig = plt.figure(figsize=(12, 8))
            sns.barplot(y=count_referenceHash.iloc[:20].index,
                        x=count_referenceHash.iloc[:20].values)
            fig.savefig(self.currdir + 'mentionsHashtag.png')
            os.chmod(self.currdir + 'mentionsHashtag.png', 0o777)
Пример #11
0
class TwitterHarvester(BaseHarvester):
    def __init__(self,
                 working_path,
                 stream_restart_interval_secs=30 * 60,
                 mq_config=None,
                 debug=False,
                 connection_errors=5,
                 http_errors=5,
                 debug_warcprox=False,
                 tries=3):
        BaseHarvester.__init__(
            self,
            working_path,
            mq_config=mq_config,
            stream_restart_interval_secs=stream_restart_interval_secs,
            debug=debug,
            debug_warcprox=debug_warcprox,
            tries=tries)
        self.twarc = None
        self.connection_errors = connection_errors
        self.http_errors = http_errors
        self.extract_media = False
        self.extract_web_resources = False
        self.extract_user_profile_images = False

    def harvest_seeds(self):
        # Create a twarc
        self._create_twarc()

        # Get harvest extract options.
        self.extract_media = self.message.get("options",
                                              {}).get("media", False)
        self.extract_web_resources = self.message.get("options", {}).get(
            "web_resources", False)
        self.extract_user_profile_images = self.message.get("options", {}).get(
            "user_images", False)

        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.search()
        elif harvest_type == "twitter_filter":
            self.filter()
        elif harvest_type == "twitter_sample":
            self.sample()
        elif harvest_type == "twitter_user_timeline":
            self.user_timeline()
        else:
            raise KeyError

    def _create_twarc(self):
        self.twarc = Twarc(self.message["credentials"]["consumer_key"],
                           self.message["credentials"]["consumer_secret"],
                           self.message["credentials"]["access_token"],
                           self.message["credentials"]["access_token_secret"],
                           http_errors=self.http_errors,
                           connection_errors=self.connection_errors)

    def search(self):
        assert len(self.message.get("seeds", [])) == 1

        incremental = self.message.get("options", {}).get("incremental", False)
        query = self.message["seeds"][0]["token"]

        since_id = self.state_store.get_state(
            __name__, u"{}.since_id".format(query)) if incremental else None

        self._harvest_tweets(self.twarc.search(query, since_id=since_id))

    def filter(self):
        assert len(self.message.get("seeds", [])) == 1

        track = self.message["seeds"][0]["token"].get("track")
        follow = self.message["seeds"][0]["token"].get("follow")
        locations = self.message["seeds"][0]["token"].get("locations")

        self._harvest_tweets(
            self.twarc.filter(track=track,
                              follow=follow,
                              locations=locations,
                              event=self.stop_harvest_seeds_event))

    def sample(self):
        self._harvest_tweets(self.twarc.sample(self.stop_harvest_seeds_event))

    def user_timeline(self):
        incremental = self.message.get("options", {}).get("incremental", False)

        for seed in self.message.get("seeds", []):
            seed_id = seed["id"]
            screen_name = seed.get("token")
            user_id = seed.get("uid")
            log.debug(
                "Processing seed (%s) with screen name %s and user id %s",
                seed_id, screen_name, user_id)
            assert screen_name or user_id

            # If there is not a user_id, look it up.
            if screen_name and not user_id:
                user_id = self._lookup_user_id(screen_name)
                if user_id:
                    # Report back if nsid found
                    self.result.uids[seed_id] = user_id
                else:
                    msg = "User id not found for user {} because account is not found or suspended".format(
                        screen_name)
                    log.exception(msg)
                    self.result.warnings.append(
                        Msg(CODE_TOKEN_NOT_FOUND, msg, seed_id=seed_id))
            # Otherwise, get the current screen_name
            else:
                new_screen_name = self._lookup_screen_name(user_id)
                # if can't find the screen_name, ignore get timeline
                if not new_screen_name:
                    msg = "Screen name not found for user id {} because account is not found or suspended".format(
                        user_id)
                    log.exception(msg)
                    self.result.warnings.append(
                        Msg(CODE_TOKEN_NOT_FOUND, msg, seed_id=seed_id))
                    # reset the user_id, ignore the get timeline
                    user_id = None
                if new_screen_name and new_screen_name != screen_name:
                    self.result.token_updates[seed_id] = new_screen_name
                    screen_name = new_screen_name

            if user_id:
                try:
                    # Get since_id from state_store
                    since_id = self.state_store.get_state(
                        __name__, "timeline.{}.since_id".format(
                            user_id)) if incremental else None

                    self._harvest_tweets(
                        self.twarc.timeline(user_id=user_id,
                                            since_id=since_id))

                except HTTPError as e:
                    if e.response.status_code == 401:
                        account = "user {} (User ID: {})".format(
                            screen_name, user_id
                        ) if screen_name else "user ID: {}".format(user_id)
                        msg = "Unauthorized for {} because account is suspended or protected".format(
                            account)
                        log.exception(msg)
                        self.result.warnings.append(
                            Msg(CODE_TOKEN_UNAUTHORIZED, msg, seed_id=seed_id))
                    else:
                        raise e

    def _lookup_screen_name(self, user_id):
        """
        Lookup a screen name given a user id.
        """
        try:
            users = list(self.twarc.user_lookup(user_ids=(user_id, )))
            assert len(users) in (0, 1)
            if users:
                return users[0]["screen_name"]
        except HTTPError as e:
            if e.response.status_code != 404:
                raise e
        return None

    def _lookup_user_id(self, screen_name):
        """
        Lookup a user id given a screen name.
        """
        try:
            users = list(self.twarc.user_lookup(screen_names=(screen_name, )))
            assert len(users) in (0, 1)
            if users:
                return users[0]["id_str"]
        except HTTPError as e:
            if e.response.status_code != 404:
                raise e
        return None

    def _harvest_tweets(self, tweets):
        # max_tweet_id = None
        for count, tweet in enumerate(tweets):
            if not count % 100:
                log.debug("Harvested %s tweets", count)
            self.result.harvest_counter["tweets"] += 1
            if self.stop_harvest_seeds_event.is_set():
                log.debug("Stopping since stop event set.")
                break

    def _process_entities(self, entities):
        if self.extract_web_resources:
            for url in entities.get("urls", []):
                # Exclude links for tweets
                if url["expanded_url"] and not status_re.match(
                        url["expanded_url"]):
                    self.result.urls.append(url["expanded_url"])
        if self.extract_media:
            for media in entities.get("media", []):
                if media["media_url"]:
                    self.result.urls.append(media["media_url"])

    def process_warc(self, warc_filepath):
        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.process_search_warc(warc_filepath)
        elif harvest_type == "twitter_filter":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_sample":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_user_timeline":
            self.process_user_timeline_warc(warc_filepath)
        else:
            raise KeyError

    def process_search_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)
        query = self.message["seeds"][0]["token"]

        since_id = self.state_store.get_state(
            __name__, u"{}.since_id".format(query)) if incremental else None

        max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath))

        # Update state store
        if incremental and max_tweet_id > since_id:
            self.state_store.set_state(__name__, u"{}.since_id".format(query),
                                       max_tweet_id)

    def process_user_timeline_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)

        for count, status in enumerate(TwitterRestWarcIter(warc_filepath)):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet:
                user_id = tweet["user"]["id_str"]
                if incremental:
                    # Update state
                    key = "timeline.{}.since_id".format(user_id)
                    self.state_store.set_state(
                        __name__, key,
                        max(self.state_store.get_state(__name__, key),
                            tweet.get("id")))
                self._process_tweet(tweet)

    def _process_tweets(self, warc_iter):
        max_tweet_id = None
        for count, status in enumerate(warc_iter):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet:
                max_tweet_id = max(max_tweet_id, tweet.get("id"))
                self._process_tweet(tweet)
        return max_tweet_id

    def _process_tweet(self, tweet):
        self.result.increment_stats("tweets")
        # For more info, see https://dev.twitter.com/overview/api/entities-in-twitter-objects
        statuses = [tweet]
        if "retweeted_status" in tweet:
            statuses.append(tweet["retweeted_status"])
        elif "quoted_status" in tweet:
            statuses.append(tweet["quoted_status"])
        for status in statuses:
            self._process_entities(status.get("entities", {}))
            self._process_entities(status.get("extended_entities", {}))
        if self.extract_user_profile_images:
            self.result.urls.append(tweet["user"]["profile_image_url"])
            self.result.urls.append(
                tweet["user"]["profile_background_image_url"])
            if "profile_banner_url" in tweet["user"]:
                self.result.urls.append(tweet["user"]["profile_banner_url"])
Пример #12
0
from models import *
import json

init_db()

t = Twarc('BAVSuRNiZ0IGb5CzIlyHzT1Fd',
          'XVDj8C2SMLzMNaUTrJP3a8UqhHDvYQKZIRIJ9awHDVRBuqxtGD',
          '2433228254-RX73xhPl1dQCEBe7zxhz4cgIJQiL5rUKMofBnz5',
          '33Ju7kmGFPKUXT7AbZ5Nj2ADzaMSL832eur2qwnbFqomt')

pos_sum = 0
count = 0

tweets = list()

for tweet in t.search("to:realdonaldtrump"):
    print tweet['id']
    print tweet['in_reply_to_status_id']
    print tweet['created_at']
    print tweet['text'].encode('utf-8')
    blob = TextBlob(tweet['text'])
    print(blob.sentiment.polarity)
    print pos_sum
    tweet['sentiment'] = blob.sentiment.polarity
    tweets.append(tweet)
    if '#yayfortrump' in tweet['text'] and '#nayfortrump' not in tweet['text']:
        vote = 1
    elif '#yayfortrump' not in tweet['text'] and '#nayfortrump' in tweet[
            'text']:
        vote = -1
    elif blob.sentiment.polarity > 0:
Пример #13
0
from twarc import Twarc

client_key = 'client_key'
client_secret = 'client_secret'
access_token = '197456523-m2qIYWxkQTFKj0ModTQPcdByTnjryHwLRm9L8o5y'
access_token_secret = 'access_token_secret'

t = Twarc(client_key, client_secret, access_token, access_token_secret)
for tweet in t.search("resigncameron"):
    print(tweet["text"])
Пример #14
0
class TwitterRelationships():
    # Cut-down code to get twitter relationships for a set of hashtags.
    # Adapted from https://labsblog.f-secure.com/2018/02/16/searching-twitter-with-twarc/

    def __init__(self, secretsfile='/Users/sara/twittersecrets.txt'):

        fsecret = open(secretsfile, 'r')
        secrets = fsecret.readline()
        access_token, access_token_secret, consumer_key, consumer_secret = \
            [x.strip() for x in secrets.split(',')]

        self.twarc = Twarc(consumer_key, consumer_secret, access_token,
                           access_token_secret)

    # Helper functions for saving csv and formatted txt files
    def write_data(self, data, filename, filetype='txt'):
        with io.open(filename, "w", encoding="utf-8") as handle:
            if filetype == 'txt':
                for item, count in data.most_common():
                    handle.write(str(count) + "\t" + item + "\n")

            else:  #write to csv
                handle.write(u"Source,Target,Weight\n")
                for source, targets in sorted(data.items()):
                    for target, count in sorted(targets.items()):
                        if source != target and source is not None and target is not None:
                            handle.write(source + u"," + target + u"," +
                                         str(count) + u"\n")
        return

    # Returns the screen_name of the user retweeted, or None
    def retweeted_user(self, status):
        if "retweeted_status" in status:
            orig_tweet = status["retweeted_status"]
            if "user" in orig_tweet and orig_tweet["user"] is not None:
                user = orig_tweet["user"]
                if "screen_name" in user and user["screen_name"] is not None:
                    return user["screen_name"]
        return

    # Returns a list of screen_names that the user interacted with in this Tweet
    def get_interactions(self, status):
        interactions = []
        if "in_reply_to_screen_name" in status:
            replied_to = status["in_reply_to_screen_name"]
            if replied_to is not None and replied_to not in interactions:
                interactions.append(replied_to)

        if "retweeted_status" in status:
            orig_tweet = status["retweeted_status"]
            if "user" in orig_tweet and orig_tweet["user"] is not None:
                user = orig_tweet["user"]
                if "screen_name" in user and user["screen_name"] is not None:
                    if user["screen_name"] not in interactions:
                        interactions.append(user["screen_name"])

        if "quoted_status" in status:
            orig_tweet = status["quoted_status"]
            if "user" in orig_tweet and orig_tweet["user"] is not None:
                user = orig_tweet["user"]
                if "screen_name" in user and user["screen_name"] is not None:
                    if user["screen_name"] not in interactions:
                        interactions.append(user["screen_name"])

        if "entities" in status:
            entities = status["entities"]
            if "user_mentions" in entities:
                for item in entities["user_mentions"]:
                    if item is not None and "screen_name" in item:
                        mention = item['screen_name']
                        if mention is not None and mention not in interactions:
                            interactions.append(mention)
        return interactions

    # Returns a list of hashtags found in the tweet
    def get_hashtags(self, status):
        hashtags = []
        if "entities" in status:
            entities = status["entities"]
            if "hashtags" in entities:
                for item in entities["hashtags"]:
                    if item is not None and "text" in item:
                        hashtag = item['text']
                        if hashtag is not None and hashtag not in hashtags:
                            hashtags.append(hashtag)
        return hashtags

    # Returns a list of URLs found in the Tweet
    def get_urls(self, status):
        urls = []
        if "entities" in status:
            entities = status["entities"]
            if "urls" in entities:
                for item in entities["urls"]:
                    if item is not None and "expanded_url" in item:
                        url = item['expanded_url']
                        if url is not None and url not in urls:
                            urls.append(url)
        return urls

    def get_image_urls(self, status):
        # Returns the URLs to any images found in the Tweet
        urls = []
        if "entities" in status:
            entities = status["entities"]
            if "media" in entities:
                for item in entities["media"]:
                    if item is not None:
                        if "media_url" in item:
                            murl = item["media_url"]
                            if murl not in urls:
                                urls.append(murl)
        return urls

    def fetch_images(self):
        # Iterate through image URLs, fetching each image if we haven't already
        pictures_dir = os.path.join(self.save_dir,
                                    self.dataname + '_' + "images")
        if not os.path.exists(pictures_dir):
            print("Creating directory: " + pictures_dir)
            os.makedirs(pictures_dir)
        for url in self.all_image_urls:
            m = re.search("^http:\/\/pbs\.twimg\.com\/media\/(.+)$", url)
            if m is not None:
                filename = m.group(1)
                print("Getting picture from: " + url)
                save_path = os.path.join(pictures_dir, filename)
                if not os.path.exists(save_path):
                    response = requests.get(url, stream=True)
                    with open(save_path, 'wb') as out_file:
                        shutil.copyfileobj(response.raw, out_file)
                    del response

        return

    def writedf(self, dataset, name, columns):
        filename = os.path.join(self.save_dir, self.dataname + '_' + name)
        with io.open(filename, "w", encoding="utf-8") as handle:
            handle.write('\t'.join(columns) + u"\n")
            for row in dataset:
                handle.write('\t'.join(row) + u"\n")
        return

    def save_datasets(self, fetch_images=True):

        csv_outputs = {
            "user_user_graph.csv": self.user_user_graph,
            "user_hashtag_graph.csv": self.user_hashtag_graph,
            "hashtag_hashtag_graph.csv": self.hashtag_hashtag_graph
        }
        for name, dataset in csv_outputs.items():
            filename = os.path.join(self.save_dir, self.dataname + '_' + name)
            self.write_data(dataset, filename, 'csv')

        text_outputs = {
            "hashtags.txt": self.hashtag_frequency_dist,
            "influencers.txt": self.influencer_frequency_dist,
            "mentioned.txt": self.mentioned_frequency_dist,
            "urls.txt": self.url_frequency_dist
        }
        for name, dataset in text_outputs.items():
            filename = os.path.join(self.save_dir, self.dataname + '_' + name)
            self.write_data(dataset, filename, 'txt')

        self.writedf(self.url_refs, "url_refs.csv", ['url', 'tweeturl'])
        self.writedf(self.image_refs, "image_refs.csv", ['url', 'tweeturl'])
        self.writedf(self.tweets, "tweets.csv",
                     ['url', 'screen_name', 'id', 'created_at', 'text'])

        if fetch_images:
            self.fetch_images()

        return

    def make_directories(self, target, rootdir='../data/twitter'):
        # Create a separate save directory for each search query
        # Since search queries can be a whole sentence, we'll check the length
        # and simply number it if the query is overly long

        self.dataname = datetime.now().strftime(
            "%Y%m%d%H%M%S") + '_' + target.replace(" ", "_")

        self.save_dir = rootdir
        if not os.path.exists(rootdir):
            os.makedirs(rootdir)
        if len(target) < 30:
            self.save_dir += "/" + self.dataname
        else:
            self.save_dir += "/target_" + str(count + 1)
        if not os.path.exists(self.save_dir):
            print("Creating directory: " + self.save_dir)
            os.makedirs(self.save_dir)

        return

    def get_target_data(self, target):

        # Variables for capturing stuff
        self.tweets_captured = 0
        self.influencer_frequency_dist = Counter()
        self.mentioned_frequency_dist = Counter()
        self.hashtag_frequency_dist = Counter()
        self.url_frequency_dist = Counter()
        self.user_user_graph = {}
        self.user_hashtag_graph = {}
        self.hashtag_hashtag_graph = {}
        self.all_image_urls = []
        self.tweets = []
        self.tweet_count = 0
        self.url_refs = []
        self.image_refs = []

        # Start the search
        for status in self.twarc.search(target):

            # Output some status as we go, so we know something is happening
            sys.stdout.write("\r")
            sys.stdout.flush()
            sys.stdout.write("Collected " + str(self.tweet_count) + " tweets.")
            sys.stdout.flush()
            self.tweet_count += 1

            screen_name = None
            if "user" in status:
                if "screen_name" in status["user"]:
                    screen_name = status["user"]["screen_name"]

            retweeted = self.retweeted_user(status)
            if retweeted is not None:
                self.influencer_frequency_dist[retweeted] += 1
            else:
                self.influencer_frequency_dist[screen_name] += 1

            # Tweet text can be in either "text" or "full_text" field...
            text = None
            if "full_text" in status:
                text = status["full_text"]
            elif "text" in status:
                text = status["text"]

            id_str = None
            if "id_str" in status:
                id_str = status["id_str"]

            # Assemble the URL to the tweet we received...
            tweet_url = None
            if id_str is not None and screen_name is not None:
                tweet_url = "https://twitter.com/" + screen_name + "/status/" + id_str
            # if tweet_url is not None and text is not None:
            #     self.tweets[tweet_url] = text
            created_at = None
            if "created_at" in status:
                created_at = status["created_at"]
            self.tweets += [[tweet_url, screen_name, id_str, created_at,
                             text]]  #capture everything

            # Record mapping graph between users
            interactions = self.get_interactions(status)
            if interactions is not None:
                for user in interactions:
                    self.mentioned_frequency_dist[user] += 1
                    if screen_name not in self.user_user_graph:
                        self.user_user_graph[screen_name] = {}
                    if user not in self.user_user_graph[screen_name]:
                        self.user_user_graph[screen_name][user] = 1
                    else:
                        self.user_user_graph[screen_name][user] += 1

            # Record mapping graph between users and hashtags
            hashtags = self.get_hashtags(status)
            if hashtags is not None:
                if len(hashtags) > 1:
                    hashtag_interactions = []

                    # This code creates pairs of hashtags in situations where multiple
                    # hashtags were found in a tweet
                    # This is used to create a graph of hashtag-hashtag interactions
                    for comb in combinations(sorted(hashtags), 2):
                        hashtag_interactions.append(comb)
                    if len(hashtag_interactions) > 0:
                        for inter in hashtag_interactions:
                            item1, item2 = inter
                            if item1 not in self.hashtag_hashtag_graph:
                                self.hashtag_hashtag_graph[item1] = {}
                            if item2 not in self.hashtag_hashtag_graph[item1]:
                                self.hashtag_hashtag_graph[item1][item2] = 1
                            else:
                                self.hashtag_hashtag_graph[item1][item2] += 1
                    for hashtag in hashtags:
                        self.hashtag_frequency_dist[hashtag] += 1
                        if screen_name not in self.user_hashtag_graph:
                            self.user_hashtag_graph[screen_name] = {}
                        if hashtag not in self.user_hashtag_graph[screen_name]:
                            self.user_hashtag_graph[screen_name][hashtag] = 1
                        else:
                            self.user_hashtag_graph[screen_name][hashtag] += 1

            urls = self.get_urls(status)
            if urls is not None:
                for url in urls:
                    self.url_refs += [[url, tweet_url]]
                    self.url_frequency_dist[url] += 1

            image_urls = self.get_image_urls(status)
            if image_urls is not None:
                for url in image_urls:
                    self.image_refs += [[url, tweet_url]]
                    if url not in self.all_image_urls:
                        self.all_image_urls.append(url)

        self.save_datasets(fetch_images=True)

        return
class TwitterHarvester(BaseHarvester):
    def __init__(self, process_interval_secs=1200, mq_config=None, debug=False):
        BaseHarvester.__init__(self, mq_config=mq_config, process_interval_secs=process_interval_secs, debug=debug)
        self.twarc = None

    def harvest_seeds(self):
        # Create a twarc
        self._create_twarc()

        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.search()
        elif harvest_type == "twitter_filter":
            self.filter()
        else:
            raise KeyError

    def _create_twarc(self):
        self.twarc = Twarc(self.message["credentials"]["consumer_key"],
                           self.message["credentials"]["consumer_secret"],
                           self.message["credentials"]["access_token"],
                           self.message["credentials"]["access_token_secret"])

    def search(self):
        incremental = self.message.get("options", {}).get("incremental", False)

        for seed in self.message.get("seeds", []):
            query = seed.get("token")
            # Get since_id from state_store
            since_id = self.state_store.get_state(__name__, "{}.since_id".format(query)) if incremental else None

            max_tweet_id = self._process_tweets(self.twarc.search(query, since_id=since_id))
            log.debug("Searching on %s since %s returned %s tweets.", query,
                      since_id, self.harvest_result.summary.get("tweet"))

            # Update state store
            if incremental and max_tweet_id:
                self.state_store.set_state(__name__, "{}.since_id".format(query), max_tweet_id)

    def filter(self):
        assert len(self.message.get("seeds", [])) == 1

        track = self.message["seeds"][0]["token"]

        self._process_tweets(self.twarc.stream(track))

    def _process_tweets(self, tweets):
        max_tweet_id = None
        for count, tweet in enumerate(tweets):
            if not count % 100:
                log.debug("Processed %s tweets", count)
            if self.stop_event.is_set():
                log.debug("Stopping since stop event set.")
                break
            if "text" in tweet:
                with self.harvest_result_lock:
                    max_tweet_id = max(max_tweet_id, tweet.get("id"))
                    self.harvest_result.increment_summary("tweet")
                    if "urls" in tweet["entities"]:
                        for url in tweet["entities"]["urls"]:
                            self.harvest_result.urls.append(url["expanded_url"])
                    if "media" in tweet["entities"]:
                        for media in tweet["entities"]["media"]:
                            self.harvest_result.urls.append(media["media_url"])
        return max_tweet_id
Пример #16
0
NAME THE TWEET DOCUMEsNTS THAT YOU 
WOULD LIKE THE TWEETS TO BE DUMPED IN
'''

file_dir = 'tweet_files'
if not os.path.isdir(file_dir):
    os.mkdir(file_dir)
'''
HERE IS WHERE THE TWITTER SCRAPING STARTS. PLEASE DO NOT TOUCH 
ANTHING BELOW. IF YOU WANT TO CHANGE THE TWARC FUNCTION, YOU MAY DO SO AS
LONG AS YOU MAKE SURE THE CORRESPONDING PARAMETERS ARE CHANGED.
'''
t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret)

for topic in topic_list:

    #Removing and spaces and hashtags to name output file
    temp_str = topic.replace('#', '')
    temp_str = temp_str.replace(' ', '_')
    file_name = temp_str + '.txt'
    file_write = os.path.join(file_dir, file_name)
    print('Started scraping tweets for the topic {0}'.format(topic))

    #writing to output file
    with open(file_write, 'w') as writefile:
        for tweet in t.search(topic, lang=language):
            line = tweet['full_text']

            #writing tweet to file
            writefile.write(line)
        print('Just finished scraping tweets for the topic {0}'.format(topic))
Пример #17
0
from twarc import Twarc
import json

CONSUMER_KEY = 'CfHUyBhlMaLv5Mn8r2IziXpLs'
CONSUMER_SECRET = 'PqqtbhbyNb5mcJ2dHkSIT2wupOMuEqfSINGYvV8KDIOPuqgDkN'
ACCESS_TOKEN = '29202483-qK6twPLeurVc8Ls8zBxdFtaFGyzm76LUBbtXOMMk1'
ACCESS_TOKEN_SECRET = 'aOIFdI1TVJjsIPWNO1rAFx2IECzVSCPY4kOnEKBA0pCdA'

w = open('tweetDay5.json', 'w')

t = Twarc(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
count = 1
for tweet in t.search("google fi"):
    w.write(json.dumps(tweet))
    w.write('\n')
    print count
    count += 1
    if count > 1000:
        break
Пример #18
0
auth.set_access_token(keys.access_token, keys.access_token_secret)

api = tweepy.API(auth)

arguments = sys.argv  # Get parameters from command line

if len(arguments) > 1:
    # If there's any arguments join with an OR in between
    hashtags = ' OR '.join(map(str, arguments))
else:
    # If no arguments don't run
    print("No arguments passed")
    sys.exit(0)

# Search Twitter for tweets conraining the hashtags
tweets = twarc.search(hashtags)

for tweet in tweets:
    user = tweet['user']

    # Check if tweet (not retweet) and determine if the user is an influencer
    if utils.not_retweet(tweet) and utils.is_influencer(tweet):

        # Check if you follow the influencer and if you've already sent a follow request
        if user['following'] == False and user['follow_request_sent'] == False:
            # Follow
            print('following: ' + user['name'])
            api.create_friendship(user['id'])

        # Check if they have more retweets than likes, like the tweet if not liked yet and viceversa
        if tweet['retweet_count'] >= tweet['favorite_count'] and tweet[
Пример #19
0
            print("Creating directory: " + save_dir)
            os.makedirs(save_dir)
# Variables for capturing stuff
        tweets_captured = 0
        influencer_frequency_dist = Counter()
        mentioned_frequency_dist = Counter()
        hashtag_frequency_dist = Counter()
        url_frequency_dist = Counter()
        user_user_graph = {}
        user_hashtag_graph = {}
        hashtag_hashtag_graph = {}
        all_image_urls = []
        tweets = {}
        tweet_count = 0
        # Start the search
        for status in twarc.search(target):
            # Output some status as we go, so we know something is happening
            sys.stdout.write("\r")
            sys.stdout.flush()
            sys.stdout.write("Collected " + str(tweet_count) + " tweets.")
            sys.stdout.flush()
            tweet_count += 1

            screen_name = None
            if "user" in status:
                if "screen_name" in status["user"]:
                    screen_name = status["user"]["screen_name"]

            retweeted = retweeted_user(status)
            if retweeted is not None:
                influencer_frequency_dist[retweeted] += 1
class TwitterHarvester(BaseHarvester):
    def __init__(self, working_path, stream_restart_interval_secs=30 * 60, mq_config=None, debug=False,
                 connection_errors=5, http_errors=5, debug_warcprox=False, tries=3):
        BaseHarvester.__init__(self, working_path, mq_config=mq_config,
                               stream_restart_interval_secs=stream_restart_interval_secs,
                               debug=debug, debug_warcprox=debug_warcprox, tries=tries)
        self.twarc = None
        self.connection_errors = connection_errors
        self.http_errors = http_errors
        self.extract_media = False
        self.extract_web_resources = False
        self.extract_user_profile_images = False

    def harvest_seeds(self):
        # Create a twarc
        self._create_twarc()

        # Get harvest extract options.
        self.extract_media = self.message.get("options", {}).get("media", False)
        self.extract_web_resources = self.message.get("options", {}).get("web_resources", False)
        self.extract_user_profile_images = self.message.get("options", {}).get("user_images", False)

        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.search()
        elif harvest_type == "twitter_filter":
            self.filter()
        elif harvest_type == "twitter_sample":
            self.sample()
        elif harvest_type == "twitter_user_timeline":
            self.user_timeline()
        else:
            raise KeyError

    def _create_twarc(self):
        self.twarc = Twarc(self.message["credentials"]["consumer_key"],
                           self.message["credentials"]["consumer_secret"],
                           self.message["credentials"]["access_token"],
                           self.message["credentials"]["access_token_secret"],
                           http_errors=self.http_errors,
                           connection_errors=self.connection_errors)

    def search(self):
        assert len(self.message.get("seeds", [])) == 1

        incremental = self.message.get("options", {}).get("incremental", False)
        query = self.message["seeds"][0]["token"]

        since_id = self.state_store.get_state(__name__, u"{}.since_id".format(query)) if incremental else None

        self._harvest_tweets(self.twarc.search(query, since_id=since_id))

    def filter(self):
        assert len(self.message.get("seeds", [])) == 1

        track = self.message["seeds"][0]["token"].get("track")
        follow = self.message["seeds"][0]["token"].get("follow")
        locations = self.message["seeds"][0]["token"].get("locations")

        self._harvest_tweets(self.twarc.filter(track=track, follow=follow, locations=locations))

    def sample(self):
        self._harvest_tweets(self.twarc.sample())

    def user_timeline(self):
        incremental = self.message.get("options", {}).get("incremental", False)

        for seed in self.message.get("seeds", []):
            seed_id = seed["id"]
            screen_name = seed.get("token")
            user_id = seed.get("uid")
            log.debug("Processing seed (%s) with screen name %s and user id %s", seed_id, screen_name, user_id)
            assert screen_name or user_id

            # If there is not a user_id, look it up.
            if screen_name and not user_id:
                user_id = self._lookup_user_id(screen_name)
                if user_id:
                    # Report back if nsid found
                    self.result.uids[seed_id] = user_id
                else:
                    msg = "User id not found for user {}".format(screen_name)
                    log.exception(msg)
                    self.result.warnings.append(Msg(CODE_TOKEN_NOT_FOUND, msg))
            # Otherwise, get the current screen_name
            else:
                new_screen_name = self._lookup_screen_name(user_id)
                if new_screen_name != screen_name:
                    self.result.token_updates[seed_id] = new_screen_name
                    screen_name = new_screen_name

            if user_id:
                try:
                    # Get since_id from state_store
                    since_id = self.state_store.get_state(__name__,
                                                          "timeline.{}.since_id".format(
                                                              user_id)) if incremental else None

                    self._harvest_tweets(self.twarc.timeline(user_id=user_id, since_id=since_id))

                except HTTPError as e:
                    if e.response.status_code == 401:
                        msg = "Unauthorized for user {} (User ID: {}) because account is suspended or private".format(screen_name, user_id)
                        log.exception(msg)
                        self.result.warnings.append(Msg(CODE_TOKEN_UNAUTHORIZED, msg))
                    else:
                        raise e

    def _lookup_screen_name(self, user_id):
        """
        Lookup a screen name given a user id.
        """
        users = list(self.twarc.user_lookup(user_ids=(user_id,)))
        assert len(users) in (0, 1)
        if users:
            return users[0]["screen_name"]
        return None

    def _lookup_user_id(self, screen_name):
        """
        Lookup a user id given a screen name.
        """
        users = list(self.twarc.user_lookup(screen_names=(screen_name,)))
        assert len(users) in (0, 1)
        if users:
            return users[0]["id_str"]
        return None

    def _harvest_tweets(self, tweets):
        # max_tweet_id = None
        for count, tweet in enumerate(tweets):
            if not count % 100:
                log.debug("Harvested %s tweets", count)
            self.result.harvest_counter["tweets"] += 1
            if self.stop_harvest_seeds_event.is_set():
                log.debug("Stopping since stop event set.")
                break

    def _process_entities(self, entities):
        if self.extract_web_resources:
            for url in entities.get("urls", []):
                # Exclude links for tweets
                if url["expanded_url"] and not status_re.match(url["expanded_url"]):
                    self.result.urls.append(url["expanded_url"])
        if self.extract_media:
            for media in entities.get("media", []):
                if media["media_url"]:
                    self.result.urls.append(media["media_url"])

    def process_warc(self, warc_filepath):
        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.process_search_warc(warc_filepath)
        elif harvest_type == "twitter_filter":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_sample":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_user_timeline":
            self.process_user_timeline_warc(warc_filepath)
        else:
            raise KeyError

    def process_search_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)
        query = self.message["seeds"][0]["token"]

        since_id = self.state_store.get_state(__name__, u"{}.since_id".format(query)) if incremental else None

        max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath))

        # Update state store
        if incremental and max_tweet_id > since_id:
            self.state_store.set_state(__name__, u"{}.since_id".format(query), max_tweet_id)

    def process_user_timeline_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)

        for count, status in enumerate(TwitterRestWarcIter(warc_filepath)):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet:
                user_id = tweet["user"]["id_str"]
                if incremental:
                    # Update state
                    key = "timeline.{}.since_id".format(user_id)
                    self.state_store.set_state(__name__, key,
                                               max(self.state_store.get_state(__name__, key), tweet.get("id")))
                self._process_tweet(tweet)

    def _process_tweets(self, warc_iter):
        max_tweet_id = None
        for count, status in enumerate(warc_iter):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet:
                max_tweet_id = max(max_tweet_id, tweet.get("id"))
                self._process_tweet(tweet)
        return max_tweet_id

    def _process_tweet(self, tweet):
        self.result.increment_stats("tweets")
        # For more info, see https://dev.twitter.com/overview/api/entities-in-twitter-objects
        statuses = [tweet]
        if "retweeted_status" in tweet:
            statuses.append(tweet["retweeted_status"])
        elif "quoted_status" in tweet:
            statuses.append(tweet["quoted_status"])
        for status in statuses:
            self._process_entities(status.get("entities", {}))
            self._process_entities(status.get("extended_entities", {}))
        if self.extract_user_profile_images:
            self.result.urls.append(tweet["user"]["profile_image_url"])
            self.result.urls.append(tweet["user"]["profile_background_image_url"])
            if "profile_banner_url" in tweet["user"]:
                self.result.urls.append(tweet["user"]["profile_banner_url"])
class TwitterHarvester(BaseHarvester):
    def __init__(self,
                 working_path,
                 stream_restart_interval_secs=30 * 60,
                 mq_config=None,
                 debug=False,
                 connection_errors=5,
                 http_errors=5,
                 debug_warcprox=False,
                 tries=3):
        BaseHarvester.__init__(
            self,
            working_path,
            mq_config=mq_config,
            stream_restart_interval_secs=stream_restart_interval_secs,
            debug=debug,
            debug_warcprox=debug_warcprox,
            tries=tries)
        self.twarc = None
        self.connection_errors = connection_errors
        self.http_errors = http_errors

    def harvest_seeds(self):
        # Create a twarc
        self._create_twarc()

        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.search()
        elif harvest_type == "twitter_filter":
            self.filter()
        elif harvest_type == "twitter_sample":
            self.sample()
        elif harvest_type == "twitter_user_timeline":
            self.user_timeline()
        else:
            raise KeyError

    def _create_twarc(self):
        self.twarc = Twarc(self.message["credentials"]["consumer_key"],
                           self.message["credentials"]["consumer_secret"],
                           self.message["credentials"]["access_token"],
                           self.message["credentials"]["access_token_secret"],
                           http_errors=self.http_errors,
                           connection_errors=self.connection_errors,
                           tweet_mode="extended")

    def search(self):
        assert len(self.message.get("seeds", [])) == 1

        incremental = self.message.get("options", {}).get("incremental", False)

        since_id = self.state_store.get_state(
            __name__, u"{}.since_id".format(
                self._search_id())) if incremental else None

        query, geocode = self._search_parameters()
        self._harvest_tweets(
            self.twarc.search(query, geocode=geocode, since_id=since_id))

    def _search_parameters(self):
        if type(self.message["seeds"][0]["token"]) is dict:
            query = self.message["seeds"][0]["token"].get("query")
            geocode = self.message["seeds"][0]["token"].get("geocode")
        else:
            query = self.message["seeds"][0]["token"]
            geocode = None
        return query, geocode

    def _search_id(self):
        query, geocode = self._search_parameters()
        if query and not geocode:
            return query
        if geocode and not query:
            return geocode
        return ":".join([query, geocode])

    def filter(self):
        assert len(self.message.get("seeds", [])) == 1

        track = self.message["seeds"][0]["token"].get("track")
        follow = self.message["seeds"][0]["token"].get("follow")
        locations = self.message["seeds"][0]["token"].get("locations")
        language = self.message["seeds"][0]["token"].get("language")

        self._harvest_tweets(
            self.twarc.filter(track=track,
                              follow=follow,
                              locations=locations,
                              lang=language,
                              event=self.stop_harvest_seeds_event))

    def sample(self):
        self._harvest_tweets(self.twarc.sample(self.stop_harvest_seeds_event))

    def user_timeline(self):
        incremental = self.message.get("options", {}).get("incremental", False)

        for seed in self.message.get("seeds", []):
            seed_id = seed["id"]
            screen_name = seed.get("token")
            user_id = seed.get("uid")
            log.debug(
                "Processing seed (%s) with screen name %s and user id %s",
                seed_id, screen_name, user_id)
            assert screen_name or user_id

            # If there is not a user_id, look it up.
            if screen_name and not user_id:
                result, user = self._lookup_user(screen_name, "screen_name")
                if result == "OK":
                    user_id = user["id_str"]
                    self.result.uids[seed_id] = user_id
                else:
                    msg = u"User id not found for {} because account is {}".format(
                        screen_name, self._result_to_reason(result))
                    log.exception(msg)
                    self.result.warnings.append(
                        Msg("token_{}".format(result), msg, seed_id=seed_id))
            # Otherwise, get the current screen_name
            else:
                result, user = self._lookup_user(user_id, "user_id")
                if result == "OK":
                    new_screen_name = user["screen_name"]
                    if new_screen_name and new_screen_name != screen_name:
                        self.result.token_updates[seed_id] = new_screen_name
                else:
                    msg = u"User {} (User ID: {}) not found because account is {}".format(
                        screen_name, user_id, self._result_to_reason(result))
                    log.exception(msg)
                    self.result.warnings.append(
                        Msg("uid_{}".format(result), msg, seed_id=seed_id))
                    user_id = None

            if user_id:
                # Get since_id from state_store
                since_id = self.state_store.get_state(
                    __name__, "timeline.{}.since_id".format(
                        user_id)) if incremental else None

                self._harvest_tweets(
                    self.twarc.timeline(user_id=user_id, since_id=since_id))

    def _lookup_user(self, id, id_type):
        url = "https://api.twitter.com/1.1/users/show.json"
        params = {id_type: id}

        # USER_DELETED: 404 and {"errors": [{"code": 50, "message": "User not found."}]}
        # USER_PROTECTED: 200 and user object with "protected": true
        # USER_SUSPENDED: 403 and {"errors":[{"code":63,"message":"User has been suspended."}]}
        result = "OK"
        user = None
        try:
            resp = self.twarc.get(url, params=params, allow_404=True)
            user = resp.json()
            if user['protected']:
                result = "unauthorized"
        except requests.exceptions.HTTPError as e:
            try:
                resp_json = e.response.json()
            except json.decoder.JSONDecodeError:
                raise e
            if e.response.status_code == 404 and self._has_error_code(
                    resp_json, 50):
                result = "not_found"
            elif e.response.status_code == 403 and self._has_error_code(
                    resp_json, 63):
                result = "suspended"
            else:
                raise e
        return result, user

    @staticmethod
    def _has_error_code(resp, code):
        if isinstance(code, int):
            code = (code, )
        for error in resp['errors']:
            if error['code'] in code:
                return True
        return False

    @staticmethod
    def _result_to_reason(result):
        if result == "unauthorized":
            return "protected"
        elif result == "suspended":
            return "suspended"
        return "not found or deleted"

    def _harvest_tweets(self, tweets):
        # max_tweet_id = None
        for count, tweet in enumerate(tweets):
            if not count % 100:
                log.debug("Harvested %s tweets", count)
            self.result.harvest_counter["tweets"] += 1
            if self.stop_harvest_seeds_event.is_set():
                log.debug("Stopping since stop event set.")
                break

    def process_warc(self, warc_filepath):
        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.process_search_warc(warc_filepath)
        elif harvest_type == "twitter_filter":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_sample":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_user_timeline":
            self.process_user_timeline_warc(warc_filepath)
        else:
            raise KeyError

    def process_search_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)

        since_id = self.state_store.get_state(
            __name__, u"{}.since_id".format(
                self._search_id())) if incremental else None

        max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath))

        # Update state store
        if incremental and (max_tweet_id or 0) > (since_id or 0):
            self.state_store.set_state(
                __name__, u"{}.since_id".format(self._search_id()),
                max_tweet_id)

    def process_user_timeline_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)

        for count, status in enumerate(TwitterRestWarcIter(warc_filepath)):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet or "full_text" in tweet:
                user_id = tweet["user"]["id_str"]
                if incremental:
                    # Update state
                    key = "timeline.{}.since_id".format(user_id)
                    self.state_store.set_state(
                        __name__, key,
                        max(
                            self.state_store.get_state(__name__, key) or 0,
                            tweet.get("id")))
                self._process_tweet(tweet)

    def _process_tweets(self, warc_iter):
        max_tweet_id = None

        for count, status in enumerate(warc_iter):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet or "full_text" in tweet:
                max_tweet_id = max(max_tweet_id or 0, tweet.get("id"))
                self._process_tweet(tweet)
        return max_tweet_id

    def _process_tweet(self, _):
        self.result.increment_stats("tweets")
Пример #22
0
from twarc import Twarc

tw = Twarc()
#get training data
for tweet in tw.search("covid-19", lang='en'):
    try:
        screen_name = None
        if "screen_name" in tweet["user"]:
            screen_name = tweet["user"]["screen_name"]
        id_str = tweet["id_str"]
        tweet_url = None
        if "id_str" != None and "screen_name" != None:
            tweet_url = "https://twitter.com/" + screen_name + "/status/" + id_str
        #put training data into a txt file
        with open("trainingcovid-19.txt", "a+") as f:
            # Move read cursor to the start of file.
            f.seek(0)
            # If file is not empty then append '\n'
            data = f.read(100)
            if len(data) > 0:
                f.write("\n")
            # Append text at the end of file
            f.write(tweet['full_text'])
            f.write("\n")
            f.write(tweet_url)

    except UnicodeEncodeError:
        print("UnicodeEncodeError in finding training data")

#now we have to manually sort training data
Пример #23
0
__location__ = os.path.dirname(os.path.realpath(__file__))

users = os.path.join(__location__, "apostrophe", "tweets.csv")

userList = []
with open(users, 'r', encoding='utf-8') as f:
	reader = csv.reader(f)
	rowCount = 0
	for row in reader:
		rowCount += 1
		if rowCount > 1:
			if not row[3] in userList:
				userList.append(row[3])

tweets = []
tweetContent = ""
for user in userList:				
	t = Twarc()
	for tweet in t.search("from:" + user):
		print (tweet["full_text"])
		tweetContent += "%s\n" % str(tweet["full_text"])
		tweets.append(tweet)
		
outputFile = os.path.join(__location__, "possibleBotTweets.jsonl")
with open(outputFile, "w", encoding='utf-8') as output:
	for line in tweets:
		output.write("%s\n" % str(json.dumps(line)))
		
contentOutput = os.path.join(__location__, "possibleBotTweetContent.txt")
with open(contentOutput, "w", encoding='utf-8') as output2:
	output2.write(tweetContent)
Пример #24
0
            tweet_dic['poi_id'] = 18839785
            tweet_dic['user']['verified'] = False                                #CHANGE
            print(tweet_dic['poi_name'])
            tweet_dic['country'] = "India"
            full_text = tweet_dic['full_text']
            tweet_dic['text_copy'] = demoji.replace(full_text)
            tweet_dic['tweet_emotions'] = list(demoji.findall(full_text).keys())
            json.dump(tweet_dic, fp, ensure_ascii = False)
            fp.write("\n")


t = Twarc(consumer_key, consumer_secret, access_key, access_secret)
max_number = 0
for name in name_list:
    print(name)
    with open( file_name, "a", encoding='utf-8') as file:
        for tweet in t.search(q = str(name), lang='hi'):
            #print("In")
            #preprocessing(tweet, file)
            if 'retweeted_status' in tweet.keys():
                print("Its a retweet")
                continue
            else:
                json.dump(tweet, file,  ensure_ascii=False)
                file.write("\n")
                max_number +=1
                print("{} number {}".format(name, max_number))
                if max_number > 2500:
                    break
time.sleep(10)
preprocessing(file_name, file_processed)
Пример #25
0
#! usr/bin/env python
# -*- coding : utf -*-

from twarc import Twarc

t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret)
for tweet in t.search("ferguson"):
    print(tweet["text"])
def crawl_feed(feed_dict, credentials):
    twarc = Twarc(credentials['consumer_key'], credentials['consumer_secret'], credentials['access_token'], credentials['access_token_secret'])
    crawl_time = datetime.datetime.now()
    crawl_time_filename = crawl_time.strftime('%Y%m%d%I%M%S')
    crawl_time_html = crawl_time.strftime('%B %d, %Y')
    crawl_name = feed_dict['crawl_name']
    crawl_type = feed_dict['crawl_type']
    short_name = feed_dict['short_name']
    search_string = feed_dict['search_string']

    feed_dir = feed_dict['feed_dir']
    json_dir = join(feed_dir, 'json')
    html_dir = join(feed_dir, 'html')
    media_dir = join(feed_dir, 'media')
    logs_dir = join(feed_dir, 'logs')

    for directory in [feed_dir, json_dir, html_dir, media_dir, logs_dir]:
        if not os.path.exists(directory):
            os.makedirs(directory)

    log_file = join(logs_dir, 'twarc.log')

    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    logger = logging.getLogger(crawl_name)
    handler = logging.FileHandler(log_file)
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)

    base_filename = short_name + '-' + crawl_time_filename
    json_file = join(json_dir, base_filename + '.json')

    print("Searching Twitter API for {0}".format(search_string))
    print("Writing JSON and HTML files...")

    logger.info("starting search for %s", search_string)
    tweet_count = 0

    for tweet in twarc.search(search_string):
        with open(json_file, 'a') as json_out:
            json_out.write("{}\n".format(json.dumps(tweet)))

        if "id_str" in tweet:
            logger.info("archived https://twitter.com/%s/status/%s", tweet['user']['screen_name'], tweet["id_str"])
        elif 'limit' in tweet:
            logger.warn("%s tweets undelivered", tweet["limit"]["track"])
        elif 'warning' in tweet:
            logger.warn(tweet['warning']['message'])
        else:
            logger.warn(json.dumps(tweet))

        tweet_count += 1

    if tweet_count == 0:
        logger.info("no new tweets matching %s", search_string)

        # Write an empty json file. Maybe don't do this?
        with open(json_file, 'w') as json_out:
            json_out.close()

    return base_filename, tweet_count, crawl_time_html