Пример #1
0
def main():
    # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors
    sys.stdout = codecs.getwriter('utf8')(sys.stdout)

    parser = argparse.ArgumentParser(description="")
    parser.add_argument('screen_name_file')
    args = parser.parse_args()

    logger = get_console_info_logger()

    ACCESS_TOKEN = Twython(consumer_key, consumer_secret, oauth_version=2).obtain_access_token()
    twython = Twython(consumer_key, access_token=ACCESS_TOKEN)

    crawler = CrawlTwitterTimelines(twython, logger)

    screen_names = get_screen_names_from_file(args.screen_name_file)

    for screen_name in screen_names:
        tweet_filename = "%s.tweets" % screen_name
        if os.path.exists(tweet_filename):
            logger.info("File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, screen_name))
        else:
            try:
                tweets = crawler.get_all_timeline_tweets_for_screen_name(screen_name)
            except TwythonError as e:
                print "TwythonError: %s" % e
                if e.error_code == 404:
                    logger.warn("HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % screen_name)
                elif e.error_code == 401:
                    logger.warn("HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % screen_name)
                else:
                    # Unhandled exception
                    raise e
            else:
                save_tweets_to_json_file(tweets, "%s.tweets" % screen_name)
Пример #2
0
def main():
    # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors
    sys.stdout = codecs.getwriter('utf8')(sys.stdout)

    # Parse and document command line options
    parser = argparse.ArgumentParser(description="")
    parser.add_argument('--input', dest='screen_name_file', default="example_screen_names.txt",
                   help='A text file with one screen name per line.')
    parser.add_argument('--token', dest='token_file', default=os.path.expanduser("~") + "/.trawler/default.yaml",
                    help='A configuration file with Twitter API access tokens. See example_token_file.yaml or twitter_oauth_settings.sample.py')
    parser.add_argument('--output', dest='output', default='./',
                    help='Where to output the resulting data.')
    args = parser.parse_args()

    # Set up loggers and output directory
    logger = get_console_info_logger()
    output_directory = args.output
    try:
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)
    except:
        print "Could not create directory:", directory
        exit(0)
    logger.info("Created directory: %s" % output_directory)

    # Set up API access
    if args.token_file.endswith('yaml'):
        #YAML file
        tokens = yaml.safe_load(open(args.token_file))
    elif args.token_file.endswith('py'):
        #.py file -- surely there is a better way to do this
        tokens = {}
        for line in open(args.token_file):
            k,v = [x.strip() for x in line.split("=")]
            tokens[k] = v[1:-1]
    else:
        raise "Unrecognized token file type -- please use a .yaml or .py file following the examples"
            
    twython = get_connection( tokens['consumer_key'], tokens['consumer_secret'])
    crawler = get_timeline_crawler( twython, logger=logger)

    # Gather unique screen names
    screen_names = get_screen_names_from_file(args.screen_name_file)

    # Gather tweets for each of the unique screen names
    # NB: in production, one should use `id` as an identifier (which does not change)
    # rather than the `screen_name`, which can be changed at the users's whim.
    for screen_name in screen_names:
        tweet_filename = output_directory + screen_name + ".tweets.gz" 
        if os.path.exists(tweet_filename):
            logger.info("File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, screen_name))
        else:
            tweets = crawler.get_all_timeline_tweets_for_screen_name( screen_name )
            #Write them out as one-JSON-object-per-line in a gzipped file
            save_tweets_to_json_file(tweets, tweet_filename)
def main():
    # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors
    sys.stdout = codecs.getwriter('utf8')(sys.stdout)

    parser = argparse.ArgumentParser(description="")
    parser.add_argument('screen_name_file')
    args = parser.parse_args()

    logger = get_console_info_logger()

    ACCESS_TOKEN = Twython(consumer_key, consumer_secret,
                           oauth_version=2).obtain_access_token()
    twython = Twython(consumer_key, access_token=ACCESS_TOKEN)

    timeline_crawler = CrawlTwitterTimelines(twython, logger)
    ff_finder = FindFriendFollowers(twython, logger)

    screen_names = get_screen_names_from_file(args.screen_name_file)

    for screen_name in screen_names:
        ff_screen_names = ff_finder.get_ff_screen_names_for_screen_name(
            screen_name)
        save_screen_names_to_file(ff_screen_names, "%s.ff" % screen_name,
                                  logger)

        for ff_screen_name in ff_screen_names:
            tweet_filename = "%s.tweets" % ff_screen_name
            if os.path.exists(tweet_filename):
                logger.info(
                    "File '%s' already exists - will not attempt to download Tweets for '%s'"
                    % (tweet_filename, ff_screen_name))
            else:
                try:
                    tweets = timeline_crawler.get_all_timeline_tweets_for_screen_name(
                        ff_screen_name)
                except TwythonError as e:
                    print "TwythonError: %s" % e
                    if e.error_code == 404:
                        logger.warn(
                            "HTTP 404 error - Most likely, Twitter user '%s' no longer exists"
                            % ff_screen_name)
                    elif e.error_code == 401:
                        logger.warn(
                            "HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible"
                            % ff_screen_name)
                    else:
                        # Unhandled exception
                        raise e
                else:
                    save_tweets_to_json_file(tweets, tweet_filename)
Пример #4
0
def main():
    # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors
    sys.stdout = codecs.getwriter('utf8')(sys.stdout)

    parser = argparse.ArgumentParser(description="")
    parser.add_argument('screen_name_file')
    parser.add_argument('old_tweet_path')
    parser.add_argument('new_tweet_path')
    args = parser.parse_args()

    logger = get_console_info_logger()

    ACCESS_TOKEN = Twython(consumer_key, consumer_secret, oauth_version=2).obtain_access_token()
    twython = Twython(consumer_key, access_token=ACCESS_TOKEN)

    crawler = CrawlTwitterTimelines(twython, logger)

    screen_names = get_screen_names_from_file(args.screen_name_file)

    for screen_name in screen_names:
        old_tweet_filename = os.path.join(args.old_tweet_path, "%s.tweets" % screen_name)
        new_tweet_filename = os.path.join(args.new_tweet_path, "%s.tweets" % screen_name)

        if not os.path.exists(old_tweet_filename):
            logger.error("Older Tweet file '%s' does not exist - will not attempt to download Tweets for '%s'" % (old_tweet_filename, screen_name))
            continue
        if os.path.exists(new_tweet_filename):
            logger.info("File '%s' already exists - will not attempt to download Tweets for '%s'" % (new_tweet_filename, screen_name))
            continue

        most_recent_tweet_id = get_most_recent_tweet_id_from_json_tweet_file(old_tweet_filename)

        try:
            tweets = crawler.get_all_timeline_tweets_for_screen_name_since(screen_name, most_recent_tweet_id)
        except TwythonError as e:
            print "TwythonError: %s" % e
            if e.error_code == 404:
                logger.warn("HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % screen_name)
            elif e.error_code == 401:
                logger.warn("HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % screen_name)
            else:
                # Unhandled exception
                raise e
        else:
            save_tweets_to_json_file(tweets, new_tweet_filename)
Пример #5
0
def main():
    # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors
    sys.stdout = codecs.getwriter('utf8')(sys.stdout)

    parser = argparse.ArgumentParser(description="")
    parser.add_argument('screen_name_file')
    args = parser.parse_args()

    logger = get_console_info_logger()

    ACCESS_TOKEN = Twython(consumer_key, consumer_secret,
                           oauth_version=2).obtain_access_token()
    twython = Twython(consumer_key, access_token=ACCESS_TOKEN)

    crawler = RateLimitedTwitterEndpoint(twython, "statuses/user_timeline",
                                         logger)

    screen_names = get_screen_names_from_file(args.screen_name_file)

    for screen_name in screen_names:
        tweet_filename = "%s.tweets" % screen_name
        if os.path.exists(tweet_filename):
            logger.info(
                "File '%s' already exists - will not attempt to download Tweets for '%s'"
                % (tweet_filename, screen_name))
        else:
            try:
                logger.info("Retrieving Tweets for user '%s'" % screen_name)
                tweets = crawler.get_data(screen_name=screen_name, count=200)
            except TwythonError as e:
                print "TwythonError: %s" % e
                if e.error_code == 404:
                    logger.warn(
                        "HTTP 404 error - Most likely, Twitter user '%s' no longer exists"
                        % screen_name)
                elif e.error_code == 401:
                    logger.warn(
                        "HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible"
                        % screen_name)
                else:
                    # Unhandled exception
                    raise e
            else:
                save_tweets_to_json_file(tweets, "%s.tweets" % screen_name)
def main():
    # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors
    sys.stdout = codecs.getwriter('utf8')(sys.stdout)

    parser = argparse.ArgumentParser(description="")
    parser.add_argument('id_file')
    args = parser.parse_args()

    logger = get_console_info_logger()

    #ACCESS_TOKEN = Twython(consumer_key, consumer_secret, oauth_version=2).obtain_access_token()
    twython = get_connection( consumer_key, consumer_secret)

    #crawler = RateLimitedTwitterEndpoint(twython, "statuses/user_timeline", logger)
    crawler = get_timeline_crawler(twython, logger)

    ids = get_ids_from_file(args.id_file)

    for user_id in ids:
        tweet_filename = "%s.tweets" % user_id
        if os.path.exists(tweet_filename):
            logger.info("File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, user_id))
        else:
            try:
                tweets = crawler.get_most_recent_tweets_by_id( user_id )
            except TwythonError as e:
                print "TwythonError: %s" % e
                if e.error_code == 404:
                    logger.warn("HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % screen_name)
                elif e.error_code == 401:
                    logger.warn("HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % screen_name)
                else:
                    # Unhandled exception
                    raise e
            else:
                save_tweets_to_json_file(tweets, 'testdata/%s.json' % user_id)
Пример #7
0
def main():
    # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors
    sys.stdout = codecs.getwriter('utf8')(sys.stdout)

    # Parse and document command line options
    parser = argparse.ArgumentParser(description="")
    parser.add_argument('--input',
                        dest='screen_name_file',
                        default="example_screen_names.txt",
                        help='A text file with one screen name per line.')
    parser.add_argument(
        '--token',
        dest='token_file',
        default=os.path.expanduser("~") + "/.trawler/default.yaml",
        help=
        'A configuration file with Twitter API access tokens. See example_token_file.yaml or twitter_oauth_settings.sample.py'
    )
    parser.add_argument('--output',
                        dest='output',
                        default='./',
                        help='Where to output the resulting data.')
    args = parser.parse_args()

    # Set up loggers and output directory
    logger = get_console_info_logger()
    output_directory = args.output
    try:
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)
    except:
        print "Could not create directory:", directory
        exit(0)
    logger.info("Created directory: %s" % output_directory)

    # Set up API access
    if args.token_file.endswith('yaml'):
        #YAML file
        tokens = yaml.safe_load(open(args.token_file))
    elif args.token_file.endswith('py'):
        #.py file -- surely there is a better way to do this
        tokens = {}
        for line in open(args.token_file):
            k, v = [x.strip() for x in line.split("=")]
            tokens[k] = v[1:-1]
    else:
        raise "Unrecognized token file type -- please use a .yaml or .py file following the examples"

    twython = get_connection(tokens['consumer_key'], tokens['consumer_secret'])
    crawler = get_timeline_crawler(twython, logger=logger)

    # Gather unique screen names
    screen_names = get_screen_names_from_file(args.screen_name_file)

    # Gather tweets for each of the unique screen names
    # NB: in production, one should use `id` as an identifier (which does not change)
    # rather than the `screen_name`, which can be changed at the users's whim.
    for screen_name in screen_names:
        tweet_filename = output_directory + screen_name + ".tweets.gz"
        if os.path.exists(tweet_filename):
            logger.info(
                "File '%s' already exists - will not attempt to download Tweets for '%s'"
                % (tweet_filename, screen_name))
        else:
            tweets = crawler.get_all_timeline_tweets_for_screen_name(
                screen_name)
            #Write them out as one-JSON-object-per-line in a gzipped file
            save_tweets_to_json_file(tweets, tweet_filename)
Пример #8
0
def main():
    # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors
    sys.stdout = codecs.getwriter('utf8')(sys.stdout)

    # Parse and document command line options
    parser = argparse.ArgumentParser(description="")
    parser.add_argument('-sn', dest='screen_name_file', default="example_screen_names.txt",
                   help='A text file with one screen name per line.')
    parser.add_argument('-t', dest='token_file', default=os.path.expanduser("~") + "/.trawler/default.yaml",
                    help='A configuration file with Twitter API access tokens. See example_token_file.yaml.')
    parser.add_argument('-d', dest='depth', default=0,
                    help='Friend and follower depth. A value of 1 will gather all tweets for users \
                    in the file as well as all tweets from their friends and followers. Default is 0.')
    args = parser.parse_args()

    # Set up loggers and output directory
    logger = get_console_info_logger()
    output_directory = "data/" + datetime.datetime.now().isoformat() + "/"
    try:
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)
    except:
        print "Could not create directory:", directory
        exit(0)
    logger.info("Created directory: %s" % output_directory)

    # Set up API access
    tokens = yaml.safe_load(open(args.token_file))
    ACCESS_TOKEN = Twython(tokens['consumer_key'], tokens['consumer_secret'], oauth_version=2).obtain_access_token()
    twython = Twython(tokens['consumer_key'], access_token=ACCESS_TOKEN)
    crawler = RateLimitedTwitterEndpoint(twython, "statuses/user_timeline", logger)

    # Gather unique screen names
    screen_names = get_screen_names_from_file(args.screen_name_file)
    depth = int(args.depth) # todo, validate args.depth
    unique_screen_names = []
    if depth > 0: # don't initiate ff_finder unless we have to
        ff_finder = FindFriendFollowers(twython, logger)
        ff_screen_names = get_ff(screen_names, depth, ff_finder, logger)
        unique_screen_names = set(ff_screen_names)
    else:
        unique_screen_names = set(screen_names) # assume the list has redundant names
    save_screen_names_to_file(unique_screen_names, output_directory + 'screen_names')

    # Gather tweets for each of the unique screen names
    for screen_name in unique_screen_names:
        tweet_filename = output_directory + screen_name + ".tweets"
        if os.path.exists(tweet_filename):
            logger.info("File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, screen_name))
        else:
            try:
                logger.info("Retrieving Tweets for user " + screen_name + " writing to file " + tweet_filename)
                tweets = crawler.get_data(screen_name=screen_name, count=200)
            except TwythonError as e:
                print "TwythonError: %s" % e
                if e.error_code == 404:
                    logger.warn("HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % screen_name)
                elif e.error_code == 401:
                    logger.warn("HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % screen_name)
                else:
                    # Unhandled exception
                    raise e
            else:
                save_tweets_to_json_file(tweets, tweet_filename)
def main():
    # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors
    sys.stdout = codecs.getwriter('utf8')(sys.stdout)

    parser = argparse.ArgumentParser(description="")
    parser.add_argument('id_file')
    parser.add_argument('output_loc')
    parser.add_argument('--token_file',dest='token_file',default=None)
    args = parser.parse_args()

    logger = get_console_info_logger()

    #Optionally pass as a parameter
    #There has to be a more elegant way to combine this with the default behavior -- tomorrow's problem though
    oauth_settings_file_loc = args.token_file
    if oauth_settings_file_loc:
        print "Using tokens from:", oauth_settings_file_loc
        exec(open(oauth_settings_file_loc).read())

    ACCESS_TOKEN = Twython(consumer_key, consumer_secret, oauth_version=2).obtain_access_token()
    twython = Twython(consumer_key, access_token=ACCESS_TOKEN)
    crawler = CrawlTwitterTimelines(twython, logger)

    twitter_ids = get_screen_names_from_file(args.id_file)
    twitter_ids.reverse() #HARDCODE
    output_loc = args.output_loc

    tempfile_loc = 'tmp/'
    os.system('mkdir -p '+tempfile_loc)
    
    #load previously broken ID files so we don't try to read them again
    broken_ids = set([]) #Defaults to an empty set
    try:
        broken_ids = set([long(x).strip() for x in open(tempfile_loc + '404d').readlines()])
    except:
        pass
    try:
        broken_ids = broken_ids.union(set([long(x).strip() for x in open(tempfile_loc + '401d').readlines()]))
    except:
        pass
    
    for twitter_id in twitter_ids:
        if twitter_id in broken_ids:
            print '%s was previously inaccessible, not trying to download.' % twitter_id
            continue
        tweet_filename = output_loc + "%s.tweets.gz" % twitter_id
        if os.path.exists(tweet_filename):
            logger.info("File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, twitter_id))
        else:
            try:
                tweets = crawler.get_all_timeline_tweets_for_id(twitter_id)
            except TwythonError as e:
                print "TwythonError: %s" % e
                if e.error_code == 404:
                    logger.warn("HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % twitter_id)
                    with open(tempfile_loc + '404d','a') as OUT:
                        OUT.write('%s\n' % twitter_id)
                elif e.error_code == 401:
                    logger.warn("HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % twitter_id)
                    with open(tempfile_loc + '401d','a') as OUT:
                        OUT.write('%s\n' % twitter_id)
                else:
                    # Unhandled exception
                    print e 
                    #Reconnect and try again
                    twython = Twython(consumer_key, access_token=ACCESS_TOKEN)
                    crawler = CrawlTwitterTimelines(twython, logger)
            else:
                save_tweets_to_json_file(tweets, tweet_filename, gzip_out=True)
Пример #10
0
def main():
    # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors
    sys.stdout = codecs.getwriter('utf8')(sys.stdout)

    parser = argparse.ArgumentParser(description="")
    parser.add_argument('id_file')
    parser.add_argument('output_loc')
    parser.add_argument('--token_file', dest='token_file', default=None)
    args = parser.parse_args()

    logger = get_console_info_logger()

    #Optionally pass as a parameter
    #There has to be a more elegant way to combine this with the default behavior -- tomorrow's problem though
    oauth_settings_file_loc = args.token_file
    if oauth_settings_file_loc:
        print "Using tokens from:", oauth_settings_file_loc
        exec(open(oauth_settings_file_loc).read())

    ACCESS_TOKEN = Twython(consumer_key, consumer_secret,
                           oauth_version=2).obtain_access_token()
    twython = Twython(consumer_key, access_token=ACCESS_TOKEN)
    crawler = CrawlTwitterTimelines(twython, logger)

    twitter_ids = get_screen_names_from_file(args.id_file)
    twitter_ids.reverse()  #HARDCODE
    output_loc = args.output_loc

    tempfile_loc = 'tmp/'
    os.system('mkdir -p ' + tempfile_loc)

    #load previously broken ID files so we don't try to read them again
    broken_ids = set([])  #Defaults to an empty set
    try:
        broken_ids = set(
            [long(x).strip() for x in open(tempfile_loc + '404d').readlines()])
    except:
        pass
    try:
        broken_ids = broken_ids.union(
            set([
                long(x).strip()
                for x in open(tempfile_loc + '401d').readlines()
            ]))
    except:
        pass

    for twitter_id in twitter_ids:
        if twitter_id in broken_ids:
            print '%s was previously inaccessible, not trying to download.' % twitter_id
            continue
        tweet_filename = output_loc + "%s.tweets.gz" % twitter_id
        if os.path.exists(tweet_filename):
            logger.info(
                "File '%s' already exists - will not attempt to download Tweets for '%s'"
                % (tweet_filename, twitter_id))
        else:
            try:
                tweets = crawler.get_all_timeline_tweets_for_id(twitter_id)
            except TwythonError as e:
                print "TwythonError: %s" % e
                if e.error_code == 404:
                    logger.warn(
                        "HTTP 404 error - Most likely, Twitter user '%s' no longer exists"
                        % twitter_id)
                    with open(tempfile_loc + '404d', 'a') as OUT:
                        OUT.write('%s\n' % twitter_id)
                elif e.error_code == 401:
                    logger.warn(
                        "HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible"
                        % twitter_id)
                    with open(tempfile_loc + '401d', 'a') as OUT:
                        OUT.write('%s\n' % twitter_id)
                else:
                    # Unhandled exception
                    print e
                    #Reconnect and try again
                    twython = Twython(consumer_key, access_token=ACCESS_TOKEN)
                    crawler = CrawlTwitterTimelines(twython, logger)
            else:
                save_tweets_to_json_file(tweets, tweet_filename, gzip_out=True)