예제 #1
0
def extract_csv(df):
    '''Creates CSV extract where each row is a Tweet document, using the schema in the twarc.json2csv module.
    :param df: Spark DataFrame'''
    column_mapping = make_column_mapping(df.columns, array_fields=['text'])
    # The hashtags and urls fields are handled differently in the Elasticsearch index and in the CSV (per the twarc.json2csv spec). So we need to drop the ES columns before renaming the CSV-versions of these columns
    df = df.drop('hashtags', 'urls')
    for k, v in column_mapping.items():
        # Need to convert fields stored as arrays
        if v[1]:
            # Concat arrays with whitespace
            df = df.withColumn(k, F.concat_ws(' ', df[k]))
            # rename columns as necessary
        if k != v[0]:
            df = df.withColumnRenamed(k, v[0])
    # We select only the columns identified in json2csv, skipping the user_urls column (which may have been deprecated)
    csv_columns = [c for c in json2csv.get_headings()]
    df_csv = df.select(csv_columns)
    # Remove newlines in the text and user_location fields
    df_csv = df_csv.withColumn('text', F.regexp_replace('text', '\n|\r', ' '))
    df_csv = df_csv.withColumn('user_location', F.regexp_replace('user_location', '\n|\r', ' '))
    # Swap back the date fields so that the created_at field contains the unparsed version  
    data_mapping = {'created_at': 'parsed_created_at',
          'parsed_created_at': 'created_at'}
    df_csv = df_csv.select([F.col(c).alias(data_mapping.get(c, c)) for c in df_csv.columns])
    # Setting the escape character to the double quote. Otherwise, it causes problems for applications reading the CSV.
    # Get rid of duplicate tweets
    df_csv = df_csv.dropDuplicates(['id'])
    return df_csv
예제 #2
0
 def on_hit(self, hit, tweet_count):
     # Cycle tweet id files
     if tweet_count % self.max_per_file == 0:
         if self.file:
             self.file.close()
         self.file = open(
             os.path.join(
                 self.dataset_path,
                 'tweets-{}.csv'.format(str(self.file_count).zfill(3))),
             'w')
         self.sheet = csv.writer(self.file)
         self.sheet.writerow(json2csv.get_headings())
         self.file_count += 1
     # Write to tweet file
     self.sheet.writerow(json2csv.get_row(json.loads(hit.tweet),
                                          excel=True))
예제 #3
0
def make_column_mapping(df_columns, array_fields):
    '''Creates mapping from TweetSets fields to CSV column headings, using headings derived from twarc.json2csv. Each key is a column name in the DataFrame created from Tweet JSON by SQL transform; each value a tuple: the first element is the name of the CSV column heading, the second element is a Boolean flag indicating whether this field is an array. (Arrays need to be transformed to strings prior to writing to CSV.)
    :param df_columns: list of columns in the transformed Spark DataFrame (includes some fields required by json2csv not indexed in Elasticsearch)
    :param array_fields: list of fields in df_columns stored as arrays'''
    # Map TweetSets fields to their CSV column names
    column_mapping = {'retweet_quoted_status_id': 'retweet_or_quote_id',
                        'retweeted_quoted_screen_name': 'retweet_or_quote_screen_name',
                        'tweet_id': 'id',
                        'user_follower_count': 'user_followers_count',
                        'language': 'lang',
                        'retweeted_quoted_user_id': 'retweet_or_quote_user_id',
                        'hashtags_csv': 'hashtags',
                        'urls_csv': 'urls'
                    }
    # Add remaining fields from the DataFrame if they are used by json2csv
    column_mapping.update({k: k for k in df_columns if k in json2csv.get_headings()})
    # Set array flag for those fields that require it
    column_mapping = {k: (v, True if k in array_fields else False) for k,v in column_mapping.items()}
    return column_mapping
예제 #4
0
파일: command.py 프로젝트: rongpenl/twarc
def main():
    parser = get_argparser()
    args = parser.parse_args()

    command = args.command
    query = args.query or ""

    logging.basicConfig(filename=args.log,
                        level=logging.INFO,
                        format="%(asctime)s %(levelname)s %(message)s")

    # log and stop when process receives SIGINT
    def stop(signal, frame):
        log.warn('process received SIGNT, stopping')
        sys.exit(0)

    signal.signal(signal.SIGINT, stop)

    if command == "version":
        print("twarc v%s" % __version__)
        sys.exit()
    elif command == "help" or not command:
        parser.print_help()
        print("\nPlease use one of the following commands:\n")
        for cmd in commands:
            print(" - %s" % cmd)
        print("\nFor example:\n\n    twarc search blacklivesmatter")
        sys.exit(1)

    # Don't validate the keys if the command is "configure"
    if command == "configure" or args.skip_key_validation:
        validate_keys = False
    else:
        validate_keys = True

    t = Twarc(consumer_key=args.consumer_key,
              consumer_secret=args.consumer_secret,
              access_token=args.access_token,
              access_token_secret=args.access_token_secret,
              connection_errors=args.connection_errors,
              http_errors=args.http_errors,
              config=args.config,
              profile=args.profile,
              tweet_mode=args.tweet_mode,
              protected=args.protected,
              validate_keys=validate_keys,
              app_auth=args.app_auth,
              gnip_auth=args.gnip_auth)

    # calls that return tweets
    if command == "search":
        if len(args.lang) > 0:
            lang = args.lang[0]
        else:
            lang = None

        # if not using a premium endpoint do a standard search
        if not args.thirtyday and not args.fullarchive and not args.gnip_fullarchive:
            things = t.search(query,
                              since_id=args.since_id,
                              max_id=args.max_id,
                              lang=lang,
                              result_type=args.result_type,
                              geocode=args.geocode)
        else:
            # parse the dates if given
            from_date = parse_dt(args.from_date) if args.from_date else None
            to_date = parse_dt(args.to_date) if args.to_date else None
            if args.gnip_fullarchive:
                env = args.gnip_fullarchive
                product = 'gnip_fullarchive'
            elif args.thirtyday:
                env = args.thirtyday
                product = '30day'
            else:
                env = args.fullarchive
                product = 'fullarchive'
            things = t.premium_search(
                query,
                product,
                env,
                from_date=from_date,
                to_date=to_date,
                sandbox=args.sandbox,
                limit=args.limit,
            )

    elif command == "filter":
        things = t.filter(track=query,
                          follow=args.follow,
                          locations=args.locations,
                          lang=args.lang)

    elif command == "dehydrate":
        input_iterator = fileinput.FileInput(
            query,
            mode='r',
            openhook=fileinput.hook_compressed,
        )
        things = t.dehydrate(input_iterator)

    elif command == "hydrate":
        input_iterator = fileinput.FileInput(
            query,
            mode='r',
            openhook=fileinput.hook_compressed,
        )
        things = t.hydrate(input_iterator)

    elif command == "tweet":
        things = [t.tweet(query)]

    elif command == "sample":
        things = t.sample()

    elif command == "timeline":
        kwargs = {"max_id": args.max_id, "since_id": args.since_id}
        if re.match('^[0-9]+$', query):
            kwargs["user_id"] = query
        elif query:
            kwargs["screen_name"] = query
        things = t.timeline(**kwargs)

    elif command == "retweets":
        if os.path.isfile(query):
            iterator = fileinput.FileInput(
                query,
                mode='r',
                openhook=fileinput.hook_compressed,
            )
            things = t.retweets(tweet_ids=iterator)
        else:
            things = t.retweets(tweet_ids=query.split(','))

    elif command == "users":
        if os.path.isfile(query):
            iterator = fileinput.FileInput(
                query,
                mode='r',
                openhook=fileinput.hook_compressed,
            )
            if re.match('^[0-9,]+$', next(open(query))):
                id_type = 'user_id'
            else:
                id_type = 'screen_name'
            things = t.user_lookup(ids=iterator, id_type=id_type)
        elif re.match('^[0-9,]+$', query):
            things = t.user_lookup(ids=query.split(","))
        else:
            things = t.user_lookup(ids=query.split(","), id_type='screen_name')

    elif command == "followers":
        things = t.follower_ids(query)

    elif command == "friends":
        things = t.friend_ids(query)

    elif command == "trends":
        # lookup woeid for geo-coordinate if appropriate
        geo = re.match('^([0-9-.]+),([0-9-.]+)$', query)
        if geo:
            lat, lon = map(float, geo.groups())
            if lat > 180 or lat < -180 or lon > 180 or lon < -180:
                parser.error('LAT and LONG must be within [-180.0, 180.0]')
            places = list(t.trends_closest(lat, lon))
            if len(places) == 0:
                parser.error("Couldn't find WOE ID for %s" % query)
            query = places[0]["woeid"]

        if not query:
            things = t.trends_available()
        else:
            trends = t.trends_place(query)
            if trends:
                things = trends[0]['trends']

    elif command == "replies":
        tweet = t.tweet(query)
        if not tweet:
            parser.error("tweet with id %s does not exist" % query)
        things = t.replies(tweet, args.recursive)

    elif command == "listmembers":
        list_parts = re.match('^https://twitter.com/(.+)/lists/(.+)$', query)
        if not list_parts:
            parser.error(
                "provide the url for the list, e.g., https://twitter.com/USAFacts/lists/us-armed-forces"
            )
        things = t.list_members(slug=list_parts.group(2),
                                owner_screen_name=list_parts.groups(1))

    elif command == "configure":
        t.configure()
        sys.exit()

    else:
        parser.print_help()
        print("\nPlease use one of the following commands:\n")
        for cmd in commands:
            print(" - %s" % cmd)
        print("\nFor example:\n\n    twarc search blacklivesmatter")
        sys.exit(1)

    # get the output filehandle
    if args.output:
        if pyv == 3:
            fh = codecs.open(args.output, 'wb', 'utf8')
        else:
            fh = open(args.output, 'w')
    else:
        fh = sys.stdout

    # optionally create a csv writer
    csv_writer = None
    if args.format in ("csv", "csv-excel") and command not in [
            "filter", "hydrate", "replies", "retweets", "sample", "search",
            "timeline", "tweet"
    ]:
        parser.error("csv output not available for %s" % command)
    elif args.format in ("csv", "csv-excel"):
        csv_writer = csv.writer(fh)
        csv_writer.writerow(get_headings())

    line_count = 0
    file_count = 0
    for thing in things:

        # rotate the files if necessary
        if args.output and args.split and line_count % args.split == 0:
            file_count += 1
            fh = codecs.open(numbered_filepath(args.output, file_count), 'wb',
                             'utf8')
            if csv_writer:
                csv_writer = csv.writer(fh)
                csv_writer.writerow(get_headings())

        line_count += 1

        # ready to output

        kind_of = type(thing)
        if kind_of == str_type:
            # user or tweet IDs
            print(thing, file=fh)
            log.info("archived %s" % thing)
        elif 'id_str' in thing:
            # tweets and users
            if (args.format == "json"):
                print(json.dumps(thing), file=fh)
            elif (args.format == "csv"):
                csv_writer.writerow(get_row(thing))
            elif (args.format == "csv-excel"):
                csv_writer.writerow(get_row(thing, excel=True))
            log.info("archived %s", thing['id_str'])
        elif 'woeid' in thing:
            # places
            print(json.dumps(thing), file=fh)
        elif 'tweet_volume' in thing:
            # trends
            print(json.dumps(thing), file=fh)
        elif 'limit' in thing:
            # rate limits
            t = datetime.datetime.utcfromtimestamp(
                float(thing['limit']['timestamp_ms']) / 1000)
            t = t.isoformat("T") + "Z"
            log.warning("%s tweets undelivered at %s", thing['limit']['track'],
                        t)
            if args.warnings:
                print(json.dumps(thing), file=fh)
        elif 'warning' in thing:
            # other warnings
            log.warning(thing['warning']['message'])
            if args.warnings:
                print(json.dumps(thing), file=fh)
        elif 'data' in thing:
            # Labs style JSON schema.
            print(json.dumps(thing), file=fh)
예제 #5
0
def main():
    parser = get_argparser()
    args = parser.parse_args()

    command = args.command
    query = args.query or ""

    logging.basicConfig(
        filename=args.log,
        level=logging.INFO,
        format="%(asctime)s %(levelname)s %(message)s"
    )

    # catch ctrl-c so users don't see a stack trace
    signal.signal(signal.SIGINT, lambda signal, frame: sys.exit(0))

    if command == "version":
        print("twarc v%s" % __version__)
        sys.exit()
    elif command == "help" or not command:
        parser.print_help()
        print("\nPlease use one of the following commands:\n")
        for cmd in commands:
            print(" - %s" % cmd)
        print("\nFor example:\n\n    twarc search blacklivesmatter")
        sys.exit(1)

    # Don't validate the keys if the command is "configure"
    if command == "configure" or args.skip_key_validation:
        validate_keys = False
    else:
        validate_keys = True


    t = Twarc(
        consumer_key=args.consumer_key,
        consumer_secret=args.consumer_secret,
        access_token=args.access_token,
        access_token_secret=args.access_token_secret,
        connection_errors=args.connection_errors,
        http_errors=args.http_errors,
        config=args.config,
        profile=args.profile,
        tweet_mode=args.tweet_mode,
        protected=args.protected,
        validate_keys=validate_keys,
    )

    # calls that return tweets
    if command == "search":
        things = t.search(
            query,
            since_id=args.since_id,
            max_id=args.max_id,
            lang=args.lang,
            result_type=args.result_type,
            geocode=args.geocode
        )

    elif command == "filter":
        things = t.filter(
            track=query,
            follow=args.follow,
            locations=args.locations
        )

    elif command == "dehydrate":
        input_iterator = fileinput.FileInput(
            query,
            mode='r',
            openhook=fileinput.hook_compressed,
        )
        things = t.dehydrate(input_iterator)

    elif command == "hydrate":
        input_iterator = fileinput.FileInput(
            query,
            mode='r',
            openhook=fileinput.hook_compressed,
        )
        things = t.hydrate(input_iterator)

    elif command == "tweet":
        things = [t.tweet(query)]

    elif command == "sample":
        things = t.sample()

    elif command == "timeline":
        kwargs = {"max_id": args.max_id, "since_id": args.since_id}
        if re.match('^[0-9]+$', query):
            kwargs["user_id"] = query
        else:
            kwargs["screen_name"] = query
        things = t.timeline(**kwargs)

    elif command == "retweets":
        things = t.retweets(query)

    elif command == "users":
        if os.path.isfile(query):
            iterator = fileinput.FileInput(
                query,
                mode='r',
                openhook=fileinput.hook_compressed,
            )
            if re.match('^[0-9,]+$', next(open(query))):
                id_type = 'user_id'
            else:
                id_type = 'screen_name'
            things = t.user_lookup(ids=iterator, id_type=id_type)
        elif re.match('^[0-9,]+$', query):
            things = t.user_lookup(ids=query.split(","))
        else:
            things = t.user_lookup(ids=query.split(","), id_type='screen_name')

    elif command == "followers":
        things = t.follower_ids(query)

    elif command == "friends":
        things = t.friend_ids(query)

    elif command == "trends":
        # lookup woeid for geo-coordinate if appropriate
        geo = re.match('^([0-9\-\.]+),([0-9\-\.]+)$', query)
        if geo:
            lat, lon = map(float, geo.groups())
            if lat > 180 or lat < -180 or lon > 180 or lon < -180:
                parser.error('LAT and LONG must be within [-180.0, 180.0]')
            places = list(t.trends_closest(lat, lon))
            if len(places) == 0:
                parser.error("Couldn't find WOE ID for %s" % query)
            query = places[0]["woeid"]

        if not query:
            things = t.trends_available()
        else:
            trends = t.trends_place(query)
            if trends:
                things = trends[0]['trends']

    elif command == "replies":
        tweet = t.tweet(query)
        if not tweet:
            parser.error("tweet with id %s does not exist" % query)
        things = t.replies(tweet, args.recursive)

    elif command == "listmembers":
        list_parts = re.match('^https://twitter.com/(.+)/lists/(.+)$', query)
        if not list_parts:
            parser.error("provide the url for the list, e.g., https://twitter.com/USAFacts/lists/us-armed-forces")
        things = t.list_members(slug=list_parts.group(2),
                                owner_screen_name=list_parts.groups(1))

    elif command == "configure":
        t.configure()
        sys.exit()

    else:
        parser.print_help()
        print("\nPlease use one of the following commands:\n")
        for cmd in commands:
            print(" - %s" % cmd)
        print("\nFor example:\n\n    twarc search blacklivesmatter")
        sys.exit(1)

    # get the output filehandle
    if args.output:
        if pyv == 3:
            fh = codecs.open(args.output, 'wb', 'utf8')
        else:
            fh = open(args.output, 'w')
    else:
        fh = sys.stdout

    # optionally create a csv writer
    csv_writer = None
    if args.format in ("csv", "csv-excel") and command not in ["filter", "hydrate", "replies",
            "retweets", "sample", "search", "timeline", "tweet"]:
        parser.error("csv output not available for %s" % command)
    elif args.format in ("csv", "csv-excel"):
        csv_writer = csv.writer(fh)
        csv_writer.writerow(get_headings())

    line_count = 0
    file_count = 0
    for thing in things:

        # rotate the files if necessary
        if args.output and args.split and line_count % args.split == 0:
            file_count += 1
            fh = codecs.open(numbered_filepath(args.output, file_count), 'wb', 'utf8')
            if csv_writer:
                csv_writer = csv.writer(fh)
                csv_writer.writerow(get_headings())

        line_count += 1

        # ready to output

        kind_of = type(thing)
        if kind_of == str_type:
            # user or tweet IDs
            print(thing, file=fh)
            logging.info("archived %s" % thing)
        elif 'id_str' in thing:
            # tweets and users
            if (args.format == "json"):
                print(json.dumps(thing), file=fh)
            elif (args.format == "csv"):
                csv_writer.writerow(get_row(thing))
            elif (args.format == "csv-excel"):
                csv_writer.writerow(get_row(thing, excel=True))
            logging.info("archived %s", thing['id_str'])
        elif 'woeid' in thing:
            # places
            print(json.dumps(thing), file=fh)
        elif 'tweet_volume' in thing:
            # trends
            print(json.dumps(thing), file=fh)
        elif 'limit' in thing:
            # rate limits
            t = datetime.datetime.utcfromtimestamp(
                float(thing['limit']['timestamp_ms']) / 1000)
            t = t.isoformat("T") + "Z"
            logging.warn("%s tweets undelivered at %s",
                         thing['limit']['track'], t)
            if args.warnings:
                print(json.dumps(thing), file=fh)
        elif 'warning' in thing:
            # other warnings
            logging.warn(thing['warning']['message'])
            if args.warnings:
                print(json.dumps(thing), file=fh)
예제 #6
0
파일: json2csv.py 프로젝트: zwytop/twarc
def get_headings(extra_headings=None):
    fields = json2csv.get_headings()
    if extra_headings:
        fields.extend(extra_headings)
    return fields
예제 #7
0
def main():
    parser = get_argparser()
    args = parser.parse_args()

    command = args.command
    query = args.query or ""

    logging.basicConfig(filename=args.log,
                        level=logging.INFO,
                        format="%(asctime)s %(levelname)s %(message)s")

    if command == "version":
        print("twarc v%s" % __version__)
        sys.exit()
    elif command == "help" or not command:
        parser.print_help()
        print("\nPlease use one of the following commands:\n")
        for cmd in commands:
            print(" - %s" % cmd)
        print("\nFor example:\n\n    twarc search blacklivesmatter")
        sys.exit(1)

    t = Twarc(consumer_key=args.consumer_key,
              consumer_secret=args.consumer_secret,
              access_token=args.access_token,
              access_token_secret=args.access_token_secret,
              connection_errors=args.connection_errors,
              http_errors=args.http_errors,
              config=args.config,
              profile=args.profile,
              tweet_mode=args.tweet_mode)

    # calls that return tweets
    if command == "search":
        things = t.search(query,
                          since_id=args.since_id,
                          max_id=args.max_id,
                          lang=args.lang,
                          result_type=args.result_type,
                          geocode=args.geocode)

    elif command == "filter":
        things = t.filter(track=query,
                          follow=args.follow,
                          locations=args.locations)

    elif command == "dehydrate":
        input_iterator = fileinput.FileInput(
            query,
            mode='rU',
            openhook=fileinput.hook_compressed,
        )
        things = t.dehydrate(input_iterator)

    elif command == "hydrate":
        input_iterator = fileinput.FileInput(
            query,
            mode='rU',
            openhook=fileinput.hook_compressed,
        )
        things = t.hydrate(input_iterator)

    elif command == "tweet":
        things = [t.tweet(query)]

    elif command == "sample":
        things = t.sample()

    elif command == "timeline":
        kwargs = {"max_id": args.max_id, "since_id": args.since_id}
        if re.match('^[0-9]+$', query):
            kwargs["user_id"] = query
        else:
            kwargs["screen_name"] = query
        things = t.timeline(**kwargs)

    elif command == "retweets":
        things = t.retweets(query)

    elif command == "users":
        if os.path.isfile(query):
            iterator = fileinput.FileInput(
                query,
                mode='rU',
                openhook=fileinput.hook_compressed,
            )
            things = t.user_lookup(iterator=iterator)
        elif re.match('^[0-9,]+$', query):
            things = t.user_lookup(user_ids=query.split(","))
        else:
            things = t.user_lookup(screen_names=query.split(","))

    elif command == "followers":
        things = t.follower_ids(query)

    elif command == "friends":
        things = t.friend_ids(query)

    elif command == "trends":
        # lookup woeid for geo-coordinate if appropriate
        geo = re.match('^([0-9\-\.]+),([0-9\-\.]+)$', query)
        if geo:
            lat, lon = map(float, geo.groups())
            if lat > 180 or lat < -180 or lon > 180 or lon < -180:
                parser.error('LAT and LONG must be within [-180.0, 180.0]')
            places = list(t.trends_closest(lat, lon))
            if len(places) == 0:
                parser.error("Couldn't find WOE ID for %s" % query)
            query = places[0]["woeid"]

        if not query:
            things = t.trends_available()
        else:
            trends = t.trends_place(query)
            if trends:
                things = trends[0]['trends']

    elif command == "replies":
        tweet = t.tweet(query)
        if not tweet:
            parser.error("tweet with id %s does not exist" % query)
        things = t.replies(tweet, args.recursive)

    elif command == "configure":
        t.input_keys()
        sys.exit()

    else:
        parser.print_help()
        print("\nPlease use one of the following commands:\n")
        for cmd in commands:
            print(" - %s" % cmd)
        print("\nFor example:\n\n    twarc search blacklivesmatter")
        sys.exit(1)

    # get the output filehandle
    if args.output:
        fh = codecs.open(args.output, 'wb', 'utf8')
    else:
        fh = sys.stdout

    # optionally create a csv writer
    csv_writer = None
    if args.format == "csv" and command not in [
            "filter", "hydrate", "replies", "retweets", "sample", "search",
            "timeline", "tweet"
    ]:
        parser.error("csv output not available for %s" % command)
    elif args.format == "csv":
        csv_writer = csv.writer(fh)
        csv_writer.writerow(get_headings())

    line_count = 0
    file_count = 0
    for thing in things:

        # rotate the files if necessary
        if args.output and args.split and line_count % args.split == 0:
            file_count += 1
            fh = codecs.open(numbered_filepath(args.output, file_count), 'wb',
                             'utf8')
            if csv_writer:
                csv_writer = csv.writer(fh)
                csv_writer.writerow(get_headings())

        line_count += 1

        # ready to output

        kind_of = type(thing)
        if kind_of == str_type:
            # user or tweet IDs
            print(thing, file=fh)
            logging.info("archived %s" % thing)
        elif 'id_str' in thing:
            # tweets and users
            if (args.format == "json"):
                print(json.dumps(thing), file=fh)
            elif (args.format == "csv"):
                csv_writer.writerow(get_row(thing))
            logging.info("archived %s", thing['id_str'])
        elif 'woeid' in thing:
            # places
            print(json.dump(thing), file=fh)
        elif 'tweet_volume' in thing:
            # trends
            print(json.dump(thing), file=fh)
        elif 'limit' in thing:
            # rate limits
            t = datetime.datetime.utcfromtimestamp(
                float(thing['limit']['timestamp_ms']) / 1000)
            t = t.isoformat("T") + "Z"
            logging.warn("%s tweets undelivered at %s",
                         thing['limit']['track'], t)
            if args.warnings:
                print(json.dump(thing), file=fh)
        elif 'warning' in thing:
            # other warnings
            logging.warn(thing['warning']['message'])
            if args.warnings:
                print(json.dump(thing), file=fh)
예제 #8
0
def main(warc_file):
    twitter = Twarc()
    out = csv.writer(sys.stdout)
    out.writerow(json2csv.get_headings())
    for tweet in twitter.hydrate(tweet_ids(warc_file)):
        out.writerow(json2csv.get_row(tweet))