예제 #1
0
def authenticate_tweepy():
    auth = tweepy.OAuthHandler(credentials.get_consumer_key(),
                               credentials.get_consumer_secret())
    auth.set_access_token(credentials.get_access_token(),
                          credentials.get_access_secret())
    global twitter
    twitter = tweepy.API(auth)
예제 #2
0
findspark.init()

import util
import credentials

import pyspark

sc = pyspark.SparkContext('local[*]')

from pyspark.sql import SparkSession
import configparser

config = configparser.ConfigParser()
access_id = credentials.get_access_key()
access_key = credentials.get_access_secret()

hadoop_conf = sc._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3n.impl",
                "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
hadoop_conf.set("fs.s3n.awsAccessKeyId", access_id)
hadoop_conf.set("fs.s3n.awsSecretAccessKey", access_key)
hadoop_conf.set("fs.s3n.endpoint", "s3.eu-central-1.amazonaws.com")

# hadoop_conf.set("fs.s3a.awsAccessKeyId", access_id)
# hadoop_conf.set("fs.s3a.awsSecretAccessKey", access_key)
# hadoop_conf.set("fs.s3a.endpoint", "s3.eu-central-1.amazonaws.com")
# hadoop_conf.set("com.amazonaws.services.s3a.enableV4", "true")
# hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

# sc._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
예제 #3
0
import os
#os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages=org.apache.hadoop:hadoop-aws:2.7.3 pyspark-shell"

os.environ[
    'PYSPARK_SUBMIT_ARGS'] = "--packages=com.amazonaws:aws-java-sdk:1.7.4,org.apache.hadoop:hadoop-aws:2.7.3 pyspark-shell"

import findspark
findspark.init()

import util
import credentials

import pyspark

AWS_ACCESS_KEY = credentials.get_access_key()
AWS_SECRET_KEY = credentials.get_access_secret()

conf = (pyspark.SparkConf().setAppName('test').setMaster('local[*]').set(
    "fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem"
).set('fs.s3a.access.key',
      AWS_ACCESS_KEY).set('fs.s3a.secret.key', AWS_SECRET_KEY).set(
          "fs.s3a.awsAccessKeyId",
          AWS_ACCESS_KEY).set("fs.s3a.awsSecretAccessKey", AWS_SECRET_KEY).set(
              'fs.s3a.endpoint', "s3.eu-central-1.amazonaws.com").set(
                  'com.amazonaws.services.s3a.enableV4', "true").set(
                      "fs.s3a.aws.credentials.provider",
                      "org.apache.hadoop.fs.s3a.BasicAWSCredentialsProvider"))

sc = pyspark.SparkContext(conf=conf)

s3File = sc.textFile("s3a://welcome12345/student.json")
            }
            tweet_list.append(entry)
        print("...tweets fetched")
        utils.write_to_json(json_file_name, tweet_list)
    except tweepy.TweepError as e:
        raise HistoricTweetException(str(e))


if __name__ == "__main__":
    arg_keyword = "".join(sys.argv[1])
    arg_num_of_tweets = int(sys.argv[2])
    arg_json_file_name = sys.argv[3]
    # Get credentials
    try:
        consumer_key = credentials.get_consumer_key()
        consumer_secret = credentials.get_consumer_secret()
        access_token = credentials.get_access_token()
        access_token_secret = credentials.get_access_secret()
    except credentials.VaultException as error:
        raise HistoricTweetException("Vault Exception: " + str(error))
    # Set Up Auth
    try:
        auth = OAuthHandler(consumer_key, consumer_secret)
        auth.set_access_token(access_token, access_token_secret)
        api = API(auth)
    except TweepError as err:
        raise TweepError("Authentication Failed: " + str(err))

    _get_historic_tweets(api, arg_keyword, arg_json_file_name,
                         arg_num_of_tweets)