def main(): print ('Start!') conf = SparkConf().setAppName("pyspark_test") sc = SparkContext(conf=conf) rdd = sc.mongoRDD('mongodb://localhost:27017/test_database.transactions') rdd.saveToMongoDB('mongodb://localhost:27017/test_database.transactions_copy') print ('Completed!')
def main(): conf = SparkConf().setAppName("pyspark test") sc = SparkContext(conf=conf) # Create an RDD backed by the MongoDB collection. # This RDD *does not* contain key/value pairs, just documents. # If you want key/value pairs, use the mongoPairRDD method instead. rdd = sc.mongoRDD('mongodb://localhost:27017/db.contextizer')
def main(): conf = SparkConf().setAppName("pyspark test") sc = SparkContext(conf=conf) # Create an RDD backed by the MongoDB collection. # This RDD *does not* contain key/value pairs, just documents. # If you want key/value pairs, use the mongoPairRDD method instead. rdd = sc.mongoRDD('mongodb://localhost:27017/db.contextizer')
def main(args): # get conf ============================================================================= conf = getConf() db_host = conf['host'] db_port = int(conf['port']) directory = conf['txt_directory'] db_name = conf['db_name'] collection_name_urls = conf['url_collection'] collection_name_dbstat = conf['dbstat_collection'] phase1_n_threads = int(conf['geo_indexing_nthread']) max_waiting_time = int(conf['max_waiting_time_http']) s = int(conf['s']) min_loc = None max_loc = None if conf['bounded_locs'] != "": bounded_locs = conf['bounded_locs'] min_loc, max_loc = bounded_locs[0], bounded_locs[1] else: min_loc, max_loc = d.getBoundaries(host, port, db_name, dbstat_collection_name) #======================================================================================== logs = {} # links extraction #logs['m1'] = m1.run(db_host, db_port, directory, db_name, collection_name_urls, collection_name_dbstat, phase1_n_threads) # Spark context definition conf = SparkConf() conf.setMaster("local") conf.setAppName("Test Spark") conf.set("spark.executor.memory", "1g") sc = SparkContext(conf=conf) # get urls for the map # set up parameters for reading from MongoDB via Hadoop input format db_conf = "mongodb://" + db_host + ":" + str(db_port) + "/" + db_name + "." db_conf_clicks = db_conf + collection_name_urls print(db_conf_clicks) # Read from DB urlsRDD = sc.mongoRDD(db_conf_clicks) # Map Reduce a = urlsRDD.map(lambda x: f_download(x,max_waiting_time)).\ map(lambda x: f_parse(x)).\ map(lambda x: f_cellIndex(x, min_loc, max_loc, s)).\ collect() print('\n\n\n\n\n\nFINITO\n\n\n\n\n\n\n')
def main(): conf = SparkConf().setAppName("pyspark test") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) config = ConfigParser.ConfigParser() config.read('configuration.cfg') mongodb_connection = config.get('BatchProperties', 'URLMongoDB') ####################################################### # UTILIZACION DE LA LIBRERIA DE PYMONGO ####################################################### client = MongoClient() db = client.test cursor = db.tabla1.find() for document in cursor: print(document) ####################################################### # UTILIZACION DE LA LIBRERIA DE pymongo_spark ####################################################### # Lectura de una tabla de mongodb (db: test; coleccion: tabla1) rdd = sc.mongoRDD(mongodb_connection + 'test.tabla1') # Guardamos el rdd leido en mongodb (db: test; coleccion: tabla2) rdd.saveToMongoDB(mongodb_connection + 'test.tabla2') # Recuperamos el valor de raiz del proyecto BASE_DIR = os.path.dirname(os.path.dirname(__file__)) # BASE_DIR = /Users/akash/PycharmProjects/masterbigdata # Leemos un fichero de ejemplo file = os.path.join(BASE_DIR + '/datasets/batch/air', 'ficheroSalidaAire.txt') rddfFile = sqlContext.jsonFile(file) # Almancemos en mongodb el fichero rddfFile.saveToMongoDB(mongodb_connection + 'test.tabla3')
def main(): conf = SparkConf().setAppName("transform") sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) conn = "mongodb://{mongo_user}:{mongo_pass}@{mongo_host}:{mongo_port}/{mongo_db}.{mongo_collection}".format( mongo_user=MONGO_USER, mongo_pass=urllib.quote_plus(MONGO_PASSWORD), mongo_host=MONGO_HOST, mongo_port=MONGO_PORT, mongo_db=DB_NAME, mongo_collection=COLLECTION_NAME) rdd = sc.mongoRDD(conn) new_rdd = rdd.map(lambda x: dict([(i, x[i]) for i in x if i != '_id']) ).map(lambda x: json.dumps(x, ensure_ascii=False).encode('ascii', 'replace') ).map(lambda x: "".join(x.split("\\n"))) df = sqlContext.jsonRDD(new_rdd) df.registerTempTable('events_temp') sqlContext.sql('DROP TABLE IF EXISTS default.events') sqlContext.sql('DROP TABLE IF EXISTS default.clean_table') sqlContext.sql('CREATE TABLE events AS SELECT * FROM events_temp') sqlContext.sql("CREATE TABLE clean_table AS SELECT description AS event_desc, id AS event_id, yes_rsvp_count, group.category.name AS cat_name, group.category.shortname AS cat_short, group.category.id AS cat_id, group.name AS group_name, group.topics.name AS topic_name, name AS event_name, time AS start_time, utc_offset AS timezone_offset, venue.state AS venue_state, venue.city AS venue_city, venue.zip AS venue_zip, fee.amount AS fee_amt, fee.required AS req_fee FROM events")
start_time = time.time() client = MongoClient('localhost',27017) utc=pytz.UTC #datetime.datetime.now().replace(tzinfo=utc) db = client['disaster'] minuteAnalysisLatest = db['minute'] pymongo_spark.activate() from pyspark import SparkContext, SparkConf conf = SparkConf().setAppName("pyspark test") sc = SparkContext(conf=conf) rdd = sc.mongoRDD('mongodb://localhost:27017/disaster.analysisData') #Objective 1: Get the number of times the key words(222) are used for 1 particular day , For every minute. dayOne=datetime.datetime(2016, 3, 24, 0, 0 , 0).replace(tzinfo=utc) incrementByAMinute = datetime.timedelta(minutes=1) incrementByADay = datetime.timedelta(days=1) dayOneEnd = dayOne + incrementByADay dayOneEnd.replace(tzinfo=utc) contentRdd = rdd.map(lambda x: (x['text'],x['created_at'])).filter(lambda (x,y): y > dayOne and y < dayOneEnd).persist() #count = {'bomb': 0, 'violent storm': 0, 'hijacker': 0, 'bombed': 0, 'sunk': 0, 'avalanche': 0, 'debris': 0, 'body bag': 0, 'battle': 0, 'fear': 0, 'weapons': 0, 'catastrophe': 0, 'forest fire': 0, 'ruin': 0, 'buildings burning': 0, 'blaze': 0, 'fatal': 0, 'airplane accident': 0, 'sinking': 0, 'electrocute': 0, 'rescue': 0, 'hostage': 0, 'massacre': 0, 'traumatised': 0, 'trouble': 0, 'screaming': 0, 'suicide bomb': 0, 'annihilated': 0, 'loud bang': 0, 'floods': 0, 'quarantine': 0, 'obliterate': 0, 'cliff fall': 0, 'body bagging': 0, 'snowstorm': 0, 'whirlwind': 0, 'disaster': 0, 'bleeding': 0, 'razed': 0, 'famine': 0, 'armageddon': 0, 'wreck': 0, 'thunder': 0, 'wrecked': 0, 'crush': 0, 'burned': 0, 'sirens': 0, 'explosion': 0, 'screams': 0, 'rescuers': 0, 'bridge collapse': 0, 'survivors': 0, 'fatality': 0, 'earthquake': 0, 'accident': 0, 'flames': 0, 'detonate': 0, 'mass murderer': 0, 'smoke': 0, 'military': 0, 'stretcher': 0, 'blizzard': 0, 'danger': 0, 'bloody': 0, 'panicking': 0, 'drowned': 0, 'eyewitness': 0, 'devastation': 0, 'bush fires': 0, 'army': 0, 'heat wave': 0, 'emergency plan': 0, 'tragedy': 0, 'collided': 0, 'survive': 0, 'injury': 0, 'riot': 0, 'attacked': 0, 'fire': 0, 'bioterrorism': 0, 'wounds': 0, 'quarantined': 0, 'drown': 0, 'hailstorm': 0, 'casualties': 0, 'mass murder': 0, 'demolish': 0, 'collision': 0, 'pandemonium': 0, 'sandstorm': 0, 'electrocuted': 0, 'landslide': 0, 'flooding': 0, 'mayhem': 0, 'rainstorm': 0, 'demolition': 0, 'blew up': 0, 'hijacking': 0, 'siren': 0, 'terrorist': 0, 'inundated': 0, 'damage': 0, 'lava': 0, 'devastated': 0, 'forest fires': 0, 'outbreak': 0, 'terrorism': 0, 'panic': 0, 'detonation': 0, 'injured': 0, 'deluged': 0, 'windstorm': 0, 'thunderstorm': 0, 'hazard': 0, 'crushed': 0, 'crashed': 0, 'blood': 0, 'buildings on fire': 0, 'destruction': 0, 'deluge': 0, 'weapon': 0, 'sinkhole': 0, 'aftershock': 0, 'ambulance': 0, 'wreckage': 0, 'desolate': 0, 'blown up': 0, 'fatalities': 0, 'injuries': 0, 'bombing': 0, 'structural failure': 0, 'death': 0, 'police': 0, 'destroyed': 0, 'engulfed': 0, 'crash': 0, 'emergency': 0, 'inundation': 0, 'collide': 0, 'blight': 0, 'destroy': 0, 'dust storm': 0, 'mudslide': 0, 'displaced': 0, 'arsonist': 0, 'nuclear reactor': 0, 'blazing': 0, 'lightning': 0, 'explode': 0, 'tsunami': 0, 'burning buildings': 0, 'volcano': 0, 'hijack': 0, 'refugees': 0, 'derailment': 0, 'harm': 0, 'hail': 0, 'bioterror': 0, 'hurricane': 0, 'trauma': 0, 'evacuation': 0, 'cyclone': 0, 'epicentre': 0, 'nuclear disaster': 0, 'hostages': 0, 'obliteration': 0, 'suicide bomber': 0, 'drowning': 0, 'derailed': 0, 'threat': 0, 'apocalypse': 0, 'chemical emergency': 0, 'burning': 0, 'obliterated': 0, 'screamed': 0, 'fire truck': 0, 'seismic': 0, 'wildfire': 0, 'emergency services': 0, 'attack': 0, 'storm': 0, 'catastrophic': 0, 'twister': 0, 'evacuated': 0, 'natural disaster': 0, 'collapse': 0, 'trapped': 0, 'war zone': 0, 'exploded': 0, 'collapsed': 0, 'oil spill': 0, 'evacuate': 0, 'typhoon': 0, 'dead': 0, 'survived': 0, 'first responders': 0, 'keyword': 0, 'radiation emergency': 0, 'annihilation': 0, 'deaths': 0, 'rubble': 0, 'ablaze': 0, 'meltdown': 0, 'casualty': 0, 'body bags': 0, 'upheaval': 0, 'flood': 0, 'demolished': 0, 'rioting': 0, 'hellfire': 0, 'curfew': 0, 'hazardous': 0, 'tornado': 0, 'desolation': 0, 'flattened': 0, 'drought': 0, 'derail': 0, 'arson': 0, 'rescued': 0, 'suicide bombing': 0, 'wild fires': 0, 'wounded': 0} # for issue in count.keys(): # print issue def getCount(content):
import pytz import time from operator import add from pymongo import MongoClient start_time = time.time() client = MongoClient('localhost',27017) utc=pytz.UTC db = client['disaster'] threeHourlyAlert = db['minute'] pymongo_spark.activate() from pyspark import SparkContext, SparkConf conf = SparkConf().setAppName("pyspark test") sc = SparkContext(conf=conf) rdd = sc.mongoRDD('mongodb://localhost:27017/disaster.overAll10MinuteAverage').persist() dayOne=datetime.datetime(2016, 3, 24, 0, 0 , 0).replace(tzinfo=utc) incrementBy3Hour= datetime.timedelta(hours=2) for x in range(288): dayOneIncrementBy3Hour = dayOne + incrementBy3Hour dayOneIncrementBy3Hour = dayOneIncrementBy3Hour.replace(tzinfo=utc) output = rdd.filter( lambda x: x['date'] >= dayOne and x['date'] < dayOneIncrementBy3Hour ).flatMap(lambda x: x['average'].items()).filter(lambda (x,y): y > 8 ).map(lambda (x,y): (x,1)).reduceByKey(lambda x,y:x+y).filter(lambda (x,y): y>8).map(lambda(x,y): x).collect() if output != []: result = db.threeHourlyAlert.insert_one({"date": dayOne , "count":output}) dayOne = dayOneIncrementBy3Hour.replace(tzinfo=utc)
start_time = time.time() client = MongoClient('localhost', 27017) utc = pytz.UTC #datetime.datetime.now().replace(tzinfo=utc) db = client['disaster'] minuteAnalysisLatest = db['minute'] pymongo_spark.activate() from pyspark import SparkContext, SparkConf conf = SparkConf().setAppName("pyspark test") sc = SparkContext(conf=conf) rdd = sc.mongoRDD('mongodb://localhost:27017/disaster.analysisData') #Objective 1: Get the number of times the key words(222) are used for 1 particular day , For every minute. dayOne = datetime.datetime(2016, 3, 24, 0, 0, 0).replace(tzinfo=utc) incrementByAMinute = datetime.timedelta(minutes=1) incrementByADay = datetime.timedelta(days=1) dayOneEnd = dayOne + incrementByADay dayOneEnd.replace(tzinfo=utc) contentRdd = rdd.map(lambda x: (x['text'], x['created_at'])).filter( lambda (x, y): y > dayOne and y < dayOneEnd).persist() #count = {'bomb': 0, 'violent storm': 0, 'hijacker': 0, 'bombed': 0, 'sunk': 0, 'avalanche': 0, 'debris': 0, 'body bag': 0, 'battle': 0, 'fear': 0, 'weapons': 0, 'catastrophe': 0, 'forest fire': 0, 'ruin': 0, 'buildings burning': 0, 'blaze': 0, 'fatal': 0, 'airplane accident': 0, 'sinking': 0, 'electrocute': 0, 'rescue': 0, 'hostage': 0, 'massacre': 0, 'traumatised': 0, 'trouble': 0, 'screaming': 0, 'suicide bomb': 0, 'annihilated': 0, 'loud bang': 0, 'floods': 0, 'quarantine': 0, 'obliterate': 0, 'cliff fall': 0, 'body bagging': 0, 'snowstorm': 0, 'whirlwind': 0, 'disaster': 0, 'bleeding': 0, 'razed': 0, 'famine': 0, 'armageddon': 0, 'wreck': 0, 'thunder': 0, 'wrecked': 0, 'crush': 0, 'burned': 0, 'sirens': 0, 'explosion': 0, 'screams': 0, 'rescuers': 0, 'bridge collapse': 0, 'survivors': 0, 'fatality': 0, 'earthquake': 0, 'accident': 0, 'flames': 0, 'detonate': 0, 'mass murderer': 0, 'smoke': 0, 'military': 0, 'stretcher': 0, 'blizzard': 0, 'danger': 0, 'bloody': 0, 'panicking': 0, 'drowned': 0, 'eyewitness': 0, 'devastation': 0, 'bush fires': 0, 'army': 0, 'heat wave': 0, 'emergency plan': 0, 'tragedy': 0, 'collided': 0, 'survive': 0, 'injury': 0, 'riot': 0, 'attacked': 0, 'fire': 0, 'bioterrorism': 0, 'wounds': 0, 'quarantined': 0, 'drown': 0, 'hailstorm': 0, 'casualties': 0, 'mass murder': 0, 'demolish': 0, 'collision': 0, 'pandemonium': 0, 'sandstorm': 0, 'electrocuted': 0, 'landslide': 0, 'flooding': 0, 'mayhem': 0, 'rainstorm': 0, 'demolition': 0, 'blew up': 0, 'hijacking': 0, 'siren': 0, 'terrorist': 0, 'inundated': 0, 'damage': 0, 'lava': 0, 'devastated': 0, 'forest fires': 0, 'outbreak': 0, 'terrorism': 0, 'panic': 0, 'detonation': 0, 'injured': 0, 'deluged': 0, 'windstorm': 0, 'thunderstorm': 0, 'hazard': 0, 'crushed': 0, 'crashed': 0, 'blood': 0, 'buildings on fire': 0, 'destruction': 0, 'deluge': 0, 'weapon': 0, 'sinkhole': 0, 'aftershock': 0, 'ambulance': 0, 'wreckage': 0, 'desolate': 0, 'blown up': 0, 'fatalities': 0, 'injuries': 0, 'bombing': 0, 'structural failure': 0, 'death': 0, 'police': 0, 'destroyed': 0, 'engulfed': 0, 'crash': 0, 'emergency': 0, 'inundation': 0, 'collide': 0, 'blight': 0, 'destroy': 0, 'dust storm': 0, 'mudslide': 0, 'displaced': 0, 'arsonist': 0, 'nuclear reactor': 0, 'blazing': 0, 'lightning': 0, 'explode': 0, 'tsunami': 0, 'burning buildings': 0, 'volcano': 0, 'hijack': 0, 'refugees': 0, 'derailment': 0, 'harm': 0, 'hail': 0, 'bioterror': 0, 'hurricane': 0, 'trauma': 0, 'evacuation': 0, 'cyclone': 0, 'epicentre': 0, 'nuclear disaster': 0, 'hostages': 0, 'obliteration': 0, 'suicide bomber': 0, 'drowning': 0, 'derailed': 0, 'threat': 0, 'apocalypse': 0, 'chemical emergency': 0, 'burning': 0, 'obliterated': 0, 'screamed': 0, 'fire truck': 0, 'seismic': 0, 'wildfire': 0, 'emergency services': 0, 'attack': 0, 'storm': 0, 'catastrophic': 0, 'twister': 0, 'evacuated': 0, 'natural disaster': 0, 'collapse': 0, 'trapped': 0, 'war zone': 0, 'exploded': 0, 'collapsed': 0, 'oil spill': 0, 'evacuate': 0, 'typhoon': 0, 'dead': 0, 'survived': 0, 'first responders': 0, 'keyword': 0, 'radiation emergency': 0, 'annihilation': 0, 'deaths': 0, 'rubble': 0, 'ablaze': 0, 'meltdown': 0, 'casualty': 0, 'body bags': 0, 'upheaval': 0, 'flood': 0, 'demolished': 0, 'rioting': 0, 'hellfire': 0, 'curfew': 0, 'hazardous': 0, 'tornado': 0, 'desolation': 0, 'flattened': 0, 'drought': 0, 'derail': 0, 'arson': 0, 'rescued': 0, 'suicide bombing': 0, 'wild fires': 0, 'wounded': 0} # for issue in count.keys(): # print issue
def main(): conf = SparkConf().setAppName("pyspark read") sc = SparkContext(conf=conf) mongo_rdd = sc.mongoRDD('mongodb://localhost:27017/estreaming.splash') print(mongo_rdd.first())
"#AugmentedReality", "#BigData", "#DevOps" ] hashtags_lowercased = [ "#ai", "#artificialintelligence", "#machinelearning", "#ml", "#deeplearning", "#dl", "#datamining", "#vr", "#virtualreality", "#ar", "#augmentedreality", "#bigdata", "#devops" ] def get_hashtag(x): if x.lower() in hashtags_lowercased: for hashtag in hashtags: if hashtag.lower() == x.lower(): return hashtag # Important: activate pymongo_spark. pymongo_spark.activate() conf = SparkConf().setAppName('SparkBatch').setMaster('local[2]') sc = SparkContext(conf=conf) #Reading mongo_rdd = sc.mongoRDD('mongodb://*****:*****@ds129540.mlab.com:29540/bigdata.tweets')\ .map(lambda x: json.loads(x['value'])['full_text'])\ .flatMap(lambda x: x.split()) \ .filter(lambda x: x.lower() in hashtags_lowercased) \ .map(lambda x: (get_hashtag(x),1)) \ .reduceByKey(lambda x,y:x+y) \ .saveToMongoDB('mongodb://*****:*****@ds129540.mlab.com:29540/bigdata.hashtags')
print doc["_id"] if __name__ == '__main__': pymongo_spark.activate() start_time=datetime.time() conf = (SparkConf() .setAppName("LinkingPipeLine")) sc = SparkContext(conf=conf,pyFiles=['/home/naveen/spark-1.6.0-bin-hadoop2.6/linkPipe/LinkPipeMethods.py', '/home/naveen/spark-1.6.0-bin-hadoop2.6/linkPipe/tagText.py']) #setting up RDD rdd = sc.mongoRDD('mongodb://10.1.1.5:27017/GaugeDB.test_judgments') #Filtering Criteria for RDD filterRDD = rdd.filter(lambda x : True if x["pipefinal"] == 1 else False) #Config paths path1 = "/usr/linkPipModels/CRF-Model-OnlyCodes" path2 = "/usr/linkPipModels/CRF-Model-OnlyTitles" path3 = "/usr/linkPipModels/VectorSpaceTitles_word.p" path4 = "/usr/linkPipModels/VectorSpaceCodes.p" path5 = "/usr/linkPipModels/Tf-IdfOnlytitles.p" path6 = "/usr/linkPipModels/Tf-IdfCitationCodes.p" path7 = "/usr/linkPipModels/TitleClassifier.p" path8 ="/usr/linkPipModels/JournalDictForStep1.p" path9 ="/usr/linkPipModels/JournalDictForStep2.p"