def ingestMongoEvents(): # ----------------------------------------------------------------------------------------- # Retrieve MongoDB info from config file mongoConnection = APP_CONFIG['MongoDB-Event-Database']['address'] databaseName = APP_CONFIG['MongoDB-Event-Database']['name'] # Retrieve the MySQL database curose connection = databaseConnection() cursor = connection.cursor() # setup localization myclient = pymongo.MongoClient(mongoConnection) # reference MongoDB database database = myclient[databaseName] # ----------------------------------------------------------------------------------------- try: # for names in collNames: for coll in database.list_collection_names(): # ----------------------------------------------------------------------------------------- if (coll == "Cambia-Lens"): print("Cambia-Lens Ingest!") # Match the event with the collection name events = database[coll] # For all events in the collection, iterate through them and ingest them for uniqueEvent in events.find({}): cambiaLens.cambiaLensIngest(uniqueEvent, cursor, connection) # ----------------------------------------------------------------------------------------- elif (coll == "Crossref"): print("Crossref Ingest!") # Match the event with the collection name events = database[coll] # For all events in the collection, iterate through them and ingest them for uniqueEvent in events.find({}): crossref.crossrefIngest(uniqueEvent, cursor, connection) # ----------------------------------------------------------------------------------------- elif (coll == "Datacite"): print("Datacite Ingest!") # Match the event with the collection name events = database[coll] # For all events in the collection, iterate through them and ingest them for uniqueEvent in events.find({}): datacite.dataciteIngest(uniqueEvent, cursor, connection) # ----------------------------------------------------------------------------------------- elif (coll == "F1000"): print("F1000 Ingest!") # Match the event with the collection name events = database[coll] # For all events in the collection, iterate through them and ingest them for uniqueEvent in events.find({}): f1000.F1000Ingest(uniqueEvent, cursor, connection) # ----------------------------------------------------------------------------------------- elif (coll == "Hypothesis"): print("Hypothesis Ingest!") # Match the event with the collection name events = database[coll] # For all events in the collection, iterate through them and ingest them for uniqueEvent in events.find({}): hypothesis.hypothesisIngest(uniqueEvent, cursor, connection) # ----------------------------------------------------------------------------------------- elif (coll == "Newsfeed"): print("Newsfeed Ingest!") # Match the event with the collection name events = database[coll] # For all events in the collection, iterate through them and ingest them for uniqueEvent in events.find({}): newsfeed.newsfeedIngest(uniqueEvent, cursor, connection) # ----------------------------------------------------------------------------------------- elif (coll == "Reddit"): print("Reddit Ingest!") # Match the event with the collection name events = database[coll] # For all events in the collection, iterate through them and ingest them for uniqueEvent in events.find({}): reddit.redditIngest(uniqueEvent, cursor, connection) # ----------------------------------------------------------------------------------------- elif (coll == "Reddit-Links"): print("Reddit-Links Ingest!") # Match the event with the collection name events = database[coll] # For all events in the collection, iterate through them and ingest them for uniqueEvent in events.find({}): redditLinks.redditLinksIngest(uniqueEvent, cursor, connection) # ----------------------------------------------------------------------------------------- elif (coll == "StackExchange"): print("StackExchange Ingest!") # Match the event with the collection name events = database[coll] # For all events in the collection, iterate through them and ingest them for uniqueEvent in events.find({}): stackExchange.stackExchangeIngest(uniqueEvent, cursor, connection) # ----------------------------------------------------------------------------------------- elif (coll == "Twitter"): print("Twitter Ingest!") # Match the event with the collection name events = database[coll] # For all events in the collection, iterate through them and ingest them for uniqueEvent in events.find({}): twitter.twitterIngest(uniqueEvent, cursor, connection) # ----------------------------------------------------------------------------------------- elif (coll == "Web"): print("Web Ingest!") # Match the event with the collection name events = database[coll] # For all events in the collection, iterate through them and ingest them for uniqueEvent in events.find({}): web.webIngest(uniqueEvent, cursor, connection) # ----------------------------------------------------------------------------------------- elif (coll == "Wikipedia"): print("Wikipedia Ingest!") # Match the event with the collection name events = database[coll] # For all events in the collection, iterate through them and ingest them for uniqueEvent in events.find({}): wikipedia.wikipediaIngest(uniqueEvent, cursor, connection) # ----------------------------------------------------------------------------------------- elif (coll == "WordPressDotCom"): print("WordPressDotCom Ingest!") # Match the event with the collection name events = database[coll] # For all events in the collection, iterate through them and ingest them for uniqueEvent in events.find({}): wordpress.wordpressIngest(uniqueEvent, cursor, connection) # ----------------------------------------------------------------------------------------- except: print("Ingest failed!") cursor.close() connection.close() # -----------------------------------------------------------------------------------------
def main(): connection = mysql.connector.connect(user=str(mysql_username), password=str( mysql_password), host='127.0.0.1', database='crossrefeventdatamain') # Returns dictionary values instead of tuples cursor = connection.cursor(dictionary=True) # Query that grabs ID of the last row(highest ID) of the database SQLMaxFindAI = "Select MAX(id) as id FROM paperbuzzeventdata.event_data_json;" cursor.execute(SQLMaxFindAI) maxAIResults = cursor.fetchone() maxAI = maxAIResults["id"] AI = 1 # Loop through the whole paperbuzz database while(AI < maxAI): query = "select json from paperbuzzeventdata.event_data_json where id = " + \ str(AI) + ";" cursor.execute(query) # Grabs the json column and puts it into a dictionary eventRow = cursor.fetchone() # Stores the "json" key within eventBytes # eventBytes turns into a byte class eventBytes = eventRow.get("json") # Converts eventBytes into a dictionary eventDict = json.loads(eventBytes.decode('utf-8')) # Go through keys and values of dictionary, searching for source_id and the value based on which online platform the event comes from for key, value in eventDict.items(): try: # Speeds up the process of searching for the source_id. if(key != "source_id"): pass else: if (key == "source_id" and value == "cambia-lens"): print('CambiaLens') cambiaLens.cambiaLensIngest( eventDict, cursor, connection) break elif (key == "source_id" and value == "crossref"): print('Crossref') crossref.crossrefIngest( eventDict, cursor, connection) break elif (key == "source_id" and value == "datacite"): print('Datacite') datacite.dataciteIngest( eventDict, cursor, connection) break elif (key == "source_id" and value == "f1000"): print('F1000') f1000.F1000Ingest( eventDict, cursor, connection) break elif (key == "source_id" and value == "hypothesis"): print('Hypothesis') hypothesis.hypothesisIngest( eventDict, cursor, connection) break elif (key == "source_id" and value == "newsfeed"): print('Newsfeed') newsfeed.newsfeedIngest( eventDict, cursor, connection) break elif (key == "source_id" and value == "reddit"): print('Reddit') reddit.redditIngest( eventDict, cursor, connection) break elif (key == "source_id" and value == "reddit-links"): print('RedditLinks') redditLinks.redditLinksIngest( eventDict, cursor, connection) break elif (key == "source_id" and value == "stackexchange"): print('Stackexchange') stackExchange.stackExchangeIngest( eventDict, cursor, connection) break elif (key == "source_id" and value == "twitter"): print('Twitter') twitter.twitterIngest( eventDict, cursor, connection) break elif (key == "source_id" and value == "web"): print('Web') web.webIngest(eventDict, cursor, connection) break elif (key == "source_id" and value == "wikipedia"): print('Wikipedia') wikipedia.wikipediaIngest( eventDict, cursor, connection) break elif (key == "source_id" and value == "wordpressdotcom"): print('Wordpress') wordpress.wordpressIngest( eventDict, cursor, connection) break except Exception as e: logging.info("Failed Ingest. Failed on file") logging.info("The error was " + str(e)) cursor.close() connection.close() sys.exit() # print(type(eventDict)) AI += 1 cursor.close() connection.close() print("--- %s seconds ---" % (time.time() - start_time))
def main(): restorePoint = "" # Used as last filename we were ingesting files = [] global dataDirectory connection = mysql.connector.connect(user=str(mysql_username), password=str(mysql_password), host='127.0.0.1', database='crossrefeventdatamain') cursor = connection.cursor( ) # Allows us to have multiple seperate working environments through the same connection. Can create individual cursors for each (event) table? redditC = cnx.cursor()? for (path, dirnames, filenames) in os.walk(dataDirectory): files.extend(os.path.join(path, name) for name in sorted(filenames)) # For each file in the directory, open the file. # For each opened file, load the contents into memory and extract the events LIST # For i in events, pull out key/value pairs and "INSERT IGNORE INTO crossRefEventData" for i in files: restorePoint = i # This is the filename we should start at when rerunning it with open(i) as json_file: data = json.load(json_file) # Dict events = data.get("message").get("events") # LIST of dicts for uniqueEvent in events: # each uniqueEvent is a dict # Go through each event's key and value pairs for key, value in uniqueEvent.items(): # Try except incase something goes wrong for if elif statements # The source_id is a distinct field that represents each online platform # If the source_id is *insert online platform here*, then insert the values for each field of the event. # Break statements are used to stop the loop from going through the rest of the event, all we needed was to find source_id. This helps speeds up the process. try: # Speeds up the process of searching for the source_id. if (key != "source_id"): pass else: if (key == "source_id" and value == "cambia-lens"): print('cambia') cambiaLens.cambiaLensIngest( uniqueEvent, cursor, connection) break elif (key == "source_id" and value == "crossref"): print('crossref') crossref.crossrefIngest( uniqueEvent, cursor, connection) break elif (key == "source_id" and value == "datacite"): print('datacite') datacite.dataciteIngest( uniqueEvent, cursor, connection) break elif (key == "source_id" and value == "f1000"): print('F1000') f1000.F1000Ingest(uniqueEvent, cursor, connection) break elif (key == "source_id" and value == "hypothesis"): print('hypothesis') hypothesis.hypothesisIngest( uniqueEvent, cursor, connection) break elif (key == "source_id" and value == "newsfeed"): print('newsfeed') newsfeed.newsfeedIngest( uniqueEvent, cursor, connection) break elif (key == "source_id" and value == "reddit"): print('reddit') reddit.redditIngest(uniqueEvent, cursor, connection) break elif (key == "source_id" and value == "reddit-links"): print('redditlinks') redditLinks.redditLinksIngest( uniqueEvent, cursor, connection) break elif (key == "source_id" and value == "stackexchange"): print('stackexchange') stackExchange.stackExchangeIngest( uniqueEvent, cursor, connection) break elif (key == "source_id" and value == "twitter"): print('twitter') twitter.twitterIngest(uniqueEvent, cursor, connection) break elif (key == "source_id" and value == "web"): print('web') web.webIngest(uniqueEvent, cursor, connection) break elif (key == "source_id" and value == "wikipedia"): print('wikipedia') wikipedia.wikipediaIngest( uniqueEvent, cursor, connection) break elif (key == "source_id" and value == "wordpressdotcom"): print('wordpress.com') wordpress.wordpressIngest( uniqueEvent, cursor, connection) break except Exception as e: logging.info("Failed Ingest. Failed on file" + i) logging.info("The error was " + str(e)) cursor.close() connection.close() sys.exit() cursor.close() connection.close() print("--- %s seconds ---" % (time.time() - start_time))