def persistFinData(mongo, mongo_colln, source, json_data, data, dataset_code, description, meta_updated, data_mode, qcode_name): """Collects the Quandl JSON response data and inserts into mongo collection and updates CKAN.""" logging.basicConfig(format='%(asctime)s %(levelname)s \ %(module)s.%(funcName)s :: %(message)s', level=logging.INFO) # fetching arguments from config. mongo_uri = mongo_config.get('mongo_uri') meta_col_name = mongo_config.get('meta_colln_name') try: mongo.bulk_mongo_insert(mongo_colln, data) # logging.info(data_mode) # logging.info(meta_updated) if data_mode == "initial" and meta_updated: # METADATA Collection meta_mongo_colln = mongo.initialize_mongo(meta_col_name) meta_feedObj = json_data["dataset"] meta_feedObj['_id'] = source + "." + dataset_code.split("/")[1] mongo.insert_into_mongo(meta_mongo_colln, meta_feedObj) # CKAN refresh_rate = json_data["dataset"]["frequency"] insert_into_ckan(mongo_uri, source, qcode_name, description, refresh_rate) except: raise logging.error("Error while Initializing Mongo.") return meta_updated
def __init__(self): mongo_uri = mongo_config.get('mongo_uri') ssl_required = mongo_config.get('ssl_required') requires_auth = mongo_config.get('requires_auth') mongo_username = mongo_config.get('mongo_username') mongo_password = mongo_config.get('mongo_password') mongo_auth_source = mongo_config.get('mongo_auth_source') mongo_auth_mech = mongo_config.get('mongo_auth_mechanism') db_name = mongo_config.get('db_name') self.mongo_index_name = mongo_config.get('mongo_index_name') try: # Instantiating MongoClient client = MongoClient(mongo_uri, ssl=ssl_required, replicaSet='Cluster0-shard-0') # Authenticate MongoDB (Optional) if requires_auth == 'true': client.the_database.authenticate(mongo_username, mongo_password, source=mongo_auth_source, mechanism=mongo_auth_mech) self.mongo_inst = client[db_name] except IOError: raise logging.error("Could not connect to Mongo Server")
def model_input(): input_list = [] source = mongo_config.get('col_name') #Constant-Mongo Collection Name mongo_colln = initialize_mongo(source) try: logging.info("Building dataframe of input.") for documents in mongo_colln.find(): #[0:20] if documents["Extracted_Date"] == str(datetime.today().date()): input_list.append([ documents['checksum'], str(documents['Review_Text']).replace("\n", ''), documents['Restaurent_id/Hotel_id'], documents['Country'], documents['Restaurent_name/Hotel_name'], documents['User_Name'], documents['Rating'], documents['Source'], documents['Rating_text'], documents['Posted_date'], documents['User_id'], documents['City'], documents['Area'], documents['User_Age'], documents['User_Location'], documents['User_Gender'], documents['Max_Rating'] ]) else: continue except: logging.error("Building input dataframe causing error.") input_dataframe = pd.DataFrame( input_list, columns=[ 'Review_id', 'Review_Text', 'Restaurent_id/Hotel_id', 'Country', 'Restaurent_name/Hotel_name', 'User_Name', 'Rating', 'Source', 'Rating_text', 'Posted_date', 'User_id', 'City', 'Area', 'User_Age', 'User_Location', 'User_Gender', 'Max_Rating' ]) return input_dataframe
def load_sentiments_to_mongo(): logging.basicConfig(filename='ABSA_uptd-logs.log', filemode='w', format='%(asctime)s %(levelname)s \ %(module)s.%(funcName)s :: %(message)s', datefmt='%d:%m:%Y %H:%M:%S', level=logging.DEBUG) try: logging.info("Fetching Input Reviews from Database.") source = mongo_config.get('col_name1') mongo_colln = initialize_mongo(source) input_dataframe = model_input() logging.info("Creating Output Object Stricture.") logging.info("Getting Aspects and Sentiments for Review.") logging.info("Triggering Model for aspects and sentiments.") for r, sentence in input_dataframe.iterrows(): json_obj = {} json_obj["REVIEW"] = str(sentence["Review_Text"]) json_obj["REVIEW_ID"] = str(sentence["Review_id"]) json_obj["Restaurent_id/Hotel_id"] = str( sentence["Restaurent_id/Hotel_id"]) json_obj["Country"] = sentence["Country"] json_obj["Restaurent_name/Hotel_name"] = sentence[ "Restaurent_name/Hotel_name"] json_obj["User_Name"] = str(sentence["User_Name"]) json_obj["Rating"] = str(sentence["Rating"]) json_obj["Source"] = sentence["Source"] json_obj["Rating_text"] = sentence["Rating_text"] json_obj["Posted_date"] = str(sentence["Posted_date"]) json_obj["User_id"] = str(sentence["User_id"]) json_obj["City"] = sentence["City"] json_obj["Area"] = sentence["Area"] json_obj["User_Age"] = str(sentence["User_Age"]) json_obj["User_Location"] = sentence["User_Location"] json_obj["User_Gender"] = sentence["User_Gender"] json_obj["Max_Rating"] = str(sentence["Max_Rating"]) try: for sentiment in get_polarities(str(sentence["Review_Text"])): json_obj[str(sentiment.keys()).replace("[", '').replace( "]", '').replace("(", '').replace(")", '').replace( ",", '').replace("dict_keys", '').replace( "'", '')] = str(sentiment.values()).replace( "[", '').replace("]", '').replace( "(", '').replace(")", '').replace( ",", '').replace("dict_values", '').replace("'", '') except: logging.error( "Issue in loading sentimente to Database Collection") continue mongo_id = hashlib.md5(str(json_obj).encode('utf-8')).hexdigest() update_mongo_collection(mongo_colln, mongo_id, json_obj) except: logging.error( "Input fetching from Mongo or Getting aspects and Sentiment Issue." ) raise
def main(): logging.basicConfig(filename='Reviews-Extractor-logs.log', filemode='w', format='%(asctime)s %(levelname)s \ %(module)s.%(funcName)s :: %(message)s', datefmt='%d:%m:%y %H:%M:%S', level=logging.INFO) col_name = mongo_config.get('col_name') # Initializing Mongo With Collection Name try: logging.info("Starting Hotel Reviews load to Mongo") hotel_mongo_load() except: logging.warn("Issue while loading Hotel Reviews to Mongo") pass logging.info("Starting Restaurant Reviews load to Mongo") mongo_colln = initialize_mongo(col_name) try: #Checking for latest date for data in Mongo if mongo_colln.find().sort("Posted_date", -1).limit(1).count() > 0: logging.info( "Checking for latest date for data in Database for Incremental Inserting" ) for obj in mongo_colln.find().sort("Posted_date", -1).limit(1): logging.info("Calling Sources functions for Extraction") scrapers = [ google_reviews(obj), zomato_reviews(obj), citysearch_reviews(obj), twitter_reviews(obj) ] for scraper in scrapers: logging.info("Adding data to database collections.") for each in scraper: update_mongo_collection(mongo_colln, each["checksum"], each) elif mongo_colln.find().sort("Posted_date", -1).limit(1).count() == 0: logging.info( "No Latest Date found, Collection is Empty. Inserting from beginning." ) obj = None scrapers = [ google_reviews(obj), zomato_reviews(obj), citysearch_reviews(obj), twitter_reviews(obj) ] for scraper in scrapers: logging.info("Adding data to database collections.") for each in scraper: update_mongo_collection(mongo_colln, each["checksum"], each) except: logging.error("Restaurant Source Extractions threw an error. ") pass
def saveCodesInMongo(qcode_name): quandl_codes_colln_name = mongo_config.get('quandl_codes_colln_name') q_data_base_URL = "https://www.quandl.com/api/v3/datasets/{0}" filenamesList = [] for (dirpath, dirnames, filenames) in walk(DEFAULT_DATA_PATH): filenamesList.extend(filenames) qcodes_colln = mongo.initialize_mongo(quandl_codes_colln_name) for fn in filenamesList: try: dataset_qcodes = [] logging.info(fn + " extracted.") codesFile = os.path.abspath(os.path.join(DEFAULT_DATA_PATH, fn)) dataset = fn.replace('-datasets-codes.csv', '') qcode_cursor = qcodes_colln.find_one({'dataset': dataset}) if not qcode_cursor: with open(codesFile, 'r') as csv_file: csvlines = csv_file.readlines() for num, line in enumerate(csvlines): codeline = line.split(',') if len(codeline) > 1: dataset_code = codeline[0] dataset_descrpn = codeline[1] created_time = datetime.now().strftime("%Y-%m-%d") code_doc = { "dataset": dataset, "dataset_code": dataset_code, "description": dataset_descrpn, "base_url": q_data_base_URL.format(dataset_code), "created_time": created_time, "name": qcode_name, "_id": dataset_code, } dataset_qcodes.append(code_doc) if qcode_cursor: mongo.bulk_mongo_update(qcodes_colln, dataset_qcodes) else: mongo.bulk_mongo_insert(qcodes_colln, dataset_qcodes) except: raise print "Errorrrrr..." continue finally: os.remove(codesFile)
def main(): logging.basicConfig(format='%(asctime)s %(levelname)s \ %(module)s.%(funcName)s :: %(message)s', level=logging.INFO) source = mongo_config.get('col_name') mongo_colln = initialize_mongo(source) archive_url = argument_config.get('stl_batting') a_team_list = [] l = [] peoria_chiefs_batting = argument_config.get('peoria_chiefs_batting') springfield_cardinals_batting = argument_config.get( 'springfield_cardinals_batting') memphis_redbirds_batting = argument_config.get('memphis_redbirds_batting') minor_url_list = [ peoria_chiefs_batting, springfield_cardinals_batting, memphis_redbirds_batting ] #url_list = [archive_url1, archive_url2, archive_url3, archive_url10, archive_url20, archive_url30, archive_url100, archive_url200, archive_url300] try: logging.info("Starting Data Extraction for St Louis Cardinals Batters") stats_empty_list = [] stat_list = stl_batting_stats(archive_url, stats_empty_list) l = json_object_building(stat_list) logging.info("Loading Data To Mongo") ## for obj in l: mongo_id = obj["NAME"] + "-" + obj["TEAM"] feed_object = obj update_mongo_collection(mongo_colln, mongo_id, feed_object) logging.info( "Starting Data Extraction for St Louis Cardinals Minor League Batters" ) for i in minor_url_list: if i == peoria_chiefs_batting: team = "PEORIA CHIEFS" elif i == springfield_cardinals_batting: team = "SPRINGFIELD CARDINALS" elif i == memphis_redbirds_batting: team == "MEMPHIS REDBIRDS" data_list = a_team_batting_stats(a_team_list, i) l = minor_json_object_building(data_list, team) data_list = [] logging.info("Loading Data To Mongo") for obj in l: mongo_id = obj["NAME"] + "-" + obj["TEAM"] feed_object = obj update_mongo_collection(mongo_colln, mongo_id, feed_object) except: logging.error( "Error Occurs while scraping and loading, raise exception to check exact error" ) raise
def sentiment_generator(input_dataframe): try: dataset_list = [] logging.info("Getting Sentiments of Loaded Reviews.") source2 = mongo_config.get("col_name3") mongo_colln2 = initialize_mongo(source2) logging.info("Loading Sentiments to Mongo") for index, i in input_dataframe.iterrows(): review_dict = {} review_dict['REVIEW_ID'] = str(i['Review_id']) review_dict['Area'] = str(i['Area']) review_dict['City'] = str(i['City']) review_dict['Country'] = str(i['Country']) review_dict['Posted_date'] = str(i['Posted_date']) review_dict['REVIEW'] = str(i['Review_Text']) review_dict['Rating'] = str(i['Rating']) review_dict['Restaurent_id/Hotel_id'] = str( i['Restaurent_id/Hotel_id']) review_dict['Restaurent_name/Hotel_name'] = str( i['Restaurent_name/Hotel_name']) review_dict['Source'] = i['Source'] review_dict['User_Name'] = str(i['User_Name']) review_dict['User_id'] = str(i['User_id']) review_dict['User_Gender'] = str(i['User_Gender']) review_dict['User_Age'] = str(i['User_Age']) review_dict['Max_Rating'] = str(i['Max_Rating']) review_dict['User_Location'] = str(i['User_Location']) try: sentiments, sentences = flask_entry(i["Review_Text"]) for sentiment in sentiments: review_dict[str(sentiment.keys()).replace("[", '').replace( "]", '').replace("(", '').replace(")", '').replace( ",", '').replace("dict_keys", '').replace( "'", '')] = str(sentiment.values()).replace( "[", '').replace("]", '').replace( "(", '').replace(")", '').replace( ",", '').replace("dict_values", '').replace("'", '') except: continue review_dict['_id'] = hashlib.md5( json.dumps(review_dict, sort_keys=True).encode('utf8')).hexdigest() dataset_list.append(review_dict) bulk_mongo_update(mongo_colln2, dataset_list) except: logging.error( "Error in getting sentiments of reviews. Unable to Load to Mongo") pass
def make_mongo_connection(collection_name): """This is to establish connection with MongoDB with desired Credentials""" # Fetching config parameters. mongo_uri = mongo_config.get('mongo_uri') requires_auth = mongo_config.get('requires_auth') mongo_username = mongo_config.get('mongo_username') mongo_password = mongo_config.get('mongo_password') mongo_auth_source = mongo_config.get('mongo_auth_source') mongo_auth_mechanism = mongo_config.get('mongo_auth_mechanism') db_name = mongo_config.get('db_name') ssl_required = mongo_config.get('ssl_required') replicaSet = mongo_config.get('replicaSet') client = MongoClient(mongo_uri, ssl=ssl_required, replicaSet=replicaSet) #, connect=True if requires_auth == 'true': client.the_database.authenticate(mongo_username, mongo_password, source=mongo_auth_source, mechanism=mongo_auth_mechanism) db = client[db_name] mongo_colln = db[collection_name] # Testing Index with Unique element, to avoid failure of Index creation, # in case of Collection doesnot exist. test_uuid = str(uuid1()) try: mongo_colln.insert_one({'uuid': test_uuid}) mongo_colln.delete_one({'uuid': test_uuid}) except: logging.debug("Collection %s already exists" % collection_name) return mongo_colln
def initialize_mongo(source): """Initializes MongoDB Connection and returns MongoCollection for the given Index.""" # Fetching config parameters. mongo_index_name = mongo_config.get('mongo_index_name') try: # Creating Mongo Collection mongo_colln = make_mongo_connection(source) # Create index, if it is not available. if mongo_index_name not in mongo_colln.index_information(): mongo_colln.create_index(mongo_index_name, unique=False) except IOError: logging.error("Could not connect to Mongo Server") return mongo_colln
def make_mongo_connection(collection_name): """This is to establish connection with MongoDB with desired Credentials""" # Fetching config parameters. mongo_uri = mongo_config.get('mongo_uri') requires_auth = mongo_config.get('requires_auth') mongo_username = mongo_config.get('mongo_username') mongo_password = mongo_config.get('mongo_password') mongo_auth_source = mongo_config.get('mongo_auth_source') mongo_auth_mechanism = mongo_config.get('mongo_auth_mechanism') db_name = mongo_config.get('db_name') ssl_required = mongo_config.get('ssl_required') client = MongoClient(mongo_uri, ssl=ssl_required, replicaSet='MLBStats-shard-0', connect=False) #client = pymongo.MongoClient(mongo_uri) if requires_auth == 'true': client.the_database.authenticate(mongo_username, mongo_password, source=mongo_auth_source, mechanism=mongo_auth_mechanism ) db = client[db_name] mongo_colln = db[collection_name] #client = pymongo.MongoClient("mongodb://*****:*****@mlbstats-shard-00-00-clx9y.mongodb.net:27017,mlbstats-shard-00-01-clx9y.mongodb.net:27017,mlbstats-shard-00-02-clx9y.mongodb.net:27017/test?ssl=true&replicaSet=MLBStats-shard-0&authSource=admin&retryWrites=true") # Testing Index with Unique element, to avoid failure of Index creation, # in case of Collection doesnot exist. test_uuid = str(uuid1()) try: mongo_colln.insert_one({'uuid': test_uuid}) mongo_colln.delete_one({'uuid': test_uuid}) except: logging.debug("Collection %s already exists" % collection_name) return mongo_colln
def hotel_mongo_load(): col_name = mongo_config.get('col_name_hotel') # Initializing Mongo With Collection Name mongo_colln = initialize_mongo(col_name) try: # Checking for latest date for data in Mongo if mongo_colln.find().sort("Posted_date", -1).limit(1).count() > 0: logging.info( "Checking for latest date for data in Database for Incremental Inserting" ) for obj in mongo_colln.find().sort("Posted_date", -1).limit(1): logging.info("Calling Sources functions for Extraction") scrapers = [ google_reviews_hotels(obj), twitter_reviews_hotels(obj) ] for scraper in scrapers: logging.info("Adding data to database collections.") for each in scraper: update_mongo_collection(mongo_colln, each["checksum"], each) elif mongo_colln.find().sort("Posted_date", -1).limit(1).count() == 0: logging.info( "No Latest Date found, Collection is Empty. Inserting from beginning." ) obj = None scrapers = [ google_reviews_hotels(obj), twitter_reviews_hotels(obj) ] for scraper in scrapers: logging.info("Adding data to database collections.") for each in scraper: update_mongo_collection(mongo_colln, each["checksum"], each) except: logging.error("Hotel Source Extractions threw an error. ") pass
def main(): db_name = mongo_config.get('db_name') # + '2' mongodb = mongodbConnector() colln_names = mongodb.get_collection_names() root_folder = os.path.abspath( os.path.join(os.path.dirname('__file__'), '', db_name)) if not os.path.isdir(db_name): os.mkdir(db_name) for colln in colln_names: os.chdir(root_folder) mongo_colln = mongodb.initialize_mongo(colln) print colln + " (" + str(mongo_colln.count()) + ")" if mongo_colln.count() == 0: continue file_name = colln[:60] if not os.path.isdir(file_name): os.mkdir(file_name) os.chdir(file_name) cwd = os.getcwd() filesInDir = os.listdir(cwd) if len(filesInDir) > 0: print str(len(filesInDir)) + " files present " continue try: data_list = [] has_written = False j = 0 for i in mongo_colln.find(): #print i data_list.append(i) if len(data_list) == 100000: with open(file_name + '_' + str(j) + '.json', 'w') as colln_file: colln_file.write( json.dumps(data_list, indent=4, default=json_util.default)) data_list = [] has_written = True j += 1 if len(i) > 0: if not has_written: with open(file_name + '.json', 'w') as colln_file: colln_file.write( json.dumps(data_list, indent=4, default=json_util.default)) else: with open(file_name + '_' + str(j) + '.json', 'w') as colln_file: colln_file.write( json.dumps(data_list, indent=4, default=json_util.default)) except: raise
def upload_files(df): try: dataset_list = [] logging.info("Starting the loading Process.") source = mongo_config.get("col_name2") mongo_colln = initialize_mongo(source) logging.info("Loading input dataset to mongo.") for index, i in df.iterrows(): resp_dict = {} resp_dict['Area'] = i["Area"] resp_dict['City'] = i["City"] resp_dict['Country'] = i["Country"] resp_dict['Extracted_Date'] = str(datetime.today().date()) s = str(i["Posted_date"]) days = ['day', 'days', 'Days', 'Day'] months = ["months", "month", "Months", "Month"] yesterday = ["yesterday", "Yesterday"] today = [ "hours", "hour", "Hours", "Hour", "Minutes", "Minute", "minutes", "minute", "mins", "min", "secs", "Seconds", "Second", "sec", "Hrs", "hrs", "Today", "today", "seconds", "second" ] if any( x in s for x in days ) and s != 'yesterday' and s != 'Yesterday' and s != 'Today' and s != 'today': parsed_s = s.split()[:1] past_time = datetime.today() - timedelta(days=int(parsed_s[0])) resp_dict['Posted_date'] = str(past_time)[:10] elif any(x in s for x in months): parsed_s = s.split()[:1] past_time = datetime.today() - relativedelta( months=int(parsed_s[0])) resp_dict['Posted_date'] = str(past_time)[:10] elif any(x in s for x in yesterday): past_time = datetime.today() - timedelta(days=int(1)) resp_dict['Posted_date'] = str(past_time)[:10] elif any(x in s for x in today): past_time = datetime.today().date() resp_dict['Posted_date'] = str(past_time)[:10] else: resp_dict['Posted_date'] = str(s) resp_dict['Rating'] = str(i["Rating"]) resp_dict['Rating_text'] = str(i["Rating_text"]) resp_dict['Restaurent_id/Hotel_id'] = str( i["Restaurent_id/Hotel_id"]) resp_dict['Restaurent_name/Hotel_name'] = str( i["Restaurent_name/Hotel_name"]) if str(i["Review_Text"]) == "": break else: resp_dict['Review_Text'] = str(i["Review_Text"]) resp_dict['Source'] = "file_upload" resp_dict['User_Name'] = str(i["User_Name"]) resp_dict['User_id'] = str(i["User_id"]) resp_dict['User_Gender'] = str(i["User_Gender"]) resp_dict['User_Age'] = str(i["User_Age"]) resp_dict['Max_Rating'] = str(i["Max_Rating"]) resp_dict['User_Location'] = str(i["User_Location"]) resp_dict['checksum'] = resp_dict['_id'] = hashlib.md5( json.dumps(resp_dict, sort_keys=True).encode('utf8')).hexdigest() dataset_list.append(resp_dict) bulk_mongo_update(mongo_colln, dataset_list) except: flash( "Error while loading Dataset to mongo, invalid data in input file or Review is empty. Please check the schema for input file." ) logging.error( "Error while loading Dataset to mongo, invalid data in input file or Review is empty. Please check the schema for input file." ) return render_template('UploadFiles.html') try: input_list = [] for documents in mongo_colln.find(): if documents["Extracted_Date"] == str(datetime.today().date( )) and resp_dict['Source'] == "file_upload": input_list.append([ documents['checksum'], str(documents['Review_Text']).replace("\n", ''), documents['Restaurent_id/Hotel_id'], documents['Country'], documents['Restaurent_name/Hotel_name'], documents['User_Name'], documents['Rating'], documents['Source'], documents['Rating_text'], documents['Posted_date'], documents['User_id'], documents['City'], documents['Area'], documents['User_Gender'], documents['User_Age'], documents['Max_Rating'], documents['User_Location'] ]) except: flash("Error while creating Dataframe for Sentiment.") logging.error("Error while creating Dataframe for Sentiment.") return render_template('UploadFiles.html') df1 = pd.DataFrame(input_list, columns=[ 'Review_id', 'Review_Text', 'Restaurent_id/Hotel_id', 'Country', 'Restaurent_name/Hotel_name', 'User_Name', 'Rating', 'Source', 'Rating_text', 'Posted_date', 'User_id', 'City', 'Area', 'User_Gender', 'User_Age', 'Max_Rating', 'User_Location' ]) input_dataframe = df1.replace(np.nan, '', regex=True) flash( "File Uploaded Successfully, getting sentiments for input dataset...") return input_dataframe
def main(): """Initiates the Financial news extraction from Quandl using API calls.""" t1 = time.time() logging.basicConfig(format='%(asctime)s %(levelname)s \ %(module)s.%(funcName)s :: %(message)s', level=logging.INFO) # fetching arguments from config. quandl_apikey = argument_config.get('quandl_apikey') meta_col_name = mongo_config.get('meta_colln_name') quandl_codes_colln_name = mongo_config.get('quandl_codes_colln_name') qcodes_colln = mongo.initialize_mongo(quandl_codes_colln_name) meta_mongo_colln = mongo.initialize_mongo(meta_col_name) # Executes code uninterrupted. while True: try: # Fetching dataset codes from Quandl qcodes_cursor = qcodes_colln.find() for qcur in qcodes_cursor: # Redownload the codes for every 30 days curr_date = datetime.now().strftime("%Y-%m-%d") codes_dt = datetime(*map(int, (qcur['created_time'])\ .split("-"))) curr_dt = datetime(*map(int, curr_date.split("-"))) if (curr_dt - codes_dt).days > 30: getCodesInCSVsForAllDatasets(quandl_apikey) break else: # Downloading the Quandl codes for the first time. getCodesInCSVsForAllDatasets(quandl_apikey) # Fetch the Quandl codes from mongo collection to extract data, qcodes_cursor = qcodes_colln.find() src_colln_list = [] for qcur in qcodes_cursor: base_url = qcur['base_url'] data_URL = base_url + "?api_key={0}" dataset_code = qcur['dataset_code'] dataset_descrpn = qcur['description'] qcode_name = qcur['name'] src_colln_name = dataset_code.lower().split("/")[0] meta_obj_name = src_colln_name + "." + dataset_code.split( "/")[1] if src_colln_name not in src_colln_list: src_colln_list.append(src_colln_name) else: continue logging.info("Executing dataset code :: " + dataset_code) src_colln = mongo.initialize_mongo(src_colln_name) resp_data = '' mongo_id = '' data_mode = '' prev_count = 0 # Check if Collection already exists in MongoDB. metadata_count = src_colln.count() if metadata_count == 0: time.sleep(3) resp = os.popen("curl " + data_URL.format(quandl_apikey)) resp_data = resp.read() data_mode = "initial" # Persisting functionality to Mongo. saveQuandlData(resp_data, src_colln, src_colln_name, dataset_descrpn, dataset_code, data_mode, prev_count, qcode_name) except: raise logging.info("Total time taken to fetch data from Quandl : " + str(round(float((time.time() - t1) / 60), 1)) + " minutes")
def insert_into_ckan(mongo_uri, source, qcode_name, description, refresh_rate): """"CKAN holds the meta information about the saved data of MongoDB.""" logging.basicConfig(format='%(asctime)s %(levelname)s \ %(module)s.%(funcName)s :: %(message)s', level=logging.INFO) # Fetch config params. ckan_host = argument_config.get('ckan_host') api_key = argument_config.get('api_key') owner_org = argument_config.get('owner_org') publisher = argument_config.get('publisher') ckan_private = argument_config.get('ckan_private') db_name = mongo_config.get('db_name') ckan_ckan = ckanapi.RemoteCKAN(ckan_host, apikey=api_key) package_name = source.lower().replace("_", "-")\ .replace("(", "-")\ .replace(")", "-")\ .replace("/", "-")\ .replace(".", "")\ .replace("&", "")\ .replace(":", "")\ .replace("---", "-")\ .replace("--", "-") package_name = package_name[:99] if package_name.endswith("-"): package_name = package_name.rstrip('-') # package_title = source.replace("_", " ") package_title = qcode_name dict_additional_fields = { 'Title': package_title, 'Sourcing date': datetime.now().strftime("%B %d, %Y, %H:%M"), 'Source': source, 'Datastore': mongo_uri, 'Database name': db_name, 'Collection': source, 'Description': description, 'Refresh rate': refresh_rate, } additional_fields = [] for k, v in dict_additional_fields.items(): additional_fields.append({'key': k, 'value': v}) tags = buildTags(package_name) try: ckan_ckan.action.package_create( name=package_name, title=package_title, maintainer=publisher, tags=tags, notes=description, private=ckan_private, owner_org=owner_org, extras=additional_fields, ) except: try: ckan_ckan.action.package_update( id=package_name, title=package_title, maintainer=publisher, tags=tags, notes=description, private=ckan_private, owner_org=owner_org, extras=additional_fields, ) except: logging.error("CKAN package creation/updation failed: " + package_name)