def main(): logging.basicConfig(format='%(asctime)s %(levelname)s \ %(module)s.%(funcName)s :: %(message)s', level=logging.INFO) source = mongo_config.get('col_name') mongo_colln = initialize_mongo(source) archive_url = argument_config.get('stl_batting') a_team_list = [] l = [] peoria_chiefs_batting = argument_config.get('peoria_chiefs_batting') springfield_cardinals_batting = argument_config.get( 'springfield_cardinals_batting') memphis_redbirds_batting = argument_config.get('memphis_redbirds_batting') minor_url_list = [ peoria_chiefs_batting, springfield_cardinals_batting, memphis_redbirds_batting ] #url_list = [archive_url1, archive_url2, archive_url3, archive_url10, archive_url20, archive_url30, archive_url100, archive_url200, archive_url300] try: logging.info("Starting Data Extraction for St Louis Cardinals Batters") stats_empty_list = [] stat_list = stl_batting_stats(archive_url, stats_empty_list) l = json_object_building(stat_list) logging.info("Loading Data To Mongo") ## for obj in l: mongo_id = obj["NAME"] + "-" + obj["TEAM"] feed_object = obj update_mongo_collection(mongo_colln, mongo_id, feed_object) logging.info( "Starting Data Extraction for St Louis Cardinals Minor League Batters" ) for i in minor_url_list: if i == peoria_chiefs_batting: team = "PEORIA CHIEFS" elif i == springfield_cardinals_batting: team = "SPRINGFIELD CARDINALS" elif i == memphis_redbirds_batting: team == "MEMPHIS REDBIRDS" data_list = a_team_batting_stats(a_team_list, i) l = minor_json_object_building(data_list, team) data_list = [] logging.info("Loading Data To Mongo") for obj in l: mongo_id = obj["NAME"] + "-" + obj["TEAM"] feed_object = obj update_mongo_collection(mongo_colln, mongo_id, feed_object) except: logging.error( "Error Occurs while scraping and loading, raise exception to check exact error" ) raise
def twitter_reviews_hotels(obj): logging.info( "searching the twitter reviews or orginal tweets posted by the user for specific handles" ) '''Searching the twitter reviews or orginal tweets posted by the user for specific handles''' feed_obj = [] auth = tweepy.OAuthHandler(argument_config.get('consumer_key'), argument_config.get('consumer_secret')) auth.set_access_token(argument_config.get('access_token'), argument_config.get('access_token_secret')) api = tweepy.API(auth, wait_on_rate_limit=True) twitter_hashtags = argument_config.get('twitter_hashtags_hotels') try: for handle in twitter_hashtags.keys(): time.sleep(60) tweets = tweepy.Cursor(api.search, q=handle, rpp=100).items(300) for tweet in tweets: if tweet._json["in_reply_to_status_id"] == None and tweet._json[ "in_reply_to_status_id_str"] == None: #and tweet._json["retweet_count"] == 0 json_dict = {} json_dict["Extracted_Date"] = str(datetime.today().date()) json_dict["Source"] = "Twitter" json_dict["Review_Text"] = remove_emoji( tweet._json["text"].strip()).replace("\n", "").replace( "\r", "") json_dict["User_Name"] = tweet._json["user"]["name"] json_dict["User_Age"] = "" json_dict["User_Location"] = "" json_dict["User_Gender"] = "" json_dict["User_id"] = tweet._json["user"]["id_str"] json_dict["Posted_date"] = tweet._json["user"][ "created_at"] dt = parse(tweet._json["user"]["created_at"]) json_dict["Posted_date"] = dt.date() if tweet._json["user"]["location"]: json_dict["Country"] = tweet._json["user"]["location"] else: json_dict["Country"] = "India" json_dict["City"] = "" json_dict["Area"] = "India" json_dict["Rating_text"] = "" json_dict["Rating"] = "" json_dict["Max_Rating"] = "5" for user in tweet._json["entities"]["user_mentions"]: json_dict["Restaurent_name/Hotel_name"] = user["name"] json_dict["Restaurent_id/Hotel_id"] = user["id_str"] if obj is not None: if dt.date() > datetime.strptime( obj['Posted_date'], '%Y-%m-%d').date(): json_dict["Posted_date"] = str(dt.date()) checksum = hashlib.md5( json.dumps(json_dict, sort_keys=True).encode( 'utf8')).hexdigest() json_dict['checksum'] = checksum feed_obj.append(json_dict.copy()) else: json_dict["Posted_date"] = str(dt.date()) checksum = hashlib.md5( json.dumps( json_dict, sort_keys=True).encode('utf8')).hexdigest() json_dict['checksum'] = checksum feed_obj.append(json_dict.copy()) except: logging.warn( "Issue while extracting data from Twitter handle, please recheck expiry/limits of keys." ) pass return feed_obj
def google_reviews_hotels(obj): logging.info( "extracting google reviews by places api and searching the nearest restaurants by coordinate" ) feed_obj = [] apiKey = argument_config.get('Google') api = SentimentAnalysis(apiKey) '''extracting google reviews by places api and searching the nearest restaurants by coordinate''' places = api.search_places_by_coordinate("17.450500,78.380890", "1000", "hotels") fields = [ 'name', 'formatted_address', 'international_phone_number', 'website', 'rating', 'review' ] for place in places: details = api.get_place_details(place['place_id'], fields) try: for review in details['result']['reviews']: googledict = {} googledict["Extracted_Date"] = str(datetime.today().date()) googledict["Source"] = "Google" googledict["City"] = place["vicinity"].split(",")[-1].strip() googledict["Country"] = "India" googledict["Restaurent_name/Hotel_name"] = details['result'][ 'name'] googledict["Restaurent_id/Hotel_id"] = place['place_id'] googledict["User_Name"] = review['author_name'] googledict["Rating"] = str(review['rating']) googledict["Max_Rating"] = "5" googledict["Review_Text"] = remove_emoji( review['text']).replace("\n", "").replace("\r", "") date1 = api.dateconverter(review['relative_time_description']) Posted_date = datetime.now() - timedelta(days=date1) googledict["User_id"] = "" googledict["User_Age"] = "" googledict["User_Location"] = "" googledict["User_Gender"] = "" googledict["Rating_text"] = "" googledict["Area"] = place["vicinity"] if obj is not None: if Posted_date.date() > datetime.strptime( obj['Posted_date'], '%Y-%m-%d').date(): googledict["Posted_date"] = str(Posted_date.date()) checksum = hashlib.md5( json.dumps( googledict, sort_keys=True).encode('utf8')).hexdigest() googledict['checksum'] = checksum feed_obj.append(googledict.copy()) else: googledict["Posted_date"] = str(Posted_date.date()) checksum = hashlib.md5( json.dumps(googledict, sort_keys=True).encode('utf8')).hexdigest() googledict['checksum'] = checksum feed_obj.append(googledict.copy()) except: logging.warn("Issue while getting google placeid") pass return feed_obj
def zomato_reviews(obj): logging.info("Extracting reviews provided by zomato via api") '''this function extracts reviews provided by zomato via api.''' apiKey = argument_config.get('Zomato') api = SentimentAnalysis(apiKey) reviewslist = [] locations = [ "madhapur", "kondapur", "kphp", "banjarahills", "secunderabad", "Gachibowli", "Miyapur" ] for location in locations: places = api.get_location_id(location, 17.3850, 78.4867, 5) for place in places: resid = api.get_restaurent_id(place["entity_id"], place["entity_type"]) try: for res in resid: # provide restaurant id in order to get reviews of specific restaurant(manditory field) reviews = api.get_zomato_reviews(res["restaurant"]["id"], "1", "20") for everyreview in reviews["user_reviews"]: ratingdict = {} ratingdict["Extracted_Date"] = str( datetime.today().date()) ratingdict["Source"] = "Zomato" ratingdict["City"] = "hyderabad" ratingdict["Country"] = "India" ratingdict["Restaurent_name/Hotel_name"] = res[ "restaurant"]["name"] ratingdict["Restaurent_id/Hotel_id"] = res[ "restaurant"]["id"] ratingdict["Area"] = res["restaurant"]["location"][ "locality"] ratingdict["Rating"] = str( everyreview["review"]["rating"]) ratingdict["Max_Rating"] = "5" ratingdict["Review_Text"] = remove_emoji( str(everyreview["review"]["review_text"])).replace( "\n", "").replace("\r", "") ratingdict["User_id"] = everyreview["review"]["id"] ratingdict["Rating_text"] = everyreview["review"][ "rating_text"] date1 = api.date_conv( str(everyreview["review"]["review_time_friendly"])) ratingdict["User_Name"] = everyreview["review"][ "user"]["name"] ratingdict["User_Age"] = "" ratingdict["User_Location"] = "" ratingdict["User_Gender"] = "" if obj is not None: if date1 > datetime.strptime( obj['Posted_date'], '%Y-%m-%d').date(): ratingdict["Posted_date"] = str(date1) checksum = hashlib.md5( json.dumps(ratingdict, sort_keys=True).encode( 'utf8')).hexdigest() ratingdict['checksum'] = checksum reviewslist.append(ratingdict.copy()) else: ratingdict["Posted_date"] = str(date1) checksum = hashlib.md5( json.dumps(ratingdict, sort_keys=True).encode( 'utf8')).hexdigest() ratingdict['checksum'] = checksum reviewslist.append(ratingdict.copy()) except: logging.warn( "Issue while getting Zomato resID, please check API limit available for a day." ) pass return reviewslist
def main(): """Initiates the Financial news extraction from Quandl using API calls.""" t1 = time.time() logging.basicConfig(format='%(asctime)s %(levelname)s \ %(module)s.%(funcName)s :: %(message)s', level=logging.INFO) # fetching arguments from config. quandl_apikey = argument_config.get('quandl_apikey') meta_col_name = mongo_config.get('meta_colln_name') quandl_codes_colln_name = mongo_config.get('quandl_codes_colln_name') qcodes_colln = mongo.initialize_mongo(quandl_codes_colln_name) meta_mongo_colln = mongo.initialize_mongo(meta_col_name) # Executes code uninterrupted. while True: try: # Fetching dataset codes from Quandl qcodes_cursor = qcodes_colln.find() for qcur in qcodes_cursor: # Redownload the codes for every 30 days curr_date = datetime.now().strftime("%Y-%m-%d") codes_dt = datetime(*map(int, (qcur['created_time'])\ .split("-"))) curr_dt = datetime(*map(int, curr_date.split("-"))) if (curr_dt - codes_dt).days > 30: getCodesInCSVsForAllDatasets(quandl_apikey) break else: # Downloading the Quandl codes for the first time. getCodesInCSVsForAllDatasets(quandl_apikey) # Fetch the Quandl codes from mongo collection to extract data, qcodes_cursor = qcodes_colln.find() src_colln_list = [] for qcur in qcodes_cursor: base_url = qcur['base_url'] data_URL = base_url + "?api_key={0}" dataset_code = qcur['dataset_code'] dataset_descrpn = qcur['description'] qcode_name = qcur['name'] src_colln_name = dataset_code.lower().split("/")[0] meta_obj_name = src_colln_name + "." + dataset_code.split( "/")[1] if src_colln_name not in src_colln_list: src_colln_list.append(src_colln_name) else: continue logging.info("Executing dataset code :: " + dataset_code) src_colln = mongo.initialize_mongo(src_colln_name) resp_data = '' mongo_id = '' data_mode = '' prev_count = 0 # Check if Collection already exists in MongoDB. metadata_count = src_colln.count() if metadata_count == 0: time.sleep(3) resp = os.popen("curl " + data_URL.format(quandl_apikey)) resp_data = resp.read() data_mode = "initial" # Persisting functionality to Mongo. saveQuandlData(resp_data, src_colln, src_colln_name, dataset_descrpn, dataset_code, data_mode, prev_count, qcode_name) except: raise logging.info("Total time taken to fetch data from Quandl : " + str(round(float((time.time() - t1) / 60), 1)) + " minutes")
def insert_into_ckan(mongo_uri, source, qcode_name, description, refresh_rate): """"CKAN holds the meta information about the saved data of MongoDB.""" logging.basicConfig(format='%(asctime)s %(levelname)s \ %(module)s.%(funcName)s :: %(message)s', level=logging.INFO) # Fetch config params. ckan_host = argument_config.get('ckan_host') api_key = argument_config.get('api_key') owner_org = argument_config.get('owner_org') publisher = argument_config.get('publisher') ckan_private = argument_config.get('ckan_private') db_name = mongo_config.get('db_name') ckan_ckan = ckanapi.RemoteCKAN(ckan_host, apikey=api_key) package_name = source.lower().replace("_", "-")\ .replace("(", "-")\ .replace(")", "-")\ .replace("/", "-")\ .replace(".", "")\ .replace("&", "")\ .replace(":", "")\ .replace("---", "-")\ .replace("--", "-") package_name = package_name[:99] if package_name.endswith("-"): package_name = package_name.rstrip('-') # package_title = source.replace("_", " ") package_title = qcode_name dict_additional_fields = { 'Title': package_title, 'Sourcing date': datetime.now().strftime("%B %d, %Y, %H:%M"), 'Source': source, 'Datastore': mongo_uri, 'Database name': db_name, 'Collection': source, 'Description': description, 'Refresh rate': refresh_rate, } additional_fields = [] for k, v in dict_additional_fields.items(): additional_fields.append({'key': k, 'value': v}) tags = buildTags(package_name) try: ckan_ckan.action.package_create( name=package_name, title=package_title, maintainer=publisher, tags=tags, notes=description, private=ckan_private, owner_org=owner_org, extras=additional_fields, ) except: try: ckan_ckan.action.package_update( id=package_name, title=package_title, maintainer=publisher, tags=tags, notes=description, private=ckan_private, owner_org=owner_org, extras=additional_fields, ) except: logging.error("CKAN package creation/updation failed: " + package_name)