Exemplo n.º 1
0
Arquivo: Main.py Projeto: aghogre/mlb
def main():
    logging.basicConfig(format='%(asctime)s %(levelname)s \
                        %(module)s.%(funcName)s :: %(message)s',
                        level=logging.INFO)
    source = mongo_config.get('col_name')
    mongo_colln = initialize_mongo(source)
    archive_url = argument_config.get('stl_batting')
    a_team_list = []
    l = []
    peoria_chiefs_batting = argument_config.get('peoria_chiefs_batting')
    springfield_cardinals_batting = argument_config.get(
        'springfield_cardinals_batting')
    memphis_redbirds_batting = argument_config.get('memphis_redbirds_batting')
    minor_url_list = [
        peoria_chiefs_batting, springfield_cardinals_batting,
        memphis_redbirds_batting
    ]
    #url_list = [archive_url1, archive_url2, archive_url3, archive_url10, archive_url20, archive_url30, archive_url100, archive_url200, archive_url300]
    try:
        logging.info("Starting Data Extraction for St Louis Cardinals Batters")
        stats_empty_list = []
        stat_list = stl_batting_stats(archive_url, stats_empty_list)
        l = json_object_building(stat_list)
        logging.info("Loading Data To Mongo")
        ##
        for obj in l:
            mongo_id = obj["NAME"] + "-" + obj["TEAM"]
            feed_object = obj
            update_mongo_collection(mongo_colln, mongo_id, feed_object)

        logging.info(
            "Starting Data Extraction for St Louis Cardinals Minor League Batters"
        )
        for i in minor_url_list:
            if i == peoria_chiefs_batting:
                team = "PEORIA CHIEFS"
            elif i == springfield_cardinals_batting:
                team = "SPRINGFIELD CARDINALS"
            elif i == memphis_redbirds_batting:
                team == "MEMPHIS REDBIRDS"
            data_list = a_team_batting_stats(a_team_list, i)
            l = minor_json_object_building(data_list, team)
            data_list = []
        logging.info("Loading Data To Mongo")
        for obj in l:
            mongo_id = obj["NAME"] + "-" + obj["TEAM"]
            feed_object = obj
            update_mongo_collection(mongo_colln, mongo_id, feed_object)
    except:
        logging.error(
            "Error Occurs while scraping and loading, raise exception to check exact error"
        )
        raise
def twitter_reviews_hotels(obj):
    logging.info(
        "searching the twitter reviews or orginal tweets posted by the user for specific handles"
    )
    '''Searching the twitter reviews or orginal tweets posted by the user for specific handles'''
    feed_obj = []
    auth = tweepy.OAuthHandler(argument_config.get('consumer_key'),
                               argument_config.get('consumer_secret'))
    auth.set_access_token(argument_config.get('access_token'),
                          argument_config.get('access_token_secret'))
    api = tweepy.API(auth, wait_on_rate_limit=True)
    twitter_hashtags = argument_config.get('twitter_hashtags_hotels')
    try:
        for handle in twitter_hashtags.keys():
            time.sleep(60)
            tweets = tweepy.Cursor(api.search, q=handle, rpp=100).items(300)
            for tweet in tweets:
                if tweet._json["in_reply_to_status_id"] == None and tweet._json[
                        "in_reply_to_status_id_str"] == None:  #and tweet._json["retweet_count"] == 0
                    json_dict = {}
                    json_dict["Extracted_Date"] = str(datetime.today().date())
                    json_dict["Source"] = "Twitter"
                    json_dict["Review_Text"] = remove_emoji(
                        tweet._json["text"].strip()).replace("\n", "").replace(
                            "\r", "")
                    json_dict["User_Name"] = tweet._json["user"]["name"]
                    json_dict["User_Age"] = ""
                    json_dict["User_Location"] = ""
                    json_dict["User_Gender"] = ""
                    json_dict["User_id"] = tweet._json["user"]["id_str"]
                    json_dict["Posted_date"] = tweet._json["user"][
                        "created_at"]
                    dt = parse(tweet._json["user"]["created_at"])
                    json_dict["Posted_date"] = dt.date()
                    if tweet._json["user"]["location"]:
                        json_dict["Country"] = tweet._json["user"]["location"]
                    else:
                        json_dict["Country"] = "India"
                    json_dict["City"] = ""
                    json_dict["Area"] = "India"
                    json_dict["Rating_text"] = ""
                    json_dict["Rating"] = ""
                    json_dict["Max_Rating"] = "5"
                    for user in tweet._json["entities"]["user_mentions"]:
                        json_dict["Restaurent_name/Hotel_name"] = user["name"]
                        json_dict["Restaurent_id/Hotel_id"] = user["id_str"]
                    if obj is not None:
                        if dt.date() > datetime.strptime(
                                obj['Posted_date'], '%Y-%m-%d').date():
                            json_dict["Posted_date"] = str(dt.date())
                            checksum = hashlib.md5(
                                json.dumps(json_dict, sort_keys=True).encode(
                                    'utf8')).hexdigest()
                            json_dict['checksum'] = checksum
                            feed_obj.append(json_dict.copy())
                    else:
                        json_dict["Posted_date"] = str(dt.date())
                        checksum = hashlib.md5(
                            json.dumps(
                                json_dict,
                                sort_keys=True).encode('utf8')).hexdigest()
                        json_dict['checksum'] = checksum
                        feed_obj.append(json_dict.copy())
    except:
        logging.warn(
            "Issue while extracting data from Twitter handle, please recheck expiry/limits of keys."
        )
        pass
    return feed_obj
def google_reviews_hotels(obj):
    logging.info(
        "extracting google reviews by places api and searching the nearest restaurants by coordinate"
    )
    feed_obj = []
    apiKey = argument_config.get('Google')
    api = SentimentAnalysis(apiKey)
    '''extracting google reviews by places api and searching the nearest restaurants by coordinate'''

    places = api.search_places_by_coordinate("17.450500,78.380890", "1000",
                                             "hotels")
    fields = [
        'name', 'formatted_address', 'international_phone_number', 'website',
        'rating', 'review'
    ]
    for place in places:
        details = api.get_place_details(place['place_id'], fields)
        try:
            for review in details['result']['reviews']:
                googledict = {}
                googledict["Extracted_Date"] = str(datetime.today().date())
                googledict["Source"] = "Google"
                googledict["City"] = place["vicinity"].split(",")[-1].strip()
                googledict["Country"] = "India"
                googledict["Restaurent_name/Hotel_name"] = details['result'][
                    'name']
                googledict["Restaurent_id/Hotel_id"] = place['place_id']
                googledict["User_Name"] = review['author_name']
                googledict["Rating"] = str(review['rating'])
                googledict["Max_Rating"] = "5"
                googledict["Review_Text"] = remove_emoji(
                    review['text']).replace("\n", "").replace("\r", "")
                date1 = api.dateconverter(review['relative_time_description'])
                Posted_date = datetime.now() - timedelta(days=date1)

                googledict["User_id"] = ""
                googledict["User_Age"] = ""
                googledict["User_Location"] = ""
                googledict["User_Gender"] = ""
                googledict["Rating_text"] = ""
                googledict["Area"] = place["vicinity"]
                if obj is not None:
                    if Posted_date.date() > datetime.strptime(
                            obj['Posted_date'], '%Y-%m-%d').date():
                        googledict["Posted_date"] = str(Posted_date.date())
                        checksum = hashlib.md5(
                            json.dumps(
                                googledict,
                                sort_keys=True).encode('utf8')).hexdigest()
                        googledict['checksum'] = checksum
                        feed_obj.append(googledict.copy())
                else:
                    googledict["Posted_date"] = str(Posted_date.date())
                    checksum = hashlib.md5(
                        json.dumps(googledict,
                                   sort_keys=True).encode('utf8')).hexdigest()
                    googledict['checksum'] = checksum
                    feed_obj.append(googledict.copy())
        except:
            logging.warn("Issue while getting google placeid")
            pass
    return feed_obj
def zomato_reviews(obj):
    logging.info("Extracting reviews provided by zomato via api")
    '''this function extracts reviews provided by zomato via api.'''
    apiKey = argument_config.get('Zomato')
    api = SentimentAnalysis(apiKey)
    reviewslist = []
    locations = [
        "madhapur", "kondapur", "kphp", "banjarahills", "secunderabad",
        "Gachibowli", "Miyapur"
    ]
    for location in locations:
        places = api.get_location_id(location, 17.3850, 78.4867, 5)
        for place in places:
            resid = api.get_restaurent_id(place["entity_id"],
                                          place["entity_type"])
            try:
                for res in resid:
                    # provide restaurant id in order to get reviews of specific restaurant(manditory field)
                    reviews = api.get_zomato_reviews(res["restaurant"]["id"],
                                                     "1", "20")
                    for everyreview in reviews["user_reviews"]:
                        ratingdict = {}
                        ratingdict["Extracted_Date"] = str(
                            datetime.today().date())
                        ratingdict["Source"] = "Zomato"
                        ratingdict["City"] = "hyderabad"
                        ratingdict["Country"] = "India"
                        ratingdict["Restaurent_name/Hotel_name"] = res[
                            "restaurant"]["name"]
                        ratingdict["Restaurent_id/Hotel_id"] = res[
                            "restaurant"]["id"]
                        ratingdict["Area"] = res["restaurant"]["location"][
                            "locality"]
                        ratingdict["Rating"] = str(
                            everyreview["review"]["rating"])
                        ratingdict["Max_Rating"] = "5"
                        ratingdict["Review_Text"] = remove_emoji(
                            str(everyreview["review"]["review_text"])).replace(
                                "\n", "").replace("\r", "")
                        ratingdict["User_id"] = everyreview["review"]["id"]
                        ratingdict["Rating_text"] = everyreview["review"][
                            "rating_text"]
                        date1 = api.date_conv(
                            str(everyreview["review"]["review_time_friendly"]))
                        ratingdict["User_Name"] = everyreview["review"][
                            "user"]["name"]
                        ratingdict["User_Age"] = ""
                        ratingdict["User_Location"] = ""
                        ratingdict["User_Gender"] = ""

                        if obj is not None:
                            if date1 > datetime.strptime(
                                    obj['Posted_date'], '%Y-%m-%d').date():
                                ratingdict["Posted_date"] = str(date1)
                                checksum = hashlib.md5(
                                    json.dumps(ratingdict,
                                               sort_keys=True).encode(
                                                   'utf8')).hexdigest()
                                ratingdict['checksum'] = checksum
                                reviewslist.append(ratingdict.copy())
                        else:
                            ratingdict["Posted_date"] = str(date1)
                            checksum = hashlib.md5(
                                json.dumps(ratingdict, sort_keys=True).encode(
                                    'utf8')).hexdigest()
                            ratingdict['checksum'] = checksum
                            reviewslist.append(ratingdict.copy())
            except:
                logging.warn(
                    "Issue while getting Zomato resID, please check API limit available for a day."
                )
                pass
    return reviewslist
def main():
    """Initiates the Financial news extraction from Quandl using API calls."""

    t1 = time.time()
    logging.basicConfig(format='%(asctime)s %(levelname)s \
                        %(module)s.%(funcName)s :: %(message)s',
                        level=logging.INFO)

    # fetching arguments from config.
    quandl_apikey = argument_config.get('quandl_apikey')
    meta_col_name = mongo_config.get('meta_colln_name')
    quandl_codes_colln_name = mongo_config.get('quandl_codes_colln_name')

    qcodes_colln = mongo.initialize_mongo(quandl_codes_colln_name)
    meta_mongo_colln = mongo.initialize_mongo(meta_col_name)

    # Executes code uninterrupted.
    while True:
        try:
            # Fetching dataset codes from Quandl
            qcodes_cursor = qcodes_colln.find()
            for qcur in qcodes_cursor:
                # Redownload the codes for every 30 days
                curr_date = datetime.now().strftime("%Y-%m-%d")
                codes_dt = datetime(*map(int, (qcur['created_time'])\
                                              .split("-")))
                curr_dt = datetime(*map(int, curr_date.split("-")))

                if (curr_dt - codes_dt).days > 30:
                    getCodesInCSVsForAllDatasets(quandl_apikey)
                break
            else:
                # Downloading the Quandl codes for the first time.
                getCodesInCSVsForAllDatasets(quandl_apikey)

            # Fetch the Quandl codes from mongo collection to extract data,
            qcodes_cursor = qcodes_colln.find()

            src_colln_list = []
            for qcur in qcodes_cursor:
                base_url = qcur['base_url']
                data_URL = base_url + "?api_key={0}"
                dataset_code = qcur['dataset_code']
                dataset_descrpn = qcur['description']
                qcode_name = qcur['name']

                src_colln_name = dataset_code.lower().split("/")[0]
                meta_obj_name = src_colln_name + "." + dataset_code.split(
                    "/")[1]

                if src_colln_name not in src_colln_list:
                    src_colln_list.append(src_colln_name)
                else:
                    continue
                logging.info("Executing dataset code :: " + dataset_code)

                src_colln = mongo.initialize_mongo(src_colln_name)
                resp_data = ''
                mongo_id = ''
                data_mode = ''
                prev_count = 0

                # Check if Collection already exists in MongoDB.
                metadata_count = src_colln.count()
                if metadata_count == 0:

                    time.sleep(3)
                    resp = os.popen("curl " + data_URL.format(quandl_apikey))
                    resp_data = resp.read()
                    data_mode = "initial"

                    # Persisting functionality to Mongo.
                    saveQuandlData(resp_data, src_colln, src_colln_name,
                                   dataset_descrpn, dataset_code, data_mode,
                                   prev_count, qcode_name)
        except:
            raise

    logging.info("Total time taken to fetch data from Quandl : " +
                 str(round(float((time.time() - t1) / 60), 1)) + " minutes")
Exemplo n.º 6
0
def insert_into_ckan(mongo_uri, source, qcode_name, description, refresh_rate):
    """"CKAN holds the meta information about the saved data of MongoDB."""

    logging.basicConfig(format='%(asctime)s %(levelname)s \
                        %(module)s.%(funcName)s :: %(message)s',
                        level=logging.INFO)

    # Fetch config params.
    ckan_host = argument_config.get('ckan_host')
    api_key = argument_config.get('api_key')
    owner_org = argument_config.get('owner_org')
    publisher = argument_config.get('publisher')
    ckan_private = argument_config.get('ckan_private')
    db_name = mongo_config.get('db_name')

    ckan_ckan = ckanapi.RemoteCKAN(ckan_host, apikey=api_key)

    package_name = source.lower().replace("_", "-")\
                                 .replace("(", "-")\
                                 .replace(")", "-")\
                                 .replace("/", "-")\
                                 .replace(".", "")\
                                 .replace("&", "")\
                                 .replace(":", "")\
                                 .replace("---", "-")\
                                 .replace("--", "-")

    package_name = package_name[:99]
    if package_name.endswith("-"):
        package_name = package_name.rstrip('-')

    # package_title = source.replace("_", " ")
    package_title = qcode_name

    dict_additional_fields = {
        'Title': package_title,
        'Sourcing date': datetime.now().strftime("%B %d, %Y, %H:%M"),
        'Source': source,
        'Datastore': mongo_uri,
        'Database name': db_name,
        'Collection': source,
        'Description': description,
        'Refresh rate': refresh_rate,
    }
    additional_fields = []
    for k, v in dict_additional_fields.items():
        additional_fields.append({'key': k, 'value': v})

    tags = buildTags(package_name)
    try:
        ckan_ckan.action.package_create(
            name=package_name,
            title=package_title,
            maintainer=publisher,
            tags=tags,
            notes=description,
            private=ckan_private,
            owner_org=owner_org,
            extras=additional_fields,
        )
    except:
        try:
            ckan_ckan.action.package_update(
                id=package_name,
                title=package_title,
                maintainer=publisher,
                tags=tags,
                notes=description,
                private=ckan_private,
                owner_org=owner_org,
                extras=additional_fields,
            )
        except:
            logging.error("CKAN package creation/updation failed: " +
                          package_name)