Exemplo n.º 1
0
def persistFinData(mongo, mongo_colln, source, json_data, data, dataset_code,
                   description, meta_updated, data_mode, qcode_name):
    """Collects the Quandl JSON response data and inserts into mongo collection
    and updates CKAN."""

    logging.basicConfig(format='%(asctime)s %(levelname)s \
                        %(module)s.%(funcName)s :: %(message)s',
                        level=logging.INFO)

    # fetching arguments from config.
    mongo_uri = mongo_config.get('mongo_uri')
    meta_col_name = mongo_config.get('meta_colln_name')

    try:
        mongo.bulk_mongo_insert(mongo_colln, data)

        # logging.info(data_mode)
        # logging.info(meta_updated)
        if data_mode == "initial" and meta_updated:
                # METADATA Collection
                meta_mongo_colln = mongo.initialize_mongo(meta_col_name)
                meta_feedObj = json_data["dataset"]
                meta_feedObj['_id'] = source + "." + dataset_code.split("/")[1]
                mongo.insert_into_mongo(meta_mongo_colln, meta_feedObj)

                # CKAN
                refresh_rate = json_data["dataset"]["frequency"]
                insert_into_ckan(mongo_uri, source, qcode_name, description,
                                 refresh_rate)
    except:
        raise
        logging.error("Error while Initializing Mongo.")

    return meta_updated
Exemplo n.º 2
0
    def __init__(self):
        mongo_uri = mongo_config.get('mongo_uri')
        ssl_required = mongo_config.get('ssl_required')
        requires_auth = mongo_config.get('requires_auth')
        mongo_username = mongo_config.get('mongo_username')
        mongo_password = mongo_config.get('mongo_password')
        mongo_auth_source = mongo_config.get('mongo_auth_source')
        mongo_auth_mech = mongo_config.get('mongo_auth_mechanism')
        db_name = mongo_config.get('db_name')
        self.mongo_index_name = mongo_config.get('mongo_index_name')

        try:
            # Instantiating MongoClient
            client = MongoClient(mongo_uri,
                                 ssl=ssl_required,
                                 replicaSet='Cluster0-shard-0')

            # Authenticate MongoDB (Optional)
            if requires_auth == 'true':
                client.the_database.authenticate(mongo_username,
                                                 mongo_password,
                                                 source=mongo_auth_source,
                                                 mechanism=mongo_auth_mech)
            self.mongo_inst = client[db_name]

        except IOError:
            raise
            logging.error("Could not connect to Mongo Server")
Exemplo n.º 3
0
def model_input():
    input_list = []
    source = mongo_config.get('col_name')  #Constant-Mongo Collection Name
    mongo_colln = initialize_mongo(source)
    try:
        logging.info("Building dataframe of input.")
        for documents in mongo_colln.find():  #[0:20]
            if documents["Extracted_Date"] == str(datetime.today().date()):
                input_list.append([
                    documents['checksum'],
                    str(documents['Review_Text']).replace("\n", ''),
                    documents['Restaurent_id/Hotel_id'], documents['Country'],
                    documents['Restaurent_name/Hotel_name'],
                    documents['User_Name'], documents['Rating'],
                    documents['Source'], documents['Rating_text'],
                    documents['Posted_date'], documents['User_id'],
                    documents['City'], documents['Area'],
                    documents['User_Age'], documents['User_Location'],
                    documents['User_Gender'], documents['Max_Rating']
                ])
            else:
                continue
    except:
        logging.error("Building input dataframe causing error.")
    input_dataframe = pd.DataFrame(
        input_list,
        columns=[
            'Review_id', 'Review_Text', 'Restaurent_id/Hotel_id', 'Country',
            'Restaurent_name/Hotel_name', 'User_Name', 'Rating', 'Source',
            'Rating_text', 'Posted_date', 'User_id', 'City', 'Area',
            'User_Age', 'User_Location', 'User_Gender', 'Max_Rating'
        ])
    return input_dataframe
Exemplo n.º 4
0
def load_sentiments_to_mongo():
    logging.basicConfig(filename='ABSA_uptd-logs.log',
                        filemode='w',
                        format='%(asctime)s %(levelname)s \
                        %(module)s.%(funcName)s :: %(message)s',
                        datefmt='%d:%m:%Y %H:%M:%S',
                        level=logging.DEBUG)
    try:
        logging.info("Fetching Input Reviews from Database.")
        source = mongo_config.get('col_name1')
        mongo_colln = initialize_mongo(source)
        input_dataframe = model_input()
        logging.info("Creating Output Object Stricture.")
        logging.info("Getting Aspects and Sentiments for Review.")
        logging.info("Triggering Model for aspects and sentiments.")
        for r, sentence in input_dataframe.iterrows():
            json_obj = {}
            json_obj["REVIEW"] = str(sentence["Review_Text"])
            json_obj["REVIEW_ID"] = str(sentence["Review_id"])
            json_obj["Restaurent_id/Hotel_id"] = str(
                sentence["Restaurent_id/Hotel_id"])
            json_obj["Country"] = sentence["Country"]
            json_obj["Restaurent_name/Hotel_name"] = sentence[
                "Restaurent_name/Hotel_name"]
            json_obj["User_Name"] = str(sentence["User_Name"])
            json_obj["Rating"] = str(sentence["Rating"])
            json_obj["Source"] = sentence["Source"]
            json_obj["Rating_text"] = sentence["Rating_text"]
            json_obj["Posted_date"] = str(sentence["Posted_date"])
            json_obj["User_id"] = str(sentence["User_id"])
            json_obj["City"] = sentence["City"]
            json_obj["Area"] = sentence["Area"]
            json_obj["User_Age"] = str(sentence["User_Age"])
            json_obj["User_Location"] = sentence["User_Location"]
            json_obj["User_Gender"] = sentence["User_Gender"]
            json_obj["Max_Rating"] = str(sentence["Max_Rating"])
            try:
                for sentiment in get_polarities(str(sentence["Review_Text"])):
                    json_obj[str(sentiment.keys()).replace("[", '').replace(
                        "]", '').replace("(", '').replace(")", '').replace(
                            ",", '').replace("dict_keys", '').replace(
                                "'", '')] = str(sentiment.values()).replace(
                                    "[", '').replace("]", '').replace(
                                        "(", '').replace(")", '').replace(
                                            ",",
                                            '').replace("dict_values",
                                                        '').replace("'", '')
            except:
                logging.error(
                    "Issue in loading sentimente to Database Collection")
                continue
            mongo_id = hashlib.md5(str(json_obj).encode('utf-8')).hexdigest()
            update_mongo_collection(mongo_colln, mongo_id, json_obj)
    except:
        logging.error(
            "Input fetching from Mongo or Getting aspects and Sentiment Issue."
        )
        raise
def main():
    logging.basicConfig(filename='Reviews-Extractor-logs.log',
                        filemode='w',
                        format='%(asctime)s %(levelname)s \
                        %(module)s.%(funcName)s :: %(message)s',
                        datefmt='%d:%m:%y %H:%M:%S',
                        level=logging.INFO)
    col_name = mongo_config.get('col_name')
    # Initializing Mongo With Collection Name
    try:
        logging.info("Starting Hotel Reviews load to Mongo")
        hotel_mongo_load()
    except:
        logging.warn("Issue while loading Hotel Reviews to Mongo")
        pass
    logging.info("Starting Restaurant Reviews load to Mongo")
    mongo_colln = initialize_mongo(col_name)
    try:
        #Checking for latest date for data in Mongo
        if mongo_colln.find().sort("Posted_date", -1).limit(1).count() > 0:
            logging.info(
                "Checking for latest date for data in Database for Incremental Inserting"
            )
            for obj in mongo_colln.find().sort("Posted_date", -1).limit(1):

                logging.info("Calling Sources functions for Extraction")
                scrapers = [
                    google_reviews(obj),
                    zomato_reviews(obj),
                    citysearch_reviews(obj),
                    twitter_reviews(obj)
                ]
                for scraper in scrapers:

                    logging.info("Adding data to database collections.")
                    for each in scraper:
                        update_mongo_collection(mongo_colln, each["checksum"],
                                                each)
        elif mongo_colln.find().sort("Posted_date", -1).limit(1).count() == 0:
            logging.info(
                "No Latest Date found, Collection is Empty. Inserting from beginning."
            )
            obj = None
            scrapers = [
                google_reviews(obj),
                zomato_reviews(obj),
                citysearch_reviews(obj),
                twitter_reviews(obj)
            ]
            for scraper in scrapers:
                logging.info("Adding data to database collections.")
                for each in scraper:
                    update_mongo_collection(mongo_colln, each["checksum"],
                                            each)
    except:
        logging.error("Restaurant Source Extractions threw an error. ")
        pass
Exemplo n.º 6
0
def saveCodesInMongo(qcode_name):

    quandl_codes_colln_name = mongo_config.get('quandl_codes_colln_name')

    q_data_base_URL = "https://www.quandl.com/api/v3/datasets/{0}"

    filenamesList = []
    for (dirpath, dirnames, filenames) in walk(DEFAULT_DATA_PATH):
        filenamesList.extend(filenames)

    qcodes_colln = mongo.initialize_mongo(quandl_codes_colln_name)
    for fn in filenamesList:
        try:
            dataset_qcodes = []
            logging.info(fn + " extracted.")
            codesFile = os.path.abspath(os.path.join(DEFAULT_DATA_PATH, fn))
            dataset = fn.replace('-datasets-codes.csv', '')

            qcode_cursor = qcodes_colln.find_one({'dataset': dataset})
            if not qcode_cursor:
                with open(codesFile, 'r') as csv_file:
                    csvlines = csv_file.readlines()

                    for num, line in enumerate(csvlines):
                        codeline = line.split(',')
                        if len(codeline) > 1:
                            dataset_code = codeline[0]
                            dataset_descrpn = codeline[1]
                            created_time = datetime.now().strftime("%Y-%m-%d")

                            code_doc = {
                                "dataset": dataset,
                                "dataset_code": dataset_code,
                                "description": dataset_descrpn,
                                "base_url":
                                q_data_base_URL.format(dataset_code),
                                "created_time": created_time,
                                "name": qcode_name,
                                "_id": dataset_code,
                            }
                            dataset_qcodes.append(code_doc)

            if qcode_cursor:
                mongo.bulk_mongo_update(qcodes_colln, dataset_qcodes)
            else:
                mongo.bulk_mongo_insert(qcodes_colln, dataset_qcodes)

        except:
            raise
            print "Errorrrrr..."
            continue
        finally:
            os.remove(codesFile)
Exemplo n.º 7
0
Arquivo: Main.py Projeto: aghogre/mlb
def main():
    logging.basicConfig(format='%(asctime)s %(levelname)s \
                        %(module)s.%(funcName)s :: %(message)s',
                        level=logging.INFO)
    source = mongo_config.get('col_name')
    mongo_colln = initialize_mongo(source)
    archive_url = argument_config.get('stl_batting')
    a_team_list = []
    l = []
    peoria_chiefs_batting = argument_config.get('peoria_chiefs_batting')
    springfield_cardinals_batting = argument_config.get(
        'springfield_cardinals_batting')
    memphis_redbirds_batting = argument_config.get('memphis_redbirds_batting')
    minor_url_list = [
        peoria_chiefs_batting, springfield_cardinals_batting,
        memphis_redbirds_batting
    ]
    #url_list = [archive_url1, archive_url2, archive_url3, archive_url10, archive_url20, archive_url30, archive_url100, archive_url200, archive_url300]
    try:
        logging.info("Starting Data Extraction for St Louis Cardinals Batters")
        stats_empty_list = []
        stat_list = stl_batting_stats(archive_url, stats_empty_list)
        l = json_object_building(stat_list)
        logging.info("Loading Data To Mongo")
        ##
        for obj in l:
            mongo_id = obj["NAME"] + "-" + obj["TEAM"]
            feed_object = obj
            update_mongo_collection(mongo_colln, mongo_id, feed_object)

        logging.info(
            "Starting Data Extraction for St Louis Cardinals Minor League Batters"
        )
        for i in minor_url_list:
            if i == peoria_chiefs_batting:
                team = "PEORIA CHIEFS"
            elif i == springfield_cardinals_batting:
                team = "SPRINGFIELD CARDINALS"
            elif i == memphis_redbirds_batting:
                team == "MEMPHIS REDBIRDS"
            data_list = a_team_batting_stats(a_team_list, i)
            l = minor_json_object_building(data_list, team)
            data_list = []
        logging.info("Loading Data To Mongo")
        for obj in l:
            mongo_id = obj["NAME"] + "-" + obj["TEAM"]
            feed_object = obj
            update_mongo_collection(mongo_colln, mongo_id, feed_object)
    except:
        logging.error(
            "Error Occurs while scraping and loading, raise exception to check exact error"
        )
        raise
Exemplo n.º 8
0
def sentiment_generator(input_dataframe):
    try:
        dataset_list = []
        logging.info("Getting Sentiments of Loaded Reviews.")
        source2 = mongo_config.get("col_name3")
        mongo_colln2 = initialize_mongo(source2)
        logging.info("Loading Sentiments to Mongo")
        for index, i in input_dataframe.iterrows():
            review_dict = {}
            review_dict['REVIEW_ID'] = str(i['Review_id'])
            review_dict['Area'] = str(i['Area'])
            review_dict['City'] = str(i['City'])
            review_dict['Country'] = str(i['Country'])
            review_dict['Posted_date'] = str(i['Posted_date'])
            review_dict['REVIEW'] = str(i['Review_Text'])
            review_dict['Rating'] = str(i['Rating'])
            review_dict['Restaurent_id/Hotel_id'] = str(
                i['Restaurent_id/Hotel_id'])
            review_dict['Restaurent_name/Hotel_name'] = str(
                i['Restaurent_name/Hotel_name'])
            review_dict['Source'] = i['Source']
            review_dict['User_Name'] = str(i['User_Name'])
            review_dict['User_id'] = str(i['User_id'])
            review_dict['User_Gender'] = str(i['User_Gender'])
            review_dict['User_Age'] = str(i['User_Age'])
            review_dict['Max_Rating'] = str(i['Max_Rating'])
            review_dict['User_Location'] = str(i['User_Location'])
            try:
                sentiments, sentences = flask_entry(i["Review_Text"])
                for sentiment in sentiments:
                    review_dict[str(sentiment.keys()).replace("[", '').replace(
                        "]", '').replace("(", '').replace(")", '').replace(
                            ",", '').replace("dict_keys", '').replace(
                                "'", '')] = str(sentiment.values()).replace(
                                    "[", '').replace("]", '').replace(
                                        "(", '').replace(")", '').replace(
                                            ",",
                                            '').replace("dict_values",
                                                        '').replace("'", '')
            except:
                continue

            review_dict['_id'] = hashlib.md5(
                json.dumps(review_dict,
                           sort_keys=True).encode('utf8')).hexdigest()
            dataset_list.append(review_dict)
        bulk_mongo_update(mongo_colln2, dataset_list)
    except:
        logging.error(
            "Error in getting sentiments of reviews. Unable to Load to Mongo")
        pass
Exemplo n.º 9
0
def make_mongo_connection(collection_name):
    """This is to establish connection with MongoDB with desired Credentials"""

    # Fetching config parameters.
    mongo_uri = mongo_config.get('mongo_uri')
    requires_auth = mongo_config.get('requires_auth')
    mongo_username = mongo_config.get('mongo_username')
    mongo_password = mongo_config.get('mongo_password')
    mongo_auth_source = mongo_config.get('mongo_auth_source')
    mongo_auth_mechanism = mongo_config.get('mongo_auth_mechanism')
    db_name = mongo_config.get('db_name')
    ssl_required = mongo_config.get('ssl_required')
    replicaSet = mongo_config.get('replicaSet')

    client = MongoClient(mongo_uri, ssl=ssl_required,
                         replicaSet=replicaSet)  #, connect=True

    if requires_auth == 'true':
        client.the_database.authenticate(mongo_username,
                                         mongo_password,
                                         source=mongo_auth_source,
                                         mechanism=mongo_auth_mechanism)

    db = client[db_name]
    mongo_colln = db[collection_name]

    # Testing Index with Unique element, to avoid failure of Index creation,
    # in case of Collection doesnot exist.
    test_uuid = str(uuid1())
    try:
        mongo_colln.insert_one({'uuid': test_uuid})
        mongo_colln.delete_one({'uuid': test_uuid})
    except:
        logging.debug("Collection %s already exists" % collection_name)

    return mongo_colln
def initialize_mongo(source):
    """Initializes MongoDB Connection and returns MongoCollection for the
    given Index."""

    # Fetching config parameters.
    mongo_index_name = mongo_config.get('mongo_index_name')

    try:
        # Creating Mongo Collection
        mongo_colln = make_mongo_connection(source)

        # Create index, if it is not available.
        if mongo_index_name not in mongo_colln.index_information():
            mongo_colln.create_index(mongo_index_name, unique=False)

    except IOError:
        logging.error("Could not connect to Mongo Server")

    return mongo_colln
Exemplo n.º 11
0
def make_mongo_connection(collection_name):
    """This is to establish connection with MongoDB with desired Credentials"""

    # Fetching config parameters.
    mongo_uri = mongo_config.get('mongo_uri')
    requires_auth = mongo_config.get('requires_auth')
    mongo_username = mongo_config.get('mongo_username')
    mongo_password = mongo_config.get('mongo_password')
    mongo_auth_source = mongo_config.get('mongo_auth_source')
    mongo_auth_mechanism = mongo_config.get('mongo_auth_mechanism')
    db_name = mongo_config.get('db_name')
    ssl_required = mongo_config.get('ssl_required')
    
    client = MongoClient(mongo_uri, ssl=ssl_required, replicaSet='MLBStats-shard-0', connect=False)
    #client = pymongo.MongoClient(mongo_uri)
    if requires_auth == 'true':
        client.the_database.authenticate(mongo_username,
                                         mongo_password,
                                         source=mongo_auth_source,
                                         mechanism=mongo_auth_mechanism
                                         )
       
    db = client[db_name]
    mongo_colln = db[collection_name]
    
    #client = pymongo.MongoClient("mongodb://*****:*****@mlbstats-shard-00-00-clx9y.mongodb.net:27017,mlbstats-shard-00-01-clx9y.mongodb.net:27017,mlbstats-shard-00-02-clx9y.mongodb.net:27017/test?ssl=true&replicaSet=MLBStats-shard-0&authSource=admin&retryWrites=true")
    


    # Testing Index with Unique element, to avoid failure of Index creation,
    # in case of Collection doesnot exist.
    test_uuid = str(uuid1())
    try:
        mongo_colln.insert_one({'uuid': test_uuid})
        mongo_colln.delete_one({'uuid': test_uuid})
    except:
        logging.debug("Collection %s already exists" % collection_name)

    return mongo_colln
def hotel_mongo_load():
    col_name = mongo_config.get('col_name_hotel')
    # Initializing Mongo With Collection Name
    mongo_colln = initialize_mongo(col_name)
    try:
        # Checking for latest date for data in Mongo
        if mongo_colln.find().sort("Posted_date", -1).limit(1).count() > 0:
            logging.info(
                "Checking for latest date for data in Database for Incremental Inserting"
            )

            for obj in mongo_colln.find().sort("Posted_date", -1).limit(1):
                logging.info("Calling Sources functions for Extraction")
                scrapers = [
                    google_reviews_hotels(obj),
                    twitter_reviews_hotels(obj)
                ]
                for scraper in scrapers:
                    logging.info("Adding data to database collections.")
                    for each in scraper:
                        update_mongo_collection(mongo_colln, each["checksum"],
                                                each)
        elif mongo_colln.find().sort("Posted_date", -1).limit(1).count() == 0:
            logging.info(
                "No Latest Date found, Collection is Empty. Inserting from beginning."
            )
            obj = None
            scrapers = [
                google_reviews_hotels(obj),
                twitter_reviews_hotels(obj)
            ]

            for scraper in scrapers:
                logging.info("Adding data to database collections.")
                for each in scraper:
                    update_mongo_collection(mongo_colln, each["checksum"],
                                            each)
    except:
        logging.error("Hotel Source Extractions threw an error. ")
        pass
def main():
    db_name = mongo_config.get('db_name')  # + '2'

    mongodb = mongodbConnector()
    colln_names = mongodb.get_collection_names()

    root_folder = os.path.abspath(
        os.path.join(os.path.dirname('__file__'), '', db_name))

    if not os.path.isdir(db_name):
        os.mkdir(db_name)

    for colln in colln_names:
        os.chdir(root_folder)
        mongo_colln = mongodb.initialize_mongo(colln)
        print colln + " (" + str(mongo_colln.count()) + ")"

        if mongo_colln.count() == 0:
            continue

        file_name = colln[:60]
        if not os.path.isdir(file_name):
            os.mkdir(file_name)
        os.chdir(file_name)
        cwd = os.getcwd()
        filesInDir = os.listdir(cwd)
        if len(filesInDir) > 0:
            print str(len(filesInDir)) + " files present "
            continue

        try:
            data_list = []
            has_written = False
            j = 0
            for i in mongo_colln.find():
                #print i
                data_list.append(i)
                if len(data_list) == 100000:
                    with open(file_name + '_' + str(j) + '.json',
                              'w') as colln_file:
                        colln_file.write(
                            json.dumps(data_list,
                                       indent=4,
                                       default=json_util.default))
                    data_list = []
                    has_written = True
                    j += 1

            if len(i) > 0:
                if not has_written:
                    with open(file_name + '.json', 'w') as colln_file:
                        colln_file.write(
                            json.dumps(data_list,
                                       indent=4,
                                       default=json_util.default))
                else:
                    with open(file_name + '_' + str(j) + '.json',
                              'w') as colln_file:
                        colln_file.write(
                            json.dumps(data_list,
                                       indent=4,
                                       default=json_util.default))
        except:
            raise
Exemplo n.º 14
0
def upload_files(df):
    try:
        dataset_list = []
        logging.info("Starting the loading Process.")
        source = mongo_config.get("col_name2")
        mongo_colln = initialize_mongo(source)
        logging.info("Loading input dataset to mongo.")
        for index, i in df.iterrows():
            resp_dict = {}
            resp_dict['Area'] = i["Area"]
            resp_dict['City'] = i["City"]
            resp_dict['Country'] = i["Country"]
            resp_dict['Extracted_Date'] = str(datetime.today().date())
            s = str(i["Posted_date"])
            days = ['day', 'days', 'Days', 'Day']
            months = ["months", "month", "Months", "Month"]
            yesterday = ["yesterday", "Yesterday"]
            today = [
                "hours", "hour", "Hours", "Hour", "Minutes", "Minute",
                "minutes", "minute", "mins", "min", "secs", "Seconds",
                "Second", "sec", "Hrs", "hrs", "Today", "today", "seconds",
                "second"
            ]
            if any(
                    x in s for x in days
            ) and s != 'yesterday' and s != 'Yesterday' and s != 'Today' and s != 'today':
                parsed_s = s.split()[:1]
                past_time = datetime.today() - timedelta(days=int(parsed_s[0]))
                resp_dict['Posted_date'] = str(past_time)[:10]
            elif any(x in s for x in months):
                parsed_s = s.split()[:1]
                past_time = datetime.today() - relativedelta(
                    months=int(parsed_s[0]))
                resp_dict['Posted_date'] = str(past_time)[:10]
            elif any(x in s for x in yesterday):
                past_time = datetime.today() - timedelta(days=int(1))
                resp_dict['Posted_date'] = str(past_time)[:10]
            elif any(x in s for x in today):
                past_time = datetime.today().date()
                resp_dict['Posted_date'] = str(past_time)[:10]
            else:
                resp_dict['Posted_date'] = str(s)
            resp_dict['Rating'] = str(i["Rating"])
            resp_dict['Rating_text'] = str(i["Rating_text"])
            resp_dict['Restaurent_id/Hotel_id'] = str(
                i["Restaurent_id/Hotel_id"])
            resp_dict['Restaurent_name/Hotel_name'] = str(
                i["Restaurent_name/Hotel_name"])
            if str(i["Review_Text"]) == "":
                break
            else:
                resp_dict['Review_Text'] = str(i["Review_Text"])
            resp_dict['Source'] = "file_upload"
            resp_dict['User_Name'] = str(i["User_Name"])
            resp_dict['User_id'] = str(i["User_id"])
            resp_dict['User_Gender'] = str(i["User_Gender"])
            resp_dict['User_Age'] = str(i["User_Age"])
            resp_dict['Max_Rating'] = str(i["Max_Rating"])
            resp_dict['User_Location'] = str(i["User_Location"])
            resp_dict['checksum'] = resp_dict['_id'] = hashlib.md5(
                json.dumps(resp_dict,
                           sort_keys=True).encode('utf8')).hexdigest()
            dataset_list.append(resp_dict)
        bulk_mongo_update(mongo_colln, dataset_list)
    except:
        flash(
            "Error while loading Dataset to mongo, invalid data in input file or Review is empty. Please check the schema for input file."
        )
        logging.error(
            "Error while loading Dataset to mongo, invalid data in input file or Review is empty. Please check the schema for input file."
        )
        return render_template('UploadFiles.html')

    try:
        input_list = []
        for documents in mongo_colln.find():
            if documents["Extracted_Date"] == str(datetime.today().date(
            )) and resp_dict['Source'] == "file_upload":
                input_list.append([
                    documents['checksum'],
                    str(documents['Review_Text']).replace("\n", ''),
                    documents['Restaurent_id/Hotel_id'], documents['Country'],
                    documents['Restaurent_name/Hotel_name'],
                    documents['User_Name'], documents['Rating'],
                    documents['Source'], documents['Rating_text'],
                    documents['Posted_date'], documents['User_id'],
                    documents['City'], documents['Area'],
                    documents['User_Gender'], documents['User_Age'],
                    documents['Max_Rating'], documents['User_Location']
                ])
    except:
        flash("Error while creating Dataframe for Sentiment.")
        logging.error("Error while creating Dataframe for Sentiment.")
        return render_template('UploadFiles.html')

    df1 = pd.DataFrame(input_list,
                       columns=[
                           'Review_id', 'Review_Text',
                           'Restaurent_id/Hotel_id', 'Country',
                           'Restaurent_name/Hotel_name', 'User_Name', 'Rating',
                           'Source', 'Rating_text', 'Posted_date', 'User_id',
                           'City', 'Area', 'User_Gender', 'User_Age',
                           'Max_Rating', 'User_Location'
                       ])
    input_dataframe = df1.replace(np.nan, '', regex=True)
    flash(
        "File Uploaded Successfully, getting sentiments for input dataset...")
    return input_dataframe
def main():
    """Initiates the Financial news extraction from Quandl using API calls."""

    t1 = time.time()
    logging.basicConfig(format='%(asctime)s %(levelname)s \
                        %(module)s.%(funcName)s :: %(message)s',
                        level=logging.INFO)

    # fetching arguments from config.
    quandl_apikey = argument_config.get('quandl_apikey')
    meta_col_name = mongo_config.get('meta_colln_name')
    quandl_codes_colln_name = mongo_config.get('quandl_codes_colln_name')

    qcodes_colln = mongo.initialize_mongo(quandl_codes_colln_name)
    meta_mongo_colln = mongo.initialize_mongo(meta_col_name)

    # Executes code uninterrupted.
    while True:
        try:
            # Fetching dataset codes from Quandl
            qcodes_cursor = qcodes_colln.find()
            for qcur in qcodes_cursor:
                # Redownload the codes for every 30 days
                curr_date = datetime.now().strftime("%Y-%m-%d")
                codes_dt = datetime(*map(int, (qcur['created_time'])\
                                              .split("-")))
                curr_dt = datetime(*map(int, curr_date.split("-")))

                if (curr_dt - codes_dt).days > 30:
                    getCodesInCSVsForAllDatasets(quandl_apikey)
                break
            else:
                # Downloading the Quandl codes for the first time.
                getCodesInCSVsForAllDatasets(quandl_apikey)

            # Fetch the Quandl codes from mongo collection to extract data,
            qcodes_cursor = qcodes_colln.find()

            src_colln_list = []
            for qcur in qcodes_cursor:
                base_url = qcur['base_url']
                data_URL = base_url + "?api_key={0}"
                dataset_code = qcur['dataset_code']
                dataset_descrpn = qcur['description']
                qcode_name = qcur['name']

                src_colln_name = dataset_code.lower().split("/")[0]
                meta_obj_name = src_colln_name + "." + dataset_code.split(
                    "/")[1]

                if src_colln_name not in src_colln_list:
                    src_colln_list.append(src_colln_name)
                else:
                    continue
                logging.info("Executing dataset code :: " + dataset_code)

                src_colln = mongo.initialize_mongo(src_colln_name)
                resp_data = ''
                mongo_id = ''
                data_mode = ''
                prev_count = 0

                # Check if Collection already exists in MongoDB.
                metadata_count = src_colln.count()
                if metadata_count == 0:

                    time.sleep(3)
                    resp = os.popen("curl " + data_URL.format(quandl_apikey))
                    resp_data = resp.read()
                    data_mode = "initial"

                    # Persisting functionality to Mongo.
                    saveQuandlData(resp_data, src_colln, src_colln_name,
                                   dataset_descrpn, dataset_code, data_mode,
                                   prev_count, qcode_name)
        except:
            raise

    logging.info("Total time taken to fetch data from Quandl : " +
                 str(round(float((time.time() - t1) / 60), 1)) + " minutes")
Exemplo n.º 16
0
def insert_into_ckan(mongo_uri, source, qcode_name, description, refresh_rate):
    """"CKAN holds the meta information about the saved data of MongoDB."""

    logging.basicConfig(format='%(asctime)s %(levelname)s \
                        %(module)s.%(funcName)s :: %(message)s',
                        level=logging.INFO)

    # Fetch config params.
    ckan_host = argument_config.get('ckan_host')
    api_key = argument_config.get('api_key')
    owner_org = argument_config.get('owner_org')
    publisher = argument_config.get('publisher')
    ckan_private = argument_config.get('ckan_private')
    db_name = mongo_config.get('db_name')

    ckan_ckan = ckanapi.RemoteCKAN(ckan_host, apikey=api_key)

    package_name = source.lower().replace("_", "-")\
                                 .replace("(", "-")\
                                 .replace(")", "-")\
                                 .replace("/", "-")\
                                 .replace(".", "")\
                                 .replace("&", "")\
                                 .replace(":", "")\
                                 .replace("---", "-")\
                                 .replace("--", "-")

    package_name = package_name[:99]
    if package_name.endswith("-"):
        package_name = package_name.rstrip('-')

    # package_title = source.replace("_", " ")
    package_title = qcode_name

    dict_additional_fields = {
        'Title': package_title,
        'Sourcing date': datetime.now().strftime("%B %d, %Y, %H:%M"),
        'Source': source,
        'Datastore': mongo_uri,
        'Database name': db_name,
        'Collection': source,
        'Description': description,
        'Refresh rate': refresh_rate,
    }
    additional_fields = []
    for k, v in dict_additional_fields.items():
        additional_fields.append({'key': k, 'value': v})

    tags = buildTags(package_name)
    try:
        ckan_ckan.action.package_create(
            name=package_name,
            title=package_title,
            maintainer=publisher,
            tags=tags,
            notes=description,
            private=ckan_private,
            owner_org=owner_org,
            extras=additional_fields,
        )
    except:
        try:
            ckan_ckan.action.package_update(
                id=package_name,
                title=package_title,
                maintainer=publisher,
                tags=tags,
                notes=description,
                private=ckan_private,
                owner_org=owner_org,
                extras=additional_fields,
            )
        except:
            logging.error("CKAN package creation/updation failed: " +
                          package_name)