示例#1
0
def login():
    login_form = LoginForm(request.form)
    if request.method == "POST":
        db = DBUtils()

        username = login_form.username.data
        password = login_form.password.data
        print(username, password)

        ret = db.check_users(username=username, password=password)
        if ret == 1:
            # download the zip file
            print("downloading")
            return send_from_directory(directory=app.config['CONTAINER'],
                                       filename=app.config['file'])
        elif ret == 0:
            error = 'Invalid credentials'

        elif ret == -1:
            error = "cannot connect to the DB"

        else:
            error = ""
        return render_template('login.html',
                               error=error,
                               login_form=login_form)

    return render_template('login.html', login_form=login_form, error=None)
def parallel_function(server):
    db = DBUtils(db_name='JobListings',
                 host='master.mongodb.d.int.zippia.com',
                 connect=False)
    collection_name = db.fetch_data('MetaCollection')[0]['current']
    hit_time = time.time()
    jobs = read_test_data(db, collection_name=collection_name)
    update_jobs(db, jobs, collection_name, server)
    print "{} hit time {}s".format('job', time.time() - hit_time)
    del jobs
    del db
def read_local_skill_master():
    db_zippia2 = DBUtils(db_name='zippia2', host='localhost')
    skill_master_dict = {}
    cursor = db_zippia2.fetch_data(configurator.commons.SKILL_MASTER, 'cursor',
                                   {},
                                   {'lay_title': 1,
                                    'median_time_to_reach': 1})
    for elem in cursor:
        if "median_time_to_reach" in elem:
            skill_master_dict[elem['lay_title']] = {}
            skill_master_dict[elem['lay_title']] = elem['median_time_to_reach']
    return skill_master_dict
def read_skill_master():
    dbutils = DBUtils(db_name='zippia', host='master.mongodb.d.int.zippia.com')
    skill_master_dict = {}
    cursor = dbutils.fetch_data(
        configurator.commons.SKILL_MASTER, 'cursor', {},
        {'lay_title': 1,
         'most_popular_soc_codes': 1,
         'skill_set': 1})
    for elem in cursor:
        skill_master_dict[elem['lay_title']] = {}
        skill_master_dict[elem['lay_title']]['soc_code'] = elem[
            'most_popular_soc_codes'][0]
        skill_master_dict[elem['lay_title']][
            'skill_set'] = [skill[0] for skill in elem['skill_set'][:30]]
    return skill_master_dict
示例#5
0
class ChatBot:
    allowed_resources = ['entrance door', 'reception area', 'pool', 'gym', 'garage']
    allowed_times = ['morning', 'evening', 'afternoon', 'night']

    sql_builder = None
    db_utils = None

    def __init__(self):
        self.sql_builder = SqlBuilder()
        self.db_utils = DBUtils()

    def process_question(self, question):
        query_parameters = {}
        blob = TextBlob(question)
        noun_phrases = blob.noun_phrases
        if len(noun_phrases) > 0:
            self.process_noun_phrases(noun_phrases, query_parameters)
        for word, tag in blob.pos_tags:
            if tag == 'NN' or tag == 'NNS':
                self.process_noun(word.lower(), query_parameters)
            self.column_name(query_parameters, word)

        while 'column' not in query_parameters.keys():
            text = input("Looking for a count or people? ['How many'/'Who']")
            self.column_name(query_parameters, text.lower())

        while 'resource_name' not in query_parameters.keys():
            text = input("Where do you want to look?? {0}\n".format(self.allowed_resources))
            if text.lower() in self.allowed_resources:
                query_parameters['resource_name'] = text.lower()

        query = self.sql_builder.build_query(column=query_parameters['column'],
                                             resource_name=query_parameters['resource_name'],
                                             time_identifier=query_parameters[
                                                 'time'] if 'time' in query_parameters.keys() else None)
        result = self.db_utils.query(query)
        if query_parameters['column'].__contains__('count'):
            print("Total of ", result[0][0])
        else:
            print("It's ", ', '.join(str(r[0]) for r in result))

    def column_name(self, query_parameters, word):
        if word.lower().__contains__('how'):
            query_parameters['column'] = 'count(*)'
        if word.lower().__contains__('who'):
            query_parameters['column'] = 'distinct(credential_holder_name)'

    def process_noun_phrases(self, noun_phrases, query_parameters):
        count = 0
        while count < len(noun_phrases) and noun_phrases[count] and noun_phrases[count].lower() != 'entrance door':
            count = count + 1
        query_parameters['resource_name'] = noun_phrases[count]

    def process_noun(self, word, query_parameters):
        if word.lower() in self.allowed_times:
            query_parameters['time'] = word
        elif word.lower() in self.allowed_resources:
            query_parameters['resource_name'] = word
示例#6
0
def process():
    db = DBUtils()
    db.create()
    data = read_from_queue()
    if len(data) > 0:
        for msg in data:
            for key, val in msg.items():
                db_record = dict(zip(["desc", "src"], [key, val]))
                db.save_data(db_record)
        print("Data received, msg_cnt: %d" % len(data))
    db.close()
def create_graph_for_majors(major, degrees, work_meta_info, min_conf,
                            depriotize_starts):
    dbutils = DBUtils(db_name='zippia', host='master.mongodb.d.int.zippia.com')
    edges = {}
    nodes = set()
    resume_count = 0
    resumes = dbutils.fetch_data(
        'resume', "cursor",
        {'latest_ed_major': major,
         'latest_ed_degree': {
             '$in': list(degrees)
         }})
    for resume in resumes:
        ''' Look at the last education info '''
        resume_count += 1
        create_edges_from_resume(resume, work_meta_info, min_conf, edges,
                                 nodes, resume["latest_ed_major"])
        if resume_count % 10000 == 0:
            print "Processed {} resumes".format(resume_count)
    print "Statistics for major:", major

    edges = manage_edges(edges, resume_count, depriotize_starts)
    print "The graph has: {} edges and {} nodes".format(len(edges), len(nodes))
    return edges
示例#8
0
def db_info():
    db = DBUtils()
    db.create()
    db.load_data(settings.DB_RECORDS_COUNT)
示例#9
0
def import_dataset(dataset_name, namespace, csv_path, default_variable_display,
                   source_name):
    with connection as c:
        db = DBUtils(c)

        # Check whether the database is up to date, by checking the
        # - last modified date of the Grapher file
        # - last modified date of the database row
        #
        # This is not bulletproof, but it allows for flexibility – authors could manually update
        # the repo, and that would trigger a database update too.

        (db_dataset_id, db_dataset_modified_time) = db.fetch_one(
            """
            SELECT id, dataEditedAt
            FROM datasets
            WHERE name = %s
            AND namespace = %s
        """, [dataset_name, namespace])

        db_dataset_modified_time = db_dataset_modified_time.replace(
            tzinfo=tz_db)
        file_modified_time = datetime.fromtimestamp(
            os.stat(csv_path).st_mtime).replace(tzinfo=tz_local)

        if file_modified_time <= db_dataset_modified_time:
            print(f"Dataset is up to date: {dataset_name}")
            sys.exit(0)

        print("Updating database...")

        # Load dataset data frame

        df = pd.read_csv(csv_path)

        # Check whether all entities exist in the database.
        # If some are missing, report & quit.

        entity_names = list(df['Country'].unique())

        db_entities_query = db.fetch_many(
            """
            SELECT id, name
            FROM entities
            WHERE name IN %s
        """, [entity_names])

        db_entity_id_by_name = {name: id for id, name in db_entities_query}

        # Terminate if some entities are missing from the database
        missing_entity_names = set(entity_names) - set(
            db_entity_id_by_name.keys())
        if len(missing_entity_names) > 0:
            print_err(
                f"Entity names missing from database: {str(missing_entity_names)}"
            )
            sys.exit(1)

        # Fetch the source

        (db_source_id, ) = db.fetch_one(
            """
            SELECT id
            FROM sources
            WHERE datasetId = %s
        """, db_dataset_id)

        # Check whether all variables match database variables.

        id_names = ["Country", "Year"]
        variable_names = list(set(df.columns) - set(id_names))

        db_variables_query = db.fetch_many(
            """
            SELECT id, name
            FROM variables
            WHERE datasetId = %s
        """, [db_dataset_id])

        db_variable_id_by_name = {name: id for id, name in db_variables_query}

        # Remove any variables no longer in the dataset. This is safe because any variables used in
        # charts won't be deleted because of database constrant checks.

        variable_names_to_remove = list(
            set(db_variable_id_by_name.keys()) - set(variable_names))
        if len(variable_names_to_remove):
            print(f"Removing variables: {str(variable_names_to_remove)}")
            variable_ids_to_remove = [
                db_variable_id_by_name[n] for n in variable_names_to_remove
            ]
            db.execute(
                """
                DELETE FROM data_values
                WHERE variableId IN %(ids)s;
                DELETE FROM variables
                WHERE id IN %(ids)s;
            """, {'ids': variable_ids_to_remove})

        # Add variables that didn't exist before. Make sure to set yearIsDay.

        variable_names_to_insert = list(
            set(variable_names) - set(db_variable_id_by_name.keys()))
        if len(variable_names_to_insert):
            print(f"Inserting variables: {str(variable_names_to_insert)}")
            for name in variable_names_to_insert:
                db_variable_id_by_name[name] = db.upsert_variable(
                    name=name,
                    code=None,
                    unit='',
                    short_unit=None,
                    source_id=db_source_id,
                    dataset_id=db_dataset_id,
                    display=default_variable_display)

        # Delete all data_values in dataset

        print("Deleting all data_values...")

        db.execute(
            """
            DELETE FROM data_values
            WHERE variableId IN %s
        """, [tuple(db_variable_id_by_name.values())])

        # Insert new data_values

        print("Inserting new data_values...")

        df_data_values = df.melt(id_vars=id_names,
                                 value_vars=variable_names,
                                 var_name='variable',
                                 value_name='value').dropna(how='any')

        for df_chunk in chunk_df(df_data_values, 50000):
            data_values = [(row['value'], int(row['Year']),
                            db_entity_id_by_name[row['Country']],
                            db_variable_id_by_name[row['variable']])
                           for _, row in df_chunk.iterrows()]
            db.upsert_many(
                """
                INSERT INTO
                    data_values (value, year, entityId, variableId)
                VALUES
                    (%s, %s, %s, %s)
            """, data_values)

        # Update dataset dataUpdatedAt time & dataUpdatedBy

        db.execute(
            """
            UPDATE datasets
            SET
                dataEditedAt = NOW(),
                dataEditedByUserId = %s
            WHERE id = %s
        """, [USER_ID, db_dataset_id])

        # Update source name ("last updated at")

        db.execute(
            """
            UPDATE sources
            SET name = %s
            WHERE id = %s
        """, [source_name, db_source_id])

        # Update chart versions to trigger rebake

        db.execute(
            """
            UPDATE charts
            SET config = JSON_SET(config, "$.version", config->"$.version" + 1)
            WHERE id IN (
                SELECT DISTINCT chart_dimensions.chartId
                FROM chart_dimensions
                JOIN variables ON variables.id = chart_dimensions.variableId
                WHERE variables.datasetId = %s
            )
        """, [db_dataset_id])

        # Enqueue deploy

        if DEPLOY_QUEUE_PATH:
            with open(DEPLOY_QUEUE_PATH, 'a') as f:
                f.write(
                    json.dumps({
                        'message':
                        f"Automated dataset update: {dataset_name}"
                    }) + "\n")

    print("Database update successful.")
    send_success(channel='corona-data-updates'
                 if not os.getenv('IS_DEV') else 'bot-testing',
                 title=f'Updated Grapher dataset: {dataset_name}')
示例#10
0
# - `standardized_name` (the database entity name)
# - `db_entity_id` (the database entity id)
# 
# **There will likely be entities that were not matched with a database entity, those will be inserted below.**

# In[ ]:


entities = pd.read_csv('./entities_standardized.csv', index_col='id')


# In[ ]:


with connection as c:
    db = DBUtils(c)
    new_entities = entities[entities['db_entity_id'].isnull()]
    for _, entity in new_entities.iterrows():
        entity_id = entity.name
        entity_name = entity['name']
        db_entity_id = db.get_or_create_entity(entity_name)
        entities.loc[entity_id, 'db_entity_id'] = db_entity_id
        print(db_entity_id, entity['name'])


# In[ ]:


len(entities[entities['db_entity_id'].isnull()])

示例#11
0
def main():

    bp_entities = pd.read_csv("./standardization/entities.csv")
    std_entities = pd.read_csv("./standardization/entities-standardized.csv")
    new_entities = bp_entities[-bp_entities.name.isin(std_entities.name)]

    if len(new_entities) > 0:
        print(
            "The following entities do not exist yet in entities-standardized.csv"
        )
        print(new_entities)
        print(
            "Press CTRL+C to cancel and match+add them yourself to entities-standardized.csv"
        )
        _ = input(
            "or press ENTER to proceed and look them up (or create) in the database: "
        )

    std_entities = pd.concat([std_entities,
                              new_entities]).reset_index(drop=True)
    std_entities.loc[std_entities.standardized_name.isnull(),
                     "standardized_name"] = std_entities.name

    with connection.cursor() as cursor:

        db = DBUtils(cursor)

        for _, row in std_entities[
                std_entities.db_entity_id.isnull()].iterrows():
            std_entities.loc[std_entities.standardized_name ==
                             row["standardized_name"],
                             "db_entity_id"] = db.get_or_create_entity(
                                 row["standardized_name"])

        db_entity_id_by_bp_name = dict(
            zip(std_entities.name, std_entities.db_entity_id))

        # Inserting the dataset
        db_dataset_id = db.upsert_dataset(
            name="BP Statistical Review of Global Energy",
            namespace="bpstatreview_2020",
            user_id=USER_ID)

        #Inserting the source
        db_source_id = db.upsert_source(
            name="BP Statistical Review of Global Energy (2020)",
            description=json.dumps({
                "dataPublishedBy": "BP",
                "dataPublisherSource": "Statistical Review of World Energy",
                "link":
                "https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html",
                "retrievedDate": "August 1, 2020"
            }),
            dataset_id=db_dataset_id)

        # Inserting variables
        variables = pd.read_csv("output/variables.csv")
        variables["notes"] = variables.notes.fillna("")

        for _, variable in variables.iterrows():

            print("Inserting variable: %s" % variable["name"])
            db_variable_id = db.upsert_variable(name=variable["name"],
                                                code=None,
                                                unit=variable["unit"],
                                                short_unit=None,
                                                source_id=db_source_id,
                                                dataset_id=db_dataset_id,
                                                description=variable["notes"])

            data_values = pd.read_csv("./output/datapoints/datapoints_%d.csv" %
                                      variable.id)

            values = [(float(row["value"]), int(row["year"]),
                       db_entity_id_by_bp_name[row["country"]], db_variable_id)
                      for _, row in data_values.iterrows()]

            print("Inserting values...")
            db.upsert_many(
                """
                INSERT INTO data_values (value, year, entityId, variableId)
                VALUES (%s, %s, %s, %s)
            """, values)
            print("Inserted %d values for variable" % len(values))
示例#12
0
        print(df[pd.to_numeric(df['value'], errors='coerce').isnull()])
        errors += 1

if errors != 0:
    print_err("\nIntegrity checks failed. There were %s errors.\n" %
              str(errors))
    sys.exit(1)
else:
    print("\nIntegrity checks passed.\n")

# ## Insert database rows

# In[1]:

with connection as c:
    db = DBUtils(c)

    for _, dataset in tqdm(datasets.iterrows(), total=len(datasets)):

        # Insert the dataset
        print("Inserting dataset: %s" % dataset['name'])
        db_dataset_id = db.upsert_dataset(name=dataset['name'],
                                          description=dataset['description'],
                                          namespace=NAMESPACE,
                                          user_id=USER_ID)

        # Insert the source
        source = sources[sources['dataset_id'] == dataset.id].iloc[0]
        print("Inserting source: %s" % source['name'])
        db_source_id = db.upsert_source(name=source['name'],
                                        description=source['description'],
示例#13
0
'''
end title generation with time threshold fall-back and median time
'''
from db_utils import DBUtils
import utils
from configurator import configurator
from collections import OrderedDict
import re
import csv
import numpy as np

dbutils = DBUtils(db_name='zippia', host='master.mongodb.d.int.zippia.com')
# dbutils= DBUtils(db_name='zippia2', host='localhost')


def is_valid_work_info(resume, work_info_obj, min_conf=0):

    valid = False

    if (work_info_obj["title"] in resume and resume[work_info_obj["title"]]
            and work_info_obj["match"] in resume
            and resume[work_info_obj["match"]]
            and work_info_obj["confidence"] in resume
            and resume[work_info_obj["confidence"]] >= min_conf
            and work_info_obj["from"] in resume
            and resume[work_info_obj["from"]] and work_info_obj["to"] in resume
            and resume[work_info_obj["to"]]):
        ''' Valid Work Experience '''

        valid = True
示例#14
0
class norm_job(object):
    api_start_time = time.time()
    SKILLSET_SIZE = 30
    CONFIDENCE_THRESHOLD = 75
    dbutils = DBUtils(configurator.commons.MONGODB_HOST)
    universal_skill_set = dbutils.create_resume_posting_universal_skill_set(
        SKILLSET_SIZE)
    ngram_limit = 1
    npe = nounphrase_extractor()
    sf = similarity_finder()

    JOBS_PARAMETER = "jobs"
    JOB_TITLE_PARAMETER = "title"
    JOB_DESCRIPTION_PARAMETER = "description"
    PREVIOUS_JOB_TITLE_PARAMETER = "previous_title"
    PREVIOUS_JOB_DESCRIPTION_PARAMETER = "previous_description"
    SOC_HINT_PARAMETER = "soc_hint"
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

    for skill in universal_skill_set:
        l = len(skill.split())
        if l > ngram_limit:
            ngram_limit = l

    def __init__(self):
        LAY_TITLE_LIST_NAME = 'lay_title_list'
        LAY_TITLE_DICT_NAME = 'lay_title_dict'
        SIMILAR_TITLE_DICT_NAME = 'title_to_similar_title_dict'
        CITY_LIST = 'city_list'
        STATE_LIST = 'state_list'
        STATE_CODES = 'state_codes'
        SOC_MASTER_DICT_NAME = 'soc_master_dict'
        SOC_MAPPING_NAME = 'soc_mapping'
        SIMILAR_DICT_NAME = 'similar_title_dict'

        try:
            self.soc_master_dict = load_file(SOC_MASTER_DICT_NAME)
            self.soc_mapping = load_file(SOC_MAPPING_NAME)
            self.similar_title_dict = load_file(SIMILAR_DICT_NAME)
            self.lay_title_list = load_file(LAY_TITLE_LIST_NAME)
            self.title_to_similar_title_dict = load_file(
                SIMILAR_TITLE_DICT_NAME)
            self.lay_title_dict = load_file(LAY_TITLE_DICT_NAME)
            self.city_list = load_file(CITY_LIST)
            self.state_list = load_file(STATE_LIST)
            self.state_codes = load_file(STATE_CODES)
        except:
            [self.soc_master_dict, self.lay_title_list, self.soc_mapping
             ] = norm_job.dbutils.create_all_lay_title_mappings()
            [self.similar_title_dict, self.title_to_similar_title_dict
             ] = norm_job.dbutils.create_all_similar_title_mappings()

            [self.lay_title_dict, self.lay_title_list
             ] = create_lay_title_dict_and_lower_list(self.lay_title_list)
            res = create_lay_title_dict_and_lower_list(
                self.title_to_similar_title_dict, stem_key=True)
            self.lay_title_dict.update(res[0])
            self.title_to_similar_title_dict = res[1]
            del res

            results = create_location_maps(norm_job.dbutils)
            self.city_list = results["city_list"]
            self.state_list = results["state_list"]
            self.state_codes = results["state_codes"]

            try:
                folder = configurator.commons.JOB_API_INIT_FILES_PATH
                if folder:
                    if folder.strip()[-1] != '/':
                        folder = folder.strip() + "/"
                    if not os.path.exists(folder):
                        os.makedirs(folder)
                temp_folder_path = folder + str(os.getpid()) + "_" + str(
                    time.time())
                try:
                    logging.info(
                        save_to_file(LAY_TITLE_DICT_NAME, self.lay_title_dict,
                                     temp_folder_path))
                    logging.info(
                        save_to_file(LAY_TITLE_LIST_NAME, self.lay_title_list,
                                     temp_folder_path))
                    logging.info(
                        save_to_file(SIMILAR_TITLE_DICT_NAME,
                                     self.title_to_similar_title_dict,
                                     temp_folder_path))
                    logging.info(
                        save_to_file(CITY_LIST, self.city_list,
                                     temp_folder_path))
                    logging.info(
                        save_to_file(STATE_LIST, self.state_list,
                                     temp_folder_path))
                    logging.info(
                        save_to_file(STATE_CODES, self.state_codes,
                                     temp_folder_path))
                    logging.info(
                        save_to_file(SOC_MASTER_DICT_NAME,
                                     self.soc_master_dict, temp_folder_path))
                    logging.info(
                        save_to_file(SOC_MAPPING_NAME, self.soc_mapping,
                                     temp_folder_path))
                    logging.info(
                        save_to_file(SIMILAR_DICT_NAME,
                                     self.similar_title_dict,
                                     temp_folder_path))
                except Exception as e:
                    root.exception(e)
                # if folder exists (may be due to parallel processes) then remove current temporary folder
                ''' Rename directory '''
                if os.path.exists(temp_folder_path):
                    for filename in listdir(join(folder, temp_folder_path)):
                        move(join(folder, temp_folder_path, filename),
                             join(folder, filename))
                    rmdir(temp_folder_path)
            except Exception as e:
                root.exception(e)
                pass

        remove_cities = [
            'teller', 'home', 'cook', 'grill', 'helper', 'industrial', 'mobile'
        ]
        for city in remove_cities:
            if city in self.city_list:
                del self.city_list[city]

        self.soc_lay_title_token_list = {}
        for soc, lts in self.lay_title_list.items():
            self.soc_lay_title_token_list[soc] = {}
            for lt in lts:
                for token in set(lt.split()):
                    if token not in self.soc_lay_title_token_list[soc]:
                        self.soc_lay_title_token_list[soc][token] = set()
                    self.soc_lay_title_token_list[soc][token].add(lt)
                if lt in self.title_to_similar_title_dict:
                    for st in self.title_to_similar_title_dict[lt]:
                        for token in set(st.split()):
                            if token not in self.soc_lay_title_token_list[soc]:
                                self.soc_lay_title_token_list[soc][
                                    token] = set()
                            self.soc_lay_title_token_list[soc][token].add(st)
        '''Load Model'''
        try:
            self.model = SOCClassifierFactory.create_classifier(
                configurator.commons.JOB_POSTING_CLF_NAME)
        except Exception as e:
            root.exception(e)
            exit()

        f = open('dictionaries/selected_ngrams_for_driver.csv', 'rb')
        fr = csv.reader(f, delimiter='\t')
        self.driver_ngrams_set = set(
            [row[0] for row in fr if (row[0] and row[0].strip())])
        ltm_cursor = norm_job.dbutils.fetch_data(
            configurator.commons.LAY_TITLE_MASTER, 'cursor',
            {'soc_code': {
                '$regex': '^53-'
            }}, {'soc_code': 1})
        self.driver_soc_codes = [(ltm_elem['soc_code'], 100)
                                 for ltm_elem in ltm_cursor]

        root.info("API Start Time= {}s".format(time.time() -
                                               norm_job.api_start_time))

    def extract_soc_code(self, text, prefix, suffix):
        if text.startswith(prefix) and text.endswith(suffix):
            return text[len(prefix):-len(suffix)]
        return None

    @staticmethod
    def fetch_closest_lay_title(lay_title_list, soc_lay_title_token_list,
                                soc_code_tuple, job_title, job_description):
        closest_lay_title = configurator.commons.DEFAULT_CLOSEST_LAY_TITLE
        default_soc = soc_code_tuple[0]
        top_soc = soc_code_tuple[0]
        valid_lay_titles = set()
        tokens = job_title.split()
        for soc_code in soc_code_tuple:
            if soc_code in soc_lay_title_token_list:
                for token in tokens:
                    if token in soc_lay_title_token_list[soc_code]:
                        valid_lay_titles = valid_lay_titles.union(
                            soc_lay_title_token_list[soc_code][token])
        if len(valid_lay_titles):
            closest_lay_title = norm_job.sf.find_closest_lay_title(
                valid_lay_titles, job_title, job_description)
            for soc in soc_code_tuple:
                if soc in lay_title_list and closest_lay_title in set(
                        lay_title_list[soc]):
                    default_soc = soc
                    break
        return closest_lay_title, default_soc, top_soc

    @cherrypy.expose
    @cherrypy.tools.json_out()
    @cherrypy.tools.json_in()
    def normalize(self, **other_params):
        cherrypy.response.headers['Content-Type'] = "application/json"
        params = {}
        if cherrypy.request.method == "POST":
            params = cherrypy.request.json
        error_message = str()
        error_flag = False
        job_description = ""
        batch_size = 0
        total_time = time.time()

        if norm_job.JOBS_PARAMETER not in params:
            error_flag = True
            error_message = configurator.commons.MALFORMED_REQUEST_ERROR_MESSAGE
        else:
            jobs = params[norm_job.JOBS_PARAMETER]
            job_array = []
            skill_array = []
            responses = []
            bypass_array = []
            batch_size = len(jobs)
            for job in jobs:
                try:
                    filtered_title = job[norm_job.JOB_TITLE_PARAMETER]
                    if "instead of" in filtered_title.lower():
                        filtered_title = filtered_title[:filtered_title.lower(
                        ).find("instead of")].strip()
                    filtered_title = create_key(filtered_title, self.city_list,
                                                self.state_list,
                                                self.state_codes)
                    job[norm_job.JOB_TITLE_PARAMETER] = filtered_title
                except:
                    filtered_title = ""
                job_description = ""
                if norm_job.JOB_DESCRIPTION_PARAMETER in job:
                    job_description = job[norm_job.JOB_DESCRIPTION_PARAMETER]
                title_ngrams = find_all_ngrams_upto(filtered_title.lower(), 4)
                if title_ngrams.intersection(self.driver_ngrams_set):
                    bypass_array.append(1)
                else:
                    job_array.append((filtered_title, job_description))
                    bypass_array.append(0)
                imp_skills = set()

                if job_description:
                    sentences = norm_job.sent_detector.tokenize(
                        job_description)
                    for sentence in sentences:
                        lower_sentence = sentence.lower()
                        sentence_n_grams = find_all_ngrams_upto(
                            lower_sentence, norm_job.ngram_limit)
                        imp_skills.update(
                            sentence_n_grams.intersection(
                                norm_job.universal_skill_set))
                skill_array.append(imp_skills)

            start_time = time.time()
            prediction_array = self.model.predict(job_array)
            root.info(
                "Context Free classification for {0} points done in {1}s".
                format(len(prediction_array),
                       time.time() - start_time))
            del job_array
            #             root.info(prediction_array)

            start_time = time.time()
            for point_index, selector_value in enumerate(bypass_array):
                if selector_value:
                    soc_codes_with_conf = self.driver_soc_codes
                else:
                    soc_codes_with_conf = prediction_array.pop(0)
                soc_codes = [
                    soc[0] for soc in sorted(
                        soc_codes_with_conf, key=lambda k: k[1], reverse=True)
                ]
                try:
                    job_title = jobs[point_index][norm_job.JOB_TITLE_PARAMETER]
                    if "instead of" in job_title.lower():
                        job_title = job_title[:job_title.lower().
                                              find("instead of")].strip()
                except:
                    error_flag = True
                    error_message = configurator.commons.MALFORMED_REQUEST_ERROR_MESSAGE
                if not error_flag:
                    response_json = {}
                    response_json["index"] = point_index
                    response_json["clean_original_title"] = format_skills(
                        jobs[point_index][norm_job.JOB_TITLE_PARAMETER])
                    response_json["soc_code"] = ''
                    response_json["confidence"] = 0
                    response_json["closest_lay_title"] = ''
                    response_json["major_group_string"] = ''
                    response_json["skills"] = list(skill_array[point_index])

                    if not soc_codes:
                        ''' The given job posting could not be normalized using our standard algorithm.
                        We should use the soc_hint parameter present here to see if we can find a nearby
                        title in the given hint SOC code.'''
                        if norm_job.SOC_HINT_PARAMETER in jobs[point_index]:
                            soc_hint = jobs[point_index][
                                norm_job.SOC_HINT_PARAMETER]
                            if soc_hint in self.soc_mapping:
                                ''' This is a valid SOC Code '''
                                associated_soc_codes = self.soc_mapping[
                                    soc_hint]
                                soc_codes = list(associated_soc_codes)
                                root.info(
                                    "Hinted {} hence, Comparing Against Codes {}"
                                    .format(soc_hint, soc_codes))
                            else:
                                ''' This is an invalid SOC Code and we can't do much about it. '''
                                root.info(
                                    "No matching SOC Code found in soc_hint {}. Cannot normalize."
                                    .format(soc_hint))
                    if soc_codes:
                        key_string = filter_chain.apply(
                            convert_encoding(job_title), is_title=True)[1]
                        closest_lay_title_tuple = norm_job.fetch_closest_lay_title(
                            self.lay_title_list, self.soc_lay_title_token_list,
                            soc_codes, key_string, "")
                        major_group_string = configurator.commons.DEFAULT_MAJOR_GROUP_STRING
                        if closest_lay_title_tuple[1] in self.soc_master_dict:
                            major_group_string = self.soc_master_dict[
                                closest_lay_title_tuple[1]][
                                    'major_group_string']
                        lay_title = convert_encoding(
                            closest_lay_title_tuple[0])
                        if lay_title in self.lay_title_dict:
                            lay_title = self.lay_title_dict[lay_title]
                            if lay_title in self.similar_title_dict:
                                lay_title = self.similar_title_dict[lay_title]
                        response_json["soc_code"] = closest_lay_title_tuple[1]
                        response_json["confidence"] = int(
                            dict(soc_codes_with_conf)[
                                closest_lay_title_tuple[1]])
                        response_json['top_soc'] = closest_lay_title_tuple[2]
                        response_json["closest_lay_title"] = lay_title
                        response_json[
                            "major_group_string"] = major_group_string
                else:
                    response_json = {
                        "error_code":
                        configurator.commons.MALFORMED_REQUEST_ERROR_STATUS,
                        "message": error_message
                    }
                responses.append(response_json)
                error_flag = False
                if (point_index + 1) % 1000 == 0:
                    root.info("{0} points done in {1}s".format(
                        point_index,
                        time.time() - start_time))
                    start_time = time.time()
            responses_object = {"normalized_jobs": responses}
        if error_flag:
            cherrypy.response.status = configurator.commons.MALFORMED_REQUEST_ERROR_STATUS
            responses_object = {
                "error_code":
                configurator.commons.MALFORMED_REQUEST_ERROR_STATUS,
                "message": error_message
            }

        root.info("{0} points done in {1}s".format(batch_size,
                                                   time.time() - total_time))

        return responses_object
示例#15
0
def import_csv_files(measure_names, age_names, metric_names, sex_names,
                     parent_tag_name, namespace, csv_dir,
                     default_source_description, get_key, get_var_name,
                     get_var_code):

    logging.basicConfig(
        filename=os.path.join(CURRENT_PATH, '..', 'logs', '%s-%s.log' %
                              (os.environ['DB_NAME'], namespace)),
        level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(name)s: %(message)s')

    logger = logging.getLogger('importer')

    try:

        with connection as c:

            db = DBUtils(c)

            total_data_values_upserted = 0

            def get_state():
                return {
                    **db.get_counts(),
                    'data_values_upserted':
                    total_data_values_upserted,
                }

            def log_state():
                message = " · ".join(
                    str(key) + ": " + str(value)
                    for key, value in get_state().items())
                print(message)
                logger.info(message)

            # The user ID that gets assigned in every user ID field
            (user_id, ) = db.fetch_one("""
                SELECT id FROM users WHERE email = '*****@*****.**'
            """)

            # Create the parent tag
            parent_tag_id = db.upsert_parent_tag(parent_tag_name)

            tag_id_by_name = {
                name: i
                for name, i in db.fetch_many(
                    """
                    SELECT name, id
                    FROM tags
                    WHERE parentId = %s
                """, parent_tag_id)
            }

            # Intentionally kept empty in order to force sources to be updated (in order
            # to update `retrievedDate`)
            dataset_id_by_name = {}

            var_id_by_code = {
                code: i
                for code, i in db.fetch_many(
                    """
                    SELECT variables.code, variables.id
                    FROM variables
                    LEFT JOIN datasets ON datasets.id = variables.datasetId
                    WHERE datasets.namespace = %s
                """, [namespace])
            }

            # Keep track of variables that we have changed the `updatedAt` column for
            touched_var_codes = set()

            # Keep track of which variables have had their data_values removed
            cleared_var_codes = set()

            # Intentionally kept empty in order to force sources to be updated (in order
            # to update `retrievedDate`)
            source_id_by_name = {}

            data_values_to_insert = []
            insert_data_value_sql = """
                INSERT INTO data_values
                    (value, year, entityId, variableId)
                VALUES
                    (%s, %s, %s, %s)
                ON DUPLICATE KEY UPDATE
                    value = VALUES(value)
            """
            # We need an ON DUPLICATE handler here because the GBD dataset has
            # South Asia under two codes: 158 and 159. Both have the same values
            # in our extract, so we can safely ignore overwrites.

            for filename in glob.glob(os.path.join(csv_dir, '*.csv')):
                with open(filename, 'r', encoding='utf8') as f:
                    print('Processing: %s' % filename)
                    logger.info('Processing: %s' % filename)
                    reader = csv.DictReader(f)
                    row_number = 0
                    for row in reader:
                        row_number += 1

                        if row_number % 100 == 0:
                            time.sleep(
                                0.001
                            )  # this is done in order to not keep the CPU busy all the time, the delay after each 100th row is 1 millisecond

                        key = get_key(row)

                        # Skip rows we don't want to import
                        if row['sex_name'] not in sex_names \
                            or row['age_name'] not in age_names \
                            or row['metric_name'] not in metric_names \
                            or row['measure_name'] not in measure_names:
                            continue

                        if key not in tag_id_by_name:
                            tag_id_by_name[key] = db.upsert_tag(
                                key, parent_tag_id)

                        if key not in dataset_id_by_name:
                            dataset_id_by_name[key] = db.upsert_dataset(
                                name=key,
                                namespace=namespace,
                                tag_id=tag_id_by_name[key],
                                user_id=user_id)

                        if key not in source_id_by_name:
                            source_id_by_name[key] = db.upsert_source(
                                name=key,
                                description=json.dumps(
                                    default_source_description),
                                dataset_id=dataset_id_by_name[key])

                        var_name = get_var_name(row)

                        var_code = get_var_code(row)

                        if var_code not in var_id_by_code:
                            var_id_by_code[var_code] = db.upsert_variable(
                                name=var_name,
                                code=var_code,
                                unit=row['metric_name'],
                                short_unit=extract_short_unit(
                                    row['metric_name']),
                                dataset_id=dataset_id_by_name[key],
                                source_id=source_id_by_name[key])
                            touched_var_codes.add(var_code)
                        elif var_code not in touched_var_codes:
                            db.touch_variable(var_id_by_code[var_code])
                            touched_var_codes.add(var_code)

                        var_id = var_id_by_code[var_code]
                        entity_name = get_standard_name(row['location_name'])
                        entity_id = db.get_or_create_entity(entity_name)
                        value = get_metric_value(row)
                        year = int(row['year'])

                        if var_code not in cleared_var_codes:
                            db.execute_until_empty(
                                """
                                DELETE FROM data_values
                                WHERE data_values.variableId = %s
                                LIMIT 100000
                            """, [var_id])
                            cleared_var_codes.add(var_code)

                        data_values_to_insert.append(
                            (value, year, entity_id, var_id))

                        if len(data_values_to_insert) >= 50000:
                            db.upsert_many(insert_data_value_sql,
                                           data_values_to_insert)
                            total_data_values_upserted += len(
                                data_values_to_insert)
                            data_values_to_insert = []
                            log_state()

                if len(data_values_to_insert
                       ):  # insert any leftover data_values
                    db.upsert_many(insert_data_value_sql,
                                   data_values_to_insert)
                    total_data_values_upserted += len(data_values_to_insert)
                    data_values_to_insert = []
                    log_state()

            db.note_import(import_type=namespace,
                           import_notes='A gbd import was performed',
                           import_state=json.dumps(get_state()))

    except:
        logger.exception("error")
        raise
示例#16
0
    return print(*args, file=sys.stderr, **kwargs)


def chunk_df(df, n):
    """Yield successive n-sized chunks from data frame."""
    for i in range(0, df.shape[0], n):
        yield df[i:i + n]


tz_utc = tz_db = timezone.utc
tz_local = datetime.now(tz_utc).astimezone().tzinfo
tz_london = pytz.timezone('Europe/London')

if __name__ == "__main__":
    with connection as c:
        db = DBUtils(c)

        # Check whether the database is up to date, by checking the
        # - last modified date of the Grapher file
        # - last modified date of the database row
        #
        # This is not bulletproof, but it allows for flexibility – authors could manually update
        # the repo, and that would trigger a database update too.

        (db_dataset_id, db_dataset_modified_time) = db.fetch_one(
            """
            SELECT id, dataEditedAt
            FROM datasets
            WHERE name = %s
            AND namespace = %s
        """, [ecdc.DATASET_NAME, NAMESPACE])
示例#17
0
from glob import glob

from db import connection
from db_utils import DBUtils

# ## Database importer

# In[2]:

entities = pd.read_csv('./standardization/entities-standardized.csv')

# In[3]:

with connection as c:

    db = DBUtils(c)

    all_entities = entities.copy()
    new_entities = all_entities[all_entities['db_entity_id'].isnull()]

    for _, entity in new_entities.iterrows():
        entity_id = entity.name
        entity_name = entity['name']
        db_entity_id = db.get_or_create_entity(entity_name)
        all_entities.loc[entity_id, 'db_entity_id'] = db_entity_id

    db_entity_id_by_name = {
        row['name']: int(row['db_entity_id'])
        for _, row in all_entities.iterrows()
    }
示例#18
0
            label_wise_removal[label]['removed'] += 1
        count += 1
        if count % 10000 == 0:
            print count, len(removable_ids)
    print count, len(removable_ids)
    for _id in removable_ids:
        db.remove_entry(collection_name, query_dict={"_id": _id})
    print "DONE CLEANING ON COLLECTION: {}, REMOVED {} ENTRIES".format(
        collection_name, len(removable_ids))
    for label, removed_object in label_wise_removal.items():
        print "Label: {}, Total: {}, Removed: {}".format(
            label, removed_object['total'], removed_object['removed'])


if __name__ == "__main__":
    db = DBUtils(db_name='test_db', host="localhost")
    collection_name = 'paragraphs'  #"responsibilities_paragraphs_3"
    training_collection_name = "training_paragraphs"
    test_collection_name = "test_paragraphs"
    non_labeled_collection_name = "non_labeled_paragraphs"
    training_part_fraction = 0.7

#     training_test_spliter(collection_name, training_collection_name, test_collection_name, non_labeled_collection_name, training_part_fraction)
#     clean_data_set(training_collection_name)
#     clean_data_set(test_collection_name)

#     token_label_map = read_label_tokens()
#     train_model(training_collection_name)
#     test_model(test_collection_name)

#     test_data = [
示例#19
0
 def __init__(self):
     self.sql_builder = SqlBuilder()
     self.db_utils = DBUtils()
示例#20
0
from db_utils import DBUtils
from utils import find_all_ngrams_upto
import re
import csv

db = DBUtils(db_name='JobListings', host='master.mongodb.d.int.zippia.com')

collection_name = db.fetch_data('MetaCollection')[0]['current']
major_group = 53


def print_ngrams_for_mg():
    cursor = db.fetch_data(
        collection_name, 'cursor',
        {'socCode': {
            '$regex': '^' + str(major_group) + '-'
        }}, {'titleDisplay': 1})
    n_gram_freq = {}
    count = 0
    for elem in cursor:
        title = elem['titleDisplay']
        title = re.sub("\W+", " ", title)
        n_grams = find_all_ngrams_upto(title.lower(), n=4)
        for ng in n_grams:
            if ng not in n_gram_freq:
                n_gram_freq[ng] = 0
            n_gram_freq[ng] += 1
        count += 1
        if count % 10000 == 0:
            print "{} entries processed".format(count)
    print "{} entries processed".format(count)
import networkx as nx
from db_utils import DBUtils
import csv
import time
import re
import numpy as np
from configurator import configurator
import utils
from filter_chain import remove_stop_words
from sklearn.externals import joblib
from functools import partial
from multiprocessing import Pool
import math

dbutils = DBUtils(db_name='zippia', host='master.mongodb.d.int.zippia.com')
db_local = DBUtils(db_name='test', host='localhost')
db_zippia2 = DBUtils(db_name='zippia2', host='localhost')


def format_date(date_string):
    date_string = re.sub("\W+", " ", date_string.lower())
    date_string = re.sub("\d+[ ]*((year)|(yr))([s]{0,1})", " ", date_string)
    date_string = re.sub("\d+[ ]*((month)|(mt))([s]{0,1})", " ", date_string)
    date_string = re.sub(" +", " ", date_string).strip()
    return date_string


class valid_edge_checker:
    level_dict = {
        "Lead": set([
def print_paths(depriotize_starts, major_title_dict1, major_title_dict2,
                degrees, work_meta_info, min_conf, required_skills_threshold,
                top_k, edge_count_threshold, index_val):
    collection_name = "careerPathsForMajors_test_time"
    db_local = DBUtils(db_name='test', host='localhost')
    title_skills = read_skill_master()
    title_time_dict = read_local_skill_master()
    if index_val == 1:
        major_title_dict = major_title_dict1
    else:
        major_title_dict = major_title_dict2
    for major, titles_dict in major_title_dict.items():
        start_time = time.time()
        final_valid_paths = {}
        final_valid_paths['name'] = major
        final_valid_paths['version'] = 4
        final_valid_paths['graduate_paths'] = []
        final_valid_paths['under_graduate_paths'] = []
        final_valid_paths['other_paths'] = []
        iterate_over_degrees = ["Graduate", "Under Graduate", "Other"]
        major_edge_count_threshold = 0.1  #edge_count_threshold
        [sts, ets] = combine_start_and_end_titles(titles_dict)
        edges = create_graph_for_majors(major, degrees, work_meta_info,
                                        min_conf, depriotize_starts)
        joblib.dump(
            edges, '/mnt/data/rohit/major_edge_count/' +
            "_".join(re.sub("\W+", " ", major).split()) + '_edge_counts.pkl')
        while major_edge_count_threshold > 0 and iterate_over_degrees:
            career_graph = create_graph(edges, sts, ets,
                                        major_edge_count_threshold)
            [result_paths, edges] = print_paths_iterator(
                major, titles_dict, title_skills, required_skills_threshold,
                career_graph, edges, title_time_dict, iterate_over_degrees)
            if len(result_paths['graduate_paths']
                   ) >= 15 and not final_valid_paths['graduate_paths']:
                final_valid_paths['graduate_paths'].extend(result_paths[
                    'graduate_paths'])
                iterate_over_degrees.remove("Graduate")
            elif result_paths['graduate_paths'] and len(final_valid_paths[
                    'graduate_paths']) < 15:
                append_new_paths(
                    result_paths,
                    edges,
                    final_valid_paths,
                    path_name='graduate_paths')
                if len(final_valid_paths['graduate_paths']) >= 15:
                    iterate_over_degrees.remove("Graduate")

            if len(result_paths['under_graduate_paths']
                   ) >= 15 and not final_valid_paths['under_graduate_paths']:
                final_valid_paths['under_graduate_paths'].extend(result_paths[
                    'under_graduate_paths'])
                iterate_over_degrees.remove("Under Graduate")
            elif result_paths['under_graduate_paths'] and len(
                    final_valid_paths['under_graduate_paths']) < 15:
                append_new_paths(
                    result_paths,
                    edges,
                    final_valid_paths,
                    path_name='under_graduate_paths')
                if len(final_valid_paths['under_graduate_paths']) >= 15:
                    iterate_over_degrees.remove("Under Graduate")

            if len(result_paths['other_paths']
                   ) >= 15 and not final_valid_paths['other_paths']:
                final_valid_paths['other_paths'].extend(result_paths[
                    'other_paths'])
                iterate_over_degrees.remove("Other")
            elif result_paths['other_paths'] and len(final_valid_paths[
                    'other_paths']) < 15:
                append_new_paths(
                    result_paths,
                    edges,
                    final_valid_paths,
                    path_name='other_paths')
                if len(final_valid_paths['other_paths']) >= 15:
                    iterate_over_degrees.remove("Other")
            major_edge_count_threshold -= 1

        db_local.insert_records(collection_name, final_valid_paths)

        print "Done Major: {} in {}s".format(major, time.time() - start_time)