def login(): login_form = LoginForm(request.form) if request.method == "POST": db = DBUtils() username = login_form.username.data password = login_form.password.data print(username, password) ret = db.check_users(username=username, password=password) if ret == 1: # download the zip file print("downloading") return send_from_directory(directory=app.config['CONTAINER'], filename=app.config['file']) elif ret == 0: error = 'Invalid credentials' elif ret == -1: error = "cannot connect to the DB" else: error = "" return render_template('login.html', error=error, login_form=login_form) return render_template('login.html', login_form=login_form, error=None)
def parallel_function(server): db = DBUtils(db_name='JobListings', host='master.mongodb.d.int.zippia.com', connect=False) collection_name = db.fetch_data('MetaCollection')[0]['current'] hit_time = time.time() jobs = read_test_data(db, collection_name=collection_name) update_jobs(db, jobs, collection_name, server) print "{} hit time {}s".format('job', time.time() - hit_time) del jobs del db
def read_local_skill_master(): db_zippia2 = DBUtils(db_name='zippia2', host='localhost') skill_master_dict = {} cursor = db_zippia2.fetch_data(configurator.commons.SKILL_MASTER, 'cursor', {}, {'lay_title': 1, 'median_time_to_reach': 1}) for elem in cursor: if "median_time_to_reach" in elem: skill_master_dict[elem['lay_title']] = {} skill_master_dict[elem['lay_title']] = elem['median_time_to_reach'] return skill_master_dict
def read_skill_master(): dbutils = DBUtils(db_name='zippia', host='master.mongodb.d.int.zippia.com') skill_master_dict = {} cursor = dbutils.fetch_data( configurator.commons.SKILL_MASTER, 'cursor', {}, {'lay_title': 1, 'most_popular_soc_codes': 1, 'skill_set': 1}) for elem in cursor: skill_master_dict[elem['lay_title']] = {} skill_master_dict[elem['lay_title']]['soc_code'] = elem[ 'most_popular_soc_codes'][0] skill_master_dict[elem['lay_title']][ 'skill_set'] = [skill[0] for skill in elem['skill_set'][:30]] return skill_master_dict
class ChatBot: allowed_resources = ['entrance door', 'reception area', 'pool', 'gym', 'garage'] allowed_times = ['morning', 'evening', 'afternoon', 'night'] sql_builder = None db_utils = None def __init__(self): self.sql_builder = SqlBuilder() self.db_utils = DBUtils() def process_question(self, question): query_parameters = {} blob = TextBlob(question) noun_phrases = blob.noun_phrases if len(noun_phrases) > 0: self.process_noun_phrases(noun_phrases, query_parameters) for word, tag in blob.pos_tags: if tag == 'NN' or tag == 'NNS': self.process_noun(word.lower(), query_parameters) self.column_name(query_parameters, word) while 'column' not in query_parameters.keys(): text = input("Looking for a count or people? ['How many'/'Who']") self.column_name(query_parameters, text.lower()) while 'resource_name' not in query_parameters.keys(): text = input("Where do you want to look?? {0}\n".format(self.allowed_resources)) if text.lower() in self.allowed_resources: query_parameters['resource_name'] = text.lower() query = self.sql_builder.build_query(column=query_parameters['column'], resource_name=query_parameters['resource_name'], time_identifier=query_parameters[ 'time'] if 'time' in query_parameters.keys() else None) result = self.db_utils.query(query) if query_parameters['column'].__contains__('count'): print("Total of ", result[0][0]) else: print("It's ", ', '.join(str(r[0]) for r in result)) def column_name(self, query_parameters, word): if word.lower().__contains__('how'): query_parameters['column'] = 'count(*)' if word.lower().__contains__('who'): query_parameters['column'] = 'distinct(credential_holder_name)' def process_noun_phrases(self, noun_phrases, query_parameters): count = 0 while count < len(noun_phrases) and noun_phrases[count] and noun_phrases[count].lower() != 'entrance door': count = count + 1 query_parameters['resource_name'] = noun_phrases[count] def process_noun(self, word, query_parameters): if word.lower() in self.allowed_times: query_parameters['time'] = word elif word.lower() in self.allowed_resources: query_parameters['resource_name'] = word
def process(): db = DBUtils() db.create() data = read_from_queue() if len(data) > 0: for msg in data: for key, val in msg.items(): db_record = dict(zip(["desc", "src"], [key, val])) db.save_data(db_record) print("Data received, msg_cnt: %d" % len(data)) db.close()
def create_graph_for_majors(major, degrees, work_meta_info, min_conf, depriotize_starts): dbutils = DBUtils(db_name='zippia', host='master.mongodb.d.int.zippia.com') edges = {} nodes = set() resume_count = 0 resumes = dbutils.fetch_data( 'resume', "cursor", {'latest_ed_major': major, 'latest_ed_degree': { '$in': list(degrees) }}) for resume in resumes: ''' Look at the last education info ''' resume_count += 1 create_edges_from_resume(resume, work_meta_info, min_conf, edges, nodes, resume["latest_ed_major"]) if resume_count % 10000 == 0: print "Processed {} resumes".format(resume_count) print "Statistics for major:", major edges = manage_edges(edges, resume_count, depriotize_starts) print "The graph has: {} edges and {} nodes".format(len(edges), len(nodes)) return edges
def db_info(): db = DBUtils() db.create() db.load_data(settings.DB_RECORDS_COUNT)
def import_dataset(dataset_name, namespace, csv_path, default_variable_display, source_name): with connection as c: db = DBUtils(c) # Check whether the database is up to date, by checking the # - last modified date of the Grapher file # - last modified date of the database row # # This is not bulletproof, but it allows for flexibility – authors could manually update # the repo, and that would trigger a database update too. (db_dataset_id, db_dataset_modified_time) = db.fetch_one( """ SELECT id, dataEditedAt FROM datasets WHERE name = %s AND namespace = %s """, [dataset_name, namespace]) db_dataset_modified_time = db_dataset_modified_time.replace( tzinfo=tz_db) file_modified_time = datetime.fromtimestamp( os.stat(csv_path).st_mtime).replace(tzinfo=tz_local) if file_modified_time <= db_dataset_modified_time: print(f"Dataset is up to date: {dataset_name}") sys.exit(0) print("Updating database...") # Load dataset data frame df = pd.read_csv(csv_path) # Check whether all entities exist in the database. # If some are missing, report & quit. entity_names = list(df['Country'].unique()) db_entities_query = db.fetch_many( """ SELECT id, name FROM entities WHERE name IN %s """, [entity_names]) db_entity_id_by_name = {name: id for id, name in db_entities_query} # Terminate if some entities are missing from the database missing_entity_names = set(entity_names) - set( db_entity_id_by_name.keys()) if len(missing_entity_names) > 0: print_err( f"Entity names missing from database: {str(missing_entity_names)}" ) sys.exit(1) # Fetch the source (db_source_id, ) = db.fetch_one( """ SELECT id FROM sources WHERE datasetId = %s """, db_dataset_id) # Check whether all variables match database variables. id_names = ["Country", "Year"] variable_names = list(set(df.columns) - set(id_names)) db_variables_query = db.fetch_many( """ SELECT id, name FROM variables WHERE datasetId = %s """, [db_dataset_id]) db_variable_id_by_name = {name: id for id, name in db_variables_query} # Remove any variables no longer in the dataset. This is safe because any variables used in # charts won't be deleted because of database constrant checks. variable_names_to_remove = list( set(db_variable_id_by_name.keys()) - set(variable_names)) if len(variable_names_to_remove): print(f"Removing variables: {str(variable_names_to_remove)}") variable_ids_to_remove = [ db_variable_id_by_name[n] for n in variable_names_to_remove ] db.execute( """ DELETE FROM data_values WHERE variableId IN %(ids)s; DELETE FROM variables WHERE id IN %(ids)s; """, {'ids': variable_ids_to_remove}) # Add variables that didn't exist before. Make sure to set yearIsDay. variable_names_to_insert = list( set(variable_names) - set(db_variable_id_by_name.keys())) if len(variable_names_to_insert): print(f"Inserting variables: {str(variable_names_to_insert)}") for name in variable_names_to_insert: db_variable_id_by_name[name] = db.upsert_variable( name=name, code=None, unit='', short_unit=None, source_id=db_source_id, dataset_id=db_dataset_id, display=default_variable_display) # Delete all data_values in dataset print("Deleting all data_values...") db.execute( """ DELETE FROM data_values WHERE variableId IN %s """, [tuple(db_variable_id_by_name.values())]) # Insert new data_values print("Inserting new data_values...") df_data_values = df.melt(id_vars=id_names, value_vars=variable_names, var_name='variable', value_name='value').dropna(how='any') for df_chunk in chunk_df(df_data_values, 50000): data_values = [(row['value'], int(row['Year']), db_entity_id_by_name[row['Country']], db_variable_id_by_name[row['variable']]) for _, row in df_chunk.iterrows()] db.upsert_many( """ INSERT INTO data_values (value, year, entityId, variableId) VALUES (%s, %s, %s, %s) """, data_values) # Update dataset dataUpdatedAt time & dataUpdatedBy db.execute( """ UPDATE datasets SET dataEditedAt = NOW(), dataEditedByUserId = %s WHERE id = %s """, [USER_ID, db_dataset_id]) # Update source name ("last updated at") db.execute( """ UPDATE sources SET name = %s WHERE id = %s """, [source_name, db_source_id]) # Update chart versions to trigger rebake db.execute( """ UPDATE charts SET config = JSON_SET(config, "$.version", config->"$.version" + 1) WHERE id IN ( SELECT DISTINCT chart_dimensions.chartId FROM chart_dimensions JOIN variables ON variables.id = chart_dimensions.variableId WHERE variables.datasetId = %s ) """, [db_dataset_id]) # Enqueue deploy if DEPLOY_QUEUE_PATH: with open(DEPLOY_QUEUE_PATH, 'a') as f: f.write( json.dumps({ 'message': f"Automated dataset update: {dataset_name}" }) + "\n") print("Database update successful.") send_success(channel='corona-data-updates' if not os.getenv('IS_DEV') else 'bot-testing', title=f'Updated Grapher dataset: {dataset_name}')
# - `standardized_name` (the database entity name) # - `db_entity_id` (the database entity id) # # **There will likely be entities that were not matched with a database entity, those will be inserted below.** # In[ ]: entities = pd.read_csv('./entities_standardized.csv', index_col='id') # In[ ]: with connection as c: db = DBUtils(c) new_entities = entities[entities['db_entity_id'].isnull()] for _, entity in new_entities.iterrows(): entity_id = entity.name entity_name = entity['name'] db_entity_id = db.get_or_create_entity(entity_name) entities.loc[entity_id, 'db_entity_id'] = db_entity_id print(db_entity_id, entity['name']) # In[ ]: len(entities[entities['db_entity_id'].isnull()])
def main(): bp_entities = pd.read_csv("./standardization/entities.csv") std_entities = pd.read_csv("./standardization/entities-standardized.csv") new_entities = bp_entities[-bp_entities.name.isin(std_entities.name)] if len(new_entities) > 0: print( "The following entities do not exist yet in entities-standardized.csv" ) print(new_entities) print( "Press CTRL+C to cancel and match+add them yourself to entities-standardized.csv" ) _ = input( "or press ENTER to proceed and look them up (or create) in the database: " ) std_entities = pd.concat([std_entities, new_entities]).reset_index(drop=True) std_entities.loc[std_entities.standardized_name.isnull(), "standardized_name"] = std_entities.name with connection.cursor() as cursor: db = DBUtils(cursor) for _, row in std_entities[ std_entities.db_entity_id.isnull()].iterrows(): std_entities.loc[std_entities.standardized_name == row["standardized_name"], "db_entity_id"] = db.get_or_create_entity( row["standardized_name"]) db_entity_id_by_bp_name = dict( zip(std_entities.name, std_entities.db_entity_id)) # Inserting the dataset db_dataset_id = db.upsert_dataset( name="BP Statistical Review of Global Energy", namespace="bpstatreview_2020", user_id=USER_ID) #Inserting the source db_source_id = db.upsert_source( name="BP Statistical Review of Global Energy (2020)", description=json.dumps({ "dataPublishedBy": "BP", "dataPublisherSource": "Statistical Review of World Energy", "link": "https://www.bp.com/en/global/corporate/energy-economics/statistical-review-of-world-energy.html", "retrievedDate": "August 1, 2020" }), dataset_id=db_dataset_id) # Inserting variables variables = pd.read_csv("output/variables.csv") variables["notes"] = variables.notes.fillna("") for _, variable in variables.iterrows(): print("Inserting variable: %s" % variable["name"]) db_variable_id = db.upsert_variable(name=variable["name"], code=None, unit=variable["unit"], short_unit=None, source_id=db_source_id, dataset_id=db_dataset_id, description=variable["notes"]) data_values = pd.read_csv("./output/datapoints/datapoints_%d.csv" % variable.id) values = [(float(row["value"]), int(row["year"]), db_entity_id_by_bp_name[row["country"]], db_variable_id) for _, row in data_values.iterrows()] print("Inserting values...") db.upsert_many( """ INSERT INTO data_values (value, year, entityId, variableId) VALUES (%s, %s, %s, %s) """, values) print("Inserted %d values for variable" % len(values))
print(df[pd.to_numeric(df['value'], errors='coerce').isnull()]) errors += 1 if errors != 0: print_err("\nIntegrity checks failed. There were %s errors.\n" % str(errors)) sys.exit(1) else: print("\nIntegrity checks passed.\n") # ## Insert database rows # In[1]: with connection as c: db = DBUtils(c) for _, dataset in tqdm(datasets.iterrows(), total=len(datasets)): # Insert the dataset print("Inserting dataset: %s" % dataset['name']) db_dataset_id = db.upsert_dataset(name=dataset['name'], description=dataset['description'], namespace=NAMESPACE, user_id=USER_ID) # Insert the source source = sources[sources['dataset_id'] == dataset.id].iloc[0] print("Inserting source: %s" % source['name']) db_source_id = db.upsert_source(name=source['name'], description=source['description'],
''' end title generation with time threshold fall-back and median time ''' from db_utils import DBUtils import utils from configurator import configurator from collections import OrderedDict import re import csv import numpy as np dbutils = DBUtils(db_name='zippia', host='master.mongodb.d.int.zippia.com') # dbutils= DBUtils(db_name='zippia2', host='localhost') def is_valid_work_info(resume, work_info_obj, min_conf=0): valid = False if (work_info_obj["title"] in resume and resume[work_info_obj["title"]] and work_info_obj["match"] in resume and resume[work_info_obj["match"]] and work_info_obj["confidence"] in resume and resume[work_info_obj["confidence"]] >= min_conf and work_info_obj["from"] in resume and resume[work_info_obj["from"]] and work_info_obj["to"] in resume and resume[work_info_obj["to"]]): ''' Valid Work Experience ''' valid = True
class norm_job(object): api_start_time = time.time() SKILLSET_SIZE = 30 CONFIDENCE_THRESHOLD = 75 dbutils = DBUtils(configurator.commons.MONGODB_HOST) universal_skill_set = dbutils.create_resume_posting_universal_skill_set( SKILLSET_SIZE) ngram_limit = 1 npe = nounphrase_extractor() sf = similarity_finder() JOBS_PARAMETER = "jobs" JOB_TITLE_PARAMETER = "title" JOB_DESCRIPTION_PARAMETER = "description" PREVIOUS_JOB_TITLE_PARAMETER = "previous_title" PREVIOUS_JOB_DESCRIPTION_PARAMETER = "previous_description" SOC_HINT_PARAMETER = "soc_hint" sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') for skill in universal_skill_set: l = len(skill.split()) if l > ngram_limit: ngram_limit = l def __init__(self): LAY_TITLE_LIST_NAME = 'lay_title_list' LAY_TITLE_DICT_NAME = 'lay_title_dict' SIMILAR_TITLE_DICT_NAME = 'title_to_similar_title_dict' CITY_LIST = 'city_list' STATE_LIST = 'state_list' STATE_CODES = 'state_codes' SOC_MASTER_DICT_NAME = 'soc_master_dict' SOC_MAPPING_NAME = 'soc_mapping' SIMILAR_DICT_NAME = 'similar_title_dict' try: self.soc_master_dict = load_file(SOC_MASTER_DICT_NAME) self.soc_mapping = load_file(SOC_MAPPING_NAME) self.similar_title_dict = load_file(SIMILAR_DICT_NAME) self.lay_title_list = load_file(LAY_TITLE_LIST_NAME) self.title_to_similar_title_dict = load_file( SIMILAR_TITLE_DICT_NAME) self.lay_title_dict = load_file(LAY_TITLE_DICT_NAME) self.city_list = load_file(CITY_LIST) self.state_list = load_file(STATE_LIST) self.state_codes = load_file(STATE_CODES) except: [self.soc_master_dict, self.lay_title_list, self.soc_mapping ] = norm_job.dbutils.create_all_lay_title_mappings() [self.similar_title_dict, self.title_to_similar_title_dict ] = norm_job.dbutils.create_all_similar_title_mappings() [self.lay_title_dict, self.lay_title_list ] = create_lay_title_dict_and_lower_list(self.lay_title_list) res = create_lay_title_dict_and_lower_list( self.title_to_similar_title_dict, stem_key=True) self.lay_title_dict.update(res[0]) self.title_to_similar_title_dict = res[1] del res results = create_location_maps(norm_job.dbutils) self.city_list = results["city_list"] self.state_list = results["state_list"] self.state_codes = results["state_codes"] try: folder = configurator.commons.JOB_API_INIT_FILES_PATH if folder: if folder.strip()[-1] != '/': folder = folder.strip() + "/" if not os.path.exists(folder): os.makedirs(folder) temp_folder_path = folder + str(os.getpid()) + "_" + str( time.time()) try: logging.info( save_to_file(LAY_TITLE_DICT_NAME, self.lay_title_dict, temp_folder_path)) logging.info( save_to_file(LAY_TITLE_LIST_NAME, self.lay_title_list, temp_folder_path)) logging.info( save_to_file(SIMILAR_TITLE_DICT_NAME, self.title_to_similar_title_dict, temp_folder_path)) logging.info( save_to_file(CITY_LIST, self.city_list, temp_folder_path)) logging.info( save_to_file(STATE_LIST, self.state_list, temp_folder_path)) logging.info( save_to_file(STATE_CODES, self.state_codes, temp_folder_path)) logging.info( save_to_file(SOC_MASTER_DICT_NAME, self.soc_master_dict, temp_folder_path)) logging.info( save_to_file(SOC_MAPPING_NAME, self.soc_mapping, temp_folder_path)) logging.info( save_to_file(SIMILAR_DICT_NAME, self.similar_title_dict, temp_folder_path)) except Exception as e: root.exception(e) # if folder exists (may be due to parallel processes) then remove current temporary folder ''' Rename directory ''' if os.path.exists(temp_folder_path): for filename in listdir(join(folder, temp_folder_path)): move(join(folder, temp_folder_path, filename), join(folder, filename)) rmdir(temp_folder_path) except Exception as e: root.exception(e) pass remove_cities = [ 'teller', 'home', 'cook', 'grill', 'helper', 'industrial', 'mobile' ] for city in remove_cities: if city in self.city_list: del self.city_list[city] self.soc_lay_title_token_list = {} for soc, lts in self.lay_title_list.items(): self.soc_lay_title_token_list[soc] = {} for lt in lts: for token in set(lt.split()): if token not in self.soc_lay_title_token_list[soc]: self.soc_lay_title_token_list[soc][token] = set() self.soc_lay_title_token_list[soc][token].add(lt) if lt in self.title_to_similar_title_dict: for st in self.title_to_similar_title_dict[lt]: for token in set(st.split()): if token not in self.soc_lay_title_token_list[soc]: self.soc_lay_title_token_list[soc][ token] = set() self.soc_lay_title_token_list[soc][token].add(st) '''Load Model''' try: self.model = SOCClassifierFactory.create_classifier( configurator.commons.JOB_POSTING_CLF_NAME) except Exception as e: root.exception(e) exit() f = open('dictionaries/selected_ngrams_for_driver.csv', 'rb') fr = csv.reader(f, delimiter='\t') self.driver_ngrams_set = set( [row[0] for row in fr if (row[0] and row[0].strip())]) ltm_cursor = norm_job.dbutils.fetch_data( configurator.commons.LAY_TITLE_MASTER, 'cursor', {'soc_code': { '$regex': '^53-' }}, {'soc_code': 1}) self.driver_soc_codes = [(ltm_elem['soc_code'], 100) for ltm_elem in ltm_cursor] root.info("API Start Time= {}s".format(time.time() - norm_job.api_start_time)) def extract_soc_code(self, text, prefix, suffix): if text.startswith(prefix) and text.endswith(suffix): return text[len(prefix):-len(suffix)] return None @staticmethod def fetch_closest_lay_title(lay_title_list, soc_lay_title_token_list, soc_code_tuple, job_title, job_description): closest_lay_title = configurator.commons.DEFAULT_CLOSEST_LAY_TITLE default_soc = soc_code_tuple[0] top_soc = soc_code_tuple[0] valid_lay_titles = set() tokens = job_title.split() for soc_code in soc_code_tuple: if soc_code in soc_lay_title_token_list: for token in tokens: if token in soc_lay_title_token_list[soc_code]: valid_lay_titles = valid_lay_titles.union( soc_lay_title_token_list[soc_code][token]) if len(valid_lay_titles): closest_lay_title = norm_job.sf.find_closest_lay_title( valid_lay_titles, job_title, job_description) for soc in soc_code_tuple: if soc in lay_title_list and closest_lay_title in set( lay_title_list[soc]): default_soc = soc break return closest_lay_title, default_soc, top_soc @cherrypy.expose @cherrypy.tools.json_out() @cherrypy.tools.json_in() def normalize(self, **other_params): cherrypy.response.headers['Content-Type'] = "application/json" params = {} if cherrypy.request.method == "POST": params = cherrypy.request.json error_message = str() error_flag = False job_description = "" batch_size = 0 total_time = time.time() if norm_job.JOBS_PARAMETER not in params: error_flag = True error_message = configurator.commons.MALFORMED_REQUEST_ERROR_MESSAGE else: jobs = params[norm_job.JOBS_PARAMETER] job_array = [] skill_array = [] responses = [] bypass_array = [] batch_size = len(jobs) for job in jobs: try: filtered_title = job[norm_job.JOB_TITLE_PARAMETER] if "instead of" in filtered_title.lower(): filtered_title = filtered_title[:filtered_title.lower( ).find("instead of")].strip() filtered_title = create_key(filtered_title, self.city_list, self.state_list, self.state_codes) job[norm_job.JOB_TITLE_PARAMETER] = filtered_title except: filtered_title = "" job_description = "" if norm_job.JOB_DESCRIPTION_PARAMETER in job: job_description = job[norm_job.JOB_DESCRIPTION_PARAMETER] title_ngrams = find_all_ngrams_upto(filtered_title.lower(), 4) if title_ngrams.intersection(self.driver_ngrams_set): bypass_array.append(1) else: job_array.append((filtered_title, job_description)) bypass_array.append(0) imp_skills = set() if job_description: sentences = norm_job.sent_detector.tokenize( job_description) for sentence in sentences: lower_sentence = sentence.lower() sentence_n_grams = find_all_ngrams_upto( lower_sentence, norm_job.ngram_limit) imp_skills.update( sentence_n_grams.intersection( norm_job.universal_skill_set)) skill_array.append(imp_skills) start_time = time.time() prediction_array = self.model.predict(job_array) root.info( "Context Free classification for {0} points done in {1}s". format(len(prediction_array), time.time() - start_time)) del job_array # root.info(prediction_array) start_time = time.time() for point_index, selector_value in enumerate(bypass_array): if selector_value: soc_codes_with_conf = self.driver_soc_codes else: soc_codes_with_conf = prediction_array.pop(0) soc_codes = [ soc[0] for soc in sorted( soc_codes_with_conf, key=lambda k: k[1], reverse=True) ] try: job_title = jobs[point_index][norm_job.JOB_TITLE_PARAMETER] if "instead of" in job_title.lower(): job_title = job_title[:job_title.lower(). find("instead of")].strip() except: error_flag = True error_message = configurator.commons.MALFORMED_REQUEST_ERROR_MESSAGE if not error_flag: response_json = {} response_json["index"] = point_index response_json["clean_original_title"] = format_skills( jobs[point_index][norm_job.JOB_TITLE_PARAMETER]) response_json["soc_code"] = '' response_json["confidence"] = 0 response_json["closest_lay_title"] = '' response_json["major_group_string"] = '' response_json["skills"] = list(skill_array[point_index]) if not soc_codes: ''' The given job posting could not be normalized using our standard algorithm. We should use the soc_hint parameter present here to see if we can find a nearby title in the given hint SOC code.''' if norm_job.SOC_HINT_PARAMETER in jobs[point_index]: soc_hint = jobs[point_index][ norm_job.SOC_HINT_PARAMETER] if soc_hint in self.soc_mapping: ''' This is a valid SOC Code ''' associated_soc_codes = self.soc_mapping[ soc_hint] soc_codes = list(associated_soc_codes) root.info( "Hinted {} hence, Comparing Against Codes {}" .format(soc_hint, soc_codes)) else: ''' This is an invalid SOC Code and we can't do much about it. ''' root.info( "No matching SOC Code found in soc_hint {}. Cannot normalize." .format(soc_hint)) if soc_codes: key_string = filter_chain.apply( convert_encoding(job_title), is_title=True)[1] closest_lay_title_tuple = norm_job.fetch_closest_lay_title( self.lay_title_list, self.soc_lay_title_token_list, soc_codes, key_string, "") major_group_string = configurator.commons.DEFAULT_MAJOR_GROUP_STRING if closest_lay_title_tuple[1] in self.soc_master_dict: major_group_string = self.soc_master_dict[ closest_lay_title_tuple[1]][ 'major_group_string'] lay_title = convert_encoding( closest_lay_title_tuple[0]) if lay_title in self.lay_title_dict: lay_title = self.lay_title_dict[lay_title] if lay_title in self.similar_title_dict: lay_title = self.similar_title_dict[lay_title] response_json["soc_code"] = closest_lay_title_tuple[1] response_json["confidence"] = int( dict(soc_codes_with_conf)[ closest_lay_title_tuple[1]]) response_json['top_soc'] = closest_lay_title_tuple[2] response_json["closest_lay_title"] = lay_title response_json[ "major_group_string"] = major_group_string else: response_json = { "error_code": configurator.commons.MALFORMED_REQUEST_ERROR_STATUS, "message": error_message } responses.append(response_json) error_flag = False if (point_index + 1) % 1000 == 0: root.info("{0} points done in {1}s".format( point_index, time.time() - start_time)) start_time = time.time() responses_object = {"normalized_jobs": responses} if error_flag: cherrypy.response.status = configurator.commons.MALFORMED_REQUEST_ERROR_STATUS responses_object = { "error_code": configurator.commons.MALFORMED_REQUEST_ERROR_STATUS, "message": error_message } root.info("{0} points done in {1}s".format(batch_size, time.time() - total_time)) return responses_object
def import_csv_files(measure_names, age_names, metric_names, sex_names, parent_tag_name, namespace, csv_dir, default_source_description, get_key, get_var_name, get_var_code): logging.basicConfig( filename=os.path.join(CURRENT_PATH, '..', 'logs', '%s-%s.log' % (os.environ['DB_NAME'], namespace)), level=logging.INFO, format='%(asctime)s [%(levelname)s] %(name)s: %(message)s') logger = logging.getLogger('importer') try: with connection as c: db = DBUtils(c) total_data_values_upserted = 0 def get_state(): return { **db.get_counts(), 'data_values_upserted': total_data_values_upserted, } def log_state(): message = " · ".join( str(key) + ": " + str(value) for key, value in get_state().items()) print(message) logger.info(message) # The user ID that gets assigned in every user ID field (user_id, ) = db.fetch_one(""" SELECT id FROM users WHERE email = '*****@*****.**' """) # Create the parent tag parent_tag_id = db.upsert_parent_tag(parent_tag_name) tag_id_by_name = { name: i for name, i in db.fetch_many( """ SELECT name, id FROM tags WHERE parentId = %s """, parent_tag_id) } # Intentionally kept empty in order to force sources to be updated (in order # to update `retrievedDate`) dataset_id_by_name = {} var_id_by_code = { code: i for code, i in db.fetch_many( """ SELECT variables.code, variables.id FROM variables LEFT JOIN datasets ON datasets.id = variables.datasetId WHERE datasets.namespace = %s """, [namespace]) } # Keep track of variables that we have changed the `updatedAt` column for touched_var_codes = set() # Keep track of which variables have had their data_values removed cleared_var_codes = set() # Intentionally kept empty in order to force sources to be updated (in order # to update `retrievedDate`) source_id_by_name = {} data_values_to_insert = [] insert_data_value_sql = """ INSERT INTO data_values (value, year, entityId, variableId) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE value = VALUES(value) """ # We need an ON DUPLICATE handler here because the GBD dataset has # South Asia under two codes: 158 and 159. Both have the same values # in our extract, so we can safely ignore overwrites. for filename in glob.glob(os.path.join(csv_dir, '*.csv')): with open(filename, 'r', encoding='utf8') as f: print('Processing: %s' % filename) logger.info('Processing: %s' % filename) reader = csv.DictReader(f) row_number = 0 for row in reader: row_number += 1 if row_number % 100 == 0: time.sleep( 0.001 ) # this is done in order to not keep the CPU busy all the time, the delay after each 100th row is 1 millisecond key = get_key(row) # Skip rows we don't want to import if row['sex_name'] not in sex_names \ or row['age_name'] not in age_names \ or row['metric_name'] not in metric_names \ or row['measure_name'] not in measure_names: continue if key not in tag_id_by_name: tag_id_by_name[key] = db.upsert_tag( key, parent_tag_id) if key not in dataset_id_by_name: dataset_id_by_name[key] = db.upsert_dataset( name=key, namespace=namespace, tag_id=tag_id_by_name[key], user_id=user_id) if key not in source_id_by_name: source_id_by_name[key] = db.upsert_source( name=key, description=json.dumps( default_source_description), dataset_id=dataset_id_by_name[key]) var_name = get_var_name(row) var_code = get_var_code(row) if var_code not in var_id_by_code: var_id_by_code[var_code] = db.upsert_variable( name=var_name, code=var_code, unit=row['metric_name'], short_unit=extract_short_unit( row['metric_name']), dataset_id=dataset_id_by_name[key], source_id=source_id_by_name[key]) touched_var_codes.add(var_code) elif var_code not in touched_var_codes: db.touch_variable(var_id_by_code[var_code]) touched_var_codes.add(var_code) var_id = var_id_by_code[var_code] entity_name = get_standard_name(row['location_name']) entity_id = db.get_or_create_entity(entity_name) value = get_metric_value(row) year = int(row['year']) if var_code not in cleared_var_codes: db.execute_until_empty( """ DELETE FROM data_values WHERE data_values.variableId = %s LIMIT 100000 """, [var_id]) cleared_var_codes.add(var_code) data_values_to_insert.append( (value, year, entity_id, var_id)) if len(data_values_to_insert) >= 50000: db.upsert_many(insert_data_value_sql, data_values_to_insert) total_data_values_upserted += len( data_values_to_insert) data_values_to_insert = [] log_state() if len(data_values_to_insert ): # insert any leftover data_values db.upsert_many(insert_data_value_sql, data_values_to_insert) total_data_values_upserted += len(data_values_to_insert) data_values_to_insert = [] log_state() db.note_import(import_type=namespace, import_notes='A gbd import was performed', import_state=json.dumps(get_state())) except: logger.exception("error") raise
return print(*args, file=sys.stderr, **kwargs) def chunk_df(df, n): """Yield successive n-sized chunks from data frame.""" for i in range(0, df.shape[0], n): yield df[i:i + n] tz_utc = tz_db = timezone.utc tz_local = datetime.now(tz_utc).astimezone().tzinfo tz_london = pytz.timezone('Europe/London') if __name__ == "__main__": with connection as c: db = DBUtils(c) # Check whether the database is up to date, by checking the # - last modified date of the Grapher file # - last modified date of the database row # # This is not bulletproof, but it allows for flexibility – authors could manually update # the repo, and that would trigger a database update too. (db_dataset_id, db_dataset_modified_time) = db.fetch_one( """ SELECT id, dataEditedAt FROM datasets WHERE name = %s AND namespace = %s """, [ecdc.DATASET_NAME, NAMESPACE])
from glob import glob from db import connection from db_utils import DBUtils # ## Database importer # In[2]: entities = pd.read_csv('./standardization/entities-standardized.csv') # In[3]: with connection as c: db = DBUtils(c) all_entities = entities.copy() new_entities = all_entities[all_entities['db_entity_id'].isnull()] for _, entity in new_entities.iterrows(): entity_id = entity.name entity_name = entity['name'] db_entity_id = db.get_or_create_entity(entity_name) all_entities.loc[entity_id, 'db_entity_id'] = db_entity_id db_entity_id_by_name = { row['name']: int(row['db_entity_id']) for _, row in all_entities.iterrows() }
label_wise_removal[label]['removed'] += 1 count += 1 if count % 10000 == 0: print count, len(removable_ids) print count, len(removable_ids) for _id in removable_ids: db.remove_entry(collection_name, query_dict={"_id": _id}) print "DONE CLEANING ON COLLECTION: {}, REMOVED {} ENTRIES".format( collection_name, len(removable_ids)) for label, removed_object in label_wise_removal.items(): print "Label: {}, Total: {}, Removed: {}".format( label, removed_object['total'], removed_object['removed']) if __name__ == "__main__": db = DBUtils(db_name='test_db', host="localhost") collection_name = 'paragraphs' #"responsibilities_paragraphs_3" training_collection_name = "training_paragraphs" test_collection_name = "test_paragraphs" non_labeled_collection_name = "non_labeled_paragraphs" training_part_fraction = 0.7 # training_test_spliter(collection_name, training_collection_name, test_collection_name, non_labeled_collection_name, training_part_fraction) # clean_data_set(training_collection_name) # clean_data_set(test_collection_name) # token_label_map = read_label_tokens() # train_model(training_collection_name) # test_model(test_collection_name) # test_data = [
def __init__(self): self.sql_builder = SqlBuilder() self.db_utils = DBUtils()
from db_utils import DBUtils from utils import find_all_ngrams_upto import re import csv db = DBUtils(db_name='JobListings', host='master.mongodb.d.int.zippia.com') collection_name = db.fetch_data('MetaCollection')[0]['current'] major_group = 53 def print_ngrams_for_mg(): cursor = db.fetch_data( collection_name, 'cursor', {'socCode': { '$regex': '^' + str(major_group) + '-' }}, {'titleDisplay': 1}) n_gram_freq = {} count = 0 for elem in cursor: title = elem['titleDisplay'] title = re.sub("\W+", " ", title) n_grams = find_all_ngrams_upto(title.lower(), n=4) for ng in n_grams: if ng not in n_gram_freq: n_gram_freq[ng] = 0 n_gram_freq[ng] += 1 count += 1 if count % 10000 == 0: print "{} entries processed".format(count) print "{} entries processed".format(count)
import networkx as nx from db_utils import DBUtils import csv import time import re import numpy as np from configurator import configurator import utils from filter_chain import remove_stop_words from sklearn.externals import joblib from functools import partial from multiprocessing import Pool import math dbutils = DBUtils(db_name='zippia', host='master.mongodb.d.int.zippia.com') db_local = DBUtils(db_name='test', host='localhost') db_zippia2 = DBUtils(db_name='zippia2', host='localhost') def format_date(date_string): date_string = re.sub("\W+", " ", date_string.lower()) date_string = re.sub("\d+[ ]*((year)|(yr))([s]{0,1})", " ", date_string) date_string = re.sub("\d+[ ]*((month)|(mt))([s]{0,1})", " ", date_string) date_string = re.sub(" +", " ", date_string).strip() return date_string class valid_edge_checker: level_dict = { "Lead": set([
def print_paths(depriotize_starts, major_title_dict1, major_title_dict2, degrees, work_meta_info, min_conf, required_skills_threshold, top_k, edge_count_threshold, index_val): collection_name = "careerPathsForMajors_test_time" db_local = DBUtils(db_name='test', host='localhost') title_skills = read_skill_master() title_time_dict = read_local_skill_master() if index_val == 1: major_title_dict = major_title_dict1 else: major_title_dict = major_title_dict2 for major, titles_dict in major_title_dict.items(): start_time = time.time() final_valid_paths = {} final_valid_paths['name'] = major final_valid_paths['version'] = 4 final_valid_paths['graduate_paths'] = [] final_valid_paths['under_graduate_paths'] = [] final_valid_paths['other_paths'] = [] iterate_over_degrees = ["Graduate", "Under Graduate", "Other"] major_edge_count_threshold = 0.1 #edge_count_threshold [sts, ets] = combine_start_and_end_titles(titles_dict) edges = create_graph_for_majors(major, degrees, work_meta_info, min_conf, depriotize_starts) joblib.dump( edges, '/mnt/data/rohit/major_edge_count/' + "_".join(re.sub("\W+", " ", major).split()) + '_edge_counts.pkl') while major_edge_count_threshold > 0 and iterate_over_degrees: career_graph = create_graph(edges, sts, ets, major_edge_count_threshold) [result_paths, edges] = print_paths_iterator( major, titles_dict, title_skills, required_skills_threshold, career_graph, edges, title_time_dict, iterate_over_degrees) if len(result_paths['graduate_paths'] ) >= 15 and not final_valid_paths['graduate_paths']: final_valid_paths['graduate_paths'].extend(result_paths[ 'graduate_paths']) iterate_over_degrees.remove("Graduate") elif result_paths['graduate_paths'] and len(final_valid_paths[ 'graduate_paths']) < 15: append_new_paths( result_paths, edges, final_valid_paths, path_name='graduate_paths') if len(final_valid_paths['graduate_paths']) >= 15: iterate_over_degrees.remove("Graduate") if len(result_paths['under_graduate_paths'] ) >= 15 and not final_valid_paths['under_graduate_paths']: final_valid_paths['under_graduate_paths'].extend(result_paths[ 'under_graduate_paths']) iterate_over_degrees.remove("Under Graduate") elif result_paths['under_graduate_paths'] and len( final_valid_paths['under_graduate_paths']) < 15: append_new_paths( result_paths, edges, final_valid_paths, path_name='under_graduate_paths') if len(final_valid_paths['under_graduate_paths']) >= 15: iterate_over_degrees.remove("Under Graduate") if len(result_paths['other_paths'] ) >= 15 and not final_valid_paths['other_paths']: final_valid_paths['other_paths'].extend(result_paths[ 'other_paths']) iterate_over_degrees.remove("Other") elif result_paths['other_paths'] and len(final_valid_paths[ 'other_paths']) < 15: append_new_paths( result_paths, edges, final_valid_paths, path_name='other_paths') if len(final_valid_paths['other_paths']) >= 15: iterate_over_degrees.remove("Other") major_edge_count_threshold -= 1 db_local.insert_records(collection_name, final_valid_paths) print "Done Major: {} in {}s".format(major, time.time() - start_time)