def main(): '''Run classification on job data Process directly from partner feeds Current configuration: + Load csv data from xml feed + Count Vectorize with Bag of Words + Label using Naive Bayes Classifier + Write results back to s3 ''' # define paths key_path = '/home/ubuntu/job-classifier/.keys/' model_path = '/home/ubuntu/job-classifier/models/' # dict with partner:url pairs partners = load_pickle(os.path.join(key_path, 'partners.pickle')) s3_details = load_pickle(os.path.join(key_path, 's3_config.pickle')) bucket = s3_details['csv_bucket'] file_to_write = s3_details['target'] cv_model = load_pickle(os.path.join(model_path, 'CV_nb_bow_model.pckl')) # update cv model above #mnb_model = load_pickle(os.path.join(model_path, 'multi_nb_model.pckl')) cnb_model = load_pickle( os.path.join(model_path, 'complement_nb_model.pckl')) # cities & roles city_roles = { 'driver': ['new york', 'los angeles'], 'nurse': ['dallas', 'los angeles'], 'tech': ['san francisco', 'boston'] } # pull xml from url and parse into df for partner in partners: print('\nProcessing:', partner.upper()) url = partners[partner] if url.endswith('.xml.gz'): df = xt.xml_from_url_compressed(url) else: df = xt.xml_from_url(url) # standardize text format df = nlp.standardize_text(df, 'title') # select data to predict from X_classify = df['title'].tolist() # get count vec X_classify_counts = nlp.get_cv_test_counts(X_classify, cv_model) # predict with model #y_label = mnb_model.predict(X_classify_counts) y_label = cnb_model.predict(X_classify_counts) # assign predictions to jobs & prune dataframe df['label'] = y_label label_cols = ['label', 'company', 'title', 'city', 'state', 'url'] #labels_to_drop = ['ignore','driver','service'] labels_to_drop = ['ignore', 'service'] df_tmp = df[~(df['label'].isin(labels_to_drop))][label_cols] for role in city_roles: print('processing:', role.upper()) for city in city_roles[role]: print('city:', city.upper()) df_to_write = get_city_df(df, city, role) # write labeled roles city_file = city.replace(' ', '_') label_key = partner + '/' + role + '/' + city_file + '/' + 'jobs.csv' bt.write_df_to_s3(df_to_write, bucket, label_key, comp=False)
def main(): '''Run classification on job data Process directly from partner feeds Current configuration: + Load csv data from xml feed + Count Vectorize with Bag of Words + Label using Naive Bayes Classifier + Write results back to s3 ''' # define paths key_path = '/home/ubuntu/job-classifier/.keys/' model_path = '/home/ubuntu/job-classifier/models/' # dict with partner:url pairs partners = load_pickle(os.path.join(key_path, 'partners.pickle')) s3_details = load_pickle(os.path.join(key_path, 's3_config.pickle')) bucket = s3_details['csv_bucket'] file_to_write = s3_details['target'] cv_model = load_pickle(os.path.join(model_path,'CV_nb_bow_model.pckl')) # update cv model above #clf_model = load_pickle(os.path.join(model_path, 'lr_bow_train_only_model.pckl')) #mnb_model = load_pickle(os.path.join(model_path, 'multi_nb_model.pckl')) cnb_model = load_pickle(os.path.join(model_path, 'complement_nb_model.pckl')) # pull xml from url and parse into df for partner in partners: url = partners[partner] if url.endswith('.xml.gz'): df = xt.xml_from_url_compressed(url) else: df = xt.xml_from_url(url) # standardize text format df = nlp.standardize_text(df, 'title') # select data to predict from X_classify = df['title'].tolist() # get count vec X_classify_counts = nlp.get_cv_test_counts(X_classify, cv_model) # predict with model #y_label = mnb_model.predict(X_classify_counts) y_label = cnb_model.predict(X_classify_counts) # assign predictions to jobs & prune dataframe df['label'] = y_label label_cols = ['label', 'company','title','city','state','url'] labels_to_drop = ['ignore','driver','service'] df_to_write = df[~(df['label'].isin(labels_to_drop))][label_cols] #df_to_write = df[~(df['label']=='ignore')][label_cols] # SAMPLE DF_TO_WRITE for smaller dataset df_to_write = df_to_write.sample(n=100) # write labeled roles label_key = partner + '/' + 'labeled_jobs.csv' bt.write_df_to_s3(df_to_write, bucket, label_key, comp=False)
def main(): '''Run classification on job data Current configuration: + Load csv data from s3 + Count Vectorize with Bag of Words + Predict Linear Regression + Select results on LinReg class probability + Write results back to s3 ''' # add custom modules to path path_to_module = '/home/ubuntu/job-classifier/tools/' sys.path.append(path_to_module) # load s3 connector & preprocessing functions import bototools as bt import xmltools as xt import nlp_preprocessing as nlp # load job data from s3 csv to dataframe path_to_data = '/home/ubuntu/job-classifier/.keys/' file_name = 'cron_data_file.json' bucket, key, url, target = bt.load_s3_location(path_to_data, file_name) # pull xml from url and parse into df df = xt.xml_from_url(url) # standardize text format cols_to_model = ['title'] for col in cols_to_model: df = nlp.standardize_text(df, col) # select data to predict from X_classify = df['title'].tolist() # define model path path_to_models = '/home/ubuntu/job-classifier/models/' # load count vectorizer & transform data to predict cv_pickle = 'CV_lr_bow_train_only_model.pckl' cv_path = os.path.join(path_to_models, cv_pickle) cv_model = pickle.load(open(cv_path, 'rb')) X_classify_counts = nlp.get_cv_test_counts(X_classify, cv_model) # load, train, fit model clf_pickle = 'lr_bow_train_only_model.pckl' clf_path = os.path.join(path_to_models, clf_pickle) clf_model = pickle.load(open(clf_path, 'rb')) y_predicted = clf_model.predict(X_classify_counts) y_prob = clf_model.predict_proba(X_classify_counts) # assign predictions to jobs & prune dataframe df['gig'] = y_predicted cols_to_write = ['company','title','city','state','url'] df_to_write = df[df['gig']==1][cols_to_write] # write jobs to accessible location on s3 # custom name by date -- test overlap between days timestr = time.strftime("%Y-%m-%d") prefix, fn = target.split('/') file_to_write = prefix + '/' + timestr + '-' + fn bt.write_df_to_s3(df_to_write, bucket, file_to_write, comp=False) # add labeled samples to validate for future training df_positive = df[df['gig']==1].sample(1000) file_positive = 'positive' + '/' + timestr + '-' + fn bt.write_df_to_s3(df_positive, bucket, file_positive, comp=False) df_negative = df[df['gig']==0].sample(1000) file_negative = 'negative' + '/' + timestr + '-' + fn bt.write_df_to_s3(df_negative, bucket, file_negative, comp=False)
def main(): '''Run classification on job data Current configuration: + Load csv data from xml feed + Count Vectorize with Bag of Words + Predict Linear Regression + Select results on LinReg class probability + Write results back to s3 ''' # add custom modules to path path_to_module = '../tools/' sys.path.append(path_to_module) # load s3 connector & preprocessing functions import bototools as bt import xmltools as xt import nlp_preprocessing as nlp # load job data from s3 csv to dataframe path_to_data = '../.keys/' file_name = 'eda_data_file.json' bucket, key, url, target = bt.load_s3_location(path_to_data, file_name) # pull xml from url and parse into df #df = bt.load_df_from_s3(bucket, key, comp='gzip') df = xt.xml_from_url(url) # standardize text format cols_to_model = ['title'] for col in cols_to_model: df = nlp.standardize_text(df, col) # select data to predict from X_classify = df['title'].tolist() # define model path path_to_models = '../models/' # load count vectorizer & transform data to predict cv_pickle = 'CV_lr_bow_train_only_model.pckl' cv_path = os.path.join(path_to_models, cv_pickle) cv_model = pickle.load(open(cv_path, 'rb')) X_classify_counts = nlp.get_cv_test_counts(X_classify, cv_model) # load, train, fit model clf_pickle = 'lr_bow_train_only_model.pckl' clf_path = os.path.join(path_to_models, clf_pickle) clf_model = pickle.load(open(clf_path, 'rb')) y_predicted = clf_model.predict(X_classify_counts) #y_prob = clf_model.predict_proba(X_classify_counts) # assign predictions to jobs & prune dataframe df['gig'] = y_predicted #df['prob'] = y_prob[:,0] # failed last test cols_to_write = ['company','title','city','state','url'] #cols_to_write = ['company','title','city','state','posted_at','url'] # only keep listings with over 95% probability of being a gig job # tighten/loosen requirement depending on model #df_to_write = df[(df['gig']==1) & (df['prob']==0.95)][cols_to_write] df_to_write = df[df['gig']==1][cols_to_write] # write jobs to accessible location on s3 #file_to_write = 'gigs/streamed_full_daily_job_list.csv' file_to_write = target bt.write_df_to_s3(df_to_write, bucket, file_to_write, comp=False)
def main(): '''Run classification ''' path_to_module = '../tools/' sys.path.append(path_to_module) # load s3 read & write functions import bototools as bt # add these to actual path print('classifying new jobs...\n') path_to_data = '../.keys/' #file_name = 'csv_to_classify.json' file_name = 'eda_data_file.json' #bucket, key = bt.load_s3_location(path_to_data, file_name) # use a sample file for testing -- avoid large files for now bucket, _ = bt.load_s3_location(path_to_data, file_name) key = 'eda_sample_data_file.csv' df = bt.load_df_from_s3(bucket, key) # sample not gzipped #df = bt.load_df_from_s3(bucket, key, comp='gzip') # import preprocessing tools import nlp_preprocessing as nlp # cleanup the dataframe to prepare for classification # while only SOME columns are used, ALL need to be returned for ops team cols_to_model = ['title'] for col in cols_to_model: df = nlp.standardize_text(df, col) X_classify = df['title'].tolist() # load count vectorizer path_to_models = '../models/' cv_pickle = 'CV_lr_bow_train_only_model.pckl' # use private file too cv_path = os.path.join(path_to_models, cv_pickle) cv_model = pickle.load(open(cv_path, 'rb')) X_classify_counts = nlp.get_cv_test_counts(X_classify, cv_model) from sklearn.linear_model import LogisticRegression # move outside main # load pre-trained model #path_to_model = '../models/' # functionalize loading & training clf_pickle = 'lr_bow_train_only_model.pckl' # use private file too clf_path = os.path.join(path_to_models, clf_pickle) clf_model = pickle.load(open(clf_path, 'rb')) y_predicted = clf_model.predict(X_classify_counts) df['gig'] = y_predicted cols_to_write = ['company','title','city','state','posted_at','url'] df_to_write = df[df['gig']==1][cols_to_write] #df_to_write = df[df['gig']==1] #df[df['gig']==1].to_csv('../data/classified_gig_jobs.csv', index=False) #print('Gig jobs found: {}'.format(df[df['gig']==1].shape[0])) # write output (use a prefix!) file_to_write = 'gigs/sample_daily_job_list.csv' bt.write_df_to_s3(df_to_write, bucket, file_to_write)