def main(): '''Create and save model ''' path_to_module = '../tools/' sys.path.append(path_to_module) # load s3 read & write functions import bototools as bt import nlp_preprocessing as nlp # if pulling from s3 #path_to_data = '../.keys/' #file_name = 'csv_to_classify.json' #bucket, key = bt.load_s3_location(path_to_data, file_name) #df = bt.load_df_from_s3(bucket, key, compression='gzip') # local data path_to_data = '../data/multiclass/' data_file = 'labeled_large.csv' df = pd.read_csv(os.path.join(path_to_data, data_file)) print('loaded CSV') # remove samples not of interestc # map roles to general labels key_path = '../.keys' role_mapper = load_pickle(os.path.join(key_path, 'roles.pickle')) role_names = [n for n in role_mapper] df = df[df['role'].isin(role_names)] df['label'] = df['role'].map(role_mapper) df = df.dropna() cols_to_train = ['title', 'label'] df = df[cols_to_train] # standardize text format df = nlp.standardize_text(df, 'title') # select data to predict from X = df['title'].tolist() y = df['label'].tolist() #y = df['gig'].tolist() # dividing X, y into train and test data print('vectorizing bag of words model') X_counts, count_vectorizer = nlp.create_count_vectorizer(X) # count vec cv_to_write = '../models/CV_nb_bow_model.pckl' pickle.dump(count_vectorizer, open(cv_to_write, 'wb')) print('successfully serialized CV model') # training a Naive Bayes classifier print('testing model') #model = MultinomialNB().fit(X_counts, y) model = ComplementNB().fit(X_counts, y) # save model for later use (locally & on s3) file_to_write = '../models/complement_nb_model.pckl' pickle.dump(model, open(file_to_write, 'wb')) print('successfully serialized NB model')
def main(): '''Run labeling on job data Process directly from partner feeds ** NOTE - this is a "dumb" method without NLP/ML ** Current configuration: + Load csv data from xml feed + Label based on job title + Write results to s3 ''' # define paths key_path = '/home/ubuntu/job-classifier/.keys/' #model_path = '/home/ubuntu/job-classifier/models/' # dict with partner:url pairs partners = load_pickle(os.path.join(key_path, 'partners.pickle')) s3_details = load_pickle(os.path.join(key_path, 's3_config.pickle')) bucket = s3_details['csv_bucket'] file_to_write = s3_details['target'] # pull xml from url and parse into df for partner in partners: url = partners[partner] if url.endswith('.xml.gz'): df = xt.xml_from_url_compressed(url) else: df = xt.xml_from_url(url) # standardize text format df = nlp.standardize_text(df, 'title') # apply label function df['label'] = df.title.apply(get_role) label_cols = ['label', 'company','title','city','state','url'] df_to_keep = df[~(df['label']=='ignore')][label_cols] role_dfs = {} for label in df_to_keep.label.unique(): tmp_df = df_to_keep[df_to_keep['label']==label] if len(tmp_df) > 100: tmp_df = tmp_df.sample(n=100) role_dfs[label] = tmp_df df_to_write = pd.concat([role_dfs[x] for x in role_dfs]) # write labeled roles label_key = partner + '/' + 'labeled_jobs.csv' print('Writing {} jobs'.format(len(df_to_write))) bt.write_df_to_s3(df_to_write, bucket, label_key, comp=False)
def main(): # text cleaning df = pd.read_csv() # or something df = nlp.standardize_text(df, 'text')
def main(): '''Run classification on job data Process directly from partner feeds Current configuration: + Load csv data from xml feed + Count Vectorize with Bag of Words + Label using Naive Bayes Classifier + Write results back to s3 ''' # define paths key_path = '/home/ubuntu/job-classifier/.keys/' model_path = '/home/ubuntu/job-classifier/models/' # dict with partner:url pairs partners = load_pickle(os.path.join(key_path, 'partners.pickle')) s3_details = load_pickle(os.path.join(key_path, 's3_config.pickle')) bucket = s3_details['csv_bucket'] file_to_write = s3_details['target'] cv_model = load_pickle(os.path.join(model_path, 'CV_nb_bow_model.pckl')) # update cv model above #mnb_model = load_pickle(os.path.join(model_path, 'multi_nb_model.pckl')) cnb_model = load_pickle( os.path.join(model_path, 'complement_nb_model.pckl')) # cities & roles city_roles = { 'driver': ['new york', 'los angeles'], 'nurse': ['dallas', 'los angeles'], 'tech': ['san francisco', 'boston'] } # pull xml from url and parse into df for partner in partners: print('\nProcessing:', partner.upper()) url = partners[partner] if url.endswith('.xml.gz'): df = xt.xml_from_url_compressed(url) else: df = xt.xml_from_url(url) # standardize text format df = nlp.standardize_text(df, 'title') # select data to predict from X_classify = df['title'].tolist() # get count vec X_classify_counts = nlp.get_cv_test_counts(X_classify, cv_model) # predict with model #y_label = mnb_model.predict(X_classify_counts) y_label = cnb_model.predict(X_classify_counts) # assign predictions to jobs & prune dataframe df['label'] = y_label label_cols = ['label', 'company', 'title', 'city', 'state', 'url'] #labels_to_drop = ['ignore','driver','service'] labels_to_drop = ['ignore', 'service'] df_tmp = df[~(df['label'].isin(labels_to_drop))][label_cols] for role in city_roles: print('processing:', role.upper()) for city in city_roles[role]: print('city:', city.upper()) df_to_write = get_city_df(df, city, role) # write labeled roles city_file = city.replace(' ', '_') label_key = partner + '/' + role + '/' + city_file + '/' + 'jobs.csv' bt.write_df_to_s3(df_to_write, bucket, label_key, comp=False)
def main(): '''Run classification on job data Process directly from partner feeds Current configuration: + Load csv data from xml feed + Count Vectorize with Bag of Words + Label using Naive Bayes Classifier + Write results back to s3 ''' # define paths key_path = '/home/ubuntu/job-classifier/.keys/' model_path = '/home/ubuntu/job-classifier/models/' # dict with partner:url pairs partners = load_pickle(os.path.join(key_path, 'partners.pickle')) s3_details = load_pickle(os.path.join(key_path, 's3_config.pickle')) bucket = s3_details['csv_bucket'] file_to_write = s3_details['target'] cv_model = load_pickle(os.path.join(model_path,'CV_nb_bow_model.pckl')) # update cv model above #clf_model = load_pickle(os.path.join(model_path, 'lr_bow_train_only_model.pckl')) #mnb_model = load_pickle(os.path.join(model_path, 'multi_nb_model.pckl')) cnb_model = load_pickle(os.path.join(model_path, 'complement_nb_model.pckl')) # pull xml from url and parse into df for partner in partners: url = partners[partner] if url.endswith('.xml.gz'): df = xt.xml_from_url_compressed(url) else: df = xt.xml_from_url(url) # standardize text format df = nlp.standardize_text(df, 'title') # select data to predict from X_classify = df['title'].tolist() # get count vec X_classify_counts = nlp.get_cv_test_counts(X_classify, cv_model) # predict with model #y_label = mnb_model.predict(X_classify_counts) y_label = cnb_model.predict(X_classify_counts) # assign predictions to jobs & prune dataframe df['label'] = y_label label_cols = ['label', 'company','title','city','state','url'] labels_to_drop = ['ignore','driver','service'] df_to_write = df[~(df['label'].isin(labels_to_drop))][label_cols] #df_to_write = df[~(df['label']=='ignore')][label_cols] # SAMPLE DF_TO_WRITE for smaller dataset df_to_write = df_to_write.sample(n=100) # write labeled roles label_key = partner + '/' + 'labeled_jobs.csv' bt.write_df_to_s3(df_to_write, bucket, label_key, comp=False)
def main(): '''Run classification on job data Process directly from partner feeds ** NOTE - this is a "dumb" method without NLP/ML ** Current configuration: + Load csv data from xml feed + Label based on job title + Write results to s3 ''' # define paths key_path = '/home/ubuntu/job-classifier/.keys/' # dict with partner:url pairs partners = load_pickle(os.path.join(key_path, 'partners.pickle')) s3_details = load_pickle(os.path.join(key_path, 's3_config.pickle')) city_roles = load_pickle(os.path.join(key_path, 'city_roles.pickle')) bucket = s3_details['csv_bucket'] file_to_write = s3_details['target'] # pull xml from url and parse into df for partner in partners: print('\nProcessing:', partner.upper()) url = partners[partner] if url.endswith('.xml.gz'): df = xt.xml_from_url_compressed(url) else: df = xt.xml_from_url(url) # standardize text format df = nlp.standardize_text(df, 'title') # assign labels to jobs & prune dataframe df['label'] = df.title.apply(get_role) label_cols = ['label', 'company', 'title', 'city', 'state', 'url'] df = df[~(df['label'] == 'ignore')][label_cols] for role in city_roles: print('processing:', role.upper()) for city in city_roles[role]: print('city:', city.upper()) df_to_write = get_city_df(df, city, role) # write labeled roles city_file = city.replace(' ', '_') label_key = partner + '/' + role + '/' + city_file + '/' + 'jobs.csv' if len(df_to_write) > 0: #print('write: {} to s3'.format(len(df_to_write))) bt.write_df_to_s3(df_to_write, bucket, label_key, comp=False) else: print('No matches found')
def main(): '''Run classification on job data Current configuration: + Load csv data from xml feed + Count Vectorize with Bag of Words + Predict Linear Regression + Select results on LinReg class probability + Write results back to s3 ''' # add custom modules to path path_to_module = '../tools/' sys.path.append(path_to_module) # load s3 connector & preprocessing functions import bototools as bt import xmltools as xt import nlp_preprocessing as nlp # load job data from s3 csv to dataframe path_to_data = '../.keys/' file_name = 'eda_data_file.json' bucket, key, url, target = bt.load_s3_location(path_to_data, file_name) # pull xml from url and parse into df #df = bt.load_df_from_s3(bucket, key, comp='gzip') df = xt.xml_from_url(url) # standardize text format cols_to_model = ['title'] for col in cols_to_model: df = nlp.standardize_text(df, col) # select data to predict from X_classify = df['title'].tolist() # define model path path_to_models = '../models/' # load count vectorizer & transform data to predict cv_pickle = 'CV_lr_bow_train_only_model.pckl' cv_path = os.path.join(path_to_models, cv_pickle) cv_model = pickle.load(open(cv_path, 'rb')) X_classify_counts = nlp.get_cv_test_counts(X_classify, cv_model) # load, train, fit model clf_pickle = 'lr_bow_train_only_model.pckl' clf_path = os.path.join(path_to_models, clf_pickle) clf_model = pickle.load(open(clf_path, 'rb')) y_predicted = clf_model.predict(X_classify_counts) #y_prob = clf_model.predict_proba(X_classify_counts) # assign predictions to jobs & prune dataframe df['gig'] = y_predicted #df['prob'] = y_prob[:,0] # failed last test cols_to_write = ['company','title','city','state','url'] #cols_to_write = ['company','title','city','state','posted_at','url'] # only keep listings with over 95% probability of being a gig job # tighten/loosen requirement depending on model #df_to_write = df[(df['gig']==1) & (df['prob']==0.95)][cols_to_write] df_to_write = df[df['gig']==1][cols_to_write] # write jobs to accessible location on s3 #file_to_write = 'gigs/streamed_full_daily_job_list.csv' file_to_write = target bt.write_df_to_s3(df_to_write, bucket, file_to_write, comp=False)
def main(): '''Test & Evaluate multiple models to select best option Considerations: + Single-feature + Multiple features + Count vectorizer - BoW/Tfidf + k-fold x-validation + Choice of classifier (LR, RF, etc) Metrics: + Accuracy + Variance + F1 + Confusion matrix + Create graphics + Feature importance ''' path_to_module = '../tools/' sys.path.append(path_to_module) # load s3 read & write functions import bototools as bt import nlp_preprocessing as nlp print('classifying new jobs...\n') # if pulling from s3 #path_to_data = '../.keys/' #file_name = 'csv_to_classify.json' #bucket, key = bt.load_s3_location(path_to_data, file_name) #df = bt.load_df_from_s3(bucket, key, compression='gzip') # local data path_to_data = '../data/multiclass/' data_file = 'labeled_large.csv' df = pd.read_csv(os.path.join(path_to_data, data_file)) print('loaded CSV') # map roles to general labels key_path = '../.keys' role_mapper = load_pickle(os.path.join(key_path, 'roles.pickle')) role_names = [n for n in role_mapper] df = df[df['role'].isin(role_names)] df['label'] = df['role'].map(role_mapper) df = df.dropna() cols_to_train = ['title', 'label'] df = df[cols_to_train] # standardize text format df = nlp.standardize_text(df, 'title') ## run a 10-fold stratified cross validation skf = StratifiedKFold(n_splits=10, random_state=0) df = df.reset_index(drop=True) X = df['title'].values.astype('U') y = df['label'] current_split = 1 acc_lr = [] prec_lr = [] rec_lr = [] f1_lr = [] acc_mnb = [] prec_mnb = [] rec_mnb = [] f1_mnb = [] acc_cnb = [] prec_cnb = [] rec_cnb = [] f1_cnb = [] for train_index, test_index in skf.split(X, y): print('CURRENT SPLIT:', current_split) # get splits & assign data X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #print('successful split') # vectorize word counts X_train_counts, count_vectorizer = nlp.create_count_vectorizer(X_train) X_test_counts = count_vectorizer.transform(X_test) # Logistic regression lr = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg', multi_class='multinomial', n_jobs=-1, random_state=40) lr.fit(X_train_counts, y_train) lr_predictions = lr.predict(X_test_counts) accuracy, precision, recall, f1 = get_metrics(y_test, lr_predictions) print( "LR: accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1)) acc_lr.append(accuracy) prec_lr.append(precision) rec_lr.append(recall) f1_lr.append(f1) # multinomial NB mnb = MultinomialNB().fit(X_train_counts, y_train) mnb_predictions = mnb.predict(X_test_counts) accuracy, precision, recall, f1 = get_metrics(y_test, mnb_predictions) print( "MNB: accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1)) acc_mnb.append(accuracy) prec_mnb.append(precision) rec_mnb.append(recall) f1_mnb.append(f1) # complement NB cnb = ComplementNB().fit(X_train_counts, y_train) cnb_predictions = cnb.predict(X_test_counts) accuracy, precision, recall, f1 = get_metrics(y_test, cnb_predictions) print( "CNB: accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1)) acc_cnb.append(accuracy) prec_cnb.append(precision) rec_cnb.append(recall) f1_cnb.append(f1) current_split += 1 # Summarize print('\nFinal Perfomance') print('\nLogistic Regression Perfomance') print('Accuracy: mean %.3f, variance %.3f' % (np.mean(acc_lr), np.var(acc_lr))) print('Precision: mean %.3f, variance %.3f' % (np.mean(prec_lr), np.var(prec_lr))) print('Recall: mean %.3f, variance %.3f' % (np.mean(rec_lr), np.var(rec_lr))) print('F1 Score: mean %.3f, variance %.3f' % (np.mean(f1_lr), np.var(f1_lr))) print('\nMultinomial Naive Bayes Perfomance') print('Accuracy: mean %.3f, variance %.3f' % (np.mean(acc_mnb), np.var(acc_mnb))) print('Precision: mean %.3f, variance %.3f' % (np.mean(prec_mnb), np.var(prec_mnb))) print('Recall: mean %.3f, variance %.3f' % (np.mean(rec_mnb), np.var(rec_mnb))) print('F1 Score: mean %.3f, variance %.3f' % (np.mean(f1_mnb), np.var(f1_mnb))) print('\nComplement Naive Bayes Perfomance') print('Accuracy: mean %.3f, variance %.3f' % (np.mean(acc_cnb), np.var(acc_cnb))) print('Precision: mean %.3f, variance %.3f' % (np.mean(prec_cnb), np.var(prec_cnb))) print('Recall: mean %.3f, variance %.3f' % (np.mean(rec_cnb), np.var(rec_cnb))) print('F1 Score: mean %.3f, variance %.3f' % (np.mean(f1_cnb), np.var(f1_cnb)))
def main(): '''Run classification on job data Current configuration: + Load csv data from s3 + Count Vectorize with Bag of Words + Predict Linear Regression + Select results on LinReg class probability + Write results back to s3 ''' # add custom modules to path path_to_module = '/home/ubuntu/job-classifier/tools/' sys.path.append(path_to_module) # load s3 connector & preprocessing functions import bototools as bt import xmltools as xt import nlp_preprocessing as nlp # load job data from s3 csv to dataframe path_to_data = '/home/ubuntu/job-classifier/.keys/' file_name = 'cron_data_file.json' bucket, key, url, target = bt.load_s3_location(path_to_data, file_name) # pull xml from url and parse into df df = xt.xml_from_url(url) # standardize text format cols_to_model = ['title'] for col in cols_to_model: df = nlp.standardize_text(df, col) # select data to predict from X_classify = df['title'].tolist() # define model path path_to_models = '/home/ubuntu/job-classifier/models/' # load count vectorizer & transform data to predict cv_pickle = 'CV_lr_bow_train_only_model.pckl' cv_path = os.path.join(path_to_models, cv_pickle) cv_model = pickle.load(open(cv_path, 'rb')) X_classify_counts = nlp.get_cv_test_counts(X_classify, cv_model) # load, train, fit model clf_pickle = 'lr_bow_train_only_model.pckl' clf_path = os.path.join(path_to_models, clf_pickle) clf_model = pickle.load(open(clf_path, 'rb')) y_predicted = clf_model.predict(X_classify_counts) y_prob = clf_model.predict_proba(X_classify_counts) # assign predictions to jobs & prune dataframe df['gig'] = y_predicted cols_to_write = ['company','title','city','state','url'] df_to_write = df[df['gig']==1][cols_to_write] # write jobs to accessible location on s3 # custom name by date -- test overlap between days timestr = time.strftime("%Y-%m-%d") prefix, fn = target.split('/') file_to_write = prefix + '/' + timestr + '-' + fn bt.write_df_to_s3(df_to_write, bucket, file_to_write, comp=False) # add labeled samples to validate for future training df_positive = df[df['gig']==1].sample(1000) file_positive = 'positive' + '/' + timestr + '-' + fn bt.write_df_to_s3(df_positive, bucket, file_positive, comp=False) df_negative = df[df['gig']==0].sample(1000) file_negative = 'negative' + '/' + timestr + '-' + fn bt.write_df_to_s3(df_negative, bucket, file_negative, comp=False)
def main(): '''Test & Evaluate multiple models to select best option Considerations: + Single-feature + Multiple features + Count vectorizer - BoW/Tfidf + k-fold x-validation + Choice of classifier (LR, RF, etc) Metrics: + Accuracy + Variance + F1 + Confusion matrix + Create graphics + Feature importance ''' path_to_module = '../tools/' sys.path.append(path_to_module) # load s3 read & write functions import bototools as bt import nlp_preprocessing as nlp print('classifying new jobs...\n') # if pulling from s3 #path_to_data = '../.keys/' #file_name = 'csv_to_classify.json' #bucket, key = bt.load_s3_location(path_to_data, file_name) #df = bt.load_df_from_s3(bucket, key, compression='gzip') # local data path_to_data = '../data/multiclass/' data_file = 'labeled_large.csv' df = pd.read_csv(os.path.join(path_to_data, data_file)) print('loaded CSV') # map roles to general labels key_path = '../.keys' role_mapper = load_pickle(os.path.join(key_path, 'roles.pickle')) role_names = [n for n in role_mapper] df = df[df['role'].isin(role_names)] df['label'] = df['role'].map(role_mapper) df = df.dropna() cols_to_train = ['title', 'label'] df = df[cols_to_train] # standardize text format df = nlp.standardize_text(df, 'title') # select data to predict from X = df['title'].tolist() y = df['label'].tolist() # dividing X, y into train and test data X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) print('vectorizing bag of words model') X_train_counts, count_vectorizer = nlp.create_count_vectorizer( X_train) # count vec X_test_counts = count_vectorizer.transform(X_test) # training a Naive Bayes classifier print('testing model') # Logistic regression lr = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg', multi_class='multinomial', n_jobs=-1, random_state=40) lr.fit(X_train_counts, y_train) lr_predictions = lr.predict(X_test_counts) # multinomial NB mnb = MultinomialNB().fit(X_train_counts, y_train) mnb_predictions = mnb.predict(X_test_counts) # complement NB cnb = ComplementNB().fit(X_train_counts, y_train) cnb_predictions = cnb.predict(X_test_counts) # evaluate performance from sklearn.metrics import confusion_matrix roles = ['tech', 'nurse', 'service', 'driver', 'ignore'] cm_lr = confusion_matrix(y_test, lr_predictions, labels=roles) cm_mnb = confusion_matrix(y_test, mnb_predictions, labels=roles) cm_cnb = confusion_matrix(y_test, cnb_predictions, labels=roles) print('\n--- Logistic Regression Confusion Matrix ---') print(roles) print(cm_lr) get_metrics(y_test, lr_predictions) print('\n--- Multinomial Bayes Confusion Matrix ---') print(roles) print(cm_mnb) get_metrics(y_test, mnb_predictions) print('\n--- Complement Bayes Confusion Matrix ---') print(roles) print(cm_cnb) get_metrics(y_test, cnb_predictions)
def main(): '''Run classification ''' path_to_module = '../tools/' sys.path.append(path_to_module) # load s3 read & write functions import bototools as bt # add these to actual path print('classifying new jobs...\n') path_to_data = '../.keys/' #file_name = 'csv_to_classify.json' file_name = 'eda_data_file.json' #bucket, key = bt.load_s3_location(path_to_data, file_name) # use a sample file for testing -- avoid large files for now bucket, _ = bt.load_s3_location(path_to_data, file_name) key = 'eda_sample_data_file.csv' df = bt.load_df_from_s3(bucket, key) # sample not gzipped #df = bt.load_df_from_s3(bucket, key, comp='gzip') # import preprocessing tools import nlp_preprocessing as nlp # cleanup the dataframe to prepare for classification # while only SOME columns are used, ALL need to be returned for ops team cols_to_model = ['title'] for col in cols_to_model: df = nlp.standardize_text(df, col) X_classify = df['title'].tolist() # load count vectorizer path_to_models = '../models/' cv_pickle = 'CV_lr_bow_train_only_model.pckl' # use private file too cv_path = os.path.join(path_to_models, cv_pickle) cv_model = pickle.load(open(cv_path, 'rb')) X_classify_counts = nlp.get_cv_test_counts(X_classify, cv_model) from sklearn.linear_model import LogisticRegression # move outside main # load pre-trained model #path_to_model = '../models/' # functionalize loading & training clf_pickle = 'lr_bow_train_only_model.pckl' # use private file too clf_path = os.path.join(path_to_models, clf_pickle) clf_model = pickle.load(open(clf_path, 'rb')) y_predicted = clf_model.predict(X_classify_counts) df['gig'] = y_predicted cols_to_write = ['company','title','city','state','posted_at','url'] df_to_write = df[df['gig']==1][cols_to_write] #df_to_write = df[df['gig']==1] #df[df['gig']==1].to_csv('../data/classified_gig_jobs.csv', index=False) #print('Gig jobs found: {}'.format(df[df['gig']==1].shape[0])) # write output (use a prefix!) file_to_write = 'gigs/sample_daily_job_list.csv' bt.write_df_to_s3(df_to_write, bucket, file_to_write)