def main():
    '''Create and save model
    '''

    path_to_module = '../tools/'
    sys.path.append(path_to_module)
    # load s3 read & write functions
    import bototools as bt
    import nlp_preprocessing as nlp

    # if pulling from s3
    #path_to_data = '../.keys/'
    #file_name = 'csv_to_classify.json'
    #bucket, key = bt.load_s3_location(path_to_data, file_name)
    #df = bt.load_df_from_s3(bucket, key, compression='gzip')

    # local data
    path_to_data = '../data/multiclass/'
    data_file = 'labeled_large.csv'
    df = pd.read_csv(os.path.join(path_to_data, data_file))
    print('loaded CSV')
    # remove samples not of interestc

    # map roles to general labels
    key_path = '../.keys'
    role_mapper = load_pickle(os.path.join(key_path, 'roles.pickle'))

    role_names = [n for n in role_mapper]
    df = df[df['role'].isin(role_names)]

    df['label'] = df['role'].map(role_mapper)
    df = df.dropna()

    cols_to_train = ['title', 'label']
    df = df[cols_to_train]

    # standardize text format
    df = nlp.standardize_text(df, 'title')

    # select data to predict from
    X = df['title'].tolist()
    y = df['label'].tolist()
    #y = df['gig'].tolist()

    # dividing X, y into train and test data
    print('vectorizing bag of words model')
    X_counts, count_vectorizer = nlp.create_count_vectorizer(X) # count vec
    cv_to_write = '../models/CV_nb_bow_model.pckl'
    pickle.dump(count_vectorizer, open(cv_to_write, 'wb'))
    print('successfully serialized CV model')

    # training a Naive Bayes classifier
    print('testing model')
    #model = MultinomialNB().fit(X_counts, y)
    model = ComplementNB().fit(X_counts, y)

    # save model for later use (locally & on s3)
    file_to_write = '../models/complement_nb_model.pckl'
    pickle.dump(model, open(file_to_write, 'wb'))
    print('successfully serialized NB model')
Пример #2
0
def main():
    '''Run labeling on job data

    Process directly from partner feeds

    ** NOTE - this is a "dumb" method without NLP/ML **

    Current configuration:

    + Load csv data from xml feed
    + Label based on job title
    + Write results to s3
    '''

    # define paths
    key_path = '/home/ubuntu/job-classifier/.keys/'
    #model_path = '/home/ubuntu/job-classifier/models/'

    # dict with partner:url pairs
    partners = load_pickle(os.path.join(key_path, 'partners.pickle'))
    s3_details = load_pickle(os.path.join(key_path, 's3_config.pickle'))

    bucket = s3_details['csv_bucket']
    file_to_write = s3_details['target']

    # pull xml from url and parse into df
    for partner in partners:
        url = partners[partner]
        if url.endswith('.xml.gz'):
            df = xt.xml_from_url_compressed(url)
        else:
            df = xt.xml_from_url(url)

        # standardize text format
        df = nlp.standardize_text(df, 'title')

        # apply label function
        df['label'] = df.title.apply(get_role)

        label_cols = ['label', 'company','title','city','state','url']
        df_to_keep = df[~(df['label']=='ignore')][label_cols]

        role_dfs = {}
            for label in df_to_keep.label.unique():
                tmp_df = df_to_keep[df_to_keep['label']==label]

                if len(tmp_df) > 100:
                    tmp_df = tmp_df.sample(n=100)

                role_dfs[label] = tmp_df

        df_to_write = pd.concat([role_dfs[x] for x in role_dfs])

        # write labeled roles
        label_key = partner + '/' + 'labeled_jobs.csv'
        print('Writing {} jobs'.format(len(df_to_write)))
        bt.write_df_to_s3(df_to_write, bucket, label_key, comp=False)
Пример #3
0
def main():

    # text cleaning
    df = pd.read_csv()  # or something
    df = nlp.standardize_text(df, 'text')
Пример #4
0
def main():
    '''Run classification on job data

    Process directly from partner feeds

    Current configuration:

    + Load csv data from xml feed
    + Count Vectorize with Bag of Words
    + Label using Naive Bayes Classifier
    + Write results back to s3
    '''

    # define paths
    key_path = '/home/ubuntu/job-classifier/.keys/'
    model_path = '/home/ubuntu/job-classifier/models/'

    # dict with partner:url pairs
    partners = load_pickle(os.path.join(key_path, 'partners.pickle'))
    s3_details = load_pickle(os.path.join(key_path, 's3_config.pickle'))

    bucket = s3_details['csv_bucket']
    file_to_write = s3_details['target']

    cv_model = load_pickle(os.path.join(model_path, 'CV_nb_bow_model.pckl'))
    # update cv model above
    #mnb_model = load_pickle(os.path.join(model_path, 'multi_nb_model.pckl'))
    cnb_model = load_pickle(
        os.path.join(model_path, 'complement_nb_model.pckl'))

    # cities & roles
    city_roles = {
        'driver': ['new york', 'los angeles'],
        'nurse': ['dallas', 'los angeles'],
        'tech': ['san francisco', 'boston']
    }

    # pull xml from url and parse into df
    for partner in partners:
        print('\nProcessing:', partner.upper())
        url = partners[partner]
        if url.endswith('.xml.gz'):
            df = xt.xml_from_url_compressed(url)
        else:
            df = xt.xml_from_url(url)

        # standardize text format
        df = nlp.standardize_text(df, 'title')

        # select data to predict from
        X_classify = df['title'].tolist()

        # get count vec
        X_classify_counts = nlp.get_cv_test_counts(X_classify, cv_model)

        # predict with model
        #y_label = mnb_model.predict(X_classify_counts)
        y_label = cnb_model.predict(X_classify_counts)

        # assign predictions to jobs & prune dataframe
        df['label'] = y_label

        label_cols = ['label', 'company', 'title', 'city', 'state', 'url']
        #labels_to_drop = ['ignore','driver','service']
        labels_to_drop = ['ignore', 'service']

        df_tmp = df[~(df['label'].isin(labels_to_drop))][label_cols]

        for role in city_roles:
            print('processing:', role.upper())

            for city in city_roles[role]:
                print('city:', city.upper())

                df_to_write = get_city_df(df, city, role)

                # write labeled roles
                city_file = city.replace(' ', '_')
                label_key = partner + '/' + role + '/' + city_file + '/' + 'jobs.csv'
                bt.write_df_to_s3(df_to_write, bucket, label_key, comp=False)
Пример #5
0
def main():
    '''Run classification on job data

    Process directly from partner feeds

    Current configuration:

    + Load csv data from xml feed
    + Count Vectorize with Bag of Words
    + Label using Naive Bayes Classifier
    + Write results back to s3
    '''

    # define paths
    key_path = '/home/ubuntu/job-classifier/.keys/'
    model_path = '/home/ubuntu/job-classifier/models/'

    # dict with partner:url pairs
    partners = load_pickle(os.path.join(key_path, 'partners.pickle'))
    s3_details = load_pickle(os.path.join(key_path, 's3_config.pickle'))

    bucket = s3_details['csv_bucket']
    file_to_write = s3_details['target']

    cv_model = load_pickle(os.path.join(model_path,'CV_nb_bow_model.pckl'))
    # update cv model above
    #clf_model = load_pickle(os.path.join(model_path, 'lr_bow_train_only_model.pckl'))
    #mnb_model = load_pickle(os.path.join(model_path, 'multi_nb_model.pckl'))
    cnb_model = load_pickle(os.path.join(model_path, 'complement_nb_model.pckl'))

    # pull xml from url and parse into df
    for partner in partners:
        url = partners[partner]
        if url.endswith('.xml.gz'):
            df = xt.xml_from_url_compressed(url)
        else:
            df = xt.xml_from_url(url)

        # standardize text format
        df = nlp.standardize_text(df, 'title')

        # select data to predict from
        X_classify = df['title'].tolist()

        # get count vec
        X_classify_counts = nlp.get_cv_test_counts(X_classify, cv_model)

        # predict with model
        #y_label = mnb_model.predict(X_classify_counts)
        y_label = cnb_model.predict(X_classify_counts)

        # assign predictions to jobs & prune dataframe
        df['label'] = y_label

        label_cols = ['label', 'company','title','city','state','url']
        labels_to_drop = ['ignore','driver','service']
        df_to_write = df[~(df['label'].isin(labels_to_drop))][label_cols]
        #df_to_write = df[~(df['label']=='ignore')][label_cols]

        # SAMPLE DF_TO_WRITE for smaller dataset
        df_to_write = df_to_write.sample(n=100)

        # write labeled roles
        label_key = partner + '/' + 'labeled_jobs.csv'
        bt.write_df_to_s3(df_to_write, bucket, label_key, comp=False)
Пример #6
0
def main():
    '''Run classification on job data

    Process directly from partner feeds

    ** NOTE - this is a "dumb" method without NLP/ML **

    Current configuration:

    + Load csv data from xml feed
    + Label based on job title
    + Write results to s3
    '''

    # define paths
    key_path = '/home/ubuntu/job-classifier/.keys/'

    # dict with partner:url pairs
    partners = load_pickle(os.path.join(key_path, 'partners.pickle'))
    s3_details = load_pickle(os.path.join(key_path, 's3_config.pickle'))
    city_roles = load_pickle(os.path.join(key_path, 'city_roles.pickle'))

    bucket = s3_details['csv_bucket']
    file_to_write = s3_details['target']

    # pull xml from url and parse into df
    for partner in partners:
        print('\nProcessing:', partner.upper())
        url = partners[partner]
        if url.endswith('.xml.gz'):
            df = xt.xml_from_url_compressed(url)
        else:
            df = xt.xml_from_url(url)

        # standardize text format
        df = nlp.standardize_text(df, 'title')

        # assign labels to jobs & prune dataframe
        df['label'] = df.title.apply(get_role)

        label_cols = ['label', 'company', 'title', 'city', 'state', 'url']

        df = df[~(df['label'] == 'ignore')][label_cols]

        for role in city_roles:
            print('processing:', role.upper())

            for city in city_roles[role]:
                print('city:', city.upper())

                df_to_write = get_city_df(df, city, role)

                # write labeled roles
                city_file = city.replace(' ', '_')
                label_key = partner + '/' + role + '/' + city_file + '/' + 'jobs.csv'
                if len(df_to_write) > 0:
                    #print('write: {} to s3'.format(len(df_to_write)))
                    bt.write_df_to_s3(df_to_write,
                                      bucket,
                                      label_key,
                                      comp=False)
                else:
                    print('No matches found')
Пример #7
0
def main():
    '''Run classification on job data

    Current configuration:

    + Load csv data from xml feed
    + Count Vectorize with Bag of Words
    + Predict Linear Regression
    + Select results on LinReg class probability
    + Write results back to s3
    '''

    # add custom modules to path
    path_to_module = '../tools/'
    sys.path.append(path_to_module)

    # load s3 connector & preprocessing functions
    import bototools as bt
    import xmltools as xt
    import nlp_preprocessing as nlp

    # load job data from s3 csv to dataframe
    path_to_data = '../.keys/'
    file_name = 'eda_data_file.json'
    bucket, key, url, target = bt.load_s3_location(path_to_data, file_name)

    # pull xml from url and parse into df
    #df = bt.load_df_from_s3(bucket, key, comp='gzip')
    df = xt.xml_from_url(url)

    # standardize text format
    cols_to_model = ['title']
    for col in cols_to_model:
        df = nlp.standardize_text(df, col)

    # select data to predict from
    X_classify = df['title'].tolist()

    # define model path
    path_to_models = '../models/'

    # load count vectorizer & transform data to predict
    cv_pickle = 'CV_lr_bow_train_only_model.pckl'
    cv_path = os.path.join(path_to_models, cv_pickle)
    cv_model = pickle.load(open(cv_path, 'rb'))
    X_classify_counts = nlp.get_cv_test_counts(X_classify, cv_model)

    # load, train, fit model
    clf_pickle = 'lr_bow_train_only_model.pckl'
    clf_path = os.path.join(path_to_models, clf_pickle)
    clf_model = pickle.load(open(clf_path, 'rb'))

    y_predicted = clf_model.predict(X_classify_counts)
    #y_prob = clf_model.predict_proba(X_classify_counts)

    # assign predictions to jobs & prune dataframe
    df['gig'] = y_predicted
    #df['prob'] = y_prob[:,0] # failed last test
    cols_to_write = ['company','title','city','state','url']
    #cols_to_write = ['company','title','city','state','posted_at','url']

    # only keep listings with over 95% probability of being a gig job
    # tighten/loosen requirement depending on model
    #df_to_write = df[(df['gig']==1) & (df['prob']==0.95)][cols_to_write]
    df_to_write = df[df['gig']==1][cols_to_write]

    # write jobs to accessible location on s3
    #file_to_write = 'gigs/streamed_full_daily_job_list.csv'
    file_to_write = target
    bt.write_df_to_s3(df_to_write, bucket, file_to_write, comp=False)
Пример #8
0
def main():
    '''Test & Evaluate multiple models to select best option

    Considerations:
    + Single-feature
    + Multiple features
    + Count vectorizer - BoW/Tfidf
    + k-fold x-validation
    + Choice of classifier (LR, RF, etc)

    Metrics:
    + Accuracy
    + Variance
    + F1
    + Confusion matrix
    + Create graphics
    + Feature importance
    '''

    path_to_module = '../tools/'
    sys.path.append(path_to_module)
    # load s3 read & write functions
    import bototools as bt
    import nlp_preprocessing as nlp

    print('classifying new jobs...\n')

    # if pulling from s3
    #path_to_data = '../.keys/'
    #file_name = 'csv_to_classify.json'
    #bucket, key = bt.load_s3_location(path_to_data, file_name)
    #df = bt.load_df_from_s3(bucket, key, compression='gzip')

    # local data
    path_to_data = '../data/multiclass/'
    data_file = 'labeled_large.csv'
    df = pd.read_csv(os.path.join(path_to_data, data_file))
    print('loaded CSV')

    # map roles to general labels
    key_path = '../.keys'
    role_mapper = load_pickle(os.path.join(key_path, 'roles.pickle'))

    role_names = [n for n in role_mapper]
    df = df[df['role'].isin(role_names)]

    df['label'] = df['role'].map(role_mapper)
    df = df.dropna()

    cols_to_train = ['title', 'label']
    df = df[cols_to_train]

    # standardize text format
    df = nlp.standardize_text(df, 'title')

    ## run a 10-fold stratified cross validation

    skf = StratifiedKFold(n_splits=10, random_state=0)

    df = df.reset_index(drop=True)
    X = df['title'].values.astype('U')
    y = df['label']

    current_split = 1
    acc_lr = []
    prec_lr = []
    rec_lr = []
    f1_lr = []
    acc_mnb = []
    prec_mnb = []
    rec_mnb = []
    f1_mnb = []
    acc_cnb = []
    prec_cnb = []
    rec_cnb = []
    f1_cnb = []

    for train_index, test_index in skf.split(X, y):
        print('CURRENT SPLIT:', current_split)

        # get splits & assign data
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        #print('successful split')

        # vectorize word counts
        X_train_counts, count_vectorizer = nlp.create_count_vectorizer(X_train)
        X_test_counts = count_vectorizer.transform(X_test)

        # Logistic regression
        lr = LogisticRegression(C=30.0,
                                class_weight='balanced',
                                solver='newton-cg',
                                multi_class='multinomial',
                                n_jobs=-1,
                                random_state=40)
        lr.fit(X_train_counts, y_train)
        lr_predictions = lr.predict(X_test_counts)
        accuracy, precision, recall, f1 = get_metrics(y_test, lr_predictions)
        print(
            "LR: accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" %
            (accuracy, precision, recall, f1))

        acc_lr.append(accuracy)
        prec_lr.append(precision)
        rec_lr.append(recall)
        f1_lr.append(f1)

        # multinomial NB
        mnb = MultinomialNB().fit(X_train_counts, y_train)
        mnb_predictions = mnb.predict(X_test_counts)
        accuracy, precision, recall, f1 = get_metrics(y_test, mnb_predictions)
        print(
            "MNB: accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f"
            % (accuracy, precision, recall, f1))

        acc_mnb.append(accuracy)
        prec_mnb.append(precision)
        rec_mnb.append(recall)
        f1_mnb.append(f1)

        # complement NB
        cnb = ComplementNB().fit(X_train_counts, y_train)
        cnb_predictions = cnb.predict(X_test_counts)
        accuracy, precision, recall, f1 = get_metrics(y_test, cnb_predictions)
        print(
            "CNB: accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f"
            % (accuracy, precision, recall, f1))

        acc_cnb.append(accuracy)
        prec_cnb.append(precision)
        rec_cnb.append(recall)
        f1_cnb.append(f1)

        current_split += 1

    # Summarize
    print('\nFinal Perfomance')

    print('\nLogistic Regression Perfomance')
    print('Accuracy: mean %.3f, variance %.3f' %
          (np.mean(acc_lr), np.var(acc_lr)))
    print('Precision: mean %.3f, variance %.3f' %
          (np.mean(prec_lr), np.var(prec_lr)))
    print('Recall: mean %.3f, variance %.3f' %
          (np.mean(rec_lr), np.var(rec_lr)))
    print('F1 Score: mean %.3f, variance %.3f' %
          (np.mean(f1_lr), np.var(f1_lr)))

    print('\nMultinomial Naive Bayes Perfomance')
    print('Accuracy: mean %.3f, variance %.3f' %
          (np.mean(acc_mnb), np.var(acc_mnb)))
    print('Precision: mean %.3f, variance %.3f' %
          (np.mean(prec_mnb), np.var(prec_mnb)))
    print('Recall: mean %.3f, variance %.3f' %
          (np.mean(rec_mnb), np.var(rec_mnb)))
    print('F1 Score: mean %.3f, variance %.3f' %
          (np.mean(f1_mnb), np.var(f1_mnb)))

    print('\nComplement Naive Bayes Perfomance')
    print('Accuracy: mean %.3f, variance %.3f' %
          (np.mean(acc_cnb), np.var(acc_cnb)))
    print('Precision: mean %.3f, variance %.3f' %
          (np.mean(prec_cnb), np.var(prec_cnb)))
    print('Recall: mean %.3f, variance %.3f' %
          (np.mean(rec_cnb), np.var(rec_cnb)))
    print('F1 Score: mean %.3f, variance %.3f' %
          (np.mean(f1_cnb), np.var(f1_cnb)))
def main():
    '''Run classification on job data

    Current configuration:

    + Load csv data from s3
    + Count Vectorize with Bag of Words
    + Predict Linear Regression
    + Select results on LinReg class probability
    + Write results back to s3
    '''

    # add custom modules to path
    path_to_module = '/home/ubuntu/job-classifier/tools/'
    sys.path.append(path_to_module)

    # load s3 connector & preprocessing functions
    import bototools as bt
    import xmltools as xt
    import nlp_preprocessing as nlp

    # load job data from s3 csv to dataframe
    path_to_data = '/home/ubuntu/job-classifier/.keys/'
    file_name = 'cron_data_file.json'
    bucket, key, url, target = bt.load_s3_location(path_to_data, file_name)

    # pull xml from url and parse into df
    df = xt.xml_from_url(url)

    # standardize text format
    cols_to_model = ['title']
    for col in cols_to_model:
        df = nlp.standardize_text(df, col)

    # select data to predict from
    X_classify = df['title'].tolist()

    # define model path
    path_to_models = '/home/ubuntu/job-classifier/models/'

    # load count vectorizer & transform data to predict
    cv_pickle = 'CV_lr_bow_train_only_model.pckl'
    cv_path = os.path.join(path_to_models, cv_pickle)
    cv_model = pickle.load(open(cv_path, 'rb'))
    X_classify_counts = nlp.get_cv_test_counts(X_classify, cv_model)

    # load, train, fit model
    clf_pickle = 'lr_bow_train_only_model.pckl'
    clf_path = os.path.join(path_to_models, clf_pickle)
    clf_model = pickle.load(open(clf_path, 'rb'))
    y_predicted = clf_model.predict(X_classify_counts)
    y_prob = clf_model.predict_proba(X_classify_counts)

    # assign predictions to jobs & prune dataframe
    df['gig'] = y_predicted
    cols_to_write = ['company','title','city','state','url']

    df_to_write = df[df['gig']==1][cols_to_write]

    # write jobs to accessible location on s3
    # custom name by date -- test overlap between days
    timestr = time.strftime("%Y-%m-%d")
    prefix, fn = target.split('/')
    file_to_write = prefix + '/' + timestr + '-' + fn
    bt.write_df_to_s3(df_to_write, bucket, file_to_write, comp=False)

    # add labeled samples to validate for future training
    df_positive = df[df['gig']==1].sample(1000)
    file_positive = 'positive' + '/' + timestr + '-' + fn
    bt.write_df_to_s3(df_positive, bucket, file_positive, comp=False)

    df_negative = df[df['gig']==0].sample(1000)
    file_negative = 'negative' + '/' + timestr + '-' + fn
    bt.write_df_to_s3(df_negative, bucket, file_negative, comp=False)
Пример #10
0
def main():
    '''Test & Evaluate multiple models to select best option

    Considerations:
    + Single-feature
    + Multiple features
    + Count vectorizer - BoW/Tfidf
    + k-fold x-validation
    + Choice of classifier (LR, RF, etc)

    Metrics:
    + Accuracy
    + Variance
    + F1
    + Confusion matrix
    + Create graphics
    + Feature importance
    '''

    path_to_module = '../tools/'
    sys.path.append(path_to_module)
    # load s3 read & write functions
    import bototools as bt
    import nlp_preprocessing as nlp

    print('classifying new jobs...\n')

    # if pulling from s3
    #path_to_data = '../.keys/'
    #file_name = 'csv_to_classify.json'
    #bucket, key = bt.load_s3_location(path_to_data, file_name)
    #df = bt.load_df_from_s3(bucket, key, compression='gzip')

    # local data
    path_to_data = '../data/multiclass/'
    data_file = 'labeled_large.csv'
    df = pd.read_csv(os.path.join(path_to_data, data_file))
    print('loaded CSV')

    # map roles to general labels
    key_path = '../.keys'
    role_mapper = load_pickle(os.path.join(key_path, 'roles.pickle'))

    role_names = [n for n in role_mapper]
    df = df[df['role'].isin(role_names)]

    df['label'] = df['role'].map(role_mapper)
    df = df.dropna()

    cols_to_train = ['title', 'label']
    df = df[cols_to_train]

    # standardize text format
    df = nlp.standardize_text(df, 'title')

    # select data to predict from
    X = df['title'].tolist()
    y = df['label'].tolist()

    # dividing X, y into train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    print('vectorizing bag of words model')
    X_train_counts, count_vectorizer = nlp.create_count_vectorizer(
        X_train)  # count vec
    X_test_counts = count_vectorizer.transform(X_test)

    # training a Naive Bayes classifier
    print('testing model')

    # Logistic regression
    lr = LogisticRegression(C=30.0,
                            class_weight='balanced',
                            solver='newton-cg',
                            multi_class='multinomial',
                            n_jobs=-1,
                            random_state=40)
    lr.fit(X_train_counts, y_train)
    lr_predictions = lr.predict(X_test_counts)

    # multinomial NB
    mnb = MultinomialNB().fit(X_train_counts, y_train)
    mnb_predictions = mnb.predict(X_test_counts)

    # complement NB
    cnb = ComplementNB().fit(X_train_counts, y_train)
    cnb_predictions = cnb.predict(X_test_counts)

    # evaluate performance
    from sklearn.metrics import confusion_matrix
    roles = ['tech', 'nurse', 'service', 'driver', 'ignore']

    cm_lr = confusion_matrix(y_test, lr_predictions, labels=roles)
    cm_mnb = confusion_matrix(y_test, mnb_predictions, labels=roles)
    cm_cnb = confusion_matrix(y_test, cnb_predictions, labels=roles)

    print('\n--- Logistic Regression Confusion Matrix ---')
    print(roles)
    print(cm_lr)
    get_metrics(y_test, lr_predictions)
    print('\n--- Multinomial Bayes Confusion Matrix ---')
    print(roles)
    print(cm_mnb)
    get_metrics(y_test, mnb_predictions)
    print('\n--- Complement Bayes Confusion Matrix ---')
    print(roles)
    print(cm_cnb)
    get_metrics(y_test, cnb_predictions)
Пример #11
0
def main():
    '''Run classification
    '''

    path_to_module = '../tools/'
    sys.path.append(path_to_module)
    # load s3 read & write functions
    import bototools as bt # add these to actual path

    print('classifying new jobs...\n')

    path_to_data = '../.keys/'
    #file_name = 'csv_to_classify.json'
    file_name = 'eda_data_file.json'
    #bucket, key = bt.load_s3_location(path_to_data, file_name)
    # use a sample file for testing -- avoid large files for now
    bucket, _ = bt.load_s3_location(path_to_data, file_name)
    key = 'eda_sample_data_file.csv'
    df = bt.load_df_from_s3(bucket, key) # sample not gzipped
    #df = bt.load_df_from_s3(bucket, key, comp='gzip')

    # import preprocessing tools
    import nlp_preprocessing as nlp
    # cleanup the dataframe to prepare for classification
    # while only SOME columns are used, ALL need to be returned for ops team

    cols_to_model = ['title']
    for col in cols_to_model:
        df = nlp.standardize_text(df, col)

    X_classify = df['title'].tolist()

    # load count vectorizer
    path_to_models = '../models/'

    cv_pickle = 'CV_lr_bow_train_only_model.pckl' # use private file too
    cv_path = os.path.join(path_to_models, cv_pickle)

    cv_model = pickle.load(open(cv_path, 'rb'))

    X_classify_counts = nlp.get_cv_test_counts(X_classify, cv_model)

    from sklearn.linear_model import LogisticRegression # move outside main

    # load pre-trained model
    #path_to_model = '../models/'

    # functionalize loading & training
    clf_pickle = 'lr_bow_train_only_model.pckl' # use private file too
    clf_path = os.path.join(path_to_models, clf_pickle)

    clf_model = pickle.load(open(clf_path, 'rb'))
    y_predicted = clf_model.predict(X_classify_counts)

    df['gig'] = y_predicted
    cols_to_write = ['company','title','city','state','posted_at','url']
    df_to_write = df[df['gig']==1][cols_to_write]
    #df_to_write = df[df['gig']==1]
    #df[df['gig']==1].to_csv('../data/classified_gig_jobs.csv', index=False)
    #print('Gig jobs found: {}'.format(df[df['gig']==1].shape[0]))

    # write output (use a prefix!)
    file_to_write = 'gigs/sample_daily_job_list.csv'
    bt.write_df_to_s3(df_to_write, bucket, file_to_write)