Exemplo n.º 1
0
def synonym_relation(text1, text2):
    try:
        if (text1 == 'no info' or text2 == 'no info'):
            return 0
        else:
            text1 = stopwordsremove(text1)
            text2 = stopwordsremove(text2)
            syn_set = set()
            count = 0
            if (len(text1) == 0 or len(text2) == 0):
                return 0
            if (len(text1) < len(text2)):
                for word in text2:
                    for syn in wordnet.synsets(word):
                        for l in syn.lemmas():
                            syn_set.add(l.name())

                for word in text1:
                    if word in syn_set:
                        count += 1
                return (count / len(text1))
            else:
                for word in text1:
                    for syn in wordnet.synsets(word):
                        for l in syn.lemmas():
                            syn_set.add(l.name())

                for word in text2:
                    if word in syn_set:
                        count += 1
                return (count / len(text2))
    except Exception as e:
        handle('synonym relation finding process')
Exemplo n.º 2
0
def missing_values(data):
    """
    All columns might contain missing values we have to decide how to handle
    each column. Some columns need to handled seperately.
    """
    """
    1.location

    a second level of handling is done as of to remove numeric values in the
    location data. regex is used to remove those numeric data and replace with
    no info.
    """

    print('Handling Missing Data')
    try:
        data['location'].fillna('no info', inplace=True)
        withoutcomma = data[~data['location'].str.contains(",")].index
        withcomma = data[data['location'].str.contains(",")].index

        for i in withcomma:
            data.loc[i,
                     'country'] = data.loc[i,
                                           'location'].split(',')[0].strip()

        for i in withoutcomma:
            data.loc[i, 'country'] = data.loc[i, 'location'].strip()
        """2.salary range"""

        data['salary_range'].fillna('0-0', inplace=True)

        for i in range(0, data.shape[0]):
            str = data.loc[i, 'salary_range']
            if re.search(r'[a-z,A-Z]', str):
                data.loc[i, 'salary_range'] = '0-0'

            if (data.loc[i, 'salary_range'].find("-") != -1):
                data.loc[i, 'minimum_salary'] = data.loc[
                    i, 'salary_range'].split('-')[0]
                data.loc[i, 'maximum_salary'] = data.loc[
                    i, 'salary_range'].split('-')[1]
            else:
                data.loc[i, 'minimum_salary'] = data.loc[i, 'salary_range']
                data.loc[i, 'maximum_salary'] = data.loc[i, 'salary_range']
        """3. All other categorical columns and remaining numeric columns."""

        columns = data.columns
        for i in columns:
            if (data[i].isna().any()):
                if (data[i].dtypes == 'object'):
                    data[i].fillna('no info', inplace=True)
                    data[i] = data[i].str.lower()

                else:
                    data[i].fillna(0, inplace=True)

        data.drop(['salary_range', 'location'], axis=1, inplace=True)
        return data
    except Exception as e:
        handle('missing data handling process')
Exemplo n.º 3
0
def categorical_cols_test(data):
    print('Categorical Encoding')
    try:
        encoder = pickle.load(open("model/encoder.p", "rb"))
        newdata = encoder.transform(data)
        return newdata
    except Exception as e:
        handle('categorical columns handling for testing process')
Exemplo n.º 4
0
def training():
    try:
        data = read_csv('data/train.csv')

        (data.pipe(missing_values).pipe(texthandling)
             .pipe(categorical_cols_train).pipe(train_and_save_model))

    except Exception as e:
        handle("Training piepline")
Exemplo n.º 5
0
def testing():
    try:
        data = read_csv('data/test.csv')

        (data.pipe(missing_values).pipe(texthandling).pipe(
            categorical_cols_test).pipe(load_model_predict))

    except Exception as e:
        handle('testing process')
Exemplo n.º 6
0
def read_csv(path):
    try:
        if ('csv' == path.split(".")[-1]):
            data = pd.read_csv(path)
        else:
            print("The files is not a CSV file")
    except Exception as e:
        handle('file reading')
    return data
Exemplo n.º 7
0
def stopwordsremove(text):
    try:
        word_token = word_tokenize(text)
        ps = PorterStemmer()
        filtered = [
            ps.stem(w.lower()) for w in word_token if not w in stop_words
        ]
        return filtered
    except Exception as e:
        handle('stop words removing')
Exemplo n.º 8
0
def categorical_cols_train(data):
    try:
        print('Categorical Encoding')
        encoder = ce.BinaryEncoder(cols=[
            'employment_type', 'required_experience', 'required_education',
            'country'
        ])
        newdata = encoder.fit_transform(data)
        pickle.dump(encoder, open("model/encoder.p", "wb"))
        return newdata
    except Exception as e:
        handle('categorical column handling')
Exemplo n.º 9
0
def load_model_predict(data):
    try:
        X_test = data.drop('fraudulent', axis=1)
        y_test = data['fraudulent']

        scaler = pickle.load(open("model/scaler.p", "rb"))
        X_test = scaler.transform(X_test)

        filename = 'model/finalized_model.p'
        model = pickle.load(open(filename, 'rb'))

        y_pred = model.predict(X_test)
        score_and_save(y_pred)
    except Exception as e:
        handle('prediction process')
Exemplo n.º 10
0
def removeuncessary(text):
    try:
        '''
        1. removing punctuations,
        2. removing numbered words,
        3. removing unknown characters

        '''
        text = re.sub('[%s]' % re.escape(string.punctuation), '', str(text))
        text = re.sub('\w*\d\w*', '', str(text))
        text = re.sub('[^a-zA-Z ]+', ' ', str(text))

        return text
    except Exception as e:
        handle('removing unnecessary text')
Exemplo n.º 11
0
def score_and_save(y_pred):
    try:
        data = read_csv('data/test.csv')

        y_test = data['fraudulent']
        cm = confusion_matrix(y_test, y_pred)
        print("\n" + "SCORES")
        print("confusion matrix")
        print(cm)
        print('F1-Score' + ' = ' + str(round(f1_score(y_test, y_pred), 4)))
        print('Precision' + ' = ' +
              str(round(precision_score(y_test, y_pred), 4)))
        print('Recall' + ' = ' + str(round(recall_score(y_test, y_pred), 4)))
        print('Accuracy' + ' = ' +
              str(round(accuracy_score(y_test, y_pred), 4)))

        data['fraud_prediction'] = y_pred

        data.to_csv('predictionoutput/testsetprediction.csv')
    except Exception as e:
        handle('scoring and saving process')
Exemplo n.º 12
0
def train_and_save_model(data):
    try:
        print("Model Training")
        X_train = data.drop('fraudulent', axis=1)
        y_train = data['fraudulent']

        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        pickle.dump(sc, open("model/scaler.p", "wb"))

        from sklearn.ensemble import RandomForestClassifier
        model = RandomForestClassifier(n_estimators=100,
                                       criterion='entropy',
                                       random_state=1)

        model.fit(X_train, y_train)

        filename = 'model/finalized_model.p'
        pickle.dump(model, open(filename, 'wb'))
    except Exception as e:
        handle('Model Creation and training')
Exemplo n.º 13
0
def texthandling(data):
    print('Text Handling')
    try:
        '''
            This function is for handling text data columns company profile,
            description, requirements, benefits are there is multiple text in
            those columns we need to do something about them.
            '''
        stop_words = set(stopwords.words('english'))
        for i in range(0, data.shape[0]):

            data.loc[i, 'company_profile'] = removeuncessary(
                data.loc[i, 'company_profile'])
            data.loc[i,
                     'description'] = removeuncessary(data.loc[i,
                                                               'description'])
            data.loc[i, 'requirements'] = removeuncessary(
                data.loc[i, 'requirements'])
            data.loc[i, 'benefits'] = removeuncessary(data.loc[i, 'benefits'])
            data.loc[i, 'title'] = removeuncessary(data.loc[i, 'title'])
            data.loc[i, 'department'] = removeuncessary(data.loc[i,
                                                                 'department'])
            data.loc[i, 'industry'] = removeuncessary(data.loc[i, 'industry'])
            data.loc[i, 'function'] = removeuncessary(data.loc[i, 'function'])

            words = str(data.loc[i, 'company_profile'])
            if (words == 'no info'):
                data.loc[i, 'company_profile_word_count'] = 0
            else:
                data.loc[i, 'company_profile_word_count'] = len(words.split())

            words = str(data.loc[i, 'benefits'])
            if (words == 'no info'):
                data.loc[i, 'benefits_word_count'] = 0
            else:
                data.loc[i, 'benefits_word_count'] = len(words.split())

            data.loc[i, 'title_and_job_similarity'] = synonym_relation(
                data.loc[i, 'title'], data.loc[i, 'description'])

            data.loc[i, 'title_and_req_similarity'] = synonym_relation(
                data.loc[i, 'title'], data.loc[i, 'requirements'])

            data.loc[i, 'profile_and_job_similarity'] = synonym_relation(
                data.loc[i, 'company_profile'], data.loc[i, 'description'])

            data.loc[i, 'profiel_and_req_similarity'] = synonym_relation(
                data.loc[i, 'company_profile'], data.loc[i, 'requirements'])

            data.loc[i,
                     'title_and_department_syn_similarity'] = synonym_relation(
                         data.loc[i, 'title'], data.loc[i, 'department'])

            data.loc[i,
                     'title_and_industry_syn_similarity'] = synonym_relation(
                         data.loc[i, 'title'], data.loc[i, 'industry'])

            data.loc[i,
                     'title_and_function_syn_similarity'] = synonym_relation(
                         data.loc[i, 'title'], data.loc[i, 'function'])

            data.loc[
                i,
                'industry_and_department_syn_similarity'] = synonym_relation(
                    data.loc[i, 'industry'], data.loc[i, 'department'])

            data.loc[
                i,
                'function_and_department_syn_similarity'] = synonym_relation(
                    data.loc[i, 'function'], data.loc[i, 'department'])
            data.loc[
                i, 'industry_and_function_syn_similarity'] = synonym_relation(
                    data.loc[i, 'industry'], data.loc[i, 'function'])

        for i in [
                'title_and_job_similarity', 'title_and_req_similarity',
                'profile_and_job_similarity', 'profiel_and_req_similarity',
                'title_and_department_syn_similarity',
                'title_and_industry_syn_similarity',
                'title_and_function_syn_similarity',
                'function_and_department_syn_similarity',
                'industry_and_department_syn_similarity',
                'industry_and_function_syn_similarity'
        ]:

            data[i].fillna(0, inplace=True)

        data.drop([
            'company_profile', 'benefits', 'description', 'requirements',
            'title', 'department', 'industry', 'function', 'job_id'
        ],
                  axis=1,
                  inplace=True)
        return data
    except Exception as e:
        handle('Text handling process')
Exemplo n.º 14
0
import argparse
from Allcodefiles.training import training
from Allcodefiles.testing import testing
from Allcodefiles.Exceptionhandling import handle

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description = 'Fake Job Prediction')
    parser.add_argument('-r', '--return_object', choices = ['train', 'test'],
                        default = 'train', type = str,
                        help = 'Select what task to be done')

    args = parser.parse_args()
    var_args = vars(args)

try:
    if(var_args['return_object'] == 'train'):
        training()
    else:
        testing()

except Exception as e:
    handle('Main file')