Пример #1
0
def synonym_relation(text1, text2):
    try:
        if (text1 == 'no info' or text2 == 'no info'):
            return 0
        else:
            text1 = stopwordsremove(text1)
            text2 = stopwordsremove(text2)
            syn_set = set()
            count = 0
            if (len(text1) == 0 or len(text2) == 0):
                return 0
            if (len(text1) < len(text2)):
                for word in text2:
                    for syn in wordnet.synsets(word):
                        for l in syn.lemmas():
                            syn_set.add(l.name())

                for word in text1:
                    if word in syn_set:
                        count += 1
                return (count / len(text1))
            else:
                for word in text1:
                    for syn in wordnet.synsets(word):
                        for l in syn.lemmas():
                            syn_set.add(l.name())

                for word in text2:
                    if word in syn_set:
                        count += 1
                return (count / len(text2))
    except Exception as e:
        handle('synonym relation finding process')
Пример #2
0
def missing_values(data):
    """
    All columns might contain missing values we have to decide how to handle
    each column. Some columns need to handled seperately.
    """
    """
    1.location

    a second level of handling is done as of to remove numeric values in the
    location data. regex is used to remove those numeric data and replace with
    no info.
    """

    print('Handling Missing Data')
    try:
        data['location'].fillna('no info', inplace=True)
        withoutcomma = data[~data['location'].str.contains(",")].index
        withcomma = data[data['location'].str.contains(",")].index

        for i in withcomma:
            data.loc[i,
                     'country'] = data.loc[i,
                                           'location'].split(',')[0].strip()

        for i in withoutcomma:
            data.loc[i, 'country'] = data.loc[i, 'location'].strip()
        """2.salary range"""

        data['salary_range'].fillna('0-0', inplace=True)

        for i in range(0, data.shape[0]):
            str = data.loc[i, 'salary_range']
            if re.search(r'[a-z,A-Z]', str):
                data.loc[i, 'salary_range'] = '0-0'

            if (data.loc[i, 'salary_range'].find("-") != -1):
                data.loc[i, 'minimum_salary'] = data.loc[
                    i, 'salary_range'].split('-')[0]
                data.loc[i, 'maximum_salary'] = data.loc[
                    i, 'salary_range'].split('-')[1]
            else:
                data.loc[i, 'minimum_salary'] = data.loc[i, 'salary_range']
                data.loc[i, 'maximum_salary'] = data.loc[i, 'salary_range']
        """3. All other categorical columns and remaining numeric columns."""

        columns = data.columns
        for i in columns:
            if (data[i].isna().any()):
                if (data[i].dtypes == 'object'):
                    data[i].fillna('no info', inplace=True)
                    data[i] = data[i].str.lower()

                else:
                    data[i].fillna(0, inplace=True)

        data.drop(['salary_range', 'location'], axis=1, inplace=True)
        return data
    except Exception as e:
        handle('missing data handling process')
Пример #3
0
def categorical_cols_test(data):
    print('Categorical Encoding')
    try:
        encoder = pickle.load(open("model/encoder.p", "rb"))
        newdata = encoder.transform(data)
        return newdata
    except Exception as e:
        handle('categorical columns handling for testing process')
Пример #4
0
def training():
    try:
        data = read_csv('data/train.csv')

        (data.pipe(missing_values).pipe(texthandling)
             .pipe(categorical_cols_train).pipe(train_and_save_model))

    except Exception as e:
        handle("Training piepline")
Пример #5
0
def testing():
    try:
        data = read_csv('data/test.csv')

        (data.pipe(missing_values).pipe(texthandling).pipe(
            categorical_cols_test).pipe(load_model_predict))

    except Exception as e:
        handle('testing process')
Пример #6
0
def read_csv(path):
    try:
        if ('csv' == path.split(".")[-1]):
            data = pd.read_csv(path)
        else:
            print("The files is not a CSV file")
    except Exception as e:
        handle('file reading')
    return data
Пример #7
0
def stopwordsremove(text):
    try:
        word_token = word_tokenize(text)
        ps = PorterStemmer()
        filtered = [
            ps.stem(w.lower()) for w in word_token if not w in stop_words
        ]
        return filtered
    except Exception as e:
        handle('stop words removing')
Пример #8
0
def categorical_cols_train(data):
    try:
        print('Categorical Encoding')
        encoder = ce.BinaryEncoder(cols=[
            'employment_type', 'required_experience', 'required_education',
            'country'
        ])
        newdata = encoder.fit_transform(data)
        pickle.dump(encoder, open("model/encoder.p", "wb"))
        return newdata
    except Exception as e:
        handle('categorical column handling')
Пример #9
0
def load_model_predict(data):
    try:
        X_test = data.drop('fraudulent', axis=1)
        y_test = data['fraudulent']

        scaler = pickle.load(open("model/scaler.p", "rb"))
        X_test = scaler.transform(X_test)

        filename = 'model/finalized_model.p'
        model = pickle.load(open(filename, 'rb'))

        y_pred = model.predict(X_test)
        score_and_save(y_pred)
    except Exception as e:
        handle('prediction process')
Пример #10
0
def removeuncessary(text):
    try:
        '''
        1. removing punctuations,
        2. removing numbered words,
        3. removing unknown characters

        '''
        text = re.sub('[%s]' % re.escape(string.punctuation), '', str(text))
        text = re.sub('\w*\d\w*', '', str(text))
        text = re.sub('[^a-zA-Z ]+', ' ', str(text))

        return text
    except Exception as e:
        handle('removing unnecessary text')
Пример #11
0
def score_and_save(y_pred):
    try:
        data = read_csv('data/test.csv')

        y_test = data['fraudulent']
        cm = confusion_matrix(y_test, y_pred)
        print("\n" + "SCORES")
        print("confusion matrix")
        print(cm)
        print('F1-Score' + ' = ' + str(round(f1_score(y_test, y_pred), 4)))
        print('Precision' + ' = ' +
              str(round(precision_score(y_test, y_pred), 4)))
        print('Recall' + ' = ' + str(round(recall_score(y_test, y_pred), 4)))
        print('Accuracy' + ' = ' +
              str(round(accuracy_score(y_test, y_pred), 4)))

        data['fraud_prediction'] = y_pred

        data.to_csv('predictionoutput/testsetprediction.csv')
    except Exception as e:
        handle('scoring and saving process')
Пример #12
0
def train_and_save_model(data):
    try:
        print("Model Training")
        X_train = data.drop('fraudulent', axis=1)
        y_train = data['fraudulent']

        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        pickle.dump(sc, open("model/scaler.p", "wb"))

        from sklearn.ensemble import RandomForestClassifier
        model = RandomForestClassifier(n_estimators=100,
                                       criterion='entropy',
                                       random_state=1)

        model.fit(X_train, y_train)

        filename = 'model/finalized_model.p'
        pickle.dump(model, open(filename, 'wb'))
    except Exception as e:
        handle('Model Creation and training')
Пример #13
0
def texthandling(data):
    print('Text Handling')
    try:
        '''
            This function is for handling text data columns company profile,
            description, requirements, benefits are there is multiple text in
            those columns we need to do something about them.
            '''
        stop_words = set(stopwords.words('english'))
        for i in range(0, data.shape[0]):

            data.loc[i, 'company_profile'] = removeuncessary(
                data.loc[i, 'company_profile'])
            data.loc[i,
                     'description'] = removeuncessary(data.loc[i,
                                                               'description'])
            data.loc[i, 'requirements'] = removeuncessary(
                data.loc[i, 'requirements'])
            data.loc[i, 'benefits'] = removeuncessary(data.loc[i, 'benefits'])
            data.loc[i, 'title'] = removeuncessary(data.loc[i, 'title'])
            data.loc[i, 'department'] = removeuncessary(data.loc[i,
                                                                 'department'])
            data.loc[i, 'industry'] = removeuncessary(data.loc[i, 'industry'])
            data.loc[i, 'function'] = removeuncessary(data.loc[i, 'function'])

            words = str(data.loc[i, 'company_profile'])
            if (words == 'no info'):
                data.loc[i, 'company_profile_word_count'] = 0
            else:
                data.loc[i, 'company_profile_word_count'] = len(words.split())

            words = str(data.loc[i, 'benefits'])
            if (words == 'no info'):
                data.loc[i, 'benefits_word_count'] = 0
            else:
                data.loc[i, 'benefits_word_count'] = len(words.split())

            data.loc[i, 'title_and_job_similarity'] = synonym_relation(
                data.loc[i, 'title'], data.loc[i, 'description'])

            data.loc[i, 'title_and_req_similarity'] = synonym_relation(
                data.loc[i, 'title'], data.loc[i, 'requirements'])

            data.loc[i, 'profile_and_job_similarity'] = synonym_relation(
                data.loc[i, 'company_profile'], data.loc[i, 'description'])

            data.loc[i, 'profiel_and_req_similarity'] = synonym_relation(
                data.loc[i, 'company_profile'], data.loc[i, 'requirements'])

            data.loc[i,
                     'title_and_department_syn_similarity'] = synonym_relation(
                         data.loc[i, 'title'], data.loc[i, 'department'])

            data.loc[i,
                     'title_and_industry_syn_similarity'] = synonym_relation(
                         data.loc[i, 'title'], data.loc[i, 'industry'])

            data.loc[i,
                     'title_and_function_syn_similarity'] = synonym_relation(
                         data.loc[i, 'title'], data.loc[i, 'function'])

            data.loc[
                i,
                'industry_and_department_syn_similarity'] = synonym_relation(
                    data.loc[i, 'industry'], data.loc[i, 'department'])

            data.loc[
                i,
                'function_and_department_syn_similarity'] = synonym_relation(
                    data.loc[i, 'function'], data.loc[i, 'department'])
            data.loc[
                i, 'industry_and_function_syn_similarity'] = synonym_relation(
                    data.loc[i, 'industry'], data.loc[i, 'function'])

        for i in [
                'title_and_job_similarity', 'title_and_req_similarity',
                'profile_and_job_similarity', 'profiel_and_req_similarity',
                'title_and_department_syn_similarity',
                'title_and_industry_syn_similarity',
                'title_and_function_syn_similarity',
                'function_and_department_syn_similarity',
                'industry_and_department_syn_similarity',
                'industry_and_function_syn_similarity'
        ]:

            data[i].fillna(0, inplace=True)

        data.drop([
            'company_profile', 'benefits', 'description', 'requirements',
            'title', 'department', 'industry', 'function', 'job_id'
        ],
                  axis=1,
                  inplace=True)
        return data
    except Exception as e:
        handle('Text handling process')
Пример #14
0
import argparse
from Allcodefiles.training import training
from Allcodefiles.testing import testing
from Allcodefiles.Exceptionhandling import handle

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description = 'Fake Job Prediction')
    parser.add_argument('-r', '--return_object', choices = ['train', 'test'],
                        default = 'train', type = str,
                        help = 'Select what task to be done')

    args = parser.parse_args()
    var_args = vars(args)

try:
    if(var_args['return_object'] == 'train'):
        training()
    else:
        testing()

except Exception as e:
    handle('Main file')