class JobAdClassification: """Classification of job ads using R and rpy2. This class provides training of a machine learning model for recommendations for new job ads, and determination of languages of job ads. Arguments --------- Rlibpath : str Path to local R libraries. search_terms : list[str] All search terms used in job ad collections. Needed to include all factor levels in the machine learning model. sites : list[str] All job sites used in job ad collections. Needed to include all factor levels in the model. language : str Language of job ads / machine learning model. Currently only Finnish and English supported. """ #Functions which were easier to implement in pure R than using rpy2. __R_functions_str = """ cleanJobAds <- function(class_data, search_terms, sites) { # Cleans and transforms job ads. # More precisely: # - Joins title and description columns # - Removes rows with empty column(s), removes extra whitespaces # - Removes duplicates # - Adds all factor levels to search terms and sites # # Returns a data frame with only site, search # # Arguments: # class_data - dataframe with columns for site, title, description, # searchterm and relevant. # search_terms - Character vector of all search terms used, needed for # factor levels. # sites - Character vector of all job ad sites, needed for # factor levels. class_data$description <- paste(class_data$title, class_data$description) class_data$title <- NULL #get rid of rows with empty column(s) for (col in colnames(class_data)) { class_data <- class_data[!(class_data[col] == ""),] class_data <- class_data[!is.na(class_data[col]),] } class_data <- unique(class_data) #get rid of extra whitespace in description class_data$description <- str_trim(class_data$description) class_data$description <- gsub("\\\\s+", " ", class_data$description) #assign proper factor levels class_data$site <- as.factor(class_data$site) levels(class_data$site) <- c(levels(class_data$site), sites[!(sites %in% levels(class_data$site))]) class_data$searchterm <- as.factor(class_data$searchterm) levels(class_data$searchterm) <- c(levels(class_data$searchterm), search_terms[!(search_terms %in% levels(class_data$searchterm))]) return(class_data) } createJoinDTM <- function(class_data, lang) { # Transforms and parses the description column for words. # Words are cleaned and stemmed, and finally added as columns # to the dataframe. # # Returns dataframe with columns for all parsed words in the description # column. The description column itself is removed. # # Arguments: # class_data - Dataframe of cleaned job ads using R_function cleanJobAds. # lang - Language of job ads, needed for stemming and removing # stopwords. #create corpus from descriptions corpus <- Corpus(VectorSource(class_data$description)) #cleaning and stemming operations applied to corpus corpus <- tm_map(corpus, tolower) corpus <- tm_map(corpus, PlainTextDocument) #need to convert after tolower corpus <- tm_map(corpus, removePunctuation) corpus <- tm_map(corpus, removeWords, stopwords(lang)) corpus <- tm_map(corpus, stemDocument, lang) #create document term matrix and remove sparse terms dtm <- DocumentTermMatrix(corpus) dtm <- removeSparseTerms(dtm, 0.98) dtm <- as.data.frame(as.matrix(dtm)) class_data <- cbind(class_data, dtm) class_data$description <- NULL colnames(class_data) <- make.names(colnames(class_data)) return (class_data) } RFmodel <- function(train_data, cutoff) { # Trains random forest binary classification model using the provided # cutoffs. # # Returns model. # # Arguments: # train_data - Dataframe containing parsed words from job ads. # cutoff - Threshold for determining whether relevant or not. train_data$relevant <- as.factor(train_data$relevant) RFmodel <- randomForest(relevant ~ ., data=train_data, cutoff = cutoff) return(RFmodel)} RFpred <- function(RFmodel, test_data) { # Classifies job ads as relevant or not using provided model. # # Returns factor of classifications. # # Arguments: # RFmodel - Model to use. # test_data - Dataframe containing parsed words from job ads as columns. # return(predict(RFmodel, newdata=test_data))} splitTerms <- function(terms_data, bool) { #Helper function for splitting data into training and testing sets. return (terms_data == bool) } model_eval <- function(predictions, actual, thold, printb=0) { # Calculates and optionally prints characteristics of model. # Returns the following in a column vector: # accuracy, sensitivity, RMSE, # true positives, true negatives, # false positives, false negatives, # fscore # predictions <- as.numeric(predictions) actual <- as.numeric(actual) if (thold != -1) { preds <- predictions >= thold } #if factor levels are 2,1 instead of 0,1 (will fail if #actual levels are 0,1 but there are no 0 predictions) if (max(predictions) == 2 || min(predictions) == 1) { preds <- predictions-1 } TP <- sum(actual + preds == 2) TN <- sum(actual + preds == 0) FP <- sum(actual - preds == -1) FN <- sum(actual - preds == 1) #accuracy acc <- (TP + TN) / (TP + TN + FP + FN) #sensitivity sens <- (TP / (TP + FN)) #fscore fscore <- 2*TP/(2*TP+FP+FN) #error measure err <- sum((actual - preds)^2) if (printb == 1) { cat("Model characteristics:", "\n") cat("Accuracy", acc, "\n") cat("Sensitivity", sens, "\n") cat("Fscore", fscore, "\n") cat("Error (RMSE)", err, "\n") } return(c(acc, sens, err, TP, TN, FP, FN, fscore)) } prepNewAds <- function(RFmodel, new_ads) { #Prepares new ads for classification by model. #Looks for words used by model and discards #words not in model. model_columns <- as.character(attr(RFmodel$terms, "variables")) new_ads <- new_ads[(names(new_ads) %in% model_columns)] for (col in model_columns[!(model_columns %in% names(new_ads))]) { new_ads[col] <- rep(0, nrow(new_ads)) } return(new_ads) } saveFile <- function(object, filename) { save(object, file=filename) } """ #columns needed for training model _train_columns = ["site", "searchterm", "title", "description", "relevant"] #columns needed for classifying new job ads _class_columns = ["id", "site", "searchterm", "title", "description"] def __init__(self, Rlibpath, search_terms, sites, language): self._RFmodel = None self._language = language self._search_terms = search_terms self._sites = sites #random forest model parameters self._threshold = 0.3 self._splitratio = 0.7 #base R assets self._utils = robjects.packages.importr("utils") self._utils.chooseCRANmirror(ind=5) #randomly chosen mirror self._base = robjects.packages.importr("base") #local library path self._base._libPaths(Rlibpath) #change locale to use utf-8 for r_repr() robjects.r['Sys.setlocale']("LC_CTYPE", "C") # tm - Framework for text mining. # SnowballC - Stemming. # textcat - Determining language of text. # # randomForest - Random forest. # caTools - Splitting data into training and test sets intelligently. # stringr - String manipulation needed_packages = ["tm", "SnowballC", "textcat", "randomForest", "caTools", "stringr"] #install packages to_install = [package for package in needed_packages if not robjects.packages.isinstalled(package)] if len(to_install) > 0: self._utils.install_packages(StrVector(to_install)) #load packages self._loaded_packages = [robjects.packages.importr(package) for package in needed_packages] self._loaded_packages = dict(zip(needed_packages, self._loaded_packages)) #load R functions self._R_functions = STAP( self.__R_functions_str, "R_functions") def _remove_diacritics(self, string): """Removes all Swedish (Finnish) diacritics from a string. Arguments ---------- string : str String to remove diacritics from. Returns ---------- clean_string : str String without diacritics. """ if isinstance(string, str): diacr = ["Ä", "ä", "Ö", "ö", "Å", "å"] replc = ["A", "a", "O", "o", "A", "a"] for i in range(0, len(diacr)): string = string.replace(diacr[i], replc[i]) return string def _create_R_dataframe(self, job_ads, include_columns): """Converts job ads to R dataframe. Arguments ---------- job_ads : list[:class:`JobAd`] List of :class:`JobAd` instances. include_columns : list[str] Defines which columns are included in the dataframe. Returns ---------- dataf : :class:`robjects.DataFrame` :class:`robjects.DataFrame` representing job ads. """ #modify structure to type {column:[rows]} if len(job_ads) == 0: raise Exception("No job ads to convert to R dataframe.") job_ads_dataf = {} for column in include_columns: job_ads_dataf[column] = [self._remove_diacritics(ad[column]) for ad in job_ads] if (column == "relevant"): job_ads_dataf[column] = IntVector(job_ads_dataf[column]) else: job_ads_dataf[column] = self._base.I(StrVector(job_ads_dataf[column])) return robjects.DataFrame(job_ads_dataf) def train_model(self, class_ads): """Trains a random forest model for classification of job ad relevance. Model is stored in the :class:`JobAdClassification` instance. Arguments ---------- class_ads : list[:class:`JobAd`] List of :class:`JobAd` instances used to train model. Each instance should have site, searchterm, title, description and relevant defined. """ ##parameters for training #typical value splitratio = self._splitratio #gave best F-score during parameter sweeping threshold = self._threshold #convert to dataframe and clean ads dataf = self._create_R_dataframe(class_ads, self._train_columns) dataf = self._R_functions.cleanJobAds(dataf, StrVector(self._search_terms), StrVector(self._sites)) dataf = self._R_functions.createJoinDTM(dataf, self._language.lower()) #create training and testing data sets if (splitratio != 1.0): split = robjects.r['sample.split'](dataf.rx2('relevant'), splitratio) train = robjects.r['subset'](dataf, self._R_functions.splitTerms(split, 'TRUE')) test = robjects.r['subset'](dataf, self._R_functions.splitTerms(split, 'FALSE')) else: train = dataf #train model self._RFmodel = self._R_functions.RFmodel(train, FloatVector([1-threshold, threshold])) #test on testing set if (splitratio != 1.0): pred = self._R_functions.RFpred(self._RFmodel, test) conf_matrix = self._R_functions.model_eval(pred, test.rx2('relevant'), -1, 1) def save_model(self, filename): """Saves :class:`JobAdClassification` instance model to file for later use. Arguments ---------- filename : str Name of file to save model in. """ self._R_functions.saveFile(self._RFmodel, filename) def load_model(self, filename): """Loads random forest classification model from file. Model is stored in :class:`JobAdClassification` instance. Arguments ---------- filename : str Name of file to load model from. """ self._RFmodel = robjects.r['get'](robjects.r['load'](filename)) def recommend_ads(self, job_ads): """Provides recommendations for ads using instance model. Arguments ---------- job_ads : list[:class:`JobAd`] Each instance should have id, site, searchterm, title and description defined. Returns ---------- results : list[:class:`JobAd`] Each instance has id and recommendation defined. """ #convert to dataframe and clean ads dataf = self._create_R_dataframe(job_ads, self._class_columns) ids = dataf.rx2('id') dataf = self._R_functions.cleanJobAds(dataf, StrVector(self._search_terms), StrVector(self._sites)) dataf = self._R_functions.createJoinDTM(dataf, self._language.lower()) dataf = self._R_functions.prepNewAds(self._RFmodel, dataf) #classify ads pred = self._R_functions.RFpred(self._RFmodel, dataf) #combine predictions with ids in a list of dictionaries results = [JobAd.create({"id" : ids[i], "recommendation": int(pred[i])-1}) for i in range(0, robjects.r['length'](ids)[0])] return results def _determine_lang(self, title, description): """Tries to determine which language a job ad is using the textcat package. Only differentiates between Finnish and English; returns English if another language is recognized. Arguments ---------- title : str Title of job ad. description : str Description of job ad. Returns ---------- language : str Determined language of job ad. """ language_both = self._loaded_packages["textcat"].textcat( " ".join([title, description])).r_repr().replace("\"", "") language_title = self._loaded_packages["textcat"].textcat(title).r_repr().replace("\"", "") language_descrip = self._loaded_packages["textcat"].textcat( description).r_repr().replace("\"", "") #English job titles with Finnish text is sometimes mistaken #as danish, frisian or middle_frisian false_finnish = ["danish", "frisian", "middle_frisian"] if (language_both == "english" or language_both == "finnish"): return language_both[0].upper() + language_both[1:] elif (language_title == "english" or language_title == "finnish"): return language_title[0].upper() + language_title[1:] elif (language_descrip == "english" or language_descrip == "finnish"): return language_descrip[0].upper() + language_descrip[1:] elif (language_both in false_finnish or language_title in false_finnish or language_descrip in false_finnish): return "Finnish" else: return "English" def det_lang_ads(self, job_ads): """Attempts to determine language of job ads. Returns list of :class:`JobAd` instances with id and language. Arguments ---------- job_ads : list[:class:`JobAd`] List of :class:`JobAd` instances. Each instance should have id, title and description defined. Returns ---------- results : list[:class:`JobAd`] List of :class:`JobAd` instances. Each instance has id and language defined. """ results = [{"id": ad["id"], "language": self._determine_lang(ad["title"], ad["description"])} for ad in job_ads] return results