def get_data_loader(rp: str, data_type: str): """ Reads the labels for the training data and converts it into a tensor of "(features, target)" for Neural Network using PyTorch. 1. Loads the training classes, both coarse and fine and creates label for each row by "concatenating coarse_class :: fine_class". 2. Converts the labels_numpy to labels_bin - Binarized form to be used in NN. 3. Loads the features using the `get_ft_obj` function in numpy arrays. 4. Get the number of features used. 5. Converts label_numpy into PyTorch tensor - labels. 6. Converts x_ft(features - independent variables) into PyTorch :argument :param rp: Absolute path of the root directory of the project. :param data_type: String either `training` or `test`. :return: feat_size: Number of features which are being used, so that we can keep the data_loader: Loader object containing train data and labels used to train the Neural Network. """ labels_numpy = [] crf, coarse = read_file("coarse_classes_{0}".format(data_type), rp) frf, fine = read_file("fine_classes_{0}".format(data_type), rp) c_lb = [remove_endline_char(c).strip() for c in coarse] f_lb = [remove_endline_char(f).strip() for f in fine] if not crf: print("Error in reading actual ({0}) coarse classes".format(data_type)) exit(-11) if not frf: print("Error in reading actual ({0}) fine classes".format(data_type)) exit(-11) label_len = len(f_lb) for i in range(0, label_len): labels_numpy.append(c_lb[i] + " :: " + f_lb[i]) mlb = MultiLabelBinarizer().fit(labels_numpy) if data_type == "training" \ else read_obj("label_binarizer", rp + "/{0}".format(nn_model_str))[1] labels_bin = mlb.transform(labels_numpy) write_obj(mlb, "label_binarizer", rp + "/{0}".format(nn_model_str)) print("- Labels loading into numpy done.") x_ft = get_ft_obj(data_type, rp, "{0}".format(nn_model_str), "coarse").toarray() feat_size = x_ft.shape[1] print("- Features loading into numpy done.") labels = torch.from_numpy(labels_bin) data = torch.from_numpy(x_ft).float() print("- Features and labels as tensors, done.") train_data = TensorDataset(data, labels) data_loader = DataLoader(train_data, batch_size=batch_size) print("- {0} loader done.".format(data_type)) return feat_size, data_loader
def get_vect(data_type: str, rp: str, prop_type: str, ml_algo: str, cat_type: str, text_data): """ This method takes the list of text data and fits the Word Vectorizer (CountVectorizer) over the list of text data. :argument: :param data_type: String either `training` or `test`. :param rp: Absolute path of the root directory of the project. :param prop_type: Natural language property either `word` (from spaCy) or `ner` (from StanfordNER). :param ml_algo: Machine algorithm for which the dataprep is running. :param cat_type: Type of categorical class `coarse` or any of the 6 main classes. (`abbr` | `desc` | `enty` | `hum` | `loc` | `num`) :param text_data: Data on which CountVectorizer is fitted on while training. :return: boolean_flag: True for successful operation. count_vec: CountVectorizer object. """ # --------------------------------------------Experimental code--------------------------------------------------------- # Other word embeddings technique can also be tried out - e.g GloVe if data_type == "training": count_vec = CountVectorizer(ngram_range=(1, 2)).fit(text_data) wflag = write_obj(count_vec, "{0}_{1}_vec".format(cat_type, prop_type), rp + "/{0}".format(ml_algo)) return wflag, count_vec elif data_type == "test": rflag, count_vec = read_obj("{0}_{1}_vec".format(cat_type, prop_type), rp + "/{0}".format(ml_algo)) return rflag, count_vec else: print( "Error: Wrong `data_type` param to function `dataprep.text.get_vect`" ) return False
def coarse_ann_computations(data_type: str, rp: str): """ This method handles the process to compute text annotations using spaCy lib, and does for the test/train text data as per the arguments passed. :argument: :param data_type: String either `training` or `test` :param rp: Absolute path of the root directory of the project :return: boolean_flag: True for successful operation. """ data = "training" if data_type == "training" else "test" doc_flag, doc_annot = com_annotations(data, rp) if doc_flag: doc_w_flag = write_obj(doc_annot, "coarse_{0}_doc".format(data), rp) if doc_w_flag: print("- Computing annotations for {0} data done.".format(data)) return True else: print("\n- ERROR: While writing annotations for {0} data.".format( data)) return False else: print("\n- ERROR: While computing annotations for {0} data.".format( data)) return False
def fine_prop_separation(data_type: str, rp: str, prop_type: str): """ :argument: :param data_type: String either `training` or `test` :param rp: Absolute path of the root directory of the project :param prop_type: Natural language property either `doc` (from spaCy) or `ner` (from StanfordNER) :return: boolean_flag: True for successful operation. """ data = "training" if data_type == "training" else "test" prop_flag, abbr_prop, desc_prop, enty_prop, hum_prop, loc_prop, num_prop = sep_lang_prop( data, rp, prop_type) if prop_flag: wf_1 = write_obj(abbr_prop, "abbr_{0}_{1}".format(data, prop_type), rp) wf_2 = write_obj(desc_prop, "desc_{0}_{1}".format(data, prop_type), rp) wf_3 = write_obj(enty_prop, "enty_{0}_{1}".format(data, prop_type), rp) wf_4 = write_obj(hum_prop, "hum_{0}_{1}".format(data, prop_type), rp) wf_5 = write_obj(loc_prop, "loc_{0}_{1}".format(data, prop_type), rp) wf_6 = write_obj(num_prop, "num_{0}_{1}".format(data, prop_type), rp) if wf_1 and wf_2 and wf_3 and wf_4 and wf_5 and wf_6: print("- Separating {1} tags for {0} data done.".format( data, prop_type)) return True else: print("\n- ERROR: While writing {1} tags for {0} data.".format( data, prop_type)) return False else: print("\n- ERROR: While computing {1} tags for {0} data.".format( data, prop_type)) return False
def fine_prop_separation(data_type: str, rp: str, prop_type: str): """ This method handles the process to generate separate files (data) for fine class prediction model. `abbr` | `desc` | `enty` | `hum` | `loc` | `num` questions having the following coarse class are combined together and stored in the respective files. :argument: :param data_type: String either `training` or `test` :param rp: Absolute path of the root directory of the project :param prop_type: Natural language property either `doc` (from spaCy) or `ner` (from StanfordNER) :return: boolean_flag: True for successful operation. """ data = "training" if data_type == "training" else "test" prop_flag, abbr_prop, desc_prop, enty_prop, hum_prop, loc_prop, num_prop = sep_lang_prop( data, rp, prop_type) if prop_flag: wf_1 = write_obj(abbr_prop, "abbr_{0}_{1}".format(data, prop_type), rp) wf_2 = write_obj(desc_prop, "desc_{0}_{1}".format(data, prop_type), rp) wf_3 = write_obj(enty_prop, "enty_{0}_{1}".format(data, prop_type), rp) wf_4 = write_obj(hum_prop, "hum_{0}_{1}".format(data, prop_type), rp) wf_5 = write_obj(loc_prop, "loc_{0}_{1}".format(data, prop_type), rp) wf_6 = write_obj(num_prop, "num_{0}_{1}".format(data, prop_type), rp) if wf_1 and wf_2 and wf_3 and wf_4 and wf_5 and wf_6: print("- Separating {1} tags for {0} data done.".format( data, prop_type)) return True else: print("\n- ERROR: While writing {1} tags for {0} data.".format( data, prop_type)) return False else: print("\n- ERROR: While computing {1} tags for {0} data.".format( data, prop_type)) return False
def train_one_node(rp: str, cat_type: str, ml_algo: str): """ Gets data in the form of sparse matrix from `qc.dataprep.feature_stack` module which is ready for use in a machine learning model. Using the data trains a ml node and serialize the trained object to the secondary memory (hard-disk). :argument: :param rp: Absolute path of the root directory of the project. :param cat_type: Type of categorical class `coarse` or any of the 6 main classes. (`abbr` | `desc` | `enty` | `hum` | `loc` | `num`) :param ml_algo: The type of machine learning models to be used. (svm | lr | linear_svm) :return: boolean_flag: True for successful operation. model: trained SVC model """ x_ft = get_ft_obj("training", rp, ml_algo, cat_type) labels = read_file("{0}_classes_training".format(cat_type), rp)[1] y_lb = [remove_endline_char(c).strip() for c in labels] machine = None # -----------------------------------Experimental code-------------------------------------------------------------- # 1. This is the part where you can experiment and play with the parameters. # 2. If you want to add more models or combinations, you just need to add an `elif` condition and # provide the condition value in argument from the shell. e.g `train svm`, # here `svm` will be in the variable {ml_algo}. if ml_algo == "svm": machine = svm.SVC() elif ml_algo == "linear_svm": machine = svm.LinearSVC() elif ml_algo == "lr": machine = linear_model.LogisticRegression(solver="newton-cg") else: print( "- Error while training {0} model. {0} is unexpected ML algorithm". format(ml_algo)) # Parameter tuning ends here. # ------------------------------------------------------------------------------------------------------------------ model = machine.fit(x_ft, y_lb) mw_flag = write_obj(model, "{0}_model".format(cat_type), rp + "/{0}".format(ml_algo)) if mw_flag: print("- Training done for {0} model of {1}".format(cat_type, ml_algo)) return True else: print("- Error in writing trained {0} model of {1}".format( cat_type, ml_algo)) return False
def coarse_ner_computations(data_type: str, rp: str): """ :argument: :param data_type: String either `training` or `test` :param rp: Absolute path of the root directory of the project :return: boolean_flag: True for successful operation. """ data = "training" if data_type == "training" else "test" ner_flag, ner_tags = com_ner(data, rp) if ner_flag: ner_w_flag = write_obj(ner_tags, "coarse_{0}_ner".format(data), rp) if ner_w_flag: print("- Computing NER tags for {0} data done.".format(data)) return True else: print( "\n- ERROR: While writing NER tags for {0} data.".format(data)) return False else: print("\n- ERROR: While computing NER tags for {0} data.".format(data)) return False