def pre_process_text(self): #Creating training test and validation sets self.trainset_pre_processed = [ process(sentence).lemmatize for sentence in self.trainset["Description"].values ] self.testset_pre_processed = [ process(sentence).lemmatize for sentence in self.testset["Description"].values ]
def run(self): raw_documents = self.reader.read() title_docs, abstract_docs = self.data_manager.parse_documents( raw_documents) title_doc_objs = pre_process.process(title_docs, self.pre_config, constants.SENTENCE_TYPE_TITLE) abs_doc_objs = pre_process.process(abstract_docs, self.pre_config, constants.SENTENCE_TYPE_ABSTRACT) doc_objects = self.data_manager.merge_documents( title_doc_objs, abs_doc_objs) dict_nern = ner.process(doc_objects, self.nern_config) self.writer.write(self.output_file, raw_documents, dict_nern)
def get_sents(): sents = get_sent_dict() all_sents = [] print("Creating Document objects...\n") documents = pre_process.process(sents, pre_config, constants.SENTENCE_TYPE_ABSTRACT) for doc in documents: for sentence in doc.sentences: token_list = [t.content for t in sentence.tokens] all_sents.append(token_list) print("Get all sentences complete.\n") return all_sents
def get_sents(): all_sents = [] # create list of tokens in sentences for dataset in datasets: print('Process dataset: ' + dataset) reader = BioCreativeReader( os.path.join(input_path, "cdr_" + dataset + ".txt")) raw_documents = reader.read() title_docs, abstract_docs = data_manager.parse_documents(raw_documents) title_doc_objs = pre_process.process(title_docs, pre_config, constants.SENTENCE_TYPE_TITLE) abs_doc_objs = pre_process.process(abstract_docs, pre_config, constants.SENTENCE_TYPE_ABSTRACT) documents = data_manager.merge_documents(title_doc_objs, abs_doc_objs) for doc in documents: for sentence in doc.sentences: token_list = [t.content for t in sentence.tokens] all_sents.append(token_list) return all_sents
def runHazus(): entries = [] entries.extend(root.fields.values()) haz = pre_process.process( root.filename, entries) # Run the Hazus script with input from user using the GUI print('Pre-Process RUN', haz, entries) if haz[0]: popupmsg(str(haz[1][0]) + ' records sucessfully processed of ' + str(haz[1][1]) + ' records total.\n' \ +str(haz[2][1])+' Building DDFs assigned.\n' \ +str(haz[2][2])+' Content DDFs assigned.\n' \ +str(haz[2][3])+' Inventory DDFs assigned.\n' \ +str(haz[4][1])+' Building DDFs checked and '+str(haz[3][1])+' found valid.\n' \ +str(haz[4][2])+' Content DDFs checked and '+str(haz[3][2])+' found valid.\n' \ +str(haz[4][3])+' Inventory DDFs checked and '+str(haz[3][3])+' found valid.\n' \ +'File saved to: ' + root.filename)
def make_vocabulary(self, train_x): for document_index, words_in_document in enumerate(train_x): words_in_document = np.asarray(process(words_in_document)) unique_words, count_of_words = np.unique(words_in_document, return_counts=True) self.vocabulary[document_index] = {} for i, word in enumerate(unique_words): self.vocabulary[document_index][word] = count_of_words[i] if max_group == y: correct += 1 # Timekeeping timed = int(time.time() - start_time) print("Evaluation finished in ", timed, "seconds.") # Accuracy accuracy = (correct / len(test_y)) * 100 print("Accuracy:", accuracy)
7: ridge, 8: stacking_linear, 9: stacking_averaging, 10: adaBoost, 11: gradBoost, } print "Movie Lens Dataset" print "Operations" print "0. View Users Data" print "1. View Movies Data" print "2. View Ratings Data" print "3. View Preprocessed Data" print "4. View Correlation between different columns" print "5. Create Train and Test Dataset" print "6. Random Forest Regression for Movies" print "7. Ridge Regression for Movie Ratings" print "--------------------------------------" print " Ensembling Techniques" print "8. Linear Stacking For Movie Ratings" print "9. Average Stacking for Movie Ratings" print "10. AdaBoosting for Movie Ratings" print "11. Gradient Boosting For Movie Ratings" print "enter your choice of Operations" choice = input() if (choice >= 3): data_frame1 = pre_process.process() options[choice]()