def generate_sentences(data_files): """ Generates SentenceRecord objects using the given datafiles. Tokenises reports into sentences and then preprocesses each sentence. Also saves the report_id and report class into the object. Note: Currently report class is just the index of the datafile extracted from, a global mapping would be better. :param data_files: List of strings corresponding the raw csv datafiles in the format 'report_id, report' :return: List of SentenceRecord objects extracted from the files. """ list_sentence_record = [] tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') for i, file_name in enumerate(data_files): with open(file_name, 'rb') as file: file.readline() # skip header line reader = csv.reader(file) for report in reader: for sentence in tokenizer.tokenize(report[1]): tmp = SentenceRecord(sentence) tmp.processed_sentence = " ".join( preprocess.textPreprocess(sentence, removeNegationsFromSentences=False)) tmp.report_id = report[0] tmp.report_class = i list_sentence_record.append(tmp) return list_sentence_record
def search(model, numResults, searchTerm): dictionary = gensim.corpora.Dictionary.load('./model_files/reports.dict') origSearchTerm = searchTerm searchTerm = preprocess.textPreprocess(searchTerm) # searchTerm = preprocess.getDerivations(searchTerm) if (searchTerm == []): return [] if model == "bow": index = gensim.similarities.SparseMatrixSimilarity.load('./model_files/reports.index') index.num_best = numResults searchTerm_bow = dictionary.doc2bow(searchTerm) similarReports = index[searchTerm_bow] elif model == "tfidf": tfidf_model = gensim.models.TfidfModel.load('./model_files/reports.tfidf_model') tfidf_index = gensim.similarities.SparseMatrixSimilarity.load('./model_files/reports_tfidf.index') tfidf_index.num_best = numResults searchTerm_bow = dictionary.doc2bow(searchTerm) searchTerm_tfidf = tfidf_model[searchTerm_bow] similarReports = tfidf_index[searchTerm_tfidf] elif model == "lsi": tfidf_model = gensim.models.TfidfModel.load('./model_files/reports.tfidf_model') lsi_model = gensim.models.LsiModel.load('./model_files/reports.lsi_model') lsi_index = gensim.similarities.MatrixSimilarity.load('./model_files/reports_lsi.index') lsi_index.num_best = numResults searchTerm_bow = dictionary.doc2bow(searchTerm) searchTerm_tfidf = tfidf_model[searchTerm_bow] searchTerm_lsi = lsi_model[searchTerm_tfidf] similarReports = lsi_index[searchTerm_lsi] elif model == "lda": lda_model = gensim.models.LdaModel.load('./model_files/reports.lda_model') lda_index = gensim.similarities.MatrixSimilarity.load('./model_files/reports_lda.index') lda_index.num_best = numResults searchTerm_bow = dictionary.doc2bow(searchTerm) searchTerm_lda = lda_model[searchTerm_bow] similarReports = lda_index[searchTerm_lda] elif model == "doc2vec": model = gensim.models.Doc2Vec.load("./model_files/reports.doc2vec_model") searchTerm_docvec = model.infer_vector(searchTerm) similarReports = model.docvecs.most_similar([searchTerm_docvec],topn=numResults) elif model == "rnn": searchTerm_rnn = rnn.getReportSearchTerm(origSearchTerm) similarReports = rnn.most_similar_reports(searchTerm_rnn,topn=numResults) else: return 0 return similarReports
def input_game(): """ This game accepts a 'report sentence' from a user and attempts to classify it. :return: None """ print "Write a sentence to see if the system thinks it's diagnostic:" sys.stdout.write("> ") ans = sys.stdin.readline() print ans if ans and ans.rstrip() != "": ans = ans.rstrip() processed_sentence = " ".join( preprocess.textPreprocess(ans, removeNegationsFromSentences=False)) if processed_sentence == "": return print processed_sentence probs = pipe.predict_proba([processed_sentence])[0] print probs if probs[0] > probs[1]: print "[Negative] That output is not diagnostic with a confidence of " + str(probs[0]) else: print "[Positive] That output is diagnostic with a confidence of " + str(probs[1])