def log(wavs_test, texts_test, texts_pred, log_folder): wer = np.mean([ calculate_wer(test.split(), pred.split()) for test, pred in zip(texts_test, texts_pred) ]) wer = np.round(wer, 4) result = {"WER": wer} content = json.dumps(result, ensure_ascii=False) log_file = join(log_folder, "result.json") write(log_file, content) wav_folder = join(log_folder, "wav") try: shutil.rmtree(wav_folder) except: pass finally: os.mkdir(wav_folder) for wav in wavs_test: new_path = join(wav_folder, basename(wav)) shutil.copyfile(wav, new_path) wavs_test_new_path = [join("wav", basename(wav)) for wav in wavs_test] speech_recognition = { "texts_test": texts_test, "texts_pred": texts_pred, "wavs_test": wavs_test_new_path, } content = json.dumps(speech_recognition, ensure_ascii=False) log_file = join(log_folder, "speechrecognition.json") write(log_file, content) print("Result is written in {}".format(log_file)) print("WER: {}%".format(wer * 100))
def convert_data(raw_folder, corpus_folder): print(raw_folder) for topic in listdir(raw_folder): print(topic) mkdir(join(corpus_folder, topic)) for file in listdir(join(raw_folder, topic)): content = read_utf16(join(raw_folder, topic, file)) write(join(corpus_folder, topic, file), content)
def save(self, folder): """save corpus to files :param str folder: path to directory :type folder: string """ try: mkdir(folder) except Exception: pass for document in self.documents: filename = join(folder, document.id) content = u"\n".join(document.sentences) write(filename, content)
def save(self, folder, format): """save wscorpus to files :param str folder: path to directory :type folder: string :param str format: either TEXT or COLUMN :type format: str """ try: mkdir(folder) except Exception as e: pass for document in self.documents: f = join(folder, document.id) content = u"\n".join(document.sentences) write(f, content)
def fit(self, X, y, model_filename=None): """Fit FastText according to X, y Parameters: ---------- X : list of text each item is a text y: list each item is either a label (in multi class problem) or list of labels (in multi label problem) """ train_file = "temp.train" X = [x.replace("\n", " ") for x in X] y = [item[0] for item in y] y = [_.replace(" ", "-") for _ in y] lines = ["__label__{} , {}".format(j, i) for i, j in zip(X, y)] content = "\n".join(lines) write(train_file, content) if model_filename: self.estimator = ft.supervised(train_file, model_filename) else: self.estimator = ft.supervised(train_file) os.remove(train_file)
def save_temp(id, output): temp_file = join(samples_dir, "%s.correct" % id) content = u"\n".join([u"\t".join(item) for item in output]) write(temp_file, content)
text = "\n".join(lines) return text def extract_sentence(content): return "# " + " ".join( [token.split("\t")[0] for token in content.split("\n")]) if __name__ == '__main__': test_dir = join(dirname(__file__), "test_set") files = [f for f in listdir(test_dir) if isfile(join(test_dir, f))] model_id = "1" try: shutil.rmtree(join(test_dir, model_id)) except: pass mkdir(join(test_dir, model_id)) for f in files: input = load_input(join(test_dir, f)) output = chunk(input) actual = "\n".join(["\t".join(tokens) for tokens in chunk(input)]) expected = load_output(join(test_dir, f)) if actual != expected: print("\n{}".format(f)) diff = '\n'.join(ndiff(expected.splitlines(), actual.splitlines())) write(join(test_dir, model_id, f), "\n".join([extract_sentence(actual), actual])) write(join(test_dir, model_id, f + ".diff"), "\n".join([extract_sentence(actual), diff]))
import requests import json from os.path import join from underthesea.feature_engineering.text import Text from underthesea.util.file_io import write url = "http://localhost:8000/api/corpora/" headers = { 'Content-type': 'application/json', 'Accept': 'application/json'} r = requests.get(url, headers=headers) content = Text(json.dumps(r.json(), ensure_ascii=False)) write(join("data", "20171017.json"), content)
TP += 1 else: FN += 1 else: if label in y_pred[i]: FP += 1 else: TN += 1 score[label] = { "TP": TP, "FP": FP, "TN": TN, "FN": FN, "accuracy": accuracy_score(TP, FP, TN, FN), "precision": precision_score(TP, FP, TN, FN), "recall": recall_score(TP, FP, TN, FN), "f1": f1_score(TP, FP, TN, FN), } df = pd.DataFrame.from_dict(score) df.T.to_excel( "inspect/score.xlsx", columns=["TP", "TN", "FP", "FN", "accuracy", "precision", "recall", "f1"]) # generate result result = {"X_test": X_test, "y_test": y_test, "y_pred": y_pred, "score": score} print(score) content = json.dumps(result, ensure_ascii=False) write("inspect/result.json", content)
def save_temp(id, output): test_dir = join(dirname(__file__), "samples", "accuracy") temp_file = join(test_dir, "%s.tmp" % id) content = u"\n".join(output) write(temp_file, content)