def ask_for_labels(folder, filename): df = get_csv(folder, filename) df, index = hand_label(df) df = auto_label(df, index) name, ext = filename.split(".") name += "_with_labels" filename = name + "." +ext write_csv(df, folder, filename) azure_helper.upload_to_blob(filename,folder+"/"+filename)
def generate_predictions(): data = load_data_from_blob( "comments.csv") #pd.read_csv("wikipedia/comments.csv") features = generate_features(data) features = feature_size_pad(features) model_file = "clf.joblib" azure_helper.download_from_blob(model_file, model_file) label_encoder_file = "label_encoder.joblib" azure_helper.download_from_blob(label_encoder_file, label_encoder_file) clf = joblib.load(model_file) labels = clf.predict(features) label_encoder = joblib.load(label_encoder_file) text_labels = label_encoder.inverse_transform(labels) data["labels"] = text_labels data.to_csv("predicted_labels.csv", index=False) azure_helper.upload_to_blob("predicted_labels.csv", "predicted_labels.csv")
def generate_clf(csv): df = load_data_from_blob(csv) features = do_word2vec(df["comments"]) features = feature_post_processing(features) labels, encoder = transform_labels(df["labels"]) clf = SVC(class_weight="balanced", tol=1e-5, gamma="scale", kernel="sigmoid", random_state=42, C=0.8) clf.fit(features, labels) model_file_name = 'clf.joblib' joblib.dump(clf, model_file_name) encoder_file_name = 'label_encoder.joblib' joblib.dump(encoder, encoder_file_name) azure_helper.upload_to_blob(model_file_name, model_file_name) azure_helper.upload_to_blob(encoder_file_name, encoder_file_name)
def _save_data(df, file_name): if os.path.exists(file_name): tmp = pd.read_csv(file_name) df = df.append(tmp) df.to_csv(file_name, index=False) azure_helper.upload_to_blob(file_name, file_name)