def __init__(self, models_folder, essays_folder, spell_check_dict): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if not models_folder.endswith("/"): models_folder += "/" if not essays_folder.endswith("/"): essays_folder += "/" self.logger = logging.getLogger() cfg = get_config(essays_folder) self.config = cfg self.essays_folder = essays_folder # Create spell checker # Need annotations here purely to load the tags tagged_essays = load_bratt_essays(essays_folder, include_vague=cfg["include_vague"], include_normal=cfg["include_normal"], load_annotations=True) self.__set_tags_(tagged_essays) self.wd_sent_freq = defaultdict(int) self.spelling_corrector = build_spelling_corrector( tagged_essays, self.config["lower_case"], self.wd_sent_freq, folder=spell_check_dict) # has to be an int as used in slices. In python 3.x this will automatically be a float offset = int((self.config["window_size"] - 1) / 2) unigram_window_stemmed = fact_extract_positional_word_features_stemmed( offset) biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2) extractors = [unigram_window_stemmed, biigram_window_stemmed] # most params below exist ONLY for the purposes of the hashing to and from disk self.feature_extractor = FeatureExtractorTransformer(extractors) # load models self.logger.info("Loading pickled models") store = ModelStore(models_folder=models_folder) self.feature_transformer = store.get_transformer() self.logger.info("Loaded Transformer") self.tag_2_wd_classifier = store.get_tag_2_wd_classifier() self.logger.info("Loaded word tagging model") self.tag_2_sent_classifier = store.get_tag_2_sent_classifier() self.logger.info("Loaded sentence classifier")
def __init__(self, models_folder, essays_folder, spell_check_dict): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if not models_folder.endswith("/"): models_folder += "/" if not essays_folder.endswith("/"): essays_folder += "/" self.logger = logging.getLogger() cfg = get_config(essays_folder) self.config = cfg self.essays_folder = essays_folder # Create spell checker # Need annotations here purely to load the tags tagged_essays = load_bratt_essays(essays_folder, include_vague=cfg["include_vague"], include_normal=cfg["include_normal"], load_annotations=True) self.__set_tags_(tagged_essays) self.wd_sent_freq = defaultdict(int) self.spelling_corrector = build_spelling_corrector(tagged_essays, self.config["lower_case"], self.wd_sent_freq, folder=spell_check_dict) # has to be an int as used in slices. In python 3.x this will automatically be a float offset = int((self.config["window_size"] - 1) / 2) unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset) biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2) extractors = [unigram_window_stemmed, biigram_window_stemmed] # most params below exist ONLY for the purposes of the hashing to and from disk self.feature_extractor = FeatureExtractorTransformer(extractors) # load models self.logger.info("Loading pickled models") store = ModelStore(models_folder=models_folder) self.feature_transformer = store.get_transformer() self.logger.info("Loaded Transformer") self.tag_2_wd_classifier = store.get_tag_2_wd_classifier() self.logger.info("Loaded word tagging model") self.tag_2_sent_classifier = store.get_tag_2_sent_classifier() self.logger.info("Loaded sentence classifier")
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import json import model_store from model_store import ModelStore _MODEL_STORE = ModelStore(model_store.PRODUCTION) _PETAL_WIDTH = 'petal_width' def predict(input_json): try: input_dict = json.loads(input_json) model, version = _MODEL_STORE.load_latest_model() result = str( model.predict_proba([[float(input_dict[_PETAL_WIDTH])]])[0][1]) return json.dumps({"result": result, "version": version}) except IndexError: return 'Failure: the model is not ready yet' except Exception as e:
# not hashed as don't affect persistence of feature processing SPARSE_WD_FEATS = True SPARSE_SENT_FEATS = True MIN_FEAT_FREQ = 5 # 5 best so far CV_FOLDS = 5 MIN_TAG_FREQ = 5 LOOK_BACK = 0 # how many sentences to look back when predicting tags # end not hashed # construct unique key using settings for pickling settings = Settings.Settings() model_store = ModelStore() folder = settings.data_directory + "SkinCancer/Merged/" config = get_config(folder) """ FEATURE EXTRACTION """ offset = (config["window_size"] - 1) / 2 unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset) biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2) #pos_tag_window = fact_extract_positional_POS_features(offset) #pos_tag_plus_wd_window = fact_extract_positional_POS_features_plus_word(offset) #head_wd_window = fact_extract_positional_head_word_features(offset) #pos_dep_vecs = fact_extract_positional_dependency_vectors(offset)
# software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import sys import time import model_store import numpy as np from model_store import ModelStore from sklearn import datasets from sklearn.linear_model import LogisticRegression _CANDIDATE_MODEL_STORE = ModelStore(model_store.CANDIDATE) _PRODUCTION_MODEL_STORE = ModelStore(model_store.PRODUCTION) def train_model(): iris = datasets.load_iris() X = iris["data"][:, 3:] # petal width y = (iris["target"] == 2).astype(np.int) model = LogisticRegression() model.fit(X, y) version = round(time.time()) print(f'Saving model with version {version} to candidate model store.') _CANDIDATE_MODEL_STORE.save_model(model, version)
# not hashed as don't affect persistence of feature processing SPARSE_WD_FEATS = True SPARSE_SENT_FEATS = True MIN_FEAT_FREQ = 5 # 5 best so far CV_FOLDS = 5 MIN_TAG_FREQ = 5 LOOK_BACK = 0 # how many sentences to look back when predicting tags # end not hashed # construct unique key using settings for pickling settings = Settings.Settings() model_store = ModelStore() folder = settings.data_directory + "CoralBleaching/BrattData/EBA1415_Merged/" config = get_config(folder) """ FEATURE EXTRACTION """ offset = int((config["window_size"] - 1) / 2) unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset) biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2) extractors = [unigram_window_stemmed, biigram_window_stemmed] feat_config = dict(list(config.items()) + [("extractors", extractors)]) """ LOAD DATA """ tagged_essays = load_process_essays( **config )
biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2) extractors = [unigram_window_stemmed, biigram_window_stemmed] feat_config = dict(config.items() + [("extractors", extractors)]) """ LOAD DATA """ tagged_essays = load_process_essays_without_annotations( **config ) logger.info("Essays loaded") # most params below exist ONLY for the purposes of the hashing to and from disk feature_extractor = FeatureExtractorTransformer(extractors) essay_feats = feature_extractor.transform(tagged_essays) logger.info("Features loaded") """ LOAD MODELS """ model_store = ModelStore() feature_transformer = model_store.get_transformer() tag2word_classifier = model_store.get_tag_2_wd_classifier() tag2sent_classifier = model_store.get_tag_2_sent_classifier() """ DEFINE TAGS """ CAUSE_TAGS = ["Causer", "Result", "explicit"] CAUSAL_REL_TAGS = [CAUSAL_REL, CAUSE_RESULT, RESULT_REL]# + ["explicit"] """ works best with all the pair-wise causal relation codes """ all_tags = tag2sent_classifier.keys() regular_tags = [t for t in all_tags if t[0].isdigit()] wd_train_tags = regular_tags + CAUSE_TAGS
# not hashed as don't affect persistence of feature processing SPARSE_WD_FEATS = True SPARSE_SENT_FEATS = True MIN_FEAT_FREQ = 5 # 5 best so far CV_FOLDS = 5 MIN_TAG_FREQ = 5 LOOK_BACK = 0 # how many sentences to look back when predicting tags # end not hashed # construct unique key using settings for pickling settings = Settings.Settings() model_store = ModelStore() """ PETER - CHANGE THESE FILE PATHS """ folder = settings.data_directory + "CoralBleaching/BrattData/EBA1415_Merged/" # Location where the training data is, use EBA_Pre and Post test essays preferably test_folder = settings.data_directory + "CoralBleaching/BrattData/Merged/" # Location where the new essays to tag are located out_predictions_file = settings.data_directory + "CoralBleaching/Results/predictions.txt" # File to dump the predictions to config = get_config(folder) """ FEATURE EXTRACTION """ offset = (config["window_size"] - 1) / 2 unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset) biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2) #pos_tag_window = fact_extract_positional_POS_features(offset) #pos_tag_plus_wd_window = fact_extract_positional_POS_features_plus_word(offset) #head_wd_window = fact_extract_positional_head_word_features(offset)