예제 #1
0
    def __init__(self, models_folder, essays_folder, spell_check_dict):

        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=logging.INFO)
        if not models_folder.endswith("/"):
            models_folder += "/"
        if not essays_folder.endswith("/"):
            essays_folder += "/"

        self.logger = logging.getLogger()
        cfg = get_config(essays_folder)
        self.config = cfg
        self.essays_folder = essays_folder

        # Create spell checker
        # Need annotations here purely to load the tags
        tagged_essays = load_bratt_essays(essays_folder,
                                          include_vague=cfg["include_vague"],
                                          include_normal=cfg["include_normal"],
                                          load_annotations=True)
        self.__set_tags_(tagged_essays)
        self.wd_sent_freq = defaultdict(int)
        self.spelling_corrector = build_spelling_corrector(
            tagged_essays,
            self.config["lower_case"],
            self.wd_sent_freq,
            folder=spell_check_dict)

        # has to be an int as used in slices. In python 3.x this will automatically be a float
        offset = int((self.config["window_size"] - 1) / 2)

        unigram_window_stemmed = fact_extract_positional_word_features_stemmed(
            offset)
        biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)

        extractors = [unigram_window_stemmed, biigram_window_stemmed]

        # most params below exist ONLY for the purposes of the hashing to and from disk
        self.feature_extractor = FeatureExtractorTransformer(extractors)

        # load models
        self.logger.info("Loading pickled models")
        store = ModelStore(models_folder=models_folder)

        self.feature_transformer = store.get_transformer()
        self.logger.info("Loaded Transformer")
        self.tag_2_wd_classifier = store.get_tag_2_wd_classifier()
        self.logger.info("Loaded word tagging model")
        self.tag_2_sent_classifier = store.get_tag_2_sent_classifier()
        self.logger.info("Loaded sentence classifier")
예제 #2
0
    def __init__(self, models_folder, essays_folder, spell_check_dict):

        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        if not models_folder.endswith("/"):
            models_folder += "/"
        if not essays_folder.endswith("/"):
            essays_folder += "/"

        self.logger = logging.getLogger()
        cfg = get_config(essays_folder)
        self.config = cfg
        self.essays_folder = essays_folder

        # Create spell checker
        # Need annotations here purely to load the tags
        tagged_essays = load_bratt_essays(essays_folder, include_vague=cfg["include_vague"], include_normal=cfg["include_normal"], load_annotations=True)
        self.__set_tags_(tagged_essays)
        self.wd_sent_freq = defaultdict(int)
        self.spelling_corrector = build_spelling_corrector(tagged_essays, self.config["lower_case"], self.wd_sent_freq, folder=spell_check_dict)

        # has to be an int as used in slices. In python 3.x this will automatically be a float
        offset = int((self.config["window_size"] - 1) / 2)

        unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset)
        biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)

        extractors = [unigram_window_stemmed, biigram_window_stemmed]

        # most params below exist ONLY for the purposes of the hashing to and from disk
        self.feature_extractor = FeatureExtractorTransformer(extractors)

        # load models
        self.logger.info("Loading pickled models")
        store = ModelStore(models_folder=models_folder)

        self.feature_transformer =  store.get_transformer()
        self.logger.info("Loaded Transformer")
        self.tag_2_wd_classifier = store.get_tag_2_wd_classifier()
        self.logger.info("Loaded word tagging model")
        self.tag_2_sent_classifier = store.get_tag_2_sent_classifier()
        self.logger.info("Loaded sentence classifier")
예제 #3
0
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

import json

import model_store
from model_store import ModelStore

_MODEL_STORE = ModelStore(model_store.PRODUCTION)
_PETAL_WIDTH = 'petal_width'


def predict(input_json):
    try:
        input_dict = json.loads(input_json)
        model, version = _MODEL_STORE.load_latest_model()
        result = str(
            model.predict_proba([[float(input_dict[_PETAL_WIDTH])]])[0][1])
        return json.dumps({"result": result, "version": version})

    except IndexError:
        return 'Failure: the model is not ready yet'

    except Exception as e:
# not hashed as don't affect persistence of feature processing
SPARSE_WD_FEATS     = True
SPARSE_SENT_FEATS   = True

MIN_FEAT_FREQ       = 5        # 5 best so far
CV_FOLDS            = 5

MIN_TAG_FREQ        = 5
LOOK_BACK           = 0     # how many sentences to look back when predicting tags
# end not hashed

# construct unique key using settings for pickling
settings = Settings.Settings()

model_store = ModelStore()

folder                              = settings.data_directory + "SkinCancer/Merged/"
config = get_config(folder)

""" FEATURE EXTRACTION """
offset = (config["window_size"] - 1) / 2

unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset)
biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)

#pos_tag_window = fact_extract_positional_POS_features(offset)
#pos_tag_plus_wd_window = fact_extract_positional_POS_features_plus_word(offset)
#head_wd_window = fact_extract_positional_head_word_features(offset)
#pos_dep_vecs = fact_extract_positional_dependency_vectors(offset)
예제 #5
0
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

import sys
import time

import model_store
import numpy as np
from model_store import ModelStore
from sklearn import datasets
from sklearn.linear_model import LogisticRegression

_CANDIDATE_MODEL_STORE = ModelStore(model_store.CANDIDATE)
_PRODUCTION_MODEL_STORE = ModelStore(model_store.PRODUCTION)

def train_model():
    iris = datasets.load_iris()

    X = iris["data"][:, 3:]  # petal width
    y = (iris["target"] == 2).astype(np.int)

    model = LogisticRegression()
    model.fit(X, y)

    version = round(time.time())

    print(f'Saving model with version {version} to candidate model store.')
    _CANDIDATE_MODEL_STORE.save_model(model, version)
# not hashed as don't affect persistence of feature processing
SPARSE_WD_FEATS     = True
SPARSE_SENT_FEATS   = True

MIN_FEAT_FREQ       = 5        # 5 best so far
CV_FOLDS            = 5

MIN_TAG_FREQ        = 5
LOOK_BACK           = 0     # how many sentences to look back when predicting tags
# end not hashed

# construct unique key using settings for pickling
settings = Settings.Settings()

model_store = ModelStore()

folder =                            settings.data_directory + "CoralBleaching/BrattData/EBA1415_Merged/"
config = get_config(folder)

""" FEATURE EXTRACTION """
offset = int((config["window_size"] - 1) / 2)

unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset)
biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)

extractors = [unigram_window_stemmed, biigram_window_stemmed]
feat_config = dict(list(config.items()) + [("extractors", extractors)])

""" LOAD DATA """
tagged_essays = load_process_essays( **config )
예제 #7
0
biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)

extractors = [unigram_window_stemmed, biigram_window_stemmed]
feat_config = dict(config.items() + [("extractors", extractors)])

""" LOAD DATA """
tagged_essays = load_process_essays_without_annotations( **config )
logger.info("Essays loaded")
# most params below exist ONLY for the purposes of the hashing to and from disk
feature_extractor = FeatureExtractorTransformer(extractors)

essay_feats = feature_extractor.transform(tagged_essays)
logger.info("Features loaded")

""" LOAD MODELS """
model_store = ModelStore()

feature_transformer = model_store.get_transformer()
tag2word_classifier = model_store.get_tag_2_wd_classifier()
tag2sent_classifier = model_store.get_tag_2_sent_classifier()

""" DEFINE TAGS """
CAUSE_TAGS = ["Causer", "Result", "explicit"]
CAUSAL_REL_TAGS = [CAUSAL_REL, CAUSE_RESULT, RESULT_REL]# + ["explicit"]

""" works best with all the pair-wise causal relation codes """

all_tags = tag2sent_classifier.keys()
regular_tags = [t for t in all_tags if t[0].isdigit()]

wd_train_tags = regular_tags + CAUSE_TAGS
예제 #8
0
# not hashed as don't affect persistence of feature processing
SPARSE_WD_FEATS = True
SPARSE_SENT_FEATS = True

MIN_FEAT_FREQ = 5  # 5 best so far
CV_FOLDS = 5

MIN_TAG_FREQ = 5
LOOK_BACK = 0  # how many sentences to look back when predicting tags
# end not hashed

# construct unique key using settings for pickling
settings = Settings.Settings()

model_store = ModelStore()
""" PETER - CHANGE THESE FILE PATHS """
folder = settings.data_directory + "CoralBleaching/BrattData/EBA1415_Merged/"  # Location where the training data is, use EBA_Pre and Post test essays preferably
test_folder = settings.data_directory + "CoralBleaching/BrattData/Merged/"  # Location where the new essays to tag are located
out_predictions_file = settings.data_directory + "CoralBleaching/Results/predictions.txt"  # File to dump the predictions to

config = get_config(folder)
""" FEATURE EXTRACTION """
offset = (config["window_size"] - 1) / 2

unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset)
biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)

#pos_tag_window = fact_extract_positional_POS_features(offset)
#pos_tag_plus_wd_window = fact_extract_positional_POS_features_plus_word(offset)
#head_wd_window = fact_extract_positional_head_word_features(offset)