def __init__(self): # Preprocessing pipeline self.data = self.prepare_data(Parameters.num_words, Parameters.seq_len) # Initialize the model self.model = TextClassifier(Parameters) # Training - Evaluation pipeline Run().train(self.model, self.data, Parameters)
def test_dataset_values_assertion(self, tmp_path): runtask = Run(tmp_path) export_dict = {"era5": "not a list"} with pytest.raises(Exception) as exception_info: runtask.export(export_dict) error_message_contains = 'values to be a list. Got' assert error_message_contains in str(exception_info), \ f'Got unexpected error message: {exception_info}'
def test_dataset_assertion(self, tmp_path): runtask = Run(tmp_path) export_dict = {"era42": ["towels"]} with pytest.raises(Exception) as exception_info: runtask.export(export_dict) error_message_contains = 'is not supported! Supported datasets are' assert error_message_contains in str(exception_info), \ f'Got unexpected error message: {exception_info}'
def rank_article(self, query): config = self.conf.getConfig() config = namedtuple("Conf", config['run'].keys())(*config['run'].values()) articles = self.__articles(query) for i in articles: run = Run("".join(config.staff), config.step, config.num, config.weighting, config.granularity, ['k' + str(config.k), 'b' + str(config.b)]) file = run.createRun("../runs", i[1], i[0][0]) self.conf.incrementRun()
def rank_elem(self, query): config = self.conf.getConfig() config = namedtuple("Conf", config['run'].keys())(*config['run'].values()) docsEl = dict() for (k, v) in self.docs.items(): for (w, x) in v.items(): key = (k + w) docsEl[key] = x rank = XmlRank(docsEl) bm = [] func = getattr(importlib.import_module("src.rank"), config.weighting) for q in query: req = clean_text(" ".join(q[1])).split() r = rank.getBm25(request=req, func=func, k=config.k, b=config.b, d=0) red = dict() fres = [] lid = 0 for t in r: if (len(fres) == config.limit): break id, res = t[0].split("/", 1) if red.get(id) is None: red[id] = [res] fres.append((id, res, t[1])) lid = id else: rec = False for z in red[id]: if res in z or z in res: rec = True if not rec and id == lid: red[id].append(res) fres.append((id, res, t[1])) lid = id run = Run("".join(config.staff), config.step, config.num, config.weighting, config.granularity, ['k' + str(config.k), 'b' + str(config.b)]) file = run.createRunElem("../runs", fres, q[0]) self.conf.incrementRun()
def main(input_args): base_dir = Path(os.path.dirname(os.path.abspath(__file__))) default_config_file = base_dir / "pipeline_config/minimal.json" with open(default_config_file, "r") as f: default_config = json.load(f) if input_args.config is not None: with open(args.config, "r") as f: user_config = json.load(f) else: user_config = {} config = DictWithDefaults(user_config, default_config) data_path = Path(config["data"]) runtask = Run(data_path) runtask.run(config, args.run_from)
from pathlib import Path import os import json from src import DictWithDefaults, Run from src.preprocess import VHIPreprocessor base_dir = Path(os.path.dirname(os.path.abspath("./ml_drought"))) default_config_file = base_dir / "pipeline_config/minimal.json" with open(default_config_file, "r") as f: default_config = json.load(f) user_config = {} config = DictWithDefaults(user_config, default_config) data_path = Path(config["data"]) preprocess_args = config["preprocess"] dataset2preprocessor = {"vhi": VHIPreprocessor} r = Run(data_path) dataset, variables = [(d, v) for d, v in preprocess_args.items()][0] preprocessor = dataset2preprocessor[dataset](r.data) preprocessor.preprocess(**variables[0])