Пример #1
0
def prepare():
    parser = argparse.ArgumentParser(description='VECTORIZE the sampled data')
    parser.add_argument(
        '--config',
        required=True,
        help="File with the configuration, must contain key 'vectorize'")

    args = parser.parse_args()
    config = util.loadConfig(args.config)
    print("Starting vectorize with config {}".format(
        config["vectorize"]["hash"]))
    config["logger"] = util.setupLogging(config, "vectorize")
    config["src"] = os.path.join(config["clean"]["baseDir"],
                                 config["vectorize"]["cleanHash"],
                                 "useable.csv")
    config["labels"] = util.getLabels(config)[1:]
    config["stop_words"] = util.getStopWords(config)
    stopWordsHash = util.getDictHash(config["stop_words"])
    if stopWordsHash != config["vectorize"]["stopWordsHash"]:
        config["logger"].error(
            "Hash of used and configured stop words differ:"
            "\n\t{} (used)"
            "\n\t{} (configured)"
            "\n\tRestore old stop word list or change config".format(
                stopWordsHash, config["vectorize"]["stopWordsHash"]))
        os.sys.exit(1)

    if config["vectorize"]["stemming"] == "none":
        config["payload"] = "payload"
    else:
        config["payload"] = config["vectorize"]["stemming"]
    return config
Пример #2
0
def prepare():
    parser = argparse.ArgumentParser(
        description='CLEAN analyze cleaned records')

    parser.add_argument(
        '--config',
        required=True,
        help="File with the configuration for the cleaning run")
    parser.add_argument('--type',
                        default="subjectScheme",
                        choices=("subjectScheme", "schemeURI", "scheme2label"),
                        help="Display subject schemes")
    parser.add_argument(
        '--label',
        default="1",
        help="Display scheme hits for this label (scheme2label)")
    parser.add_argument(
        '--scheme',
        default="all",
        help="Display scheme hits for this scheme (scheme2label)")

    args = parser.parse_args()

    config = util.loadConfig(args.config)
    config["type"] = args.type
    config["label"] = args.label
    config["scheme"] = args.scheme
    return config
Пример #3
0
def prepare():
    parser = argparse.ArgumentParser(
        description='EVALUATE a model/param_grid/data bundle'
    )
    parser.add_argument('--config',
            required = True,
            help = "File with the configuration, must contain key 'evaluate'")
    parser.add_argument('--device',
            default="default",
            help = "Device name used to train the models ('/device:GPU:0', '/device:GPU:1')")

    args = parser.parse_args()
    config = loadConfig(args.config)
    print("Starting with config {}\n\ttail -f {}".format(
        config["evaluate"]["hash"],
        config["evaluate"]["logFile"]
    ))
    config["logger"] = setupLogging(config, "evaluate")
    config["device"] = args.device
    if config["device"] == "default":
        prefix = "0"
    else:
        prefix = "1"
    config["target"] = os.path.join(config["evaluate"]["baseDir"], prefix + "_evaluation.csv")
    config["srcDir"] = os.path.join(
        config["vectorize"]["baseDir"],
        config["evaluate"]["vectorizeHash"]
    )

    return config
Пример #4
0
def testLoadConfig():
    # Check autocompletion of config works
    config = util.loadConfig(getTestConfig())
    for key in ("hash", "rawDataDir", "processedDataDir", "configDir"):
        assert key in config.keys()

    # Check autocreations work (therefore delete the dirs first)
    hashedConfig = os.path.join(os.path.dirname(getTestConfig()),
                                config["hash"] + ".json")
    if os.path.isfile(hashedConfig):
        os.remove(hashedConfig)
    shutil.rmtree(config["processedDataDir"])

    config = util.loadConfig(getTestConfig())
    # Check config backup works and provides identical copy
    for key in ("processedDataDir", "configDir"):
        assert os.path.isdir(config[key])
    for key in ("retrieve", "clean", "sample", "train", "evaluate", "use"):
        assert os.path.isdir(os.path.join(config["processedDataDir"], key))
    assert os.path.exists(hashedConfig)
    config2 = util.loadConfig(hashedConfig)
    assert config == config2
Пример #5
0
def prepare():
    parser = argparse.ArgumentParser(
        description='RETRIEVE: retrieve all raw data.'
    )
    parser.add_argument('--config',
            required = True,
            help = "File with the configuration, must contain key 'retrieve'")
    parser.add_argument('--sleep',
            default = 20,
            help = "Time period to sleep until a harvester is checked during harvesting")
    args = parser.parse_args()
    config = util.loadConfig(args.config)
    config["retrieve"]["sleep"] = int(args.sleep)
    config["logger"] = util.setupLogging(config, "retrieve")
    return config
Пример #6
0
def prepare():
    """ Prepares a cleaning run

    # Returns
        config: dict A configuration with all paths, compiled regexes and a
                     logger
    """
    parser = argparse.ArgumentParser(
        description='CLEAN retrieved metadata records')

    parser.add_argument(
        '--config',
        required=True,
        help="File with the configuration for the cleaning run")
    parser.add_argument('--worker', default=3, help="Number of workers")
    args = parser.parse_args()

    config = util.loadConfig(args.config)
    # This should be the only output, allowing to tail the log
    config["logger"] = util.setupLogging(config, "clean")
    config["worker"] = int(args.worker)

    usedMappingHash = util.getFileHash("clean/cleanDataHelpers.py")
    if usedMappingHash != config["clean"]["mappingHash"]:
        config["logger"].error(
            "Hash of used and configured mapping differ:"
            "\n\t{} (used)"
            "\n\t{} (configured)"
            "\n\tRestore old mapping or change config".format(
                usedMappingHash, config["clean"]["mappingHash"]))
        os.sys.exit(1)
    config["labels"] = util.getLabels(config)

    config["regex"] = {
        "ddcValue": re.compile(config["clean"]["regex"]["ddcValue"]),
        "ddcSchemeURI": re.compile(config["clean"]["regex"]["ddcSchemeURI"]),
        "special": re.compile(config["clean"]["regex"]["special"]),
        "dataInput": re.compile(config["clean"]["regex"]["dataInput"]),
        "dataOutput": re.compile(config["clean"]["regex"]["dataOutput"])
    }

    return config
Пример #7
0
def prepare():
    """ Prepares a cleaning run

    # Returns
        config: dict A configuration with all paths, compiled regexes and a
                     logger
    """
    parser = argparse.ArgumentParser(
        description='Grep over subjects of retrieved metadata records')

    parser.add_argument(
        '--config',
        required=True,
        help="File with the configuration for the cleaning run")

    parser.add_argument('--field',
                        default="value",
                        choices=("value", "subjectScheme", "schemeURI"),
                        help="On which field to grep on")

    parser.add_argument('--grep', required=True, help="Grep expression")

    args = parser.parse_args()
    config = util.loadConfig(args.config)

    config["field"] = args.field
    config["grep"] = args.grep

    if "regex" in config["clean"].keys():
        config["regex"] = {
            "dataInput": re.compile(config["clean"]["regex"]["dataInput"]),
        }
    elif "dataInputRegex" in config["clean"].keys():
        config["regex"] = {
            "dataInput": re.compile(config["clean"]["dataInputRegex"]),
        }
    return config
Пример #8
0
from PyQt5.QtWidgets import QApplication, QWidget
from PyQt5.QtWidgets import QLabel, QPushButton, QLineEdit, QTextBrowser
from PyQt5.QtGui import QValidator, QDoubleValidator
from PyQt5.QtWidgets import QVBoxLayout, QHBoxLayout, QFormLayout
from gui_util.file_handler import FileHandler

TABS_DIR = os.path.dirname(os.path.realpath(__file__))
SRC_DIR = os.path.dirname(TABS_DIR)
sys.path.append(SRC_DIR)
sys.path.append(os.path.join(SRC_DIR, 'util'))
sys.path.append(os.path.join(SRC_DIR, 'database'))
from util import util
from database.ft_db import get_firstrade_db

DATA_DIR = os.path.join(SRC_DIR, 'data')
CONFIG = util.loadConfig(os.path.join(DATA_DIR, 'config.json'))
FTDB = get_firstrade_db(db_fpath=os.path.join(DATA_DIR, CONFIG['database']))

STANDARD_H = 600
STANDARD_W = 800


class ImportTab(QWidget):
    def __init__(self, parent=None):
        super(self.__class__, self).__init__(parent)
        self.left = 10
        self.top = 10
        self.width = STANDARD_W
        self.height = STANDARD_H
        self.setGeometry(self.left, self.top, self.width, self.height)
        self.setUI()
Пример #9
0
import shutil
from train.mlp import train_ngram_model


################################################################################
# TEST PREPARATION
################################################################################
def getTestConfig():
    return os.path.join(
        os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
        "config/config.json")


payload = {"test": [1, 2, 3], "test2": {"test3": "abc"}}
subdir = "retrieve"
config = util.loadConfig(getTestConfig())


################################################################################
# TESTS
################################################################################
def testLoadConfig():
    # Check autocompletion of config works
    config = util.loadConfig(getTestConfig())
    for key in ("hash", "rawDataDir", "processedDataDir", "configDir"):
        assert key in config.keys()

    # Check autocreations work (therefore delete the dirs first)
    hashedConfig = os.path.join(os.path.dirname(getTestConfig()),
                                config["hash"] + ".json")
    if os.path.isfile(hashedConfig):
Пример #10
0
    df = util.cfm2df(cfm, range(len(shortAnzsrc)))
    df_cfm = pd.DataFrame(data=df.values,
                          index=shortAnzsrc,
                          columns=shortAnzsrc)
    plt.figure(figsize=(40, 28))
    sn.heatmap(df_cfm, annot=True)
    return plt.plot()


parser = argparse.ArgumentParser(
    description='Plot evaluations for a trained model.')

parser.add_argument('--config',
                    required=True,
                    help="File with the configuration for the training run")

args = parser.parse_args()

config = util.loadConfig(args.config)

model_file = os.path.join(config["processedDataDir"], "train", "mlp_model.h5")
print("Loading model: {}".format(model_file))
model = models.load_model(model_file)

(test_texts, test_labels) = util.loadJsonFromFile(config, "test.json", "train")

cfm = util.getConfusionMatrix(config, model, test_texts, test_labels)
perCfm = cfm / cfm.sum(axis=1, keepdims=True)
plotConfusionMatrix(config, perCfm)
savefig(os.path.join(config["processedDataDir"], "evaluate", "cfm.png"))
Пример #11
0
import os, sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
from util.util import loadConfig, loadBinary, loadJsonFromFile
from tensorflow.python.keras import models
import numpy as np
import argparse

parser = argparse.ArgumentParser(
    description='TEST: test a model with a given configuration.')
parser.add_argument('--config',
                    required=True,
                    help="File with the configuration for the training run")
args = parser.parse_args()

config = loadConfig(args.config)

model_file = os.path.join(config["processedDataDir"], "train", "mlp_model.h5")
print("Loading model: {}".format(model_file))
model = models.load_model(model_file)

tests = [
    "mathematics proof theorem lemma number topology deduction",
    "particle physics theoretical physics experimental physics atom mass motion star nova",
    "chemistry liquid acid protein reaction",
    "earth science atmosphere geochemistry geology oceanography hydrology",
    "ecology soil environmental sciences",
    "biology species population life organism evolution",
    "agriculture veterinary crop cattle forest animal slaughterhouse",
    "computer science information science library Memory Computation IT programming language code",
    "engineering construction electronic structure applied",
    "technology nanotechnology biotechnology hardware",
Пример #12
0
import os, sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
import util.util as util
import json

mode = ""

config = util.loadConfig("../config/config{}.json".format(mode))

selections = [
        { 
            "feature_selection": {"mode": "multipleOfLabels", "value": 1000},
            "stemming": "none"
        },
        { 
            "feature_selection": {"mode": "multipleOfLabels", "value": 2500},
            "stemming": "none"
        },
        { 
            "feature_selection": {"mode": "multipleOfLabels", "value": 5000},
            "stemming": "none",
        },
        { 
            "feature_selection": {"mode": "multipleOfLabels", "value": 1000},
            "stemming": "lancaster"
        },
        { 
            "feature_selection": {"mode": "multipleOfLabels", "value": 2500},
            "stemming": "lancaster"
        },
        {