Exemplo n.º 1
0
 def __init__(self, fastTextModel=None):
     if fastTextModel is None:
         fileType = ".ftz" if tools.config(
         )["model"]["quantize"] is True else ".bin"
         modelFile = tools.config()["model"]["model-path"] + fileType
         print("model under test:" + modelFile)
         self.model = fasttext.load_model(modelFile)
     else:
         self.model = fastTextModel
Exemplo n.º 2
0
def push_an(dest, amd):
    '''
    Wrapper for data pushing toward the Storer.

    :param dest: Service hoster for the info received: 'storer'
    :type dest: str
    :param amd: Amendment to be added to the destination service
    :type amd: dict
    :return: Code status
    :rtype: str
    '''

    host = config().get(section=dest, option='server')
    port = config().get(section=dest, option='port')
    hostname = '{}:{}'.format(host, port)
    status = requests.post(url='http://{0}/an'.format(hostname), json=amd)
    return status.content
Exemplo n.º 3
0
def pull_an(source, proj_id=None, exam_id=None):
    '''
    Wrapper for data pulling from Storer or Crawler.

    :param source: Service provider for the info pulled: 'storer' or 'crawler'
    :type source: str
    :param proj_id: Reference ID of the project.
    :type proj_id: str
    :param exam_id: Reference ID of the exam.
    :type exam_id: str
    :return: Raw data provided by the service called
    :rtype: dict
    '''

    host = config().get(section=source, option='server')
    port = config().get(section=source, option='port')
    hostname = '{0}:{1}'.format(host, port)
    data = {"list_n_project": [],
            "list_project": [],
            "all_exams": [],
            "all_amds": [],
            }
    if proj_id:
        if exam_id:
            data0 = requests.get(url='http://{0}/an/{1}/{2}'.format(hostname, proj_id, exam_id))
            try:
                data = data0.json()
            except Exception as err:
                logger.error('Error pulling amd:\t{} -\t{}'.format(err, data0.content))
        else:
            data0 = requests.get(url='http://{0}/an/{1}'.format(hostname, proj_id))
            try:
                data = data0.json()
            except Exception as err:
                logger.error('Error pulling exam:\t{} -\t{}'.format(err, data0.content))
    else:
        data0 = requests.get(url='http://{0}/an'.format(hostname))
        try:
            data = data0.json()
        except Exception as err:
            logger.error('Error pulling amd:\t{} -\t{}'.format(err, data0.content))

    return data
Exemplo n.º 4
0
def check_dev(name):
    global dev_checked
    global found_dev
    if not dev_checked:
        found_dev, board = iio_scanner.find_device(name)
        if found_dev:
            global URI
            global devices_us
            global devices
            global boot_bin
            global board_name
            global board_config
            URI = board.uri
            board_config = tools.config(board_name, \
                boot_bin, board.uri[3:], devices, devices_us)
        dev_checked = True
    return found_dev
Exemplo n.º 5
0
def run(fastTextModel, printErrors=False):

    model = FastTextTest(fastTextModel)

    data = readTestData(tools.config()["model"]["test-file"])

    table = []

    all_truth = []
    all_prediction = []
    for row in data:
        truth, prediction = model.predict(row)
        result = stat_fscore(truth, prediction)
        table.append([row[0]] + result)
        all_truth.extend(truth)
        all_prediction.extend(prediction)

    table.append(["all"] + stat_fscore(all_truth, all_prediction))

    sumRow = ["sum"]
    for col in range(1, len(table[0])):
        rowSum = sum(map(lambda x: x[col], table))
        sumRow.append(rowSum)
    table.append(sumRow)

    headers = [
        "file", "precisionMicro", "recallMicro", "fscoreMicro",
        "precisionMacro", "recallMacro", "fscoreMacro"
    ]
    print(tabulate(table, headers, tablefmt="pipe", floatfmt=".4f"))

    title_description = "FastText"

    plt = printcm.plot_confusion_matrix(
        all_truth,
        all_prediction,
        classes=["negative", "neutral", "positive"],
        normalize=True,
        title=title_description)
    plt.savefig("models/sentiment-cm.pdf")

    return table
Exemplo n.º 6
0
def run():
    if (tools.config()["preprocessing"]["use-cache"] is False):
        subprocess.call(f"rm -rf {pathForSets}* {pathForTestsets}* ",
                        shell=True)

    dataLoaders = [
        ["emotions", lambda: tools.loadData(source_data + "emotions")],
        [
            "germeval", lambda: tools.loadGermeval2017(
                source_data + "germeval2017/set_v1.4.tsv")
        ],
        [
            "sb10k", lambda: tools.loadData(
                source_data + "SB10k/not-preprocessed/corpus_label_text.tsv",
                "\t")
        ],
        [
            "PotTS", lambda: tools.loadData(
                source_data + "PotTS/not-preprocessed/corpus_label_text.tsv",
                "\t")
        ],
        [
            "filmstarts", lambda: tools.loadFilmstarts(
                source_data + "filmstarts/filmstarts.tsv")
        ],
        [
            "scare", lambda: tools.loadScareSet(source_data +
                                                "scare_v1.0.0_data/reviews/")
        ],
        [
            "holidaycheck", lambda: tools.loadHolidaycheck(
                source_data + "holidaycheck/holidaycheck.clean.filtered.tsv")
        ],
        [
            "leipzig-mixed-typical-2011",
            lambda: tools.loadData(source_data + "leipzig/deu-mixed-labeled")
        ],
        [
            "leipzig-newscrawl-2017", lambda: tools.loadData(
                source_data + "leipzig/deu-newscrawl-2017-labeled")
        ],
        [
            "leipzig-deu-wikipedia-2016", lambda: tools.loadData(
                source_data + "leipzig/deu-wikipedia-2016-labeled")
        ]
    ]
    dataSets = []
    table = []
    dataSetsToLoad = tools.config()["datasets"]

    for dataSet in dataSetsToLoad:
        if dataSet["train"] is True or dataSet["test"] is True:
            # if this fails the loader you are defined in the config, is not defined in the code
            loader = next(
                filter(lambda x: x[0] == dataSet["name"], dataLoaders))

            # split every set in its 3 classes
            meta_info = cleanAndSplit(*loader)

            if dataSet["train"] is True:
                dataSets.append(loader)

            table.append(list(dataSet.values()) + meta_info)

    headers = [
        "set name", "training", "test", "from cache", "positiv", "neutral",
        "negative", "total"
    ]
    print(tabulate(table, headers, tablefmt="pipe", floatfmt=".4f"))

    trainSets = [
        dataset["name"] for dataset in dataSetsToLoad
        if dataset["train"] is True
    ]

    # combine single datasets into one set per class
    neutralSamples = createSetForClass("neutral", trainSets)
    positiveSamples = createSetForClass("positive", trainSets)
    negativeSamples = createSetForClass("negative", trainSets)

    print("\nclass distribution in data set:")
    print("neutral \t{}\npostitive\t{}\nnegative\t{}".format(
        neutralSamples, positiveSamples, negativeSamples))

    # balance classes
    if (tools.config()['preprocessing']['balance'] == 'down'):
        print("\nbalance classes with downsampling")
        samplesPerClass = min(neutralSamples, positiveSamples, negativeSamples)
        print("random sampels per class: {}".format(samplesPerClass))
        print("total sampels: {}".format(samplesPerClass * 3))
        # train / test split per class
        split(samplesPerClass, "neutral")
        split(samplesPerClass, "positive")
        split(samplesPerClass, "negative")
    else:
        split(neutralSamples, "neutral")
        split(positiveSamples, "positive")
        split(negativeSamples, "negative")
        print(f"random sampels per class neutral: {neutralSamples}")
        print(f"random sampels per class positiv: {positiveSamples}")
        print(f"random sampels per class negative:{negativeSamples}")
        print(
            f"total sampels: {neutralSamples +positiveSamples + negativeSamples}"
        )

    # combine classes to set
    trainFile = path + "model.train"
    validFile = path + "model.valid"
    testFile = path + "model.test"

    executeToFile(f"cat {pathForSets}all.train.* | cut -f2,3",
                  trainFile,
                  shellMode=True)
    executeToFile(f"cat {pathForSets}all.valid.* | cut -f2,3",
                  validFile,
                  shellMode=True)
    executeToFile(f"cat {pathForSets}all.test.*", testFile, shellMode=True)

    totalTrain = tools.lineCount(trainFile)
    totalValid = tools.lineCount(validFile)
    totalTest = tools.lineCount(testFile)
    totalLines = float(totalTrain + totalValid + totalTest)

    print("\nsamples in:\ntrain\t{}\nvalid\t{}\ntest\t{}\nsum\t{}".format(
        totalTrain, totalValid, totalTest, totalLines))

    print("\npercentage in:\ntrain\t{}\nvalid\t{}\ntest\t{}".format(
        totalTrain / totalLines, totalValid / totalLines,
        totalTest / totalLines))

    test_sets = [
        dataset["name"] for dataset in dataSetsToLoad
        if dataset["train"] is False and dataset["test"] is True
    ]
    print(f"datasets just for testing {test_sets}")
    if os.path.exists(testFile + ".extra"): os.remove(testFile + ".extra")

    for test_set in test_sets:
        executeToFile(f"cat {pathForSets}{test_set}.* ",
                      testFile + ".extra",
                      mode="a",
                      shellMode=True)

    executeToFile(f"cat {testFile} {testFile}.extra",
                  path + "model.test.full",
                  shellMode=True)

    executeToFile(
        f"cat {pathForSets}all.negative {pathForSets}all.neutral {pathForSets}all.positive | cut -f3 ",
        path + "wordvecc.train",
        shellMode=True)
Exemplo n.º 7
0
#!/usr/bin/env python
# coding: utf-8
# __author__ = 'Benjamin'

import flask
import json
import tools

app = flask.Flask(__name__)


@app.route('/')
def status():
    return json.dumps({'Service': 'transformer', 'Status': 'Alive'})


if __name__ == '__main__':
    hostname = tools.config().get(section='transformer', option='server')
    port = tools.config().get(section='transformer', option='port')

    app.run(host='0.0.0.0', port=int(port), debug=True)
Exemplo n.º 8
0
import re
from multiprocessing import Pool
from typing import List
import tools
from string import digits

from tqdm import tqdm


cleanChars = re.compile(r'[^A-Za-züöäÖÜÄß.!? ]', re.MULTILINE)
cleanHttpUrls = re.compile(r'https*\S+', re.MULTILINE)
cleanAtMentionsTwitter = re.compile(r'@\S+', re.MULTILINE)
        
config = tools.config()["preprocessing"]

def cleanData(data:List[str]) -> List[str]:
    with Pool(6) as p:        
        data = p.map(cleanRow, data)        

    #for row in tqdm(data):
    #    row[1] = cleanText(row[1])

    data = [row for row in data if row[1]] # filter out empty rows
    return data

def cleanRow(row):
    row[1] = cleanText(row[1])
    return row

def replaceNumbers(text: str) -> str:
        text = text.replace("0"," null")
Exemplo n.º 9
0

app = flask.Flask(__name__)


@app.route('/')
def status():
    return flask.jsonify({'Status': 'Alive'})


@app.route('/merge', methods=['POST'])
def upload():
    filenames = tools.get_files(request.files)
    if filenames:
        merged = tools.merge_files(filenames)
        uploaded = tools.serve_file(merged)
        os.remove(merged)
        return uploaded

# @app.route('/uploads/<filename>')
# def uploaded_file(filename):
#     return send_from_directory(app.config['UPLOAD_FOLDER'],
#                                filename)

hostname = tools.config().get(section='server', option='address')
port = tools.config().get(section='server', option='port')

app.run(host=hostname, port=int(port), debug=True)


Exemplo n.º 10
0
# __author__ = 'Benjamin'

import flask
import json
import tools

app = flask.Flask(__name__)


@app.route('/')
def status():
    return json.dumps({'Status': 'Alive'})


@app.route('/files', methods=['POST'])
def merge_and_send(files):
    # Temps de process potentiellement long, attention au timeout de la requetes HTTP !
    # TODO: Request files (1 by 1) and merge it
    # TODO: Upload to goploader (depado or self hosted) an mail the link
    # TODO: Check timeout issues
    local_files = tools.get_files(files)
    new_file = tools.merge_files(local_files)
    new_link = tools.serve_file(new_file)
    return json.dumps({'link': new_link})


hostname = tools.config().get(section='merger', option='server')
port = tools.config().get(section='merger', option='port')

app.run(host='0.0.0.0', port=int(port), debug=True)
Exemplo n.º 11
0
def train(saveModel=True):
    train_data = tools.config()["model"]["train-file"]
    valid_data = tools.config()["model"]["valid-file"]
    quantizeModel=tools.config()["model"]["quantize"]    
    extendedValidation=tools.config()["model"]["print-confusion-matrix"]
    
    #quick train with hs
    #traningParameters = {'input': train_data, 'epoch': 10, 'lr': 0.25, 'wordNgrams': 3, 'verbose': 2, 'minCount':1, 'loss': "ns", "neg":5,
    #                    'lrUpdateRate': 100, 'thread': 8, 'ws': 5, 'dim': 100, 'pretrainedVectors': "model/sentiment.vector.d100.vec"}
    #traningParameters = {'input': train_data, 'epoch': 50, 'lr': 0.05, 'wordNgrams': 3, 'verbose': 2, 'minCount':1, 'loss': "ns",
    #                    'lrUpdateRate': 100, 'thread': 8, 'ws': 5, 'dim': 300, 'pretrainedVectors': "cc.de.300.vec"}                        
    traningParameters = tools.config()["model"]["fasttext"]
    traningParameters["input"] = train_data  

    pp = pprint.PrettyPrinter(depth=1)
    print("\n traing with following parameters ")
    pp.pprint(traningParameters)

    model = train_supervised(**traningParameters)

    if quantizeModel:
        print("quantize model")
        model.quantize(input=train_data,thread=16,qnorm=True,retrain=True,cutoff=400000) #

    if saveModel:
        path = tools.config()["model"]["model-path"]
        if quantizeModel:
            model.save_model(path+".ftz")
        else:
            model.save_model(path+".bin")        
        with open(path+".params", "w") as text_file:
            print(yaml.dump(tools.config()), file=text_file)  

    # validation
    if(extendedValidation is False):
        print_results(*model.test(valid_data))
    else:
        data = loadValidData(valid_data)

        truth = [row[0].replace("__label__", "") for row in data]        
        texts = [row[1] for row in data]

        predictions = model.predict(texts)
        predictions = tools.flatmap(predictions[0])
        predicted = [x.replace("__label__", "") for x in predictions]
 
        precision, recall, fscore, support = score(
            truth, predicted)  # , average='macro')


        headers = ["metric", "negative", "neutral","positive"] # todo: check if headers a right
        table = []
        table.append(['precision']+[x for x in precision])
        table.append(['recall']+[x for x in recall])
        table.append(['fscore']+[x for x in fscore])
        table.append(['sample count']+[x for x in support])
        print(tabulate(table, headers, tablefmt="pipe", floatfmt=".4f"))
  

        precision, recall, fscore, support = score(
            truth, predicted, average='macro')
        print('macro fscore: {}'.format(fscore))
        precision, recall, fscore, support = score(
            truth, predicted, average='micro')
        print('micro fscore: {}'.format(fscore))

        cm = confusion_matrix(truth, predicted, labels=[
                              "negative", "neutral", "positive"])
        printcm.plot_confusion_matrix(cm=cm, target_names=[
                                      "negative", "neutral", "positive"], normalize=True, title="sentiment classification")

    return model
Exemplo n.º 12
0
#!/usr/bin/env python
# coding: utf-8
# __author__ = 'Benjamin'

from tools import config
from celery import Celery

rmq_ip = config().get(section='rabbitmq', option='server')
rmq_port = config().get(section='rabbitmq', option='port')

celeri = Celery(
    'distributer',
    # user, password, hostname, port, vhost
    broker='amqp://*****:*****@{0}:{1}/ansene'.format(rmq_ip, rmq_port),
    backend='amqp://*****:*****@{0}:{1}//'.format(rmq_ip, rmq_port),
    include=['update_an'])

if __name__ == '__main__':
    celeri.start()
Exemplo n.º 13
0
#!/usr/bin/env python
# coding: utf-8
# __author__ = 'Benjamin'

import flask
import tools
import models
import dbhandler
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

app = flask.Flask(__name__)

# ------ DB setup ------

db_host = tools.config().get(section='postgre', option='server')
db_port = tools.config().get(section='postgre', option='port')
db_user = '******'
db_pwd = 'benjamin'
db_name = 'ansene'

engine = create_engine('postgresql://{0}:{1}@{2}:{3}/{4}'.format(
    db_user, db_pwd, db_host, db_port, db_name))

models.Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
conn = Session()

# ------ Web Server ------