def __init__(self, fastTextModel=None): if fastTextModel is None: fileType = ".ftz" if tools.config( )["model"]["quantize"] is True else ".bin" modelFile = tools.config()["model"]["model-path"] + fileType print("model under test:" + modelFile) self.model = fasttext.load_model(modelFile) else: self.model = fastTextModel
def push_an(dest, amd): ''' Wrapper for data pushing toward the Storer. :param dest: Service hoster for the info received: 'storer' :type dest: str :param amd: Amendment to be added to the destination service :type amd: dict :return: Code status :rtype: str ''' host = config().get(section=dest, option='server') port = config().get(section=dest, option='port') hostname = '{}:{}'.format(host, port) status = requests.post(url='http://{0}/an'.format(hostname), json=amd) return status.content
def pull_an(source, proj_id=None, exam_id=None): ''' Wrapper for data pulling from Storer or Crawler. :param source: Service provider for the info pulled: 'storer' or 'crawler' :type source: str :param proj_id: Reference ID of the project. :type proj_id: str :param exam_id: Reference ID of the exam. :type exam_id: str :return: Raw data provided by the service called :rtype: dict ''' host = config().get(section=source, option='server') port = config().get(section=source, option='port') hostname = '{0}:{1}'.format(host, port) data = {"list_n_project": [], "list_project": [], "all_exams": [], "all_amds": [], } if proj_id: if exam_id: data0 = requests.get(url='http://{0}/an/{1}/{2}'.format(hostname, proj_id, exam_id)) try: data = data0.json() except Exception as err: logger.error('Error pulling amd:\t{} -\t{}'.format(err, data0.content)) else: data0 = requests.get(url='http://{0}/an/{1}'.format(hostname, proj_id)) try: data = data0.json() except Exception as err: logger.error('Error pulling exam:\t{} -\t{}'.format(err, data0.content)) else: data0 = requests.get(url='http://{0}/an'.format(hostname)) try: data = data0.json() except Exception as err: logger.error('Error pulling amd:\t{} -\t{}'.format(err, data0.content)) return data
def check_dev(name): global dev_checked global found_dev if not dev_checked: found_dev, board = iio_scanner.find_device(name) if found_dev: global URI global devices_us global devices global boot_bin global board_name global board_config URI = board.uri board_config = tools.config(board_name, \ boot_bin, board.uri[3:], devices, devices_us) dev_checked = True return found_dev
def run(fastTextModel, printErrors=False): model = FastTextTest(fastTextModel) data = readTestData(tools.config()["model"]["test-file"]) table = [] all_truth = [] all_prediction = [] for row in data: truth, prediction = model.predict(row) result = stat_fscore(truth, prediction) table.append([row[0]] + result) all_truth.extend(truth) all_prediction.extend(prediction) table.append(["all"] + stat_fscore(all_truth, all_prediction)) sumRow = ["sum"] for col in range(1, len(table[0])): rowSum = sum(map(lambda x: x[col], table)) sumRow.append(rowSum) table.append(sumRow) headers = [ "file", "precisionMicro", "recallMicro", "fscoreMicro", "precisionMacro", "recallMacro", "fscoreMacro" ] print(tabulate(table, headers, tablefmt="pipe", floatfmt=".4f")) title_description = "FastText" plt = printcm.plot_confusion_matrix( all_truth, all_prediction, classes=["negative", "neutral", "positive"], normalize=True, title=title_description) plt.savefig("models/sentiment-cm.pdf") return table
def run(): if (tools.config()["preprocessing"]["use-cache"] is False): subprocess.call(f"rm -rf {pathForSets}* {pathForTestsets}* ", shell=True) dataLoaders = [ ["emotions", lambda: tools.loadData(source_data + "emotions")], [ "germeval", lambda: tools.loadGermeval2017( source_data + "germeval2017/set_v1.4.tsv") ], [ "sb10k", lambda: tools.loadData( source_data + "SB10k/not-preprocessed/corpus_label_text.tsv", "\t") ], [ "PotTS", lambda: tools.loadData( source_data + "PotTS/not-preprocessed/corpus_label_text.tsv", "\t") ], [ "filmstarts", lambda: tools.loadFilmstarts( source_data + "filmstarts/filmstarts.tsv") ], [ "scare", lambda: tools.loadScareSet(source_data + "scare_v1.0.0_data/reviews/") ], [ "holidaycheck", lambda: tools.loadHolidaycheck( source_data + "holidaycheck/holidaycheck.clean.filtered.tsv") ], [ "leipzig-mixed-typical-2011", lambda: tools.loadData(source_data + "leipzig/deu-mixed-labeled") ], [ "leipzig-newscrawl-2017", lambda: tools.loadData( source_data + "leipzig/deu-newscrawl-2017-labeled") ], [ "leipzig-deu-wikipedia-2016", lambda: tools.loadData( source_data + "leipzig/deu-wikipedia-2016-labeled") ] ] dataSets = [] table = [] dataSetsToLoad = tools.config()["datasets"] for dataSet in dataSetsToLoad: if dataSet["train"] is True or dataSet["test"] is True: # if this fails the loader you are defined in the config, is not defined in the code loader = next( filter(lambda x: x[0] == dataSet["name"], dataLoaders)) # split every set in its 3 classes meta_info = cleanAndSplit(*loader) if dataSet["train"] is True: dataSets.append(loader) table.append(list(dataSet.values()) + meta_info) headers = [ "set name", "training", "test", "from cache", "positiv", "neutral", "negative", "total" ] print(tabulate(table, headers, tablefmt="pipe", floatfmt=".4f")) trainSets = [ dataset["name"] for dataset in dataSetsToLoad if dataset["train"] is True ] # combine single datasets into one set per class neutralSamples = createSetForClass("neutral", trainSets) positiveSamples = createSetForClass("positive", trainSets) negativeSamples = createSetForClass("negative", trainSets) print("\nclass distribution in data set:") print("neutral \t{}\npostitive\t{}\nnegative\t{}".format( neutralSamples, positiveSamples, negativeSamples)) # balance classes if (tools.config()['preprocessing']['balance'] == 'down'): print("\nbalance classes with downsampling") samplesPerClass = min(neutralSamples, positiveSamples, negativeSamples) print("random sampels per class: {}".format(samplesPerClass)) print("total sampels: {}".format(samplesPerClass * 3)) # train / test split per class split(samplesPerClass, "neutral") split(samplesPerClass, "positive") split(samplesPerClass, "negative") else: split(neutralSamples, "neutral") split(positiveSamples, "positive") split(negativeSamples, "negative") print(f"random sampels per class neutral: {neutralSamples}") print(f"random sampels per class positiv: {positiveSamples}") print(f"random sampels per class negative:{negativeSamples}") print( f"total sampels: {neutralSamples +positiveSamples + negativeSamples}" ) # combine classes to set trainFile = path + "model.train" validFile = path + "model.valid" testFile = path + "model.test" executeToFile(f"cat {pathForSets}all.train.* | cut -f2,3", trainFile, shellMode=True) executeToFile(f"cat {pathForSets}all.valid.* | cut -f2,3", validFile, shellMode=True) executeToFile(f"cat {pathForSets}all.test.*", testFile, shellMode=True) totalTrain = tools.lineCount(trainFile) totalValid = tools.lineCount(validFile) totalTest = tools.lineCount(testFile) totalLines = float(totalTrain + totalValid + totalTest) print("\nsamples in:\ntrain\t{}\nvalid\t{}\ntest\t{}\nsum\t{}".format( totalTrain, totalValid, totalTest, totalLines)) print("\npercentage in:\ntrain\t{}\nvalid\t{}\ntest\t{}".format( totalTrain / totalLines, totalValid / totalLines, totalTest / totalLines)) test_sets = [ dataset["name"] for dataset in dataSetsToLoad if dataset["train"] is False and dataset["test"] is True ] print(f"datasets just for testing {test_sets}") if os.path.exists(testFile + ".extra"): os.remove(testFile + ".extra") for test_set in test_sets: executeToFile(f"cat {pathForSets}{test_set}.* ", testFile + ".extra", mode="a", shellMode=True) executeToFile(f"cat {testFile} {testFile}.extra", path + "model.test.full", shellMode=True) executeToFile( f"cat {pathForSets}all.negative {pathForSets}all.neutral {pathForSets}all.positive | cut -f3 ", path + "wordvecc.train", shellMode=True)
#!/usr/bin/env python # coding: utf-8 # __author__ = 'Benjamin' import flask import json import tools app = flask.Flask(__name__) @app.route('/') def status(): return json.dumps({'Service': 'transformer', 'Status': 'Alive'}) if __name__ == '__main__': hostname = tools.config().get(section='transformer', option='server') port = tools.config().get(section='transformer', option='port') app.run(host='0.0.0.0', port=int(port), debug=True)
import re from multiprocessing import Pool from typing import List import tools from string import digits from tqdm import tqdm cleanChars = re.compile(r'[^A-Za-züöäÖÜÄß.!? ]', re.MULTILINE) cleanHttpUrls = re.compile(r'https*\S+', re.MULTILINE) cleanAtMentionsTwitter = re.compile(r'@\S+', re.MULTILINE) config = tools.config()["preprocessing"] def cleanData(data:List[str]) -> List[str]: with Pool(6) as p: data = p.map(cleanRow, data) #for row in tqdm(data): # row[1] = cleanText(row[1]) data = [row for row in data if row[1]] # filter out empty rows return data def cleanRow(row): row[1] = cleanText(row[1]) return row def replaceNumbers(text: str) -> str: text = text.replace("0"," null")
app = flask.Flask(__name__) @app.route('/') def status(): return flask.jsonify({'Status': 'Alive'}) @app.route('/merge', methods=['POST']) def upload(): filenames = tools.get_files(request.files) if filenames: merged = tools.merge_files(filenames) uploaded = tools.serve_file(merged) os.remove(merged) return uploaded # @app.route('/uploads/<filename>') # def uploaded_file(filename): # return send_from_directory(app.config['UPLOAD_FOLDER'], # filename) hostname = tools.config().get(section='server', option='address') port = tools.config().get(section='server', option='port') app.run(host=hostname, port=int(port), debug=True)
# __author__ = 'Benjamin' import flask import json import tools app = flask.Flask(__name__) @app.route('/') def status(): return json.dumps({'Status': 'Alive'}) @app.route('/files', methods=['POST']) def merge_and_send(files): # Temps de process potentiellement long, attention au timeout de la requetes HTTP ! # TODO: Request files (1 by 1) and merge it # TODO: Upload to goploader (depado or self hosted) an mail the link # TODO: Check timeout issues local_files = tools.get_files(files) new_file = tools.merge_files(local_files) new_link = tools.serve_file(new_file) return json.dumps({'link': new_link}) hostname = tools.config().get(section='merger', option='server') port = tools.config().get(section='merger', option='port') app.run(host='0.0.0.0', port=int(port), debug=True)
def train(saveModel=True): train_data = tools.config()["model"]["train-file"] valid_data = tools.config()["model"]["valid-file"] quantizeModel=tools.config()["model"]["quantize"] extendedValidation=tools.config()["model"]["print-confusion-matrix"] #quick train with hs #traningParameters = {'input': train_data, 'epoch': 10, 'lr': 0.25, 'wordNgrams': 3, 'verbose': 2, 'minCount':1, 'loss': "ns", "neg":5, # 'lrUpdateRate': 100, 'thread': 8, 'ws': 5, 'dim': 100, 'pretrainedVectors': "model/sentiment.vector.d100.vec"} #traningParameters = {'input': train_data, 'epoch': 50, 'lr': 0.05, 'wordNgrams': 3, 'verbose': 2, 'minCount':1, 'loss': "ns", # 'lrUpdateRate': 100, 'thread': 8, 'ws': 5, 'dim': 300, 'pretrainedVectors': "cc.de.300.vec"} traningParameters = tools.config()["model"]["fasttext"] traningParameters["input"] = train_data pp = pprint.PrettyPrinter(depth=1) print("\n traing with following parameters ") pp.pprint(traningParameters) model = train_supervised(**traningParameters) if quantizeModel: print("quantize model") model.quantize(input=train_data,thread=16,qnorm=True,retrain=True,cutoff=400000) # if saveModel: path = tools.config()["model"]["model-path"] if quantizeModel: model.save_model(path+".ftz") else: model.save_model(path+".bin") with open(path+".params", "w") as text_file: print(yaml.dump(tools.config()), file=text_file) # validation if(extendedValidation is False): print_results(*model.test(valid_data)) else: data = loadValidData(valid_data) truth = [row[0].replace("__label__", "") for row in data] texts = [row[1] for row in data] predictions = model.predict(texts) predictions = tools.flatmap(predictions[0]) predicted = [x.replace("__label__", "") for x in predictions] precision, recall, fscore, support = score( truth, predicted) # , average='macro') headers = ["metric", "negative", "neutral","positive"] # todo: check if headers a right table = [] table.append(['precision']+[x for x in precision]) table.append(['recall']+[x for x in recall]) table.append(['fscore']+[x for x in fscore]) table.append(['sample count']+[x for x in support]) print(tabulate(table, headers, tablefmt="pipe", floatfmt=".4f")) precision, recall, fscore, support = score( truth, predicted, average='macro') print('macro fscore: {}'.format(fscore)) precision, recall, fscore, support = score( truth, predicted, average='micro') print('micro fscore: {}'.format(fscore)) cm = confusion_matrix(truth, predicted, labels=[ "negative", "neutral", "positive"]) printcm.plot_confusion_matrix(cm=cm, target_names=[ "negative", "neutral", "positive"], normalize=True, title="sentiment classification") return model
#!/usr/bin/env python # coding: utf-8 # __author__ = 'Benjamin' from tools import config from celery import Celery rmq_ip = config().get(section='rabbitmq', option='server') rmq_port = config().get(section='rabbitmq', option='port') celeri = Celery( 'distributer', # user, password, hostname, port, vhost broker='amqp://*****:*****@{0}:{1}/ansene'.format(rmq_ip, rmq_port), backend='amqp://*****:*****@{0}:{1}//'.format(rmq_ip, rmq_port), include=['update_an']) if __name__ == '__main__': celeri.start()
#!/usr/bin/env python # coding: utf-8 # __author__ = 'Benjamin' import flask import tools import models import dbhandler from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker app = flask.Flask(__name__) # ------ DB setup ------ db_host = tools.config().get(section='postgre', option='server') db_port = tools.config().get(section='postgre', option='port') db_user = '******' db_pwd = 'benjamin' db_name = 'ansene' engine = create_engine('postgresql://{0}:{1}@{2}:{3}/{4}'.format( db_user, db_pwd, db_host, db_port, db_name)) models.Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) conn = Session() # ------ Web Server ------