def outputCsv(c: classifier.Classifier) -> None: filename = '/tmp/%s_%d%s.csv' % (FLAGS.model, FLAGS.limit, '_diff' if FLAGS.csv_diff_only else '') with open(filename, 'w') as csvFile: writer = csv.writer(csvFile) writer.writerow([ 'sharedId', 'sequence', 'training_labels', 'predicted_sure', 'predicted_unsure', 'revised_training_labels' ]) with sessionLock: samples: List[ClassificationSample] = list( ClassificationSample.query.find( dict(model=FLAGS.model, use_for_training=True)).sort([ ('seqHash', -1) ]).limit(FLAGS.limit)) predicted = c.classify([s.seq for s in samples]) for sample, pred in zip(samples, predicted): train_str = ';'.join([l.topic for l in sample.training_labels]) sorted_pred: List[Tuple[str, float]] = sorted(pred.items(), key=lambda e: -e[1]) pred_sure_str = ';'.join( [t for t, q in sorted_pred if q >= FLAGS.csv_sure]) pred_unsure_str = ';'.join( [t for t, q in sorted_pred if q < FLAGS.csv_sure]) if not FLAGS.csv_diff_only or train_str != pred_sure_str: writer.writerow([ sample.sharedId, sample.seq, train_str, pred_sure_str, pred_unsure_str, '' ]) print('Wrote %s.' % filename)
def test_invalid_bert(self, fs: FakeFilesystem) -> None: bad_bert_path = './bad/path/to/bert' config = """ { "bert": "%s", "labels": "label.vocab", "is_released": true, "description": "This is the latest model from Sascha.", "metadata": { "thesaurus": "issues" } } """ % (bad_bert_path) fs.add_real_directory('./testdata/test_model/test_instance') fs.remove_object('./testdata/test_model/test_instance/config.json') fs.create_file('./testdata/test_model/test_instance/config.json', contents=config) with pytest.raises(Exception, match='SavedModel file does not exist at'): c = Classifier(self.BASE_CLASSIFIER_PATH, 'test_model') # Bad bert is only used on uncached embed. c.classify(['some string'])
def test_classify(self, fs: FakeFilesystem) -> None: fs.add_real_directory('./testdata/test_model/test_instance') fs.add_real_directory('./testdata/test_model/test_instance_unreleased') c = Classifier('./testdata', 'test_model') result = c.classify(['Increase access to health care']) assert c.labels is not None assert c.embedder is not None assert c.predictor is not None assert c.instance == 'test_instance' assert result # result ~ [{topic: probability, topic2: probability, ...}, ...] for topic, _ in result[0].items(): assert topic in c.labels assert len(result) == 1 assert result[0]['Right to health'] >= 0.8
def test_classify(self, fs: FakeFilesystem) -> None: fs.add_real_directory('./testdata/test_model/test_instance') fs.add_real_directory('./testdata/test_model/test_instance_unreleased') c = Classifier('./testdata', 'test_model') result = c.classify(['Where is my medical book?']) assert c.vocab is not None assert c.embedder is not None assert c.predictor is not None assert c.instance == 'test_instance' print(result) assert result # result ~ [{topic: probability, topic2: probability, ...}, ...] for topic, _ in result[0].items(): assert topic in c.vocab assert result[0]['Right to education'] >= 0.7
def outputCsv(c: classifier.Classifier) -> None: filename = './%s_%d%s.csv' % (FLAGS.model, FLAGS.limit, '_diff' if FLAGS.csv_diff_only else '') if FLAGS.csv: subset_seqs: List[str] = [] with open(FLAGS.csv, 'r') as csvFile, sessionLock: for row in csv.DictReader(csvFile): subset_seqs.append(row[FLAGS.text_col]) print(subset_seqs[:10]) with open(filename, 'w') as csvFile: writer = csv.writer(csvFile) writer.writerow([ 'sharedId', 'sequence', 'training_labels', 'predictions', 'probabilities' ]) with sessionLock: samples: List[ClassificationSample] = list( ClassificationSample.query.find( dict(model=FLAGS.model, use_for_training=False)).sort([ ('seqHash', -1) ]).limit(FLAGS.limit)) if FLAGS.csv: samples = [ s for s in samples if any(x in s.seq for x in subset_seqs) ] predicted = c.classify([s.seq for s in samples]) for sample, pred in zip(samples, predicted): training_labels = [l.topic for l in sample.training_labels] train_str = ';'.join(sorted(training_labels)) sorted_pred: List[Tuple[str, float]] = sorted(pred.items()) predictions = ';'.join([t for t, q in sorted_pred]) probabilities = ';'.join([str(q) for t, q in sorted_pred]) if not FLAGS.csv_diff_only or train_str != predictions: writer.writerow([ sample.sharedId, sample.seq, train_str, predictions, probabilities ]) print('Wrote %s.' % filename)
def call_cls(urls, callback, kws, labels): """ Provides the communication of the status and results between the classifier process, the main process and the client (when using via web interface). # Input: - urls (list): a list of urls to be classified. - callback(str): the callback url. - kws (list): list of pre-defined keywords in the database. - labels (list): list of pre-defined labels in the database. """ context = zmq.Context() socket = context.socket(zmq.REP) socket.bind('tcp://*:{PORT}'.format(PORT=ZMQ_LISTENING_PORT)) poller = zmq.Poller() poller.register(socket, zmq.POLLIN) print("calling classifier") msg = None while msg is None and callback is None: socks = dict(poller.poll()) if socket in socks: msg = socket.recv() print("callback: ", callback) cls = Classifier(model=model) results = dict() if callback is None: url = urls[0] for status in cls.classify(url, kws, labels): print("status:", status) if type(status) == str: socket.send_string(json.dumps({'status':status, 'url':url})) if status == 'error': break gevent.sleep(0.1) msg = socket.recv() else: socket.send_string(json.dumps(status)) else: print("calculating for direct post") results = [] for url in urls: for status in cls.classify(url, kws, labels): if type(status) == str: data = json.dumps({'status':status + " in url " + url}) print("sending status to callback") requests.post(callback, json=data) if status == 'error': results += [{'url':url, 'restrict':False, 'reasons':['error']}] break else: results += [status] #TODO call update db data = json.dumps({'sites':results}) print("sending results to callback") requests.post(callback, json=data) sys.exit()