Exemplo n.º 1
0
def outputCsv(c: classifier.Classifier) -> None:
    filename = '/tmp/%s_%d%s.csv' % (FLAGS.model, FLAGS.limit,
                                     '_diff' if FLAGS.csv_diff_only else '')
    with open(filename, 'w') as csvFile:
        writer = csv.writer(csvFile)
        writer.writerow([
            'sharedId', 'sequence', 'training_labels', 'predicted_sure',
            'predicted_unsure', 'revised_training_labels'
        ])
        with sessionLock:
            samples: List[ClassificationSample] = list(
                ClassificationSample.query.find(
                    dict(model=FLAGS.model, use_for_training=True)).sort([
                        ('seqHash', -1)
                    ]).limit(FLAGS.limit))
        predicted = c.classify([s.seq for s in samples])
        for sample, pred in zip(samples, predicted):
            train_str = ';'.join([l.topic for l in sample.training_labels])
            sorted_pred: List[Tuple[str, float]] = sorted(pred.items(),
                                                          key=lambda e: -e[1])
            pred_sure_str = ';'.join(
                [t for t, q in sorted_pred if q >= FLAGS.csv_sure])
            pred_unsure_str = ';'.join(
                [t for t, q in sorted_pred if q < FLAGS.csv_sure])
            if not FLAGS.csv_diff_only or train_str != pred_sure_str:
                writer.writerow([
                    sample.sharedId, sample.seq, train_str, pred_sure_str,
                    pred_unsure_str, ''
                ])
    print('Wrote %s.' % filename)
Exemplo n.º 2
0
 def test_invalid_bert(self, fs: FakeFilesystem) -> None:
     bad_bert_path = './bad/path/to/bert'
     config = """
     {
         "bert": "%s",
         "labels": "label.vocab",
         "is_released": true,
         "description": "This is the latest model from Sascha.",
         "metadata": {
             "thesaurus": "issues"
         }
     }
     """ % (bad_bert_path)
     fs.add_real_directory('./testdata/test_model/test_instance')
     fs.remove_object('./testdata/test_model/test_instance/config.json')
     fs.create_file('./testdata/test_model/test_instance/config.json',
                    contents=config)
     with pytest.raises(Exception,
                        match='SavedModel file does not exist at'):
         c = Classifier(self.BASE_CLASSIFIER_PATH, 'test_model')
         # Bad bert is only used on uncached embed.
         c.classify(['some string'])
Exemplo n.º 3
0
    def test_classify(self, fs: FakeFilesystem) -> None:
        fs.add_real_directory('./testdata/test_model/test_instance')
        fs.add_real_directory('./testdata/test_model/test_instance_unreleased')
        c = Classifier('./testdata', 'test_model')

        result = c.classify(['Increase access to health care'])

        assert c.labels is not None
        assert c.embedder is not None
        assert c.predictor is not None

        assert c.instance == 'test_instance'
        assert result
        # result ~ [{topic: probability, topic2: probability, ...}, ...]
        for topic, _ in result[0].items():
            assert topic in c.labels
        assert len(result) == 1
        assert result[0]['Right to health'] >= 0.8
    def test_classify(self, fs: FakeFilesystem) -> None:
        fs.add_real_directory('./testdata/test_model/test_instance')
        fs.add_real_directory('./testdata/test_model/test_instance_unreleased')
        c = Classifier('./testdata', 'test_model')

        result = c.classify(['Where is my medical book?'])

        assert c.vocab is not None
        assert c.embedder is not None
        assert c.predictor is not None

        assert c.instance == 'test_instance'
        print(result)
        assert result
        # result ~ [{topic: probability, topic2: probability, ...}, ...]
        for topic, _ in result[0].items():
            assert topic in c.vocab
        assert result[0]['Right to education'] >= 0.7
Exemplo n.º 5
0
def outputCsv(c: classifier.Classifier) -> None:
    filename = './%s_%d%s.csv' % (FLAGS.model, FLAGS.limit,
                                  '_diff' if FLAGS.csv_diff_only else '')
    if FLAGS.csv:
        subset_seqs: List[str] = []
        with open(FLAGS.csv, 'r') as csvFile, sessionLock:
            for row in csv.DictReader(csvFile):
                subset_seqs.append(row[FLAGS.text_col])
            print(subset_seqs[:10])

    with open(filename, 'w') as csvFile:
        writer = csv.writer(csvFile)
        writer.writerow([
            'sharedId', 'sequence', 'training_labels', 'predictions',
            'probabilities'
        ])
        with sessionLock:
            samples: List[ClassificationSample] = list(
                ClassificationSample.query.find(
                    dict(model=FLAGS.model, use_for_training=False)).sort([
                        ('seqHash', -1)
                    ]).limit(FLAGS.limit))
            if FLAGS.csv:
                samples = [
                    s for s in samples if any(x in s.seq for x in subset_seqs)
                ]
        predicted = c.classify([s.seq for s in samples])
        for sample, pred in zip(samples, predicted):
            training_labels = [l.topic for l in sample.training_labels]
            train_str = ';'.join(sorted(training_labels))

            sorted_pred: List[Tuple[str, float]] = sorted(pred.items())
            predictions = ';'.join([t for t, q in sorted_pred])
            probabilities = ';'.join([str(q) for t, q in sorted_pred])

            if not FLAGS.csv_diff_only or train_str != predictions:
                writer.writerow([
                    sample.sharedId, sample.seq, train_str, predictions,
                    probabilities
                ])
    print('Wrote %s.' % filename)
Exemplo n.º 6
0
def call_cls(urls, callback, kws, labels):
    """
    Provides the communication of the status and results between the classifier process, the main process and the client (when using via web interface).
    
    # Input:
        - urls (list): a list of urls to be classified.
        - callback(str): the callback url.
        - kws (list): list of pre-defined keywords in the database.
        - labels (list): list of pre-defined labels in the database.
    """
    context = zmq.Context()
    socket = context.socket(zmq.REP)
    socket.bind('tcp://*:{PORT}'.format(PORT=ZMQ_LISTENING_PORT))
    poller = zmq.Poller()
    poller.register(socket, zmq.POLLIN)
    print("calling classifier")


    msg = None
    while msg is None and callback is None:
        socks = dict(poller.poll())
        if socket in socks:
            msg = socket.recv()

    print("callback: ", callback)
    cls = Classifier(model=model)
    results = dict()
    if callback is None:
        url = urls[0]
        for status in cls.classify(url, kws, labels):
            print("status:", status)
            if type(status) == str:
                socket.send_string(json.dumps({'status':status, 'url':url}))
                if status == 'error':
                    break
                gevent.sleep(0.1)
                msg = socket.recv()
            else:
                socket.send_string(json.dumps(status))
    else:
        print("calculating for direct post")
        results = []
        for url in urls:
            for status in cls.classify(url, kws, labels):
                if type(status) == str:
                    data = json.dumps({'status':status + " in url " + url})
                    print("sending status to callback")
                    requests.post(callback, json=data)
                    if status == 'error':
                        results += [{'url':url,
                                     'restrict':False,
                                     'reasons':['error']}]
                        break
                else:
                    results += [status]

        #TODO call update db
        data = json.dumps({'sites':results})
        print("sending results to callback")
        requests.post(callback, json=data)
    sys.exit()