示例#1
0
    def _analyze(self, text, project, params):
        data = {'text': text}
        if 'project' in params:
            data['project'] = params['project']

        try:
            req = requests.post(params['endpoint'], data=data)
            req.raise_for_status()
        except requests.exceptions.RequestException as err:
            self.warning("HTTP request failed: {}".format(err))
            return ListAnalysisResult([], project.subjects)

        try:
            response = req.json()
        except ValueError as err:
            self.warning("JSON decode failed: {}".format(err))
            return ListAnalysisResult([], project.subjects)

        if 'results' in response:
            results = response['results']
        else:
            results = response

        try:
            return ListAnalysisResult([AnalysisHit(uri=h['uri'],
                                                   label=h['label'],
                                                   score=h['score'])
                                       for h in results
                                       if h['score'] > 0.0],
                                      project.subjects)
        except (TypeError, ValueError) as err:
            self.warning("Problem interpreting JSON data: {}".format(err))
            return ListAnalysisResult([], project.subjects)
示例#2
0
def test_analysishits_vector_notfound(document_corpus):
    subjects = SubjectIndex(document_corpus)
    hits = ListAnalysisResult([
        AnalysisHit(
            uri='http://example.com/notfound', label='not found', score=1.0)
    ], subjects)
    assert hits.vector.sum() == 0
示例#3
0
def generate_hits(n, subject_index):
    hits = []
    for i in range(n):
        hits.append(AnalysisHit(uri='http://example.org/{}'.format(i),
                                label='hit {}'.format(i),
                                score=1.0 / (i + 1)))
    return ListAnalysisResult(hits, subject_index)
示例#4
0
def test_hitfilter_zero_score(subject_index):
    orighits = ListAnalysisResult(
        [AnalysisHit(uri='uri', label='label', score=0.0)],
        subject_index)
    hits = HitFilter()(orighits)
    assert isinstance(hits, AnalysisResult)
    assert len(hits) == 0
示例#5
0
 def _analyze_chunks(self, chunktexts, project):
     results = []
     for chunktext in chunktexts:
         exampletext = self._inputs_to_exampletext(project, chunktext)
         if not exampletext:
             continue
         example = ' {}'.format(exampletext)
         result = self._model.predict(example)
         results.append(self._convert_result(result, project))
     if not results:  # empty result
         return ListAnalysisResult(hits=[], subject_index=project.subjects)
     return VectorAnalysisResult(
         np.array(results).mean(axis=0), project.subjects)
示例#6
0
 def _analyze(self, text, project, params):
     self.initialize()
     self.debug('Analyzing text "{}..." (len={})'.format(
         text[:20], len(text)))
     sentences = project.analyzer.tokenize_sentences(text)
     self.debug('Found {} sentences'.format(len(sentences)))
     chunksize = int(params['chunksize'])
     chunktexts = []
     for i in range(0, len(sentences), chunksize):
         chunktexts.append(' '.join(sentences[i:i + chunksize]))
     self.debug('Split sentences into {} chunks'.format(len(chunktexts)))
     if len(chunktexts) == 0:  # nothing to analyze, empty result
         return ListAnalysisResult(hits=[], subject_index=project.subjects)
     return self._analyze_chunks(chunktexts, project)
示例#7
0
    def _analyze_chunks(self, chunktexts, project):
        limit = int(self.params['limit'])
        chunklabels, chunkscores = self._model.predict(chunktexts, limit)
        label_scores = collections.defaultdict(float)
        for labels, scores in zip(chunklabels, chunkscores):
            for label, score in zip(labels, scores):
                label_scores[label] += score
        best_labels = sorted([(score, label)
                              for label, score in label_scores.items()],
                             reverse=True)

        results = []
        for score, label in best_labels[:limit]:
            subject = self._label_to_subject(project, label)
            results.append(
                AnalysisHit(uri=subject[0],
                            label=subject[1],
                            score=score / len(chunktexts)))
        return ListAnalysisResult(results, project.subjects)
示例#8
0
def test_analysishits_vector(document_corpus):
    subjects = SubjectIndex(document_corpus)
    hits = ListAnalysisResult([
        AnalysisHit(uri='http://www.yso.fi/onto/yso/p7141',
                    label='sinetit',
                    score=1.0),
        AnalysisHit(uri='http://www.yso.fi/onto/yso/p6479',
                    label='viikingit',
                    score=0.5)
    ], subjects)
    assert isinstance(hits.vector, np.ndarray)
    assert len(hits.vector) == len(subjects)
    assert hits.vector.sum() == 1.5
    for subject_id, score in enumerate(hits.vector):
        if subjects[subject_id][1] == 'sinetit':
            assert score == 1.0
        elif subjects[subject_id][1] == 'viikingit':
            assert score == 0.5
        else:
            assert score == 0.0
示例#9
0
 def _analyze(self, text, project, params):
     score = float(params.get('score', 1.0))
     return ListAnalysisResult(
         [AnalysisHit(uri=self.uri, label=self.label, score=score)],
         project.subjects)
示例#10
0
 def _analyze(self, text, project, params):
     score = float(params.get('score', 1.0))
     return ListAnalysisResult([
         AnalysisHit(
             uri='http://example.org/dummy', label='dummy', score=score)
     ], project.subjects)