def _analyze(self, text, project, params): data = {'text': text} if 'project' in params: data['project'] = params['project'] try: req = requests.post(params['endpoint'], data=data) req.raise_for_status() except requests.exceptions.RequestException as err: self.warning("HTTP request failed: {}".format(err)) return ListAnalysisResult([], project.subjects) try: response = req.json() except ValueError as err: self.warning("JSON decode failed: {}".format(err)) return ListAnalysisResult([], project.subjects) if 'results' in response: results = response['results'] else: results = response try: return ListAnalysisResult([AnalysisHit(uri=h['uri'], label=h['label'], score=h['score']) for h in results if h['score'] > 0.0], project.subjects) except (TypeError, ValueError) as err: self.warning("Problem interpreting JSON data: {}".format(err)) return ListAnalysisResult([], project.subjects)
def test_analysishits_vector_notfound(document_corpus): subjects = SubjectIndex(document_corpus) hits = ListAnalysisResult([ AnalysisHit( uri='http://example.com/notfound', label='not found', score=1.0) ], subjects) assert hits.vector.sum() == 0
def generate_hits(n, subject_index): hits = [] for i in range(n): hits.append(AnalysisHit(uri='http://example.org/{}'.format(i), label='hit {}'.format(i), score=1.0 / (i + 1))) return ListAnalysisResult(hits, subject_index)
def test_hitfilter_zero_score(subject_index): orighits = ListAnalysisResult( [AnalysisHit(uri='uri', label='label', score=0.0)], subject_index) hits = HitFilter()(orighits) assert isinstance(hits, AnalysisResult) assert len(hits) == 0
def _analyze_chunks(self, chunktexts, project): results = [] for chunktext in chunktexts: exampletext = self._inputs_to_exampletext(project, chunktext) if not exampletext: continue example = ' {}'.format(exampletext) result = self._model.predict(example) results.append(self._convert_result(result, project)) if not results: # empty result return ListAnalysisResult(hits=[], subject_index=project.subjects) return VectorAnalysisResult( np.array(results).mean(axis=0), project.subjects)
def _analyze(self, text, project, params): self.initialize() self.debug('Analyzing text "{}..." (len={})'.format( text[:20], len(text))) sentences = project.analyzer.tokenize_sentences(text) self.debug('Found {} sentences'.format(len(sentences))) chunksize = int(params['chunksize']) chunktexts = [] for i in range(0, len(sentences), chunksize): chunktexts.append(' '.join(sentences[i:i + chunksize])) self.debug('Split sentences into {} chunks'.format(len(chunktexts))) if len(chunktexts) == 0: # nothing to analyze, empty result return ListAnalysisResult(hits=[], subject_index=project.subjects) return self._analyze_chunks(chunktexts, project)
def _analyze_chunks(self, chunktexts, project): limit = int(self.params['limit']) chunklabels, chunkscores = self._model.predict(chunktexts, limit) label_scores = collections.defaultdict(float) for labels, scores in zip(chunklabels, chunkscores): for label, score in zip(labels, scores): label_scores[label] += score best_labels = sorted([(score, label) for label, score in label_scores.items()], reverse=True) results = [] for score, label in best_labels[:limit]: subject = self._label_to_subject(project, label) results.append( AnalysisHit(uri=subject[0], label=subject[1], score=score / len(chunktexts))) return ListAnalysisResult(results, project.subjects)
def test_analysishits_vector(document_corpus): subjects = SubjectIndex(document_corpus) hits = ListAnalysisResult([ AnalysisHit(uri='http://www.yso.fi/onto/yso/p7141', label='sinetit', score=1.0), AnalysisHit(uri='http://www.yso.fi/onto/yso/p6479', label='viikingit', score=0.5) ], subjects) assert isinstance(hits.vector, np.ndarray) assert len(hits.vector) == len(subjects) assert hits.vector.sum() == 1.5 for subject_id, score in enumerate(hits.vector): if subjects[subject_id][1] == 'sinetit': assert score == 1.0 elif subjects[subject_id][1] == 'viikingit': assert score == 0.5 else: assert score == 0.0
def _analyze(self, text, project, params): score = float(params.get('score', 1.0)) return ListAnalysisResult( [AnalysisHit(uri=self.uri, label=self.label, score=score)], project.subjects)
def _analyze(self, text, project, params): score = float(params.get('score', 1.0)) return ListAnalysisResult([ AnalysisHit( uri='http://example.org/dummy', label='dummy', score=score) ], project.subjects)