def score(self, hypothesis: List[str], references: List[List[str]], tags: Optional[List[List[str]]] = None) -> VizSeqScore: self._update_n_workers(len(hypothesis)) sent_scores = _get_sent_cider(hypothesis, references, extra_args={ 'n_workers': self.n_workers, 'verbose': self.verbose }) corpus_score, group_scores = None, None if self.corpus_level: corpus_score = np.mean(sent_scores) if not self.sent_level: sent_scores = None if tags is not None: tag_set = self._unique(tags) group_scores = {} for t in tag_set: indices = [i for i, cur in enumerate(tags) if t in cur] group_scores[t] = np.mean([sent_scores[i] for i in indices]) return VizSeqScore.make(corpus_score=corpus_score, sent_scores=sent_scores, group_scores=group_scores)
def score( self, hypothesis: List[str], references: List[List[str]], tags: Optional[List[List[str]]] = None ) -> VizSeqScore: self._update_n_workers(len(hypothesis)) corpus_score, group_scores, sent_scores = None, None, None if self.sent_level: sent_scores = self._score_sentences_multiprocess( hypothesis, references, _get_sent_bleu ) if self.corpus_level: corpus_score = self.score_corpus_multiprocess( hypothesis, references ) if tags is not None: tag_set = self._unique(tags) group_scores = {} for t in tag_set: indices = [i for i, cur in enumerate(tags) if t in cur] ref_slice = [[r[i] for i in indices] for r in references] pred_slice = [hypothesis[i] for i in indices] group_scores[t] = self.score_corpus_multiprocess( pred_slice, ref_slice ) return VizSeqScore.make( corpus_score=corpus_score, sent_scores=sent_scores, group_scores=group_scores )
def score( self, hypothesis: List[str], references: List[List[str]], tags: Optional[List[List[str]]] = None ) -> VizSeqScore: corpus_score, sent_scores, group_scores = None, None, None import bert_score as bs import langid import logging logging.getLogger('pytorch_pretrained_bert').setLevel(logging.WARNING) logging.getLogger('langid').setLevel(logging.WARNING) lang = langid.classify(references[0][0])[0] sent_scores = bs.score( hypothesis, references[0], nthreads=self.n_workers, lang=lang, verbose=self.verbose )[2].tolist() if self.corpus_level: corpus_score = np.mean(sent_scores) if tags is not None: tag_set = self._unique(tags) group_scores = {} for t in tag_set: indices = [i for i, cur in enumerate(tags) if t in cur] group_scores[t] = np.mean([sent_scores[i] for i in indices]) return VizSeqScore.make( corpus_score=corpus_score, sent_scores=sent_scores, group_scores=group_scores )
def score(self, hypothesis: List[str], references: List[List[str]], tags: Optional[List[List[str]]] = None) -> VizSeqScore: self._update_n_workers(len(hypothesis)) corpus_score, group_scores, sent_scores = None, None, None sent_scores = self._score_sentences_multiprocess( hypothesis, references, _get_sent_wer) sent_lens = None if self.corpus_level: sent_lens = self._score_sentences_multiprocess( hypothesis, references, _get_sent_len_r) n_incorrect = np.sum( [s * l for s, l in zip(sent_scores, sent_lens)]) corpus_score = n_incorrect / np.sum(sent_lens) if tags is not None: tag_set = self._unique(tags) group_scores = {} if sent_lens is None: sent_lens = self._score_sentences_multiprocess( hypothesis, references, _get_sent_len_r) for t in tag_set: indices = [i for i, cur in enumerate(tags) if t in cur] cur_sent_scores = [sent_scores[i] for i in indices] cur_sent_lens = [sent_lens[i] for i in indices] n_incorrect = np.sum( [s * l for s, l in zip(cur_sent_scores, cur_sent_lens)]) group_scores[t] = n_incorrect / np.sum(sent_lens) return VizSeqScore.make(corpus_score=corpus_score, sent_scores=sent_scores, group_scores=group_scores)
def score( self, hypothesis: List[str], references: Optional[List[List[str]]] = None, tags: Optional[List[List[str]]] = None ) -> VizSeqScore: corpus_score, group_scores, sent_scores = None, None, None selfbleu_scores = compute_self_bleu(hypothesis) if self.corpus_level: # implement corpus-level score corpus_score = np.mean(selfbleu_scores) if self.sent_level: # implement sentence-level score sent_scores = selfbleu_scores if tags is not None: raise NotImplementedError # tag_set = self._unique(tags) # implement group-level (by sentence tags) score # group_scores={t: 99.9 for t in tag_set} return VizSeqScore.make( corpus_score=corpus_score, sent_scores=sent_scores, group_scores=group_scores )
def score(self, hypothesis: List[str], references: List[List[str]], tags: Optional[List[List[str]]] = None) -> VizSeqScore: corpus_score, group_scores, sent_scores = None, None, None sent_scores = _get_sent_laser(hypothesis, references) if self.corpus_level: corpus_score = np.mean(sent_scores) if tags is not None: tag_set = self._unique(tags) group_scores = {} for t in tag_set: indices = [i for i, cur in enumerate(tags) if t in cur] group_scores[t] = np.mean([sent_scores[i] for i in indices]) return VizSeqScore.make(corpus_score=corpus_score, sent_scores=sent_scores, group_scores=group_scores)
def score( self, hypothesis: List[str], references: List[List[str]], tags: Optional[List[List[str]]] = None, sources: Optional[List[List[str]]] = None, ) -> VizSeqScore: problem = self.extra_args["problem"] # Only relevant if predicting tasks assert problem in ( "TargetProductAndRequirements_TO_Tasks", "Requirements_TO_TargetProductAndTasks", "TargetProductAndRequirementsAndTasks", "RequirementsAndTargetProductAndTasks", ) corpus_score, sent_scores, group_scores = None, None, None requirements = sources[0] hypotheses = hypothesis sent_scores = [] for req_str, hypo in zip(requirements, hypotheses): try: score = compute_requirement_coverage(hypo, req_str, essential=True, problem=problem) except ValueError: score = 0 sent_scores.append(score) if self.corpus_level: corpus_score = np.mean(sent_scores) * 100 return VizSeqScore.make(corpus_score=corpus_score, sent_scores=sent_scores, group_scores={})
def score( self, hypothesis: List[str], references: List[List[str]], tags: Optional[List[List[str]]] = None, sources: Optional[List[List[str]]] = None, ) -> VizSeqScore: # global bert_scorer problem = self.extra_args["problem"] # Only relevant if predicting tasks assert problem in ( "TargetProductAndRequirements_TO_Tasks", "Requirements_TO_TargetProductAndTasks", "TargetProductAndRequirementsAndTasks", "RequirementsAndTargetProductAndTasks", ) corpus_score, sent_scores, group_scores = None, None, None # requirements = sources[0] references = references[0] hypotheses = hypothesis sent_scores = [] for ref, hypo in zip(references, hypotheses): # if problem == 'TargetProductAndRequirementsAndTasks': # # HACK: RecipeGPT grammar # tasks_gt = re.split('\. |! ', ref.rstrip(' <end-directions>')) # tasks_pred = re.split('\. |! ', # hypo.rstrip(' <end-directions>')) if problem in ( "Requirements_TO_TargetProductAndTasks", "RequirementsAndTargetProductAndTasks", ): # Requirements_TO_TargetProductAndTasks tgt_prod_and_tasks_gt = string_to_tasks(ref, parse_tp=True) tasks_gt = tgt_prod_and_tasks_gt[1:] try: tgt_prod_and_tasks_pred = string_to_tasks(hypo, parse_tp=True) tasks_pred = tgt_prod_and_tasks_pred[1:] except ValueError: continue else: # TargetProductAndRequirements_TO_Tasks or TargetProductAndRequirementsAndTasks tasks_gt = string_to_tasks(ref) tasks_pred = string_to_tasks(hypo) try: score = compute_task_order_score(tasks_gt, tasks_pred) sent_scores.append(score) except ScoreComputationError: continue if self.corpus_level: print(len(sent_scores)) corpus_score = np.mean(sent_scores) return VizSeqScore.make(corpus_score=corpus_score, sent_scores=sent_scores, group_scores={})