class BM25Score(summaryrank.Feature): """ BM25 score for the sentence """ def __init__(self, args): super(BM25Score, self).__init__(args) self.k1 = args.bm25_k1 self.b = args.bm25_b self.avgdl = args.bm25_avgdl self._freq_stats = GalagoIndex(args.index, 'postings.krovetz') if args.index else None @classmethod def init_parser(cls, parser, group): # be warned, using the secret API if not parser._get_option_tuples('--index'): group.add_argument('--index', metavar='PATH', help='the background Galago index') group.add_argument('--bm25-k1', type=float, metavar='NUM', help='parameter k1 (default: %(default)s)') group.add_argument('--bm25-b', type=float, metavar='NUM', help='parameter b (default: %(default)s)') group.add_argument('--bm25-avgdl', type=float, metavar='NUM', help='parameter avgdl (default: %(default)s)') group.set_defaults(bm25_k1=1.2, bm25_b=0.75, bm25_avgdl=25) @classmethod def check_parser_args(cls, parser, args): pass def check(self, model): assert model.contains(['topics_stem', 'sentences_stem']) if not self._freq_stats: assert model.contains(['freq_stats']) def compute(self, model): result = [] if not self._freq_stats: self._freq_stats = GalagoIndexDump.load(model.get_path('freq_stats')) N = self._freq_stats.num_docs() topics_stem = model.load_topics('topics_stem') queries = dict((m['qid'], text.split()) for text, m in topics_stem) for text, m in model.load_sentences('sentences_stem'): stems = text.split() sentence_tf = collections.Counter(stems) sentence_len = len(stems) score = float(0) for query_stem in queries[m['qid']]: df = self._freq_stats.df(query_stem) comp1 = math.log(float(N - df + 0.5) / (df + 0.5)) comp2 = float(sentence_tf[query_stem] * (self.k1 + 1)) comp3 = sentence_tf[query_stem] + \ self.k1 * (1 - self.b + float(self.b * sentence_len) / self.avgdl) score += comp1 * comp2 / comp3 result.append(score) return result