示例#1
0
 def __init__(self, args):
     super(BM25Score, self).__init__(args)
     self.k1 = args.bm25_k1
     self.b = args.bm25_b
     self.avgdl = args.bm25_avgdl
     self._freq_stats = GalagoIndex(
         args.index, 'postings.krovetz') if args.index else None
示例#2
0
class LanguageModelScore(summaryrank.Feature):
    """ Query likelihood of the sentence language model using Dirichlet smoothing """
    def __init__(self, args):
        super(LanguageModelScore, self).__init__(args)
        self.mu = args.lm_mu
        self._freq_stats = GalagoIndex(
            args.index, 'postings.krovetz') if args.index else None

    @classmethod
    def init_parser(cls, parser, group):
        # be warned, using the secret API
        if not parser._get_option_tuples('--index'):
            group.add_argument('--index',
                               metavar='PATH',
                               help='the background Galago index')

        group.add_argument(
            '--lm-mu',
            type=int,
            metavar='NUM',
            help='mu in Dirichlet smoothing (default: %(default)s)')
        group.set_defaults(lm_mu=10)

    @classmethod
    def check_parser_args(cls, parser, args):
        pass

    def check(self, model):
        assert model.contains(['topics_stem', 'sentences_stem'])
        if not self._freq_stats:
            assert model.contains(['freq_stats'])

    def compute(self, model):
        result = []
        if not self._freq_stats:
            self._freq_stats = IndexDump.load(model.get_path('freq_stats'))

        collection_len = self._freq_stats.collection_length()

        topics_stem = model.load_topics('topics_stem')
        queries = dict((m['qid'], text.split()) for text, m in topics_stem)

        sentences_stem = model.load_sentences('sentences_stem')
        for text, m in sentences_stem:
            stems = text.split()
            sentence_tf = collections.Counter(stems)
            sentence_len = len(stems)
            score = float(0)
            for query_stem in queries[m['qid']]:
                cf = self._freq_stats.cf(query_stem)
                if cf == 0:
                    continue
                score += math.log(
                    float(sentence_tf[query_stem] +
                          self.mu * float(cf) / collection_len) /
                    (sentence_len + self.mu))
            result.append(score)
        return result
示例#3
0
class BM25Score(summaryrank.Feature):
    """ BM25 score for the sentence """

    def __init__(self, args):
        super(BM25Score, self).__init__(args)
        self.k1 = args.bm25_k1
        self.b = args.bm25_b
        self.avgdl = args.bm25_avgdl
        self._freq_stats = GalagoIndex(args.index, 'postings.krovetz') if args.index else None

    @classmethod
    def init_parser(cls, parser, group):
        # be warned, using the secret API
        if not parser._get_option_tuples('--index'):
            group.add_argument('--index', metavar='PATH',
                               help='the background Galago index')

        group.add_argument('--bm25-k1', type=float, metavar='NUM',
                           help='parameter k1 (default: %(default)s)')
        group.add_argument('--bm25-b', type=float, metavar='NUM',
                           help='parameter b (default: %(default)s)')
        group.add_argument('--bm25-avgdl', type=float, metavar='NUM',
                           help='parameter avgdl (default: %(default)s)')
        group.set_defaults(bm25_k1=1.2, bm25_b=0.75, bm25_avgdl=25)

    @classmethod
    def check_parser_args(cls, parser, args):
        pass

    def check(self, model):
        assert model.contains(['topics_stem', 'sentences_stem'])
        if not self._freq_stats:
            assert model.contains(['freq_stats'])

    def compute(self, model):
        result = []
        if not self._freq_stats:
            self._freq_stats = GalagoIndexDump.load(model.get_path('freq_stats'))

        N = self._freq_stats.num_docs()

        topics_stem = model.load_topics('topics_stem')
        queries = dict((m['qid'], text.split()) for text, m in topics_stem)

        for text, m in model.load_sentences('sentences_stem'):
            stems = text.split()
            sentence_tf = collections.Counter(stems)
            sentence_len = len(stems)
            score = float(0)
            for query_stem in queries[m['qid']]:
                df = self._freq_stats.df(query_stem)
                comp1 = math.log(float(N - df + 0.5) / (df + 0.5))
                comp2 = float(sentence_tf[query_stem] * (self.k1 + 1))
                comp3 = sentence_tf[query_stem] + \
                        self.k1 * (1 - self.b + float(self.b * sentence_len) / self.avgdl)
                score += comp1 * comp2 / comp3
            result.append(score)
        return result
示例#4
0
class LanguageModelScore(summaryrank.Feature):
    """ Query likelihood of the sentence language model using Dirichlet smoothing """

    def __init__(self, args):
        super(LanguageModelScore, self).__init__(args)
        self.mu = args.lm_mu
        self._freq_stats = GalagoIndex(args.index, 'postings.krovetz') if args.index else None

    @classmethod
    def init_parser(cls, parser, group):
        # be warned, using the secret API
        if not parser._get_option_tuples('--index'):
            group.add_argument('--index', metavar='PATH',
                               help='the background Galago index')

        group.add_argument('--lm-mu', type=int, metavar='NUM',
                           help='mu in Dirichlet smoothing (default: %(default)s)')
        group.set_defaults(lm_mu=10)

    @classmethod
    def check_parser_args(cls, parser, args):
        pass

    def check(self, model):
        assert model.contains(['topics_stem', 'sentences_stem'])
        if not self._freq_stats:
            assert model.contains(['freq_stats'])

    def compute(self, model):
        result = []
        if not self._freq_stats:
            self._freq_stats = GalagoIndexDump.load(model.get_path('freq_stats'))

        collection_len = self._freq_stats.collection_length()

        topics_stem = model.load_topics('topics_stem')
        queries = dict((m['qid'], text.split()) for text, m in topics_stem)

        sentences_stem = model.load_sentences('sentences_stem')
        for text, m in sentences_stem:
            stems = text.split()
            sentence_tf = collections.Counter(stems)
            sentence_len = len(stems)
            score = float(0)
            for query_stem in queries[m['qid']]:
                cf = self._freq_stats.cf(query_stem)
                if cf == 0:
                    continue
                score += math.log(
                    float(sentence_tf[query_stem] + self.mu * float(cf) / collection_len)
                    / (sentence_len + self.mu))
            result.append(score)
        return result
示例#5
0
def gen_freqstats(argv):
    """ Generate frequency stats """
    parser = argparse.ArgumentParser(
        prog='gen_freqstats',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        add_help=False,
    )

    parser.add_argument('-m',
                        dest='model',
                        metavar='DIR',
                        required=True,
                        help='store the processed data in DIR')
    parser.add_argument('index_path', help='path to Indri/Galago index')
    parser.add_argument(
        'index_part',
        nargs='?',
        help='(Galago only) index part: postings.krovetz or postings.porter')
    args = parser.parse_args(argv)

    model = summaryrank.Model(args.model)

    if IndriIndex.is_valid_path(args.index_path):
        index = IndriIndex(args.index_path)
        print >> sys.stderr, 'use Indri index'
    elif GalagoIndex.is_valid_path(args.index_path):
        index = GalagoIndex(args.index_path, args.index_part)
        print >> sys.stderr, 'use Galago index'
    else:
        parser.error('must specify a valid Indri/Galago index')

    term_set = set()
    for text, _ in model.load_topics('topics_stem'):
        term_set.update(text.split())
    for text, _ in model.load_sentences('sentences_stem'):
        term_set.update(text.split())

    print >> sys.stderr, 'found {} stems'.format(len(term_set))

    IndexDump.dump(model.get_path('freq_stats'), index, term_set)
示例#6
0
def gen_freqstats(argv):
    """ Generate frequency stats """
    parser = argparse.ArgumentParser(
        prog='gen_freqstats',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        add_help=False,
    )

    parser.add_argument('-m', dest='model', metavar='DIR', required=True,
                        help='store the processed data in DIR')
    parser.add_argument('index_path',
                        help='path to Indri/Galago index')
    parser.add_argument('index_part', nargs='?',
                        help='(Galago only) index part: postings.krovetz or postings.porter')
    args = parser.parse_args(argv)

    model = summaryrank.Model(args.model)

    if IndriIndex.is_valid_path(args.index_path):
        index = IndriIndex(args.index_path)
        print >>sys.stderr, 'use Indri index'
    elif GalagoIndex.is_valid_path(args.index_path):
        index = GalagoIndex(args.index_path, args.index_part)
        print >>sys.stderr, 'use Galago index'
    else:
        parser.error('must specify a valid Indri/Galago index')

    term_set = set()
    for text, _ in model.load_topics('topics_stem'):
        term_set.update(text.split())
    for text, _ in model.load_sentences('sentences_stem'):
        term_set.update(text.split())

    print >>sys.stderr, 'found {} stems'.format(len(term_set))

    IndexDump.dump(model.get_path('freq_stats'), index, term_set)
示例#7
0
 def __init__(self, args):
     super(LanguageModelScore, self).__init__(args)
     self.mu = args.lm_mu
     self._freq_stats = GalagoIndex(
         args.index, 'postings.krovetz') if args.index else None
示例#8
0
 def __init__(self, args):
     super(BM25Score, self).__init__(args)
     self.k1 = args.bm25_k1
     self.b = args.bm25_b
     self.avgdl = args.bm25_avgdl
     self._freq_stats = GalagoIndex(args.index, 'postings.krovetz') if args.index else None
示例#9
0
 def __init__(self, args):
     super(LanguageModelScore, self).__init__(args)
     self.mu = args.lm_mu
     self._freq_stats = GalagoIndex(args.index, 'postings.krovetz') if args.index else None