예제 #1
0
    def infer_subtopic_labels(self, limit=None):
        # The basic idea here is to aggressively gather truth data while
        # avoiding cross contamination with other subfolders. Since our query
        # is a (content_id, subtopic_id), we can use subtopic connected
        # components to achieve this.

        # Short aliases.
        cid, subid = self.query_content_id, self.query_subtopic_id

        # For positive labels, the only thing we can do is traverse the
        # subtopic connected component.
        # Don't impose a hard limit on positive labels. (There are probably
        # very few of them.)
        logger.info('Inferring positive labels for: %r', (cid, subid))
        pos_labels = (self.label_store.expand(
            (cid, subid)) + list(self.positive_subtopic_labels()))
        logger.info('Inferring negative labels for: %r', (cid, subid))
        neg_labels = self.negative_subtopic_labels()

        pos_sample = web.streaming_sample(pos_labels,
                                          limit,
                                          limit=hard_limit(limit))
        neg_sample = web.streaming_sample(neg_labels,
                                          limit,
                                          limit=hard_limit(limit))
        print('-' * 79)
        print('POSITIVES\n', '\n'.join(map(repr, pos_sample)), '\n')
        print('-' * 79)
        print('NEGATIVES\n', '\n'.join(map(repr, neg_sample)))
        print('-' * 79)
        return pos_sample + neg_sample
예제 #2
0
    def infer_subtopic_labels(self, limit=None):
        # The basic idea here is to aggressively gather truth data while
        # avoiding cross contamination with other subfolders. Since our query
        # is a (content_id, subtopic_id), we can use subtopic connected
        # components to achieve this.

        # Short aliases.
        cid, subid = self.query_content_id, self.query_subtopic_id

        # For positive labels, the only thing we can do is traverse the
        # subtopic connected component.
        # Don't impose a hard limit on positive labels. (There are probably
        # very few of them.)
        logger.info('Inferring positive labels for: %r', (cid, subid))
        pos_labels = (self.label_store.expand((cid, subid))
                      + list(self.positive_subtopic_labels()))
        logger.info('Inferring negative labels for: %r', (cid, subid))
        neg_labels = self.negative_subtopic_labels()

        pos_sample = web.streaming_sample(
            pos_labels, limit, limit=hard_limit(limit))
        neg_sample = web.streaming_sample(
            neg_labels, limit, limit=hard_limit(limit))
        print('-' * 79)
        print('POSITIVES\n', '\n'.join(map(repr, pos_sample)), '\n')
        print('-' * 79)
        print('NEGATIVES\n', '\n'.join(map(repr, neg_sample)))
        print('-' * 79)
        return pos_sample + neg_sample
예제 #3
0
 def canopy(self, limit=None):
     ids = web.streaming_sample(
         self.canopy_ids(limit_hint=hard_limit(limit)), limit,
         hard_limit(limit))
     # I don't think it ever makes sense to include the query
     # as part of the candidate set.
     return filter(lambda (_, fc): fc is not None, self.store.get_many(ids))
예제 #4
0
 def canopy(self, limit=None):
     ids = web.streaming_sample(
         self.canopy_ids(limit_hint=hard_limit(limit)),
         limit, hard_limit(limit))
     # I don't think it ever makes sense to include the query
     # as part of the candidate set.
     return filter(lambda (_, fc): fc is not None, self.store.get_many(ids))
예제 #5
0
    def do_tfidf(self, args):
        conn = happybase.Connection(host=args.host, port=args.port,
                                    table_prefix=args.table_prefix)
        t = conn.table('artifact')
        corpus = []
        print('Extracting random sample...')
        sample = streaming_sample(open(args.ids), args.limit)

        print('Building corpus...')
        batches = batch_iter(args.batch_size, (s.strip() for s in sample))
        pool = multiprocessing.Pool(processes=args.processes)
        for i, batch in enumerate(batches, 1):
            rows = (row for _, row in t.rows(list(batch)))
            for noun_phrases in pool.imap(unpack_noun_phrases, rows):
                corpus.append(noun_phrases)
            status('%d of %d batches done' % (i, args.limit / args.batch_size))

        print('Computing model...')
        dictionary = corpora.Dictionary(corpus)
        bows = [dictionary.doc2bow(tokens) for tokens in corpus]
        tfidf = models.TfidfModel(bows, id2word=dictionary)
        tfidf.save(args.out)
예제 #6
0
    def do_tfidf(self, args):
        conn = happybase.Connection(host=args.host,
                                    port=args.port,
                                    table_prefix=args.table_prefix)
        t = conn.table('artifact')
        corpus = []
        print('Extracting random sample...')
        sample = streaming_sample(open(args.ids), args.limit)

        print('Building corpus...')
        batches = batch_iter(args.batch_size, (s.strip() for s in sample))
        pool = multiprocessing.Pool(processes=args.processes)
        for i, batch in enumerate(batches, 1):
            rows = (row for _, row in t.rows(list(batch)))
            for noun_phrases in pool.imap(unpack_noun_phrases, rows):
                corpus.append(noun_phrases)
            status('%d of %d batches done' % (i, args.limit / args.batch_size))

        print('Computing model...')
        dictionary = corpora.Dictionary(corpus)
        bows = [dictionary.doc2bow(tokens) for tokens in corpus]
        tfidf = models.TfidfModel(bows, id2word=dictionary)
        tfidf.save(args.out)