def infer_subtopic_labels(self, limit=None): # The basic idea here is to aggressively gather truth data while # avoiding cross contamination with other subfolders. Since our query # is a (content_id, subtopic_id), we can use subtopic connected # components to achieve this. # Short aliases. cid, subid = self.query_content_id, self.query_subtopic_id # For positive labels, the only thing we can do is traverse the # subtopic connected component. # Don't impose a hard limit on positive labels. (There are probably # very few of them.) logger.info('Inferring positive labels for: %r', (cid, subid)) pos_labels = (self.label_store.expand( (cid, subid)) + list(self.positive_subtopic_labels())) logger.info('Inferring negative labels for: %r', (cid, subid)) neg_labels = self.negative_subtopic_labels() pos_sample = web.streaming_sample(pos_labels, limit, limit=hard_limit(limit)) neg_sample = web.streaming_sample(neg_labels, limit, limit=hard_limit(limit)) print('-' * 79) print('POSITIVES\n', '\n'.join(map(repr, pos_sample)), '\n') print('-' * 79) print('NEGATIVES\n', '\n'.join(map(repr, neg_sample))) print('-' * 79) return pos_sample + neg_sample
def infer_subtopic_labels(self, limit=None): # The basic idea here is to aggressively gather truth data while # avoiding cross contamination with other subfolders. Since our query # is a (content_id, subtopic_id), we can use subtopic connected # components to achieve this. # Short aliases. cid, subid = self.query_content_id, self.query_subtopic_id # For positive labels, the only thing we can do is traverse the # subtopic connected component. # Don't impose a hard limit on positive labels. (There are probably # very few of them.) logger.info('Inferring positive labels for: %r', (cid, subid)) pos_labels = (self.label_store.expand((cid, subid)) + list(self.positive_subtopic_labels())) logger.info('Inferring negative labels for: %r', (cid, subid)) neg_labels = self.negative_subtopic_labels() pos_sample = web.streaming_sample( pos_labels, limit, limit=hard_limit(limit)) neg_sample = web.streaming_sample( neg_labels, limit, limit=hard_limit(limit)) print('-' * 79) print('POSITIVES\n', '\n'.join(map(repr, pos_sample)), '\n') print('-' * 79) print('NEGATIVES\n', '\n'.join(map(repr, neg_sample))) print('-' * 79) return pos_sample + neg_sample
def canopy(self, limit=None): ids = web.streaming_sample( self.canopy_ids(limit_hint=hard_limit(limit)), limit, hard_limit(limit)) # I don't think it ever makes sense to include the query # as part of the candidate set. return filter(lambda (_, fc): fc is not None, self.store.get_many(ids))
def do_tfidf(self, args): conn = happybase.Connection(host=args.host, port=args.port, table_prefix=args.table_prefix) t = conn.table('artifact') corpus = [] print('Extracting random sample...') sample = streaming_sample(open(args.ids), args.limit) print('Building corpus...') batches = batch_iter(args.batch_size, (s.strip() for s in sample)) pool = multiprocessing.Pool(processes=args.processes) for i, batch in enumerate(batches, 1): rows = (row for _, row in t.rows(list(batch))) for noun_phrases in pool.imap(unpack_noun_phrases, rows): corpus.append(noun_phrases) status('%d of %d batches done' % (i, args.limit / args.batch_size)) print('Computing model...') dictionary = corpora.Dictionary(corpus) bows = [dictionary.doc2bow(tokens) for tokens in corpus] tfidf = models.TfidfModel(bows, id2word=dictionary) tfidf.save(args.out)