def aggregate_file(client, file_path, categs): """ Aggregate all links from a text file """ log.info('Loading from file: %s' % file_path) def read_data(): file = open(file_path) for url in file.xreadlines(): yield url.strip(), categs feeds.aggregate_all(client, read_data(), get_hbase_client)
def aggregate_opml(client, file, categs): """ Aggregate all links from an OPML file """ log.info('Loading from file: %s' % file) def read_data(): loader = OpmlLoader(file) if not loader.is_valid(): log.error('Invalid opml file: %s' % file) return for element in loader: yield element.xmlUrl, categs feeds.aggregate_all(client, read_data(), get_hbase_client)
def refresh_feeds(client, allowed_categs): """ Refresh all feeds found in the database using a pool of threads. """ log.info('Starting to refresh all feeds') allowed_categs = split_csv(allowed_categs) def read_data(): scanner = db.Scanner(client, 'Feeds', ['Meta:']) for row in scanner: feed, categs = row.row, row.columns['Meta:categs'].value if allowed_categs and not any_in(split_csv(categs), allowed_categs): continue yield feed, categs feeds.aggregate_all(client, read_data(), get_hbase_client)