def run(self): """ Generate the concord Computation class and run via serve_computation """ attrs = { 'metadata': self._get_metadata(), 'init': self._get_init(), 'process_timer': self._get_process_timer(), 'process_record': self._get_process_record(), } computation = type(self.name, (ComputationWrapper,), attrs) serve_computation(computation())
def serve_test_computation(handler): logger.info("About to serve computation and service") def tryGetEnv(key): try: return os.environ[key] except Exception as e: logger.error('Error getting os.environ[%s]' % key) logger.fatal(e) zookeeper_url = tryGetEnv('integration_test_zookeeper_url') test_id = tryGetEnv('integration_test_id') node_id = tryGetEnv('integration_test_node_id') handler.concord = ZookeeperContext(zookeeper_url, test_id, node_id) logger.info("Defering further init: concord.computation.serve_computation") serve_computation(handler)
pass def metadata(self): return Metadata(name=self.name, istreams=[], ostreams=["nothing_going_here"]) class Gatherer(Computation): def process_record(self, ctx, record): self.concord_logger.info("{}:{}".format(record.key, record.data)) def metadata(self): return Metadata(name="gatherer", istreams=["nothing_going_here"], ostreams=[]) def init(self, ctx): pass def process_timer(self, ctx, key, timer): pass if __name__ == "__main__": import sys if len(sys.argv) > 1: serve_computation(MultiInstance(sys.argv[1])) else: serve_computation(Gatherer())
import json import sys import unicodedata import logging import concord from concord.computation import ( Computation, Metadata, serve_computation ) logging.basicConfig() log = logging.getLogger('CoinbasePricePrinter') log.setLevel(logging.DEBUG) class CoinbasePricePrinter(Computation): def init(self, ctx): log.info("Price Printer initialized") def process_record(self, ctx, record): r = json.loads(record.data) price = r.get('price', 'no-price-avail') log.info('Price: %s', price) def metadata(self): return Metadata(name='coinbase-price-printer', istreams=['btcusd'], ostreams=[]) serve_computation(CoinbasePricePrinter())
urls = [] return urls class MedicalDevicesUrlGenerator(Computation): def init(self, ctx): self.concord_logger.info("MedicalDevicesUrlGenerator init") ctx.set_timer('loop', time_millis()) def destroy(self): self.concord_logger.info("MedicalDevicesUrlGenerator destroyed") def process_timer(self, ctx, key, time): urls = raw_urls() for url in urls: # check in the cache if we have already processed this url h = url_hash(url) if len(ctx.get_state(h)) == 0: url_b = bytes(url) ctx.set_state(h, url_b) ctx.produce_record("m-device-urls", h, url_b) delay_ms = 1000 * 60 * 10; # 10 minutes ctx.set_timer(key, time_millis() + delay_ms) def process_record(self, ctx, record): raise Exception('process_record not implemented') def metadata(self): return Metadata( name='m-devices', istreams=[], ostreams=['m-device-urls']) serve_computation(MedicalDevicesUrlGenerator())
class CoinbaseSource(Computation): def __init__(self): self.queue = Queue() def init(self, ctx): ctx.set_timer('loop', time_millis() + 1000) # start in 1 sec log.info("Coinbase initialized") def process_timer(self, ctx, key, time): while not self.queue.empty(): ctx.produce_record('btcusd', 'empty', self.queue.get()) ctx.set_timer(key, time_millis() + 1000) # every sec def metadata(self): return Metadata(name='coinbase-indx', istreams=[], ostreams=['btcusd']) def gen_coinbase_source(): ret = CoinbaseSource() factory = WebSocketClientFactory("wss://ws-feed.exchange.coinbase.com") factory.queue = ret.queue factory.close_cb = reactor.stop factory.protocol = ExchangeProtocol connectWS(factory) Thread(target=reactor.run, args=(False, )).start() return ret serve_computation(gen_coinbase_source())
import logging logging.basicConfig() logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) class RSSParser(Computation): def __init__(self): pass def init(self, ctx): self.count=1 logger.info("Source initialized") def process_record(self, ctx, record): j=json.loads(record.data) if j['summary'].find('free') != -1: logger.info("Someone is giving away free stuff! url =" +j['url']) #logger.info("receiving = "+record.data) self.count+=1 #logger.info("threads parsed:"+str(self.count)) def metadata(self): return Metadata( name='rssfeedparser', istreams=['rssfeeds'], ostreams=[]) logger.info("Main") serve_computation(RSSParser())
# this deployment cant be killed using the `concord kill -a` command due to timer needing to fire to process the kill def init(self, ctx): self.started = False self.concord_logger.info("init") ctx.set_timer("start_cycle", int(round(time.time() * 1000))) def destroy(self): pass def process_timer(self, ctx, key, timer): self.concord_logger.info("process_timer") ctx.produce_record("cycle", "something", "=====start this cycle------>>>") if self.started == False: self.started = True ctx.set_timer("start_cycle", int(round(time.time() * 1000))) def process_record(self, ctx, record): self.concord_logger.info("{}:{}:{}".format(record.key, record.data, record.time)) time.sleep(2) ctx.produce_record("cycle", "something", "444") def metadata(self): return Metadata(name='cyclical', istreams=[("cycle", StreamGrouping.GROUP_BY)], ostreams=["cycle"]) serve_computation(Cyclical())
# Update cache with freshest metric data self.providerCtr[provider] = (clicks, impressions) # metadata takes no arguments. # returns: An object of type 'Metadata'. This object has three named fields, name, istreams, # and ostreams. The data in this object will be used by the framework to determine what streams # this operator subscribes and/or publishes to. def metadata(self): # istreams is an array that expects tuples of a type: (string, StreamGrouping). # In this example there are multiple istreams, therefore this computation is performing # a type of stream join. # The GROUP_BY enum instructs the framework that an aggregation by key is to be performed. # This ensures that records with the same key are always sent to the same instance of # this operator. # By aggregating two streams by key we can be assured that records from any stream emitted # on a particular provider will always be sent to the same instance of CtrCalculator. return Metadata( name='ctr-calculator', istreams=[ ('impressions', StreamGrouping.GROUP_BY), ('clicks', StreamGrouping.GROUP_BY) ] ostreams=[ 'click_through_rate' ]) # Initializes thrift server so the concord proxy can communicate with this process via thrift RPC.= # Makes 'registerWithScheduler' RPC which begins a series of events that will place this operator into # a concord topology. serve_computation(CtrCalculator())
import json import sys import unicodedata import logging import concord from concord.computation import (Computation, Metadata, serve_computation) logging.basicConfig() log = logging.getLogger('CoinbasePricePrinter') log.setLevel(logging.DEBUG) class CoinbasePricePrinter(Computation): def init(self, ctx): log.info("Price Printer initialized") def process_record(self, ctx, record): r = json.loads(record.data) price = r.get('price', 'no-price-avail') log.info('Price: %s', price) def metadata(self): return Metadata(name='coinbase-price-printer', istreams=['btcusd'], ostreams=[]) serve_computation(CoinbasePricePrinter())
def __init__(self): self.dict = {} self.pidx = 0 # print index def init(self, ctx): self.concord_logger.info("Counter initialized") def destroy(self): self.concord_logger.info("Source destroyed") def process_timer(self, ctx, key, time): raise Exception('process_timer not implemented') def process_record(self, ctx, record): self.pidx += 1 if self.dict.has_key(record.key): self.dict[record.key] += 1 else: self.dict[record.key] = 1 if (self.pidx % 10000) == 0: self.concord_logger.info(self.dict) def metadata(self): return Metadata( name='word-counter', istreams=[('words', StreamGrouping.GROUP_BY)], ostreams=[]) serve_computation(WordCounter())
def __init__(self): self.dict = {} self.pidx = 0 # print index def init(self, ctx): self.concord_logger.info("Counter initialized") def destroy(self): self.concord_logger.info("Source destroyed") def process_timer(self, ctx, key, time): raise Exception('process_timer not implemented') def process_record(self, ctx, record): self.pidx += 1 if self.dict.has_key(record.key): self.dict[record.key] += 1 else: self.dict[record.key] = 1 if (self.pidx % 10000) == 0: self.concord_logger.info(self.dict) def metadata(self): return Metadata(name='word-counter', istreams=[('words', StreamGrouping.GROUP_BY)], ostreams=[]) serve_computation(WordCounter())
logger.info('Dumping matches!') output = '\n'.join([str(x) for x in self.batch]) logger.debug(output) self.batch = [] def watcher(self, children): """ Callback that is triggered when data is posted to zookeeper path '/regex'""" logger.info('detected new node %s' % children) for child in children: if child not in self.expressions: fn = partial(DistGrep.new_regex, self, child) self.zk_client.DataWatch('/regex/%s' % child, func=fn) def new_regex(self, child, data, stats): if data == None and stats == None: return True self.expressions[child] = data logger.info('Child reporting %s' % child) logger.info('registering new regex %s' % data) def metadata(self): """ Called when scheduler is initializing computation for launch. Must return object of type concord.Metadata""" return Metadata(name='dist-grep', istreams=['logs'], ostreams=[]) logger.info("Inside of Main") serve_computation(DistGrep())
d = feedparser.parse( completeurl ) for a in d['entries']: #logger.info("reading from rss obj") summary =a['summary_detail']['value'] link=a['links'][0]['href'] #logger.info("creatign json data") data = {} data['url'] = link data['summary']=summary json_data = json.dumps(data) #logger.info("created json data") ctx.produce_record("rssfeeds", "content", json_data.encode('utf-8')) #logger.info("sent message") self.count+=1 #logger.info("sent "+str(self.count)+" items") ctx.set_timer('loop', time_millis() + 1000) def process_record(self, ctx, record): raise Exception('process_record not implemented') def metadata(self): return Metadata( name='rssfeedgenerator', istreams=[], ostreams=['rssfeeds']) logger.info("Main") serve_computation(RSSGenerator())
except Exception as e: print "Exception closing readers ", e raise StopIteration except Exception as e: self.bad_records_parsed = self.bad_records_parsed + 1 print "Unhandled error in url parsing, skipping record: ", e class MedicalDevicesParser(Computation): def init(self, ctx): pass def destroy(self): pass def process_timer(self, ctx, key, time): pass def process_record(self, ctx, record): for obj in MedicalDeviceIterator(str(record.data)): try: ctx.produce_record("m-devices-json", bytes(obj.id), bytes(obj.to_json())) except Exception as e: print "Exception producing record", e def metadata(self): return Metadata(name="m-device-parser", istreams=["m-device-urls"], ostreams=["m-devices-json"]) serve_computation(MedicalDevicesParser())
def __init__(self): self.words = ['foo', 'bar', 'baz', 'fiz', 'buzz'] def sample(self): """returns a random word""" import random return random.choice(self.words) def init(self, ctx): self.concord_logger.info("Source initialized") ctx.set_timer('loop', time_millis()) def process_timer(self, ctx, key, time): # stream, key, value. empty value, no need for val for _ in range(0, 1024): ctx.produce_record("words", self.sample(), '-') # emit records every 500ms ctx.set_timer("main_loop", time_millis() + 5000) def process_record(self, ctx, record): raise Exception('process_record not implemented') def metadata(self): return Metadata( name='word-source', istreams=[], ostreams=['words']) serve_computation(WordSource())
def init(self, ctx): self.concord_logger.info("Operator initialized") if self.prune_time > 0: ctx.set_timer('loop', time_time() * 1000) def process_timer(self, ctx, key, time): """ Prune the cache of expired items every 'prune_time' seconds. Otherwise this would only happen when mutating the cache""" self.cache.expire() ctx.set_timer('cleanup_loop', (time.time() + self.prune_time) * 1000) def process_record(self, ctx, record): """ With GROUP_BY routing strategy, it is guaranteed that the same key will be sent to the same operator, regardless of scaling""" if record.stream == 'bids': self.cache[record.key] = record.data elif record.stream == 'imps': bid = self.cache[record.key] if bid is not None: ctx.process_record('winningbids', record.key, '-') def metadata(self): return Metadata( name='filter-winning-bids', istreams=[('bids', StreamGrouping.GROUP_BY), ('imps', StreamGrouping.GROUP_BY)], ostreams=['winningbids']) serve_computation(FilterWinningBids(5000000, 60))
import random return random.choice(self.sentences) def init(self, ctx): self.concord_logger.info("Source initialized") ctx.set_timer('loop', time_millis()) def destroy(self): self.concord_logger.info("Source destroyed") def process_timer(self, ctx, key, time): # stream, key, value. empty value, no need for val iterations = 10000 while iterations > 0: iterations -= 1 ctx.produce_record("sentences", self.sample(), '-') # emit records every 500ms ctx.set_timer("main_loop", time_millis()) def process_record(self, ctx, record): raise Exception('process_record not implemented') def metadata(self): return Metadata(name='sentence-generator', istreams=[], ostreams=['sentences']) serve_computation(SentenceGenerator())
from concord.computation import (Computation, Metadata, serve_computation) from concord.internal.thrift.ttypes import StreamGrouping class SentenceSplitter(Computation): def init(self, ctx): self.concord_logger.info("Splitter initialized") def destroy(self): self.concord_logger.info("Source destroyed") def process_timer(self, ctx, key, time): raise Exception('process_timer not implemented') def process_record(self, ctx, record): for word in record.key.split(" "): ctx.produce_record('words', word, '-') def metadata(self): return Metadata(name='sentence-splitter', istreams=['sentences'], ostreams=['words']) serve_computation(SentenceSplitter())
sniff_on_connection_fail=True, # and also every 60 seconds sniffer_timeout=60 ) return es class MDeviceIndexer(Computation): def init(self, ctx): self.es = get_elastic_search_connection() # ignore 400 cause by IndexAlreadyExistsException when creating an index self.es.indices.create(index='concord', ignore=400) def destroy(self): pass def process_timer(self, ctx, key, time): pass def process_record(self, ctx, record): try: res = self.es.index(index="concord", doc_type="mdevice", id=record.key, body=record.data) if not res['created']: print "Error saving to elastic search: ", res except Exception as e: print "Couldn't index record: ", e def metadata(self): return Metadata( name='m-device-es', istreams=['m-devices-json'], ostreams=[]) serve_computation(MDeviceIndexer())
def new_time(ctx, offset_in_millis): current_time = time.time() current_millis = current_time * 1000 rounded_time = int(round(current_millis)) + offset_in_millis ctx.set_timer("init", rounded_time) class First(Computation): def init(self, ctx): self.concord_logger.info("Counter initialized") new_time(ctx, 3000) def destroy(self): self.concord_logger.info("Source destroyed") def process_timer(self, ctx, key, timer): self.concord_logger.info("process timer") ctx.produce_record("outstream", "hello world", "!") ctx.produce_record("outstream", "hello world", "!!!") ctx.produce_record("outstream", "hello world", "???") new_time(ctx, 3000) def process_record(self, ctx, record): self.concord_logger("process record") def metadata(self): return Metadata(name='first', istreams=[], ostreams=["outstream"]) serve_computation(First())
def onMessage(self, payload, *args, **kwargs): self.factory.queue.put(payload) def onClose(self, wasClean, code, reason): log.info("websocket closed because", reason) self.factory.close_cb() class CoinbaseSource(Computation): def __init__(self): self.queue = Queue() def init(self, ctx): ctx.set_timer('loop', time_millis() + 1000) # start in 1 sec log.info("Coinbase initialized") def process_timer(self, ctx, key, time): while not self.queue.empty(): ctx.produce_record('btcusd', 'empty', self.queue.get()) ctx.set_timer(key, time_millis() + 1000) # every sec def metadata(self): return Metadata(name='coinbase-indx', istreams=[], ostreams=['btcusd']) def gen_coinbase_source(): ret = CoinbaseSource() factory = WebSocketClientFactory("wss://ws-feed.exchange.coinbase.com") factory.queue = ret.queue factory.close_cb = reactor.stop factory.protocol = ExchangeProtocol connectWS(factory) Thread(target=reactor.run, args=(False,)).start() return ret serve_computation(gen_coinbase_source())
u"These tools and their built-in counterparts also work well with the high-speed functions in the operator module. For example, the multiplication operator can be mapped across two vectors to form an efficient dot-product: sum(imap(operator.mul, vector1, vector2)).", ] def sample(self): """returns a random word""" import random return random.choice(self.sentences) def init(self, ctx): self.concord_logger.info("Source initialized") ctx.set_timer('loop', time_millis()) def process_timer(self, ctx, key, time): # stream, key, value. empty value, no need for val for i in range(0, 1024): ctx.produce_record("sentences", str(i), self.sample()) # emit records every 500ms ctx.set_timer("main_loop", time_millis() + 5000) def process_record(self, ctx, record): raise Exception('process_record not implemented') def metadata(self): return Metadata( name='sentence-source', istreams=[], ostreams=['sentences']) serve_computation(SentenceSource())
import random return random.choice(self.sentences) def init(self, ctx): self.concord_logger.info("Source initialized") ctx.set_timer('loop', time_millis()) def destroy(self): self.concord_logger.info("Source destroyed") def process_timer(self, ctx, key, time): # stream, key, value. empty value, no need for val iterations = 10000 while iterations > 0: iterations -= 1 ctx.produce_record("sentences", self.sample(), '-') # emit records every 500ms ctx.set_timer("main_loop", time_millis()) def process_record(self, ctx, record): raise Exception('process_record not implemented') def metadata(self): return Metadata( name='sentence-generator', istreams=[], ostreams=['sentences']) serve_computation(SentenceGenerator())
Metadata, StreamGrouping, serve_computation ) class Something(Computation): def init(self, ctx): self.concord_logger.info("Counter initialized") ctx.set_timer("init", int(round(time.time() * 1000))) def destroy(self): self.concord_logger.info("Source destroyed") def process_timer(self, ctx, key, timer): self.concord_logger.info("process timer") ctx.set_timer("process timer", int(round(time.time() * 1000))) def process_record(self, ctx, record): self.concord_logger("process record") def metadata(self): return Metadata( name='something', istreams=[], ostreams=["outputstream"]) serve_computation(Something()) ~ ~
import sys import concord import time from concord.computation import (Computation, Metadata, StreamGrouping, serve_computation) class Final(Computation): def init(self, ctx): self.concord_logger.info("initialized") def destroy(self): self.concord_logger.info("Source destroyed") def process_timer(self, ctx, key, timer): pass def process_record(self, ctx, record): self.concord_logger.info("process record") self.concord_logger.info("=====================") for key, val in record.__dict__.items(): self.concord_logger.info("{}:{}".format(key, val)) def metadata(self): return Metadata(name='final', istreams=[("outstream", StreamGrouping.GROUP_BY)], ostreams=[]) serve_computation(Final())