Exemplo n.º 1
0
 def testTopoSort(self):
     """  Topological sort of the DAG """
     logger.info("In TestFlow.testSort ...")
     logger.debug("Setting up DAG ...")
     a = HappyJobNode()
     b = HappyJobNode()
     c = HappyJobNode()
     d = HappyJobNode()
     e = HappyJobNode()
     f = HappyJobNode()
     g = HappyJobNode()
     h = HappyJobNode()
     i = HappyJobNode()
     j = HappyJobNode()
     a.addChild(b)
     a.addChild(c)
     b.addChild(d)
     b.addChild(e)
     c.addChild(f)
     c.addChild(g)
     d.addChild(h)
     g.addChild(i)        
     i.addChild(j)   
     e.addChild(j)     
     b.addChild(i)
     a.addChild(j)
     logger.debug("Testing topological sort ...")
     sort = a.sort()
     self.assertEqual(sort, [a, b, c, d, e, f, g, h, i, j])        
     logger.info("DONE.")
Exemplo n.º 2
0
 def testDictize(self):
     logger.info("In TestFlow.testDictize ...")
     name="name", 
     p1 = HappyJobNode(name="P1", inputpath=['inP1'], outputpath=['outP1'], 
                       status='statusP1', job='NullJob()')
     c1 = HappyJobNode(name="C1", inputpath=['inC1'], outputpath=['outC1'], 
                       status='statusC1', job='NullJob()')
     c2 = HappyJobNode(name="C2", inputpath=['inC2'], outputpath=['outC2'], 
                       status='statusC2', job='NullJob()')
     p1.addChild(c1)
     p1.addChild(c2)
     dict = p1.dictize()
     d1 = HappyJobNode.dedictize(dict)
     self.assertEqual(p1.name, d1.name)
     self.assertEqual(p1.inputpaths, d1.inputpaths)
     self.assertEqual(p1.outputpath, d1.outputpath)
     self.assertEqual(p1.job.__class__, d1.job.__class__)
     self.assertEqual(p1.children()[0].name, d1.children()[0].name)
     self.assertEqual(p1.children()[0].inputpaths, d1.children()[0].inputpaths)
     self.assertEqual(p1.children()[0].outputpath, d1.children()[0].outputpath)
     self.assertEqual(p1.children()[0].job.__class__, d1.children()[0].job.__class__)
     self.assertEqual(p1.children()[1].name, d1.children()[1].name)
     self.assertEqual(p1.children()[1].inputpaths, d1.children()[1].inputpaths)
     self.assertEqual(p1.children()[1].outputpath, d1.children()[1].outputpath)
     self.assertEqual(p1.children()[1].job.__class__, d1.children()[1].job.__class__)
     logger.info("DONE.")
Exemplo n.º 3
0
 def _testJoin(self):
     logger.info("In TestFlow.testGraphNames() ...")
     names = HappyJobNode(name='get_names', 
                         job=FilterExact(filterkey='propname', 
                                        filtervalues=['/type/object/name', '/common/topic/alias'],
                                        keyfield='a:guid', mapfields={'value':'name'}),
                         inputpaths=['/data/graph/latest/crawl'])
     types = HappyJobNode(name='get_types', 
                           job=FilterExact(filterkey='propname', 
                                           filtervalues=['/type/object/type'],
                                           keyfield='b:guid', mapfields={'target':'type'}),
                           inputpaths=['/data/graph/latest/crawl'])
     join = HappyJobNode(name='join_name_types',
                         job=InnerJoin(joinkeys=['a:guid', 'b:guid'], outkey='guid'))
     people = HappyJobNode(name='filter_people',
                           job=FilterExact(filterkey='type', 
                                           filtervalues=['/people/person'],
                                           keyfield='guid', mapfields={'type':'type', 'name':'name'}))
     agg = HappyJobNode(name='invert_names', 
                         job=AggregateJson(aggkey='name', aggfunc='agg.list("guid")'),
                         outputpath='namelist')
     names.addChild(join)
     types.addChild(join)
     join.addChild(people)
     people.addChild(agg)
     names.run(force=True)
Exemplo n.º 4
0
 def testCreateWithParent(self):
     """ Test instanciation with parent specified """
     logger.info("In TestFlow.testCreateWithParent ...")
     p = HappyJobNode()
     c = HappyJobNode(parents=p)
     self.assertEqual(c.parents(), [p])
     self.assertEqual(p.children(), [c])
     logger.info("DONE.")
Exemplo n.º 5
0
 def _testHappyRun(self):
     logger.info("In TestFlow.testSingleRun() ...")
     h = IdentityJob()
     h.inputpaths = "small"
     h.outputpath = "crap"
     dfs.delete('crap')
     h.run()
     dfs.delete('crap')
Exemplo n.º 6
0
 def _testFilter(self):
     logger.info("In TestFlow.testFilter() ...")
     node = HappyJobNode(name='filter_graph', 
                         job=FilterJson(filterkey='propname', 
                                       filtervalues=['/type/object/type'],
                                       returnkeys=['target', 'creator']),
                         inputpaths=['/data/graph/latest/crawl'], 
                         outputpath='typecount')
     node.run(force=True)
Exemplo n.º 7
0
 def _testEmptyRun(self):
     logger.info("In TestFlow.testRun() ...")
     dict = {1: {'children': [2, 3, 5], 'kwargs': {'job': 'NullJob()', 'name': 'P1', 'inputpaths':['small']}}, 
             2: {'children': [4],       'kwargs': {'job': 'NullJob()', 'name': 'C1'}}, 
             3: {'children': [4],       'kwargs': {'job': 'NullJob()', 'name': 'C2'}},
             5: {'children': [4],       'kwargs': {'job': 'NullJob()', 'name': 'C3'}},
             4: {'children': [],        'kwargs': {'job': 'NullJob()', 'name': 'G1', 'outputpath':'crap'}}}
     dag = HappyJobNode.dedictize(dict)
     dag.run(force=True)
Exemplo n.º 8
0
 def _testFilterLambda(self):
     logger.info("In TestFlow.testFilterLambda() ...")
     node = HappyJobNode(name='filter_graph_lambda', 
                         job=FilterLambda(filters=["lambda x: x.get('propname', None) in ['/type/object/name', '/common/topic/alias']",
                                                   "lambda y: type(y.get('value', ' '))==str and y.get('value', ' ').startswith('c')"],
                                       returnkeys=['value', '__keys__']),
                         inputpaths=['/data/graph/latest/crawl'], 
                         outputpath='cnames')
     node.run(force=True)
Exemplo n.º 9
0
 def testCreateSingle(self):
     """ Test that we can create a HappyJobNode """
     logger.info("In TestFlow.testCreateSingle ...")
     node = HappyJobNode(name="name", inputpaths=['in'], 
                         outputpath='out', job="NullJob()")
     self.assert_(node != None)
     self.assertEqual(node.name, 'name')
     self.assertEqual(node.inputpaths, ['in'])
     self.assertEqual(node.outputpath, 'out')
     logger.info("DONE.")
Exemplo n.º 10
0
 def deleteOutFiles(self, onlytmp=True):
     """
     Deletes all files listed as outputs in the Flow.
     """
     self.linkNodes()
     for node in self.sort():
         file = node.outputpath
         if (not onlytmp or file[0:4]=='tmp.'):
             logger.info("Deleting output file '%s'" % file)
             dfs.delete(file)
Exemplo n.º 11
0
 def _testDagRun(self):
     logger.info("In TestFlow.testDagRun() ...")
     p1 = HappyJobNode(name="P1", job=IdentityJob(),inputpaths=['small'])
     c1 = HappyJobNode(name="C1", job=IdentityJob())
     c2 = HappyJobNode(name="C2", job=IdentityJob())
     g1 = HappyJobNode(name="G1", job=IdentityJob(), outputpath='crap')
     p1.addChild(c1)
     p1.addChild(c2)
     c1.addChild(g1)
     c2.addChild(g1)
     p1.run(force=True)
Exemplo n.º 12
0
 def testMultiParent(self):
     """ Set up with many parents """
     logger.info("In TestFlow.testMultiParent ...")
     p1 = HappyJobNode()
     p2 = HappyJobNode()
     c1 = HappyJobNode()
     p1.addChild(c1)
     c1.addParent(p2)
     self.assertEqual(c1.parents(), [p1, p2])
     self.assertEqual(p1.children(), [c1])
     self.assertEqual(p2.children(), [c1])
     logger.info("DONE.")
Exemplo n.º 13
0
 def _testGraphNames(self):
     logger.info("In TestFlow.testGraphNames() ...")
     names = HappyJobNode(name='filter_graph_names', 
                         job=FilterExact(filterkey='propname', 
                                        filtervalues=['/type/object/name', '/common/topic/alias'],
                                        keyfield='guid', mapfields={'value':'name'}),
                         inputpaths=['/data/graph/latest/crawl'])
     agg = HappyJobNode(name='invert_names', 
                         job=AggregateJson(aggkey='value', aggfunc='agg.list("guid")'),
                         outputpath='namelist')
     names.addChild(agg)
     names.run(force=True)
Exemplo n.º 14
0
 def _testCountTypes(self):
     logger.info("In TestFlow.testCountTypes() ...")
     filter = HappyJobNode(name='filter_graph', 
                         job=FilterExact(filterkey='propname', 
                                       filtervalues=['/type/object/type'],
                                       returnkeys=['target']),
                         inputpaths=['/data/graph/latest/crawl'])
     agg = HappyJobNode(name='agg_types', 
                         job=AggregateJson(aggkey='target', aggfunc='agg.count()'),
                         outputpath='typecount')
     filter.addChild(agg)
     filter.run(force=True)
Exemplo n.º 15
0
 def testMultiChild(self):
     """ Set up with many children """
     logger.info("In TestFlow.testMultiChild ...")
     p1 = HappyJobNode(name="P1")
     c1 = HappyJobNode(name="C1")
     c2 = HappyJobNode(name="C2")
     p1.addChild(c1)
     p1.addChild(c2)
     self.assertEqual(c1.parents(), [p1])
     self.assertEqual(c2.parents(), [p1])
     self.assertEqual(p1.children(), [c1, c2])
     logger.info("DONE.")
Exemplo n.º 16
0
 def testCreateSingle(self):
     """ Test that we can create a HappyJobNode """
     logger.info("In TestFlow.testCreateSingle ...")
     node = HappyJobNode(name="name", inFiles=['in'], outFiles=['out'], 
                         status='status', job='job')
     self.assert_(node != None)
     self.assertEqual(node.name, 'name')
     self.assertEqual(node.inFiles, ['in'])
     self.assertEqual(node.outFiles, ['out'])
     self.assertEqual(node.status, 'status')
     self.assertEqual(node.job, 'job')
     logger.info("DONE.")
Exemplo n.º 17
0
 def testDAG(self):
     """ Set up with many relationships """
     logger.info("In TestFlow.testDAG ...")
     logger.debug("Setting up DAG ...")
     a = HappyJobNode()
     b = HappyJobNode()
     c = HappyJobNode()
     d = HappyJobNode()
     e = HappyJobNode()
     f = HappyJobNode()
     g = HappyJobNode()
     h = HappyJobNode()
     i = HappyJobNode()
     a.addChild(b)
     a.addChild(c)
     b.addChild(c)
     d.addChild(f)
     e.addChild(f)
     c.addChild(g)
     f.addChild(g)
     h.addChild(g)
     g.addChild(i)        
     logger.debug("Testing parent/child relationships ...")
     self.assertEqual(a.parents(), [])
     self.assertEqual(a.children(), [b, c])
     self.assertEqual(c.parents(), [a, b])
     self.assertEqual(f.parents(), [d, e])
     self.assertEqual(g.parents(), [c, f, h])
     self.assertEqual(g.children(), [i])
     logger.debug("Testing node retrieval ...")
     nodes0 = set([a,b,c,d,e,f,g,h,i])
     nodes1 = a.nodes()
     nodes2 = e.nodes()
     self.assertEqual(nodes0, nodes1)
     self.assertEqual(nodes0, nodes2)
     logger.debug("Testing sinks and sources ...")
     sinks = a.sinks()
     self.assertEqual(sinks, [i])
     sources = a.sources()
     self.assertEqual(sources, [a, d, e, h])
     logger.debug("Testing isAncestorOf() and isDecendentOf() ...")
     self.assert_(a.isAncestorOf(b))
     self.assert_(a.isAncestorOf(g))
     self.assert_(a.isAncestorOf(i))
     self.assert_(not a.isAncestorOf(d))
     self.assert_(i.isDecendentOf(g))
     self.assert_(i.isDecendentOf(a))
     self.assert_(i.isDecendentOf(b))
     self.assert_(i.isDecendentOf(e))
     self.assert_(not f.isDecendentOf(a))
     logger.info("DONE.")
Exemplo n.º 18
0
 def testBidirectional(self):
     """ Test that parent / child links are bidirectional """
     logger.info("In TestFlow.testBidirectional ...")
     p1 = HappyJobNode()
     c1 = HappyJobNode()
     p1.addChild(c1)
     self.assertEquals(c1.parents(), [p1])
     self.assertEquals(p1.children(), [c1])
     p2 = HappyJobNode()
     c2 = HappyJobNode()
     c2.addParent(p2)
     self.assertEquals(c2.parents(), [p2])
     self.assertEquals(p2.children(), [c2])
     logger.info("DONE.")
Exemplo n.º 19
0
 def _json_impl(agg, record):
     if not agg: agg = {}
     for (k, v) in record.items():
         logger.info("k: " + str(k) + ", v: " + str(v))
         if agg.has_key(k):
             if happy.flow.isIterable(v):
                 agg[k].extend(v)
             else:
                 agg[k].append(v)
         else:
             if happy.flow.isIterable(v):
                 agg[k] = v
             else:
                 agg[k] = [v]
     return agg
Exemplo n.º 20
0
 def testCircle(self):
     logger.info("In TestFlow.testCircle ...")
     p1 = HappyJobNode(name="P1")
     c1 = HappyJobNode(name="C1")
     c2 = HappyJobNode(name="C2")
     g1 = HappyJobNode(name="G1")
     p1.addChild(c1)
     p1.addChild(c2)
     c1.addChild(g1)
     self.assertRaises(CycleException, c1.addChild, p1)
     self.assertRaises(CycleException, p1.addParent, c1)
     self.assertRaises(CycleException, c2.addChild, p1)
     self.assertRaises(CycleException, p1.addParent, c2)
     self.assertRaises(CycleException, g1.addChild, p1)
     self.assertRaises(CycleException, p1.addParent, g1)
Exemplo n.º 21
0
 def linkNodes(self, workingDir=None):
     """
     Assures that every parent/child pair have a matching file in
     their inFile / outFile lists.  Creates files if necessary. 
     
     @param workingDir: the directory to create temp files in. 
     """
     if workingDir:
         logger.info("Linking nodes, using workingDir = %s" % (workingDir))    
         if dfs.exists(workingDir):
             fs = dfs.fileStatus(workingDir)
             if not fs.isDir():
                 raise FlowException, "%s is a file, not a directory." % (workingDir)
         else:
             logger.info("Creating working directory %s." % (workingDir))    
             # dfs.mkdir(workingDir)
     stack = self.sources()
     for source in stack:
         if ((not source.inputpaths) or len(source.inputpaths)<1):
             raise FlowException, "Source node %s has no inputpaths defined." % source
     while stack:
         node = stack.pop(0)
         if node.outputpath:
             logger.trace("linkNodes(): %s has an outputpath '%s'.  Using it." % (node, node.outputpath))
             filename = node.outputpath
         else:
             filename = "tmp.%s" % (node.name)
             if workingDir:
                 filename = "%s/%s" % (workingDir, filename)
             logger.trace("linkNodes(): Created temp outfile '%s' for %s." % (filename, node))
             node.outputpath = filename
         for child in node.children():
             if ((not child.inputpaths) or 
                (len(set(node.outputpath) & set(child.inputpaths)) == 0)):
                 logger.debug("linkNodes(): Linked %s and %s with file '%s'." % (node, child, filename))
                 child.inputpaths = castList(child.inputpaths) + [filename]
             stack.append(child)
         logger.debug("%s has inputs %s and outputs %s" % (node, node.inputpaths, node.outputpath))
Exemplo n.º 22
0
 def testFlow2(self):
     logger.info("In TestFlow.testFlow2() ...")
     test_flow = Flow2(inputpaths=['/data/graph/latest/crawl'], outputpath='namelist')
     (names, types) = test_flow.split()
     names.chain(HappyJobNode(name='get_names', 
                              job=FilterExact(filterkey='propname', 
                                              filtervalues=['/type/object/name', '/common/topic/alias'],
                                              keyfield='a:guid', mapfields={'value':'name'})))
     types.chain(HappyJobNode(name='get_types', 
                              job=FilterExact(filterkey='propname', 
                                              filtervalues=['/type/object/type'],
                                              keyfield='b:guid', mapfields={'target':'type'})))
     names.chain(HappyJobNode(name='join_name_types',
                              job=InnerJoin(joinkeys=['a:guid', 'b:guid'], outkey='guid'),
                              force=True),
                 join=types)
     names.chain(HappyJobNode(name='filter_people',
                              job=FilterExact(filterkey='type', 
                                              filtervalues=['/people/person'],
                                              keyfield='guid', mapfields={'type':'type', 'name':'name'})))
     names.chain(HappyJobNode(name='invert_names', 
                              job=AggregateJson(aggkey='name', aggfunc='agg.list("guid")')))
     logger.debug("DAG: \n%s\n" % names.startNode.dictize())
     names.run(force=False)
Exemplo n.º 23
0
 def _testSingleRun(self):
     logger.info("In TestFlow.testSingleRun() ...")
     node = HappyJobNode(name="P1", job=IdentityJob(),
                       inputpaths=['small'], outputpath='crap')
     node.run(force=True)
Exemplo n.º 24
0
 def _testFlowRun(self):
     logger.info("In TestFlow.testFlowRun() ...")
     f = Flow(IdentityJob(),inputpaths=['small'])
     f1 = f.chain(IdentityJob())
     f2 = f.chain(IdentityJob()).chain(IdentityJob(), join=f1, outputpath='crap')
     f2.run(force=True)
Exemplo n.º 25
0
 def run(self):
     logger.info("NullJob %s fired." % self.name)
     w = dfs.write(self.outputpath)
     w.write("NullJob() output -- for testing only.")
     w.close()
Exemplo n.º 26
0
 def testNull(self):
     """ Test that unittest harness is working """
     logger.info("In TestFlow.testNull ...")
     self.assertEqual(1, 1)
     logger.info("DONE.")
Exemplo n.º 27
0
    def run(self, force=False, workingDir=None):
        """
        Runs the entire job chain (ie DAG) that contains this node.
        """
        logger.debug("Calling HappyJobNode.run(), workingDir=%s" % workingDir)
        self.linkNodes(workingDir)
        if force:
            self.deleteOutFiles(onlytmp=False)
        # stack = self.sources()
        stack = self.sort()
        logger.info("Stack order is: %s" % (", ".join([str(x._id) for x in stack],)))
        ok_children = self.sources()
        while stack:
            node = stack.pop(0)
            putChildren = False
            
            if (not node in ok_children):
                logger.warn("Branch terminated: node %s not in ok_children list %s." % (node, ok_children))
                continue
            
            pre = node.precheck()
            if node.force:
                logger.info("FORCING %s [%s --> %s] (delete %s first)" % (node, node.inputpaths, node.outputpath, node.outputpath))
                dfs.delete(node.outputpath)
                node.fire()
            elif (pre =='ready'):
                logger.info("Running %s [%s --> %s]" % (node, node.inputpaths, node.outputpath))
                node.fire()
            else:
                logger.info("Skipping job %s: already done" % node)
                putChildren = True
                self.status = 'skip'
            
            post = node.postcheck()    
            if (post == 'done'):
                logger.info("Job %s completed successfully. " % node)
                putChildren = True
            elif (post == 'fail'):
                logger.info("Job %s failed.  Not adding children." % node)

            if putChildren:
                if (node.isSink()):
                    logger.info("Job %s is a sink, no children." % node)
                else:
                    newChildren = [child for child in node.children() if child not in ok_children]
                    logger.info("Placing children %s of job %s on stack." %  (newChildren, node))
                    ok_children.extend(newChildren)
Exemplo n.º 28
0
 def fire(self, *args, **kwargs):
     logger.info("NullNode %s fired." % self.name)