예제 #1
0
def getStatsForSSAMR():
    batchSize = 50000
    default_experts_twitter_stream_settings['ssa_threshold'] = 0.75
    for id in range(0, 10):
        ts = time.time()
        fileName = time_to_process_points + '%s/%s' % (batchSize, id)
        iteration_file = '%s_%s' % (batchSize, id)
        print 'Generating data for ', iteration_file
        with open(iteration_file, 'w') as fp:
            [
                fp.write(CJSONProtocol.write('x', [doc1, doc2]) + '\n') for
                doc1, doc2 in combinations(iterateUserDocuments(fileName), 2)
            ]
        os.system('hadoop fs -put %s %s' % (iteration_file, hdfsUnzippedPath))
        StreamSimilarityAggregationMR.estimate(
            hdfsUnzippedPath + '/%s' % iteration_file,
            args='-r hadoop'.split(),
            jobconf={
                'mapred.map.tasks': 25,
                'mapred.task.timeout': 7200000,
                'mapred.reduce.tasks': 25
            })

        os.system('hadoop fs -rmr %s' %
                  (hdfsUnzippedPath + '/%s' % iteration_file))
        os.system('rm -rf %s' % iteration_file)
        iteration_data = {
            'iteration_time': time.time() - ts,
            'type': 'ssa_mr',
            'number_of_messages': batchSize * (id + 1),
            'batch_size': batchSize
        }
        FileIO.writeToFileAsJson(iteration_data, ssa_mr_stats_file)
def getStatsForSSAMR():
    batchSize = 50000
    default_experts_twitter_stream_settings['ssa_threshold']=0.75
    for id in range(0, 10):
        ts = time.time()
        fileName = time_to_process_points+'%s/%s'%(batchSize,id)
        iteration_file = '%s_%s'%(batchSize, id)
        print 'Generating data for ', iteration_file
        with open(iteration_file, 'w') as fp: [fp.write(CJSONProtocol.write('x', [doc1, doc2])+'\n') for doc1, doc2 in combinations(iterateUserDocuments(fileName),2)]
        os.system('hadoop fs -put %s %s'%(iteration_file, hdfsUnzippedPath))    
        StreamSimilarityAggregationMR.estimate(hdfsUnzippedPath+'/%s'%iteration_file, args='-r hadoop'.split(), 
                                        jobconf={'mapred.map.tasks':25, 'mapred.task.timeout': 7200000, 'mapred.reduce.tasks':25})
        
        os.system('hadoop fs -rmr %s'%(hdfsUnzippedPath+'/%s'%iteration_file))
        os.system('rm -rf %s'%iteration_file)
        iteration_data = {'iteration_time': time.time()-ts, 'type': 'ssa_mr', 'number_of_messages': batchSize*(id+1), 'batch_size': batchSize}
        FileIO.writeToFileAsJson(iteration_data, ssa_mr_stats_file)
 def getStatsForSSAMR(self):
     print "SSA-MR"
     ts = time.time()
     documentClusters = list(
         StreamSimilarityAggregationMR.estimate(
             self.hdfsUnzippedFile,
             args="-r hadoop".split(),
             jobconf={"mapred.map.tasks": 25, "mapred.task.timeout": 7200000, "mapred.reduce.tasks": 25},
         )
     )
     te = time.time()
     return self.getEvaluationMetrics(documentClusters, te - ts)
 def getStatsForSSAMR(self):
     print 'SSA-MR'
     ts = time.time()
     documentClusters = list(
         StreamSimilarityAggregationMR.estimate(self.hdfsUnzippedFile,
                                                args='-r hadoop'.split(),
                                                jobconf={
                                                    'mapred.map.tasks': 25,
                                                    'mapred.task.timeout':
                                                    7200000,
                                                    'mapred.reduce.tasks':
                                                    25
                                                }))
     te = time.time()
     return self.getEvaluationMetrics(documentClusters, te - ts)
예제 #5
0
 def test_estimate(self):
     args = '-r hadoop' if os.uname()[1]=='spock' else '-r local'
     self.assertEqual([['1', '3', '2', '4'], ['5', '7', '6']], list(StreamSimilarityAggregationMR.estimate(test_file, args=args.split(), jobconf={'mapred.reduce.tasks':2})))