def getStatsForSSAMR(): batchSize = 50000 default_experts_twitter_stream_settings['ssa_threshold'] = 0.75 for id in range(0, 10): ts = time.time() fileName = time_to_process_points + '%s/%s' % (batchSize, id) iteration_file = '%s_%s' % (batchSize, id) print 'Generating data for ', iteration_file with open(iteration_file, 'w') as fp: [ fp.write(CJSONProtocol.write('x', [doc1, doc2]) + '\n') for doc1, doc2 in combinations(iterateUserDocuments(fileName), 2) ] os.system('hadoop fs -put %s %s' % (iteration_file, hdfsUnzippedPath)) StreamSimilarityAggregationMR.estimate( hdfsUnzippedPath + '/%s' % iteration_file, args='-r hadoop'.split(), jobconf={ 'mapred.map.tasks': 25, 'mapred.task.timeout': 7200000, 'mapred.reduce.tasks': 25 }) os.system('hadoop fs -rmr %s' % (hdfsUnzippedPath + '/%s' % iteration_file)) os.system('rm -rf %s' % iteration_file) iteration_data = { 'iteration_time': time.time() - ts, 'type': 'ssa_mr', 'number_of_messages': batchSize * (id + 1), 'batch_size': batchSize } FileIO.writeToFileAsJson(iteration_data, ssa_mr_stats_file)
def getStatsForSSAMR(): batchSize = 50000 default_experts_twitter_stream_settings['ssa_threshold']=0.75 for id in range(0, 10): ts = time.time() fileName = time_to_process_points+'%s/%s'%(batchSize,id) iteration_file = '%s_%s'%(batchSize, id) print 'Generating data for ', iteration_file with open(iteration_file, 'w') as fp: [fp.write(CJSONProtocol.write('x', [doc1, doc2])+'\n') for doc1, doc2 in combinations(iterateUserDocuments(fileName),2)] os.system('hadoop fs -put %s %s'%(iteration_file, hdfsUnzippedPath)) StreamSimilarityAggregationMR.estimate(hdfsUnzippedPath+'/%s'%iteration_file, args='-r hadoop'.split(), jobconf={'mapred.map.tasks':25, 'mapred.task.timeout': 7200000, 'mapred.reduce.tasks':25}) os.system('hadoop fs -rmr %s'%(hdfsUnzippedPath+'/%s'%iteration_file)) os.system('rm -rf %s'%iteration_file) iteration_data = {'iteration_time': time.time()-ts, 'type': 'ssa_mr', 'number_of_messages': batchSize*(id+1), 'batch_size': batchSize} FileIO.writeToFileAsJson(iteration_data, ssa_mr_stats_file)
def getStatsForSSAMR(self): print "SSA-MR" ts = time.time() documentClusters = list( StreamSimilarityAggregationMR.estimate( self.hdfsUnzippedFile, args="-r hadoop".split(), jobconf={"mapred.map.tasks": 25, "mapred.task.timeout": 7200000, "mapred.reduce.tasks": 25}, ) ) te = time.time() return self.getEvaluationMetrics(documentClusters, te - ts)
def getStatsForSSAMR(self): print 'SSA-MR' ts = time.time() documentClusters = list( StreamSimilarityAggregationMR.estimate(self.hdfsUnzippedFile, args='-r hadoop'.split(), jobconf={ 'mapred.map.tasks': 25, 'mapred.task.timeout': 7200000, 'mapred.reduce.tasks': 25 })) te = time.time() return self.getEvaluationMetrics(documentClusters, te - ts)
def test_estimate(self): args = '-r hadoop' if os.uname()[1]=='spock' else '-r local' self.assertEqual([['1', '3', '2', '4'], ['5', '7', '6']], list(StreamSimilarityAggregationMR.estimate(test_file, args=args.split(), jobconf={'mapred.reduce.tasks':2})))