def getStatsForSSAMR(): batchSize = 50000 default_experts_twitter_stream_settings['ssa_threshold'] = 0.75 for id in range(0, 10): ts = time.time() fileName = time_to_process_points + '%s/%s' % (batchSize, id) iteration_file = '%s_%s' % (batchSize, id) print 'Generating data for ', iteration_file with open(iteration_file, 'w') as fp: [ fp.write(CJSONProtocol.write('x', [doc1, doc2]) + '\n') for doc1, doc2 in combinations(iterateUserDocuments(fileName), 2) ] os.system('hadoop fs -put %s %s' % (iteration_file, hdfsUnzippedPath)) StreamSimilarityAggregationMR.estimate( hdfsUnzippedPath + '/%s' % iteration_file, args='-r hadoop'.split(), jobconf={ 'mapred.map.tasks': 25, 'mapred.task.timeout': 7200000, 'mapred.reduce.tasks': 25 }) os.system('hadoop fs -rmr %s' % (hdfsUnzippedPath + '/%s' % iteration_file)) os.system('rm -rf %s' % iteration_file) iteration_data = { 'iteration_time': time.time() - ts, 'type': 'ssa_mr', 'number_of_messages': batchSize * (id + 1), 'batch_size': batchSize } FileIO.writeToFileAsJson(iteration_data, ssa_mr_stats_file)
def generateDocsForSSAMR(): for length in [1000000, 1100000, 1200000]: tf = TweetsFile(length, **experts_twitter_stream_settings) iteration_file = clustering_quality_experts_ssa_mr_folder + str(length) print "Generating data for ", iteration_file with open(iteration_file, "w") as fp: [ fp.write(CJSONProtocol.write("x", [doc1, doc2]) + "\n") for doc1, doc2 in combinations(tf._iterateUserDocuments(), 2) ] os.system("gzip %s" % iteration_file) print "hadoop fs -put %s.gz %s" % (iteration_file, hdfsPath) os.system("hadoop fs -put %s.gz %s" % (iteration_file, hdfsPath))
def generateDocsForSSAMR(): for length in [1000000, 1100000, 1200000]: tf = TweetsFile(length, **experts_twitter_stream_settings) iteration_file = clustering_quality_experts_ssa_mr_folder + str( length) print 'Generating data for ', iteration_file with open(iteration_file, 'w') as fp: [ fp.write(CJSONProtocol.write('x', [doc1, doc2]) + '\n') for doc1, doc2 in combinations(tf._iterateUserDocuments(), 2) ] os.system('gzip %s' % iteration_file) print 'hadoop fs -put %s.gz %s' % (iteration_file, hdfsPath) os.system('hadoop fs -put %s.gz %s' % (iteration_file, hdfsPath))
def getStatsForSSAMR(): batchSize = 50000 default_experts_twitter_stream_settings['ssa_threshold']=0.75 for id in range(0, 10): ts = time.time() fileName = time_to_process_points+'%s/%s'%(batchSize,id) iteration_file = '%s_%s'%(batchSize, id) print 'Generating data for ', iteration_file with open(iteration_file, 'w') as fp: [fp.write(CJSONProtocol.write('x', [doc1, doc2])+'\n') for doc1, doc2 in combinations(iterateUserDocuments(fileName),2)] os.system('hadoop fs -put %s %s'%(iteration_file, hdfsUnzippedPath)) StreamSimilarityAggregationMR.estimate(hdfsUnzippedPath+'/%s'%iteration_file, args='-r hadoop'.split(), jobconf={'mapred.map.tasks':25, 'mapred.task.timeout': 7200000, 'mapred.reduce.tasks':25}) os.system('hadoop fs -rmr %s'%(hdfsUnzippedPath+'/%s'%iteration_file)) os.system('rm -rf %s'%iteration_file) iteration_data = {'iteration_time': time.time()-ts, 'type': 'ssa_mr', 'number_of_messages': batchSize*(id+1), 'batch_size': batchSize} FileIO.writeToFileAsJson(iteration_data, ssa_mr_stats_file)
def createFileForNextIteration(data): with open(iteration_file, 'w') as fp: [fp.write(CJSONProtocol.write(k, v)+'\n') for k, v in data.iteritems()]
def createFileForNextIteration(data): with open(iteration_file, 'w') as fp: [ fp.write(CJSONProtocol.write(k, v) + '\n') for k, v in data.iteritems() ]