def bic_using_mapreduce(self, iteration_bic_list, em_iters):
     mr_args = ['-v', '--strict-protocols', '-r', 'local','--input-protocol', 'pickle','--output-protocol','pickle','--protocol','pickle']
     input = []
     l = len(iteration_bic_list)
     for gmm1idx in range(l):
         for gmm2idx in range(gmm1idx+1, l):
             g1, d1 = iteration_bic_list[gmm1idx]
             g2, d2 = iteration_bic_list[gmm2idx]
             data = np.concatenate((d1,d2))
             an_item = protocol.write((gmm1idx,gmm2idx),(g1, g2, data, em_iters))
             input.append(an_item+"\n")     
 
     job = BICMRJob(args=mr_args).sandbox(stdin=input)
     runner = job.make_runner()
     runner.run()
     kv_pairs = map(job.parse_output_line, runner.stream_output())
     assert len(kv_pairs) == 1
     best_merged_gmm, merged_tuple, merged_tuple_indices, best_score = kv_pairs[0][1]
     
     ind1, ind2 = merged_tuple_indices
     g1, d1 = iteration_bic_list[ind1]
     g2, d2 = iteration_bic_list[ind2]
     data = np.concatenate((d1,d2))
     new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters)
         
     return new_gmm, (g1, g2), merged_tuple_indices, best_score      
示例#2
0
 def internal_protocol(self):
     if self.options.internal_format == 'json':
         return StandardJSONProtocol()
     elif self.options.internal_format == 'pickle':
         return PickleProtocol()
     elif self.options.internal_format == 'raw':
         return RawProtocol()
示例#3
0
def diarize_all( infilenames ):
        mr_args = ['-v', '--strict-protocols',
            '-r', 'hadoop',
            '--input-protocol', 'pickle',
            '--output-protocol','pickle',
            '--protocol','pickle'
        ]
        task_args = [protocol.write(name, None)+"\n" for name in meeting_names]
        job = ClusterMRJob(args=mr_args).sandbox(stdin=task_args)
        runner = job.make_runner()
        runner.run()
 def train_using_mapreduce(self, init_training, em_iters):
     mr_args = ['-v', '--strict-protocols', '-r', 'hadoop','--input-protocol', 'pickle','--output-protocol','pickle','--protocol','pickle']    
     input = []
     count = 0
     for pair in init_training:
         input.append((count, pair, em_iters))
         count = count+1
     task_args = [protocol.write(pair, None)+"\n" for pair in input]
     job = TrainMRJob(args=mr_args).sandbox(stdin=task_args)
     runner = job.make_runner()        
     runner.run()
     kv_pairs = map(job.parse_output_line, runner.stream_output())
     #keys = map(lambda(k, v): k, kv_pairs)
     #print "Returned keys:", keys
     return map(lambda(k, v): v, kv_pairs)  
 def score_using_mapreduce(self, gmm_list):
     mr_args = ['-v', '--strict-protocols', '-r', 'hadoop','--input-protocol', 'pickle','--output-protocol','pickle','--protocol','pickle']
     input = []
     count = 0
     for g in gmm_list:
         input.append((count, g)) #, self.X))
         count = count + 1
     task_args = [protocol.write(g, None)+"\n" for g in input]
     job = ScoreMRJob(args=mr_args).sandbox(stdin=task_args)
     runner = job.make_runner()
     runner.run()
     kv_pairs = map(job.parse_output_line, runner.stream_output())
     #keys = map(lambda(k, v): k, kv_pairs)
     #print "Returned keys:", keys
     return map(lambda(k, v): v, kv_pairs)  
 def train_using_mapreduce(self, init_training, em_iters):
     mr_args = ['-v', '--strict-protocols', '-r', 'hadoop','--input-protocol', 'pickle','--output-protocol','pickle','--protocol','pickle']    
     input = []
     count = 0
     t = time.time()
     for pair in init_training:
         g, start, interval = pair
         #g.initialize_asp_mod()
         input.append((count, pair, em_iters))
         count = count+1
     task_args = [protocol.write(pair, None)+"\n" for pair in input]
     print "[train] preparation time:", time.time()-t
     t = time.time()
     job = TrainMRJob(args=mr_args).sandbox(stdin=task_args)
     runner = job.make_runner()
     print "[train] init mrjob:", time.time()-t        
     runner.run()
     kv_pairs = map(job.parse_output_line, runner.stream_output())
     #keys = map(lambda(k, v): k, kv_pairs)
     #print "Returned keys:", keys
     return map(lambda(k, v): v, kv_pairs)  
 def segment_using_mapreduce(self, gmm_list, map_input, em_iter):
     pickle.dump(gmm_list, open('self_gmmlist', 'w'))
     os.chmod("self_gmmlist", S_IRUSR | S_IWUSR | S_IXUSR | \
                              S_IRGRP | S_IXGRP |           \
                              S_IROTH | S_IXOTH             )
     pickle.dump(em_iter, open('self_em_iter', 'w'))
     os.chmod("self_em_iter", S_IRUSR | S_IWUSR | S_IXUSR | \
                              S_IRGRP | S_IXGRP |           \
                              S_IROTH | S_IXOTH             )
     
     mr_args = ['-v', '--strict-protocols', '-r', 'hadoop','--input-protocol', 'pickle','--output-protocol','pickle','--protocol','pickle']        
     task_args = [protocol.write(i, None)+"\n" for i in map_input]
     job = SegmentMRJob(args=mr_args).sandbox(stdin=task_args)
     runner = job.make_runner()
     runner.run()
     kv_pairs = map(job.parse_output_line, runner.stream_output())
     iter_bic_list = map(lambda(k, v): k, kv_pairs)
     iter_bic_dict = {}
     for pair in kv_pairs:
         (p, data_indices), g = pair
         iter_bic_dict[p] = data_indices
         gmm_list[p] = g             #Update trained GMMs
     return iter_bic_dict, iter_bic_list
示例#8
0
 def test_bad_data(self):
     self.assertCantDecode(PickleProtocol(), b'{@#$@#!^&*$%^')
示例#9
0
 def test_round_trip_with_trailing_tab(self):
     for k, v in PICKLE_KEYS_AND_VALUES:
         self.assertRoundTripWithTrailingTabOK(PickleProtocol(), k, v)
示例#10
0
 def test_round_trip(self):
     for k, v in PICKLE_KEYS_AND_VALUES:
         self.assertRoundTripOK(PickleProtocol(), k, v)
示例#11
0
 
all_meeting_names = [
'E001/HVC006045', 'E001/HVC006184', 'E001/HVC011409', 'E001/HVC022974', 'E001/HVC026971', 'E001/HVC027850', 'E001/HVC029485', 'E001/HVC031151', 'E001/HVC032158', 'E001/HVC036790', 'E001/HVC040785', 'E001/HVC042692', 'E001/HVC049437', 'E001/HVC064215', 'E001/HVC067270', 'E001/HVC068504', 'E001/HVC087057', 'E001/HVC090414', 'E001/HVC091740', 'E001/HVC095303', 'E001/HVC098807', 'E001/HVC103302', 'E001/HVC103932', 'E001/HVC105766', 'E001/HVC108320', 'E001/HVC109456', 'E001/HVC123288', 'E001/HVC134406', 'E001/HVC134634', 'E001/HVC146032', 'E001/HVC148793', 'E001/HVC152448', 'E001/HVC169936', 'E001/HVC174999', 'E001/HVC198184', 'E001/HVC201655', 'E001/HVC203988', 'E001/HVC218344', 'E001/HVC218765', 'E001/HVC240490', 'E001/HVC243157', 'E001/HVC255452', 'E001/HVC257987', 'E001/HVC259599', 'E001/HVC263724', 'E001/HVC268521', 'E001/HVC271903', 'E001/HVC283780', 'E001/HVC292292', 'E001/HVC293678', 'E001/HVC298568', 'E001/HVC309504', 'E001/HVC317804', 'E001/HVC319297', 'E001/HVC319600', 'E001/HVC320560', 'E001/HVC334254', 'E001/HVC359216', 'E001/HVC362146', 'E001/HVC364230', 'E001/HVC365963', 'E001/HVC377296', 'E001/HVC396658', 'E001/HVC397279', 'E001/HVC398674', 'E001/HVC402970', 'E001/HVC403628', 'E001/HVC417926', 'E001/HVC418407', 'E001/HVC425927', 'E001/HVC429818', 'E001/HVC434449', 'E001/HVC450989', 'E001/HVC460343', 'E001/HVC462262', 'E001/HVC462508', 'E001/HVC463620', 'E001/HVC467645', 'E001/HVC468477', 'E001/HVC473973', 'E001/HVC479361', 'E001/HVC486704', 'E001/HVC498099', 'E001/HVC499900', 'E001/HVC506183', 'E001/HVC507955', 'E001/HVC513054', 'E001/HVC515040', 'E001/HVC516366', 'E001/HVC520461', 'E001/HVC528762', 'E001/HVC528929', 'E001/HVC529613', 'E001/HVC532992', 'E001/HVC532993', 'E001/HVC539647', 'E001/HVC541506', 'E001/HVC542481', 'E001/HVC549861', 'E001/HVC553302', 'E001/HVC561733', 'E001/HVC562777', 'E001/HVC570651', 'E001/HVC573643', 'E001/HVC579741', 'E001/HVC591147', 'E001/HVC597104', 'E001/HVC602688', 'E001/HVC605240', 'E001/HVC606315', 'E001/HVC615626', 'E001/HVC616948', 'E001/HVC620320', 'E001/HVC631691', 'E001/HVC637907', 'E001/HVC646345', 'E001/HVC652043', 'E001/HVC658517', 'E001/HVC672564', 'E001/HVC676565', 'E001/HVC680956', 'E001/HVC681821', 'E001/HVC682794', 'E001/HVC683835', 'E001/HVC687278', 'E001/HVC690639', 'E001/HVC699748', 'E001/HVC705168', 'E001/HVC707236', 'E001/HVC711253', 'E001/HVC717615', 'E001/HVC718274', 'E001/HVC719228', 'E001/HVC726373', 'E001/HVC730049', 'E001/HVC730081', 'E001/HVC733376', 'E001/HVC742499', 'E001/HVC749106', 'E001/HVC766498', 'E001/HVC776608', 'E001/HVC786536', 'E001/HVC788532', 'E001/HVC789034', 'E001/HVC792251', 'E001/HVC803719', 'E001/HVC811971', 'E001/HVC822060', 'E001/HVC823377', 'E001/HVC823657', 'E001/HVC829585', 'E001/HVC834162', 'E001/HVC836424', 'E001/HVC846668', 'E001/HVC878334', 'E001/HVC881736', 'E001/HVC883718', 'E001/HVC887091', 'E001/HVC888814', 'E001/HVC891523', 
'E002/HVC013188', 'E002/HVC021844', 'E002/HVC027840', 'E002/HVC027879', 'E002/HVC028378', 'E002/HVC043460', 'E002/HVC050106', 'E002/HVC053039', 'E002/HVC055040', 'E002/HVC056946', 'E002/HVC057164', 'E002/HVC057244', 'E002/HVC067449', 'E002/HVC070937', 'E002/HVC075150', 'E002/HVC080067', 'E002/HVC081166', 'E002/HVC084897', 'E002/HVC087496', 'E002/HVC090118', 'E002/HVC090439', 'E002/HVC092481', 'E002/HVC110256', 'E002/HVC114107', 'E002/HVC115550', 'E002/HVC117893', 'E002/HVC118292', 'E002/HVC121841', 'E002/HVC127982', 'E002/HVC131494', 'E002/HVC132114', 'E002/HVC148914', 'E002/HVC164420', 'E002/HVC168889', 'E002/HVC188853', 'E002/HVC192860', 'E002/HVC198167', 'E002/HVC200173', 'E002/HVC217227', 'E002/HVC220724', 'E002/HVC232523', 'E002/HVC237868', 'E002/HVC239317', 'E002/HVC240101', 'E002/HVC240983', 'E002/HVC244418', 'E002/HVC256062', 'E002/HVC274889', 'E002/HVC280004', 'E002/HVC284062', 'E002/HVC284420', 'E002/HVC291067', 'E002/HVC297552', 'E002/HVC302964', 'E002/HVC305398', 'E002/HVC317191', 'E002/HVC318186', 'E002/HVC319130', 'E002/HVC329241', 'E002/HVC330358', 'E002/HVC330734', 'E002/HVC331229', 'E002/HVC331444', 'E002/HVC333068', 'E002/HVC333217', 'E002/HVC337634', 'E002/HVC337743', 'E002/HVC342710', 'E002/HVC357788', 'E002/HVC358793', 'E002/HVC371186', 'E002/HVC372625', 'E002/HVC378891', 'E002/HVC381000', 'E002/HVC381541', 'E002/HVC385947', 'E002/HVC386562', 'E002/HVC397476', 'E002/HVC406511', 'E002/HVC419946', 'E002/HVC425921', 'E002/HVC426636', 'E002/HVC448233', 'E002/HVC450975', 'E002/HVC454146', 'E002/HVC461207', 'E002/HVC468846', 'E002/HVC505164', 'E002/HVC508446', 'E002/HVC518505', 'E002/HVC524155', 'E002/HVC526932', 'E002/HVC528086', 'E002/HVC533619', 'E002/HVC543259', 'E002/HVC557970', 'E002/HVC562914', 'E002/HVC563762', 'E002/HVC568424', 'E002/HVC568645', 'E002/HVC569270', 'E002/HVC572422', 'E002/HVC572908', 'E002/HVC575229', 'E002/HVC575738', 'E002/HVC577808', 'E002/HVC588610', 'E002/HVC602589', 'E002/HVC613614', 'E002/HVC617138', 'E002/HVC620742', 'E002/HVC621080', 'E002/HVC622215', 'E002/HVC633752', 'E002/HVC639090', 'E002/HVC648237', 'E002/HVC650616', 'E002/HVC652771', 'E002/HVC669143', 'E002/HVC678577', 'E002/HVC685238', 'E002/HVC686035', 'E002/HVC686083', 'E002/HVC690403', 'E002/HVC693882', 'E002/HVC698972', 'E002/HVC699015', 'E002/HVC724149', 'E002/HVC725414', 'E002/HVC732985', 'E002/HVC738245', 'E002/HVC742544', 'E002/HVC765357', 'E002/HVC765442', 'E002/HVC768066', 'E002/HVC781040', 'E002/HVC781550', 'E002/HVC787358', 'E002/HVC788059', 'E002/HVC792707', 'E002/HVC800578', 'E002/HVC804688', 'E002/HVC805254', 'E002/HVC805769', 'E002/HVC820877', 'E002/HVC826976', 'E002/HVC827162', 'E002/HVC831492', 'E002/HVC843356', 'E002/HVC843604', 'E002/HVC848042', 'E002/HVC852977', 'E002/HVC860018', 'E002/HVC860974', 'E002/HVC862982', 'E002/HVC868918', 'E002/HVC875036', 'E002/HVC876313', 'E002/HVC885657', 'E002/HVC893844', 



'E006/HVC003127', 'E006/HVC014770', 'E006/HVC026344', 'E006/HVC028739', 'E006/HVC044384', 'E006/HVC046682', 'E006/HVC047785', 'E006/HVC048803', 'E006/HVC049355', 'E006/HVC053019', 'E006/HVC055364', 'E006/HVC057950', 'E006/HVC059513', 'E006/HVC060224', 'E006/HVC062957', 'E006/HVC065131', 'E006/HVC075202', 'E006/HVC079072', 'E006/HVC089951', 'E006/HVC094890', 'E006/HVC096167', 'E006/HVC102455', 'E006/HVC105744', 'E006/HVC108273', 'E006/HVC108839', 'E006/HVC114661', 'E006/HVC117742', 'E006/HVC135090', 'E006/HVC136327', 'E006/HVC136448', 'E006/HVC136655', 'E006/HVC139877', 'E006/HVC142274', 'E006/HVC143596', 'E006/HVC149161', 'E006/HVC154429', 'E006/HVC154997', 'E006/HVC160292', 'E006/HVC174418', 'E006/HVC176309', 'E006/HVC180590', 'E006/HVC183888', 'E006/HVC184345', 'E006/HVC186186', 'E006/HVC187429', 'E006/HVC210336', 'E006/HVC213284', 'E006/HVC219748', 'E006/HVC220136', 'E006/HVC229233', 'E006/HVC229611', 'E006/HVC237575', 'E006/HVC238240', 'E006/HVC258411', 'E006/HVC278220', 'E006/HVC279776', 'E006/HVC279987', 'E006/HVC280065', 'E006/HVC283720', 'E006/HVC284110', 'E006/HVC286248', 'E006/HVC289166', 'E006/HVC307350', 'E006/HVC325101', 'E006/HVC325801', 'E006/HVC327718', 'E006/HVC329883', 'E006/HVC330657', 'E006/HVC333739', 'E006/HVC334553', 'E006/HVC342037', 'E006/HVC342555', 'E006/HVC342574', 'E006/HVC344602', 'E006/HVC353470', 'E006/HVC355527', 'E006/HVC361500', 'E006/HVC363634', 'E006/HVC364802', 'E006/HVC384707', 'E006/HVC391819', 'E006/HVC397337', 'E006/HVC401377', 'E006/HVC405887', 'E006/HVC410673', 'E006/HVC411997', 'E006/HVC418818', 'E006/HVC422800', 'E006/HVC426052', 'E006/HVC428254', 'E006/HVC454277', 'E006/HVC456896', 'E006/HVC456955', 'E006/HVC457943', 'E006/HVC458485', 'E006/HVC459157', 'E006/HVC460515', 'E006/HVC462990', 'E006/HVC468328', 'E006/HVC474932', 'E006/HVC476364', 'E006/HVC488561', 'E006/HVC493576', 'E006/HVC497605', 'E006/HVC502408', 'E006/HVC505966', 'E006/HVC514846', 'E006/HVC515444', 'E006/HVC518852', 'E006/HVC522828', 'E006/HVC523689', 'E006/HVC528640', 'E006/HVC528925', 'E006/HVC531593', 'E006/HVC533182', 'E006/HVC535101', 'E006/HVC553695', 'E006/HVC554261', 'E006/HVC556677', 'E006/HVC560384', 'E006/HVC568379', 'E006/HVC569472', 'E006/HVC582072', 'E006/HVC600875', 'E006/HVC602088', 'E006/HVC611698', 'E006/HVC612697', 'E006/HVC614602', 'E006/HVC618594', 'E006/HVC620168', 'E006/HVC620202', 'E006/HVC621902', 'E006/HVC622327', 'E006/HVC624578', 'E006/HVC635304', 'E006/HVC639480', 'E006/HVC646979', 'E006/HVC648989', 'E006/HVC652121', 'E006/HVC656513', 'E006/HVC662543', 'E006/HVC662906', 'E006/HVC665754', 'E006/HVC681179', 'E006/HVC687355', 'E006/HVC688682', 'E006/HVC690097', 'E006/HVC690730', 'E006/HVC698488', 'E006/HVC701531', 'E006/HVC703233', 'E006/HVC713044', 'E006/HVC735580', 'E006/HVC747595', 'E006/HVC749712', 'E006/HVC767782', 'E006/HVC772466', 'E006/HVC787057', 'E006/HVC789721', 'E006/HVC807168', 'E006/HVC807393', 'E006/HVC809384', 'E006/HVC810540', 'E006/HVC828816', 'E006/HVC829086', 'E006/HVC834685', 'E006/HVC849484', 'E006/HVC859116', 'E006/HVC861803', 'E006/HVC869061', 'E006/HVC878724', 'E006/HVC893705', 


]

def preprocess(names):
    tuples = map(lambda(x): (x, os.path.getsize('/u/drspeech/data/Aladdin/corpora/trecvid2011/events/'+x+'.htk')), names)
    return map(lambda(name, size): name, sorted(tuples, key=lambda(x): x[1], reverse=True))

if __name__ == '__main__':
    mr_args = ['-v', '--strict-protocols', '-r', 'hadoop','--input-protocol', 'pickle','--output-protocol','pickle','--protocol','pickle']
    meeting_names = all_meeting_names[:1000]
    meeting_names = preprocess(meeting_names)
    print "Processing {0} input files".format(len(meeting_names))
    task_args = [protocol.write(name, None)+"\n" for name in meeting_names]
    
    start = time.time()
    job = ClusterMRJob(args=mr_args).sandbox(stdin=task_args)
    runner = job.make_runner()        
    runner.run()

    print "Tasks done. Total execution time:", time.time()-start, "seconds."
示例#12
0
import time
import os.path

from cluster_map import ClusterMRJob
from mrjob.protocol import PickleProtocol as protocol

 
# video file names, not all included for brevity
all_meeting_names

def preprocess(names):
    tuples = map(lambda(x): (x, os.path.getsize('/u/drspeech/data/Aladdin/corpora/trecvid2011/events/'+x+'.htk')), names)
    return map(lambda(name, size): name, sorted(tuples, key=lambda(x): x[1], reverse=True))

if __name__ == '__main__':
    mr_args = ['-v', '--strict-protocols', '-r', 'hadoop','--input-protocol', 'pickle','--output-protocol','pickle','--protocol','pickle']
    meeting_names = all_meeting_names[:250]
    meeting_names = preprocess(meeting_names)
    print "Processing {0} input files".format(len(meeting_names))
    task_args = [protocol.write(name, None)+"\n" for name in meeting_names]
    
    start = time.time()
    job = ClusterMRJob(args=mr_args).sandbox(stdin=task_args)
    runner = job.make_runner()        
    runner.run()

    print "Tasks done. Total execution time:", time.time()-start, "seconds."