Пример #1
0
def main():
    # select operations
    tmp=raw_input('Select 1 or 2 to continue: \n'+
                  '[1] Generate feature 1 and 2 using single thread\n' +
                  '[2] Generate feature 1 and 2 using multiple thread (for large data size)\n'
                  '[3] Clear all data (DANGEROUS !!!!)\n'+
                  'P.S. I assume the new data will be appended into the end of tweets.txt. It will not take into account changes in previous tweets in tweets.txt\n')
    # for generating features
    if tmp=='1':
        f_manager=feature_manager(s_para)
        f_manager.calculate_feature(f_para)
        # output features
        output_feature1(f_manager.word_storage)
        output_feature2(f_manager.median_arr)
    elif tmp=='2':
        m_worker=multi_worker(s_para)
        m_worker.multi_calculate_feature(f_para)
        # output features
        output_feature1(m_worker.total_word_storage)
        output_feature2(m_worker.total_median_arr)
        # clean temporary file
        clean_tmp_file()
    # for delete all features
    elif tmp == '3':
        confirm =  raw_input('[WARNING] It will deleted all data. Are you sure that you want to continue(y/n)??')
        if confirm.lower() == 'y':
            f_manager=feature_manager(s_para)
            f_manager.__delete_all__()
            clean_tmp_file()
            print 'data are deleted'
 def __init__(self,s_para):
     self.main_f_manager=feature_manager(s_para)
     self.last_pos=self.main_f_manager.get_last_pos()
     self.total_storage_name=s_para['storage_filename']
     self.par_f_manager={}
     self.total_word_storage=self.main_f_manager.word_storage
     self.total_median_arr=self.main_f_manager.median_arr
     self.total_indiv_word_arr=self.main_f_manager.indiv_word_arr
     self.total_unsorted_indiv_word_arr=self.main_f_manager.unsorted_indiv_word_arr
 def process_data_par(self):
     logging.info ('start to calculate feature in parallel')
     s_para={}
     for i in range(0,self.num_of_file):
         tmpstr=(str(i).zfill(self.suffix_length))
         storage_file_p='data_storage_%s.p' %tmpstr
         s_para['storage_filename']=os.path.join(storage_dir_p,storage_file_p)
         s_para['worker_no']='worker %d' %i
         self.par_f_manager[tmpstr]=feature_manager(s_para)
     # process data in parallel
     p = ThreadPool(num_of_worker)
     l=range(0,self.num_of_file)
     result=p.map(self.let_worker_run,l)
     p.close()
     p.join()