def getBrokenlinks(url): broken_links = 0 threads = [] try: soup = url.getsoup() except WebcredError as e: raise WebcredError(e.message) except: raise WebcredError('Url is broken') for link in soup.find_all('a', href=True): uri = link.get('href') # TODO should it inlude inner links as well? if not uri.startswith('http://') and not uri.startswith('https://'): uri = url.geturl() + uri if validators.url(uri): t = MyThread(Method='funcBrokenllinks', Name='brokenlinks', Url=uri) t.start() threads.append(t) for t in threads: # pdb.set_trace() t.join() # t.freemem() if t.getResult(): broken_links += 1 return broken_links
def collect(self): li = [] for remark, url in self.urls: t = MyThread(self.check, args=(remark, url)) li.append(t) t.start() for t in li: t.join() yield t.get_result()
def __create_workers(self): """ Create new threads for the parser :return: Nothing """ for _ in range(Properties.PARSER_MAX_THREADS): t = MyThread(HTMLServiceFactory(self.html_service_type).get_instance(), self) t.start()
def create_workers(self): """ Create new threads for the crawler :return: Nothing """ for _ in range(Properties.CRAWLER_MAX_THREADS): t = MyThread( UrlDAOFactory(self.db_type).get_instance(), self, self.bucket) self.__threads.append(t) t.start()
def getImgratio(url): total_img_size = 0 threads = [] try: text_size = url.getsize() except WebcredError as e: return e.message soup = url.getsoup() # total_img_size of images for link in soup.find_all('img', src=True): uri = link.get('src', None) if not uri.startswith('http://') and not uri.startswith('https://'): uri = url.geturl() + uri if validators.url(uri): try: uri = Urlattributes(uri) t = MyThread(Method='funcImgratio', Name='Imgratio', Url=uri) t.start() threads.append(t) except WebcredError as e: # even if particular image is not accessible, we don't mind it pass for t in threads: t.join() t.freemem() size = t.getResult() if isinstance(size, int): total_img_size += size # print total_img_size try: total_size = total_img_size + text_size ratio = float(text_size) / total_size # print ratio, text_size, total_size except ValueError: raise WebcredError('Error in fetching images') return ratio
def main(cross_num=5, exp_path='', sav_dir='', conf='', cfg_sec='', bool_vad=False): if not os.path.exists(exp_path): os.mkdir(exp_path) for i in range(1, 6): session_name = 'Session' + str(i) session_path = exp_path + '/' + session_name if not os.path.exists(session_path): os.mkdir(session_path) if not os.path.exists(sav_dir): os.mkdir(sav_dir) thread_list = [] lab_list = [] for i in range(5): t = MyThread( get_emo_data, args=('Session' + str(i + 1), True, bool_vad, 'spectrogram', 3, exp_path + 'Session' + str(i + 1) + '/', conf, cfg_sec)) thread_list.append(t) for t in thread_list: t.start() for t in thread_list: t.join() lab_list.append(t.get_result()) lab_dict = dict() for item in lab_list: lab_dict.update(item) min_val = np.Inf freq_bag = set() time_bag = set() for sess_spk in lab_dict.keys(): # sess_spk aims to specific session of F/M, # wav_file_dict: {wav_file_name: [[mat1,lab1], [mat2, lab2] ...], wav_file_name: [[mat1,lab1], [mat2, # lab2] ...] , ..} wav_file_info_dict = lab_dict.get(sess_spk) print(sess_spk, len(list(wav_file_info_dict.keys()))) for wav_file in wav_file_info_dict: info_list = wav_file_info_dict.get(wav_file) for mat, label, valence, arouse, domain in info_list: min_val = min(min_val, mat.min()) # max_length = max(max_length, mat.shape[1]) # min_length = min(min_length, mat.shape[1]) time_bag.add(mat.shape[1]) freq_bag.add(mat.shape[0]) # print('Time bag:\n\t', time_bag) print('Freq bag:\n\t', freq_bag) print(max(time_bag), min(time_bag)) bias = min_val - 1 print(bias) sess_spk = set(lab_dict.keys()) if cross_num == 5: for i in range(1, 6): # construct the dataset for five-fold cross validation cross_val = 'leave_' + str(i) file_path = sav_dir + '/' + cross_val dev_key = {'Session' + str(i) + '_F'} test_key = {'Session' + str(i) + '_M'} train_key = sess_spk - test_key - dev_key if not os.path.exists(file_path): os.mkdir(file_path) print(test_key) store(file_path=file_path, train_key=train_key, test_key=test_key, data_dict=lab_dict, bias=bias, bool_vad=bool_vad) print() else: sess_spk_list = list(sess_spk) # construct the dataset for ten-fold cross validation for i in range(1, 11): idx = sess_spk_list[i - 1] cross_val = 'leave_' + str(i) file_path = data_path_prefix + '10cross_set/' + cross_val test_key = {idx} train_key = sess_spk - test_key if not os.path.exists(file_path): os.mkdir(file_path) store(file_path=file_path, train_key=train_key, test_key=test_key, data_dict=lab_dict)
if(ret_code!=0): raise Exception("section[{}] func[{}] error[{}]".format(section, 'deal_csv', ret_msg)) threads=[] threads_num=0 for section in section_list_multiple: if(section_name!=None): if(section!=section_name): continue file_dict=config_dic.get(section, None) if(file_dict==None): raise Exception("section[{}] not found".format(section)) t=MyThread(deal_csv,args=(section, file_dict, glob_config, relation_ds)) threads.append(t) t.start() threads_num+=1 # print("\n\nNow begin deal section[{}]".format(section)) # ret_code, ret_msg, relation_ds, out_ds =deal_csv(file_dict, glob_config, relation_ds, out_ds) # if(ret_code!=0): # raise Exception("section[{}] func[{}] error[{}]".format(section, 'deal_csv', ret_msg)) for t in threads: t.join() # 一定要join,不然主线程比子线程跑的快,会拿不到结果 ret_code, ret_msg, relation_ds, out_ds_list[t.args[0]] = t.get_result() if(ret_code!=0): raise Exception("thread[{}] func[{}] error[{}]".format(t, 'deal_csv', ret_msg)) # out_ds=pd.DataFrame({'openday':[], 'detail_type':[], 'detail_cnt':[], 'detail_amt':[]}) i=0 out_ds=pd.DataFrame({'openday':[], 'detail_type':[], 'detail_cnt':[], 'detail_amt':[]})