def crawler_multiprocessing(worker_id, task_queue_crawler, path_dir, task_queue_record_article, headers, each_region, each_target): while True: task_list_get = task_queue_crawler.get() print('id =', worker_id, ',stard_task-----', task_list_get[0], '/', task_list_get[1], '/', task_queue_crawler.qsize()) shortcode = task_list_get[2] global each_article_url each_article_url = 'https://www.instagram.com/p/' + shortcode + '/' html_article = get_html(each_article_url, headers) post_time = get_data(html_article, path_dir, each_region, each_target) if post_time == None: line_notify.lineNotifyMessage(msg='id =' + str(worker_id) + '\n' + str(each_article_url) + '\npost_time = ' + str(post_time)) break else: task_queue_record_article.put(shortcode) # 放進_任務佇列 now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') time_sleep = random.randint(1, 3) print('id =', worker_id, ',done_task------', task_list_get[0], '/', task_list_get[1], '/', task_queue_crawler.qsize(), ',休息', time_sleep, '秒', now_time, each_article_url, '發文時間:', post_time) time.sleep(time_sleep) task_queue_crawler.task_done()
def put_task_queue_crawler(task_queue_crawler, record_list, record_article_set): for each_record in record_list: if each_record not in record_article_set: task = [ record_list.index(each_record) + 1, len(record_list), each_record ] task_queue_crawler.put(task) # 放進_任務佇列 record_article_set.add(each_record) # line 通知 進度 global search_tag data_count = len(record_article_set) if data_count % 20000 == 0: line_notify.lineNotifyMessage(msg=str(search_tag) + ' 已收集: ' + str(data_count)) return record_article_set
# request.urlretrieve(img_url,path_dir_each + '/' + title + '.jpg') return posting_time except Exception as e: print(e) # def collect_Exception(e,url=''): # path = './{0}_Exception.txt'.format(os.path.basename(__file__).replace('.py', '')) # time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # save_Exception_dict = {"time":time,"Exception":str(e),"url":url} # js_data_Exception = json.dumps(save_Exception_dict) # # 存文字檔 # with open(path, 'a', encoding='utf8') as f: # f.write( js_data_Exception + '\n') if __name__ == '__main__': tag_list_region = ['台北', '新北', '基隆', '桃園', '新竹', '宜蘭'] # 手動更改 # tag_list_region = ['桃園', '新竹','宜蘭'] # 手動更改 tag_list_target = ['景點', '美食'] # 手動更改 for each_target in tag_list_target: for each_region in tag_list_region: print('開始爬:', each_region + each_target) line_notify.lineNotifyMessage( msg='開始爬:{0}{1}'.format(each_region, each_target)) main(each_region, each_target) print('Complete!!!!!!!!!!')
str_tmp += each_word + " " except Exception as e: print(e) continue # 斷詞結果存檔 segSaveFile = save_path with open(segSaveFile, 'ab') as saveFile: saveFile.write(str_tmp.encode('utf-8') + '\n'.encode('utf-8')) cost_time = time.time() - time_start print('ckiptagger 花了', cost_time / 3600, '小時') print('save_path=', save_path) def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) ckiptagger(load_txt_path='../combin/raw_place.txt', save_path='./segDone_place.txt') ckiptagger(load_txt_path='../combin/raw_food.txt', save_path='./segDone_food.txt') if __name__ == '__main__': main() print('Complete!!!!!!!!!!') line_notify.lineNotifyMessage(msg='combin_ig_txt Complete!!!!!!!!!!')