def fit_polynomial3(pixel_data): '''Return an "image" which is a polynomial fit to the pixel data Fit the image to the polynomial Ax**2+By**2+Cxy+Dx+Ey+F pixel_data - a two-dimensional numpy array to be fitted ''' x,y = np.mgrid[0:pixel_data.shape[0], 0:pixel_data.shape[1]] x2 = x*x y2 = y*y xy = x*y x2y = x*x*y y3 = y*y*y x3 = x*x*x y2x = y*y*x o = np.ones(pixel_data.shape) a = np.stack([x.flat, y.flat, x2.flat, y2.flat, xy.flat, x2y.flat, y3.flat, x3.flat, y2x.flat, o.flat], 1) mean, std = pixel_data.mean(), pixel_data.std() # z = (pixel_data.flat - mean) / std z = pixel_data.flat coeffs, residuals, rank, s = scipy.linalg.lstsq(a, z) LogHelper.logText('\n{:.8f}x + {:.8f}y + {:.8f}x^2 + {:.8f}y^2 + {:.8f}xy + {:.8f}x^2y + {:.8f}y^3 + {:.8f}x^3 + {:.8f}xy^2 + {:.8f}', *coeffs) output_pixels = np.sum([coeff * index for coeff, index in zip(coeffs, [x,y,x2,y2,xy, x2y, y3, x3, y2x, o])], 0) smooth = filters.rank.mean(pixel_data, disk(50)) coeffs2, residuals2, rank2, s2 = scipy.linalg.lstsq(a, smooth.flat) LogHelper.logText('\n{:.8f}x + {:.8f}y + {:.8f}x^2 + {:.8f}y^2 + {:.8f}xy + {:.8f}x^2y + {:.8f}y^3 + {:.8f}x^3 + {:.8f}xy^2 + {:.8f}', *coeffs2) return output_pixels, mean, std
def calculate_masked_stats(): plate_no = "59798" parsed = get_plate_files(plate_no) for w in ['w2']: files = filter(lambda f: f.wave == w[1], parsed) # accum = np.zeros((2160, 2160), dtype=np.uint32) # files = filter(lambda x: 's1' not in x and 's7' not in x, all_files) nof = len(files) for i, frame in enumerate(files[0:5], 1): LogHelper.logText(frame.fullpath) img = imread(frame.fullpath) t = filters.threshold_yen(img) b1 = img > t b2 = binary_erosion(b1, square(2)) b3 = binary_dilation(b2, square(10)) b4 = binary_closing(b3, square(3)) imm = np.ma.masked_where(b4, img) mn, mx = np.percentile(imm, (1, 99)) LogHelper.logText( '%3d of %d, %4d-%4d-%4d-%5d, %.0f-%.0f' % (i, nof, imm.min(), mn, mx, imm.max(), imm.mean(), imm.std()) ) im2 = imm.filled(int(imm.mean())) out_name = "{0}\\{5}-{1}{2}-{3}-{4}.tif".format(ROOT_DIR, frame.row, frame.column, frame.site, LogHelper.init_ts, frame.experiment) imsave(out_name, im2)
def my_copy_file(source_file,target): i_file_count = 0 try: str_dir = source_file if os.path.isfile(str_dir): str_dir = os.path.dirname(str_dir) arr_files = get_dir_files(str_dir) i_file_count = len(arr_files) for name in arr_files: copy_file(name,target) return str(i_file_count) + ",.," except Exception as e: LogHelper.error("CopyError1:" + e.message) return i_file_count + ",.," + "CopyError1:" + e.message
def copy_file(source_file,target_file): try: source = getRootDir(source_file) desfilename=source_file.replace('/',os.sep).replace(source,target_file,1).replace('\\\\',os.sep) LogHelper.debug(source_file + " copy to "+desfilename) if not os.path.exists(os.path.dirname(desfilename)): os.makedirs(os.path.dirname(desfilename)) if not os.path.exists(desfilename): shutil.copy(source_file,desfilename)#如果要改为移动,而不是拷贝,可以将copy改为move return "1,.," except Exception as e: LogHelper.error("CopyError0:" + e.message) return "0,.,CopyError0:" + e.message
def find_min(): dir_name = get_plate_files("59438") ilum_name = os.path.join(dir_name, 'ilum.tiff') all_files = glob.glob1(dir_name, '*.TIF') files = all_files m = 500 ts = "{:%Y-%m-%d-%H-%M-%S}-".format(datetime.now()) for filename in files: file_path = os.path.join(dir_name, filename) out_name = os.path.join(dir_name, filename) img = imread(file_path) m1 = min(m, np.percentile(img, 1)) if m != m1: LogHelper.logText("{0} - {1} ".format(m1, filename)) m = m1 LogHelper.logText("*************** {0} - {1} ***************".format(m, ts))
def download_page(self, url, outDir, assetDir, filename, css=False, javascript=False, image=False): page = None #response = self.get_http_pool(url).request('GET', url, headers=headers) try: response = self.session.get(url) except requests.exceptions.ConnectionError as connectionError: return -1, None except Exception as ex: self.logger.exception(LogHelper.getExceptionMsg(ex, "exception: download %s" % url)) return -1, None if (response.status_code != 200): self.logger.warn("http response: %s %s" % (response.status_code, url)) else: html = response.content.decode('utf-8', 'ignore') page = htmlparser.fromstring(html, base_url=url) if (css or javascript or image): if (css): self.download_related_files(page, u"//link", u"href", outDir, assetDir, baseurl=url) if (javascript): self.download_related_files(page, u"//script", u"src", outDir, assetDir, baseurl=url) if (image): self.download_related_files(page, u"//img", u"src", outDir, assetDir, baseurl=url) newHtml = htmlparser.tostring(page) self.saveToFile(os.path.join(outDir, filename), newHtml) else: self.saveToFile(os.path.join(outDir, filename), html) return response.status_code, page
def parseProfile(self, name, urlProfile, outputDir, donors): result = False try: self.logger.info('downloading profile %s...' % name) #filename = '%s.html' % datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d_%H%M%S') status, page = self.downloader.download_page(urlProfile, outputDir, self.assetDir, profileHtmlSrcFilename, css=True, javascript=True, image=False) if (page == None): pass elif (status != 200): self.logger.warn("http response: %s %s" % (status, urlProfile)) else: detailedFound = False progress = "" raised = "" togo = "" nodeDetailes = self.find_element_by_xpath(page, u'//div[@id="funding_details"]') #text = htmlparser.tostring(nodeDetailes, "innerHTML") text=nodeDetailes.text_content() if (text != None and text != ""): text = text.lower() list = re.findall(ur"([\d\.]+?)%\s*(.*?)\$([\d,]+)\s*(.*?)\s*?$", text) if (len(list) == 1 and len(list[0]) == 4): values = list[0] if (values[1] != None and values[3] == "raised"): progress = values[0] raised = values[2] detailedFound = True elif (values[1] != None and values[3] == "to go"): progress = values[0] togo = values[2] detailedFound = True if(not detailedFound): self.logger.error("invalid reg pattern for %s in profile %s" % (text, urlProfile)) if (not detailedFound): self.logger.error("profile %s details not found" % name) else: timestamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S') if (raised != None): raised = re.sub(ur"[,$]", "", raised) if (togo != None): togo = re.sub(ur"[,$]", "", togo) details = [timestamp, progress, raised, togo, donors] self.logger.info("profile %s details: %s" % (name, str(details))) #self.save_profile_details(os.path.join(outputDir, '%s.txt' % (name)), details) self.save_profile_details(os.path.join(outputDir, 'data.csv'), details) result = True except Exception as ex: self.logger.exception(LogHelper.getExceptionMsg(ex, "Exception: %s: %s " % ("parseProfile", name))) finally: pass return result
def saveToFile(filename, data): file = None try: file = open(filename, mode='wb') file.write(data) except Exception as ex: logger.exception(LogHelper.getExceptionMsg(ex, "unable to save file: %s" % filename)) finally: if (file != None): file.close()
def savePatientInfos(self): if (self.patientInfosFile): self.patientInfosFile.close() self.patientInfosFile = None file = None try: file = open(self.patientInfosFilename, 'w').close() except Exception, ex: self.logger.exception(LogHelper.getExceptionMsg(ex, "error: can't write cache file"))
def saveToFile(self, filename, data): file = None try: file = open(filename, mode='wb') file.write(data) except Exception as ex: self.logger.exception(LogHelper.getExceptionMsg(ex, "unable to save file: %s" % (filename))) finally: if (file): file.close()
def get_plate_files(plate_name, cnt=-1): def parse_file_name(f): n = os.path.basename(f) parts = [f, n, plate_name] + NAME_PARSER.split(n)[1:6] if len(parts[6]) == 1: parts[6] = '0' + parts[6] frm = Frame._make(parts) return frm tree = LogHelper.time(lambda: [PathNode._make(t) for t in os.walk(ROOT_DIR + "\\data", topdown=False)]) plate = pydash.find(tree, lambda p: p.root.endswith(plate_name)) LogHelper.logText(plate.root) if plate.dirs: files = glob.glob(plate.root + "\\*\\*.tif") else: files = glob.glob(plate.root + "\\*.tif") if cnt > 0: files = random.sample(files, cnt) parsed = map(parse_file_name, files) return parsed
def dilum(): # ilum = imread(r"T:\NewImaging\w2-2018-03-15-16-02-09-smooth.tiff") parsed = get_plate_files("59798") for w in ['w2']: files = filter(lambda f: f.wave == w[1], parsed) for i, frame in enumerate(files[0:1], 1): img = imread(frame.fullpath) r1 = rescale_intensity(img, out_range=np.uint8).astype(np.uint8) mn = img.min() mx = img.max() mean = np.mean(img) std = np.std(img) img[img > (mean + std)] = mn r2 = rescale_intensity(img, in_range=(mn, mx), out_range=np.uint8).astype(np.uint8) s = np.stack((r1, r1, r2), 2) # img2 = (np.int32(img) - ilum) # img3 = np.clip(img2, 0, None) # img4 = rescale_intensity(img3, out_range=np.uint8).astype(np.uint8) out_name = "{0}\\{1}{2}-{3}-{4}.png".format(ROOT_DIR, frame.row, frame.column, frame.site, LogHelper.init_ts) imsave(out_name, s) LogHelper.logText("*************** {0:s} ***************".format(out_name))
def save_profile_details(self, filename, list): file = None try: file = open(filename, "ab") writer = csv.writer(file) writer.writerow(list) file.flush() except Exception as ex: self.logger.exception(LogHelper.getExceptionMsg(ex, "unable to save file: %s" % (filename))) finally: if (file != None): file.close()
def saveTextToFile(self, filename, data): file = None try: #file = codecs.open(filename, mode='w', encoding="utf-8") #file = codecs.open(filename, mode='wb') file = open(filename, mode='w') #file.write(u'\ufeff') #codecs.BOM_UTF8 file.write(data) except Exception as ex: self.logger.exception(LogHelper.getExceptionMsg(ex, "unable to save file: %s" % filename)) finally: if (file): file.close()
def saveTextToFile(filename, data, encoding="utf-8"): file = None try: file = codecs.open(filename, mode='w', encoding=encoding) #file = codecs.open(filename, mode='wb') #file = open(filename, mode='w') #file.write(u'\ufeff') #codecs.BOM_UTF8 file.write(data) except Exception as ex: logger.exception(LogHelper.getExceptionMsg(ex, "unable to save file: %s" % filename)) finally: if (file != None): file.close()
def calculate_ilum(): parsed = get_plate_files("59839") for w in ['2']: files = filter(lambda f: f.wave == w, parsed)[0:30] nof = len(files) # files = filter(lambda x: 's1' not in x and 's7' not in x, files) img0 = imread(files[0].fullpath) mp = np.argmax(np.bincount(img0.flat)) s2 = mp - img0.min() accum = np.zeros_like(img0, dtype=np.int32) accum_cnt = np.ones_like(img0, dtype=np.int32) thresh_w = np.uint16(filters.threshold_otsu(img0)) prt = (img0 > thresh_w).sum() * 1.0 / len(img0.flat) if prt > 0.2: thresh_w = img0.mean() + 10 * img0.std() LogHelper.logText('{0}'.format(thresh_w)) # ls = 0 for i in range(nof): frame = files[i] img = imread(frame.fullpath) mp = np.argmax(np.bincount(img.flat)) s2 = mp - img0.min() t = mp + s2 # LogHelper.logText('%4d-%4d-%5d (%.0f)' % (img.min(), img.mean(), img.max(), img.std())) img[img >= t] = 0 accum += img accum_cnt += (img != 0) # av = (accum / accum_cnt).astype(np.uint16) # avs = filters.laplace(av, 31) # s = avs.std() # ds = abs(s - ls) LogHelper.logText('%3d of %d w%s# %s %d' % (i+1, nof, w, frame.shortname(), t)) # ls = s # if ds < LPTH: # break stats_dir = os.path.join(ROOT_DIR, "%s-stats" % files[0].plate) try: os.mkdir(stats_dir) except WindowsError, e: assert(e.winerror == 183) # 'Cannot create a file when that file already exists' filename = os.path.join(stats_dir, "%s-w%s-%%s.tif" % (LogHelper.init_ti, w)) LogHelper.logText(filename) tifsave(filename % 'accum', accum) tifsave(filename % 'accum_cnt', accum_cnt) avg_u = (accum / accum_cnt).astype(np.uint16) tifsave(filename % 'avg_u', avg_u) smooth = filters.rank.mean(avg_u, disk(50)) tifsave(filename % 'smooth', smooth)
def calculate_stats(): # d = imread(r"T:\NewImaging\w2-2018-03-16-18-47-26-smooth.tiff") # dmn, dmd, dmx = np.percentile(d, (0.1, 50, 99.9)) # LogHelper.logText( # '%4d-%4d-%4d-%4d-%5d, %.2f-%.2f' # % (d.min(), dmn, dmd, dmx, d.max(), np.mean(d), np.std(d)) # ) parsed = get_plate_files("59833", 30) for w in ['1']: files = filter(lambda f: f.wave == w, parsed) # files = filter(lambda x: 's1' not in x and 's7' not in x, files) nof = len(files) p50s = [] mps = [] means = [] s2s = [] stds = [] for i, frame in enumerate(files, 1): LogHelper.logText("%3d of %d - %s" % (i, nof, frame.shortname())) img = imread(frame.fullpath) p1, p50, p99 = np.percentile(img, (1, 50, 99)) mp = np.argmax(np.bincount(img.flat)) # type: np.uint16 s2 = mp - img.min() # r1 = measure.block_reduce(img, (30, 30), func=np.std) # r2 = measure.block_reduce(img, (20, 20), func=np.std) # l = abs(filters.laplace(img, 3)) LogHelper.logText( '%4d-%4d # %4d-%4d # %4d-%d-%4d #%4d-%4d' % (img.min(), img.max(), p1, p99, p50, mp, img.mean(), s2, img.std()) # "%08.8f %08.8f" % (r1.max() / 30, r2.max() / 20) ### good focus s > 4 # % (i, nof, img.min(), p1, p50, p99, img.max(), l.sum(), l.std()) ) p50s.append(p50) mps.append(mp) means.append(img.mean()) s2s.append(s2) stds.append(img.std()) # dm = cv2.subtract(img, d) # p1, p50, p99 = np.percentile(dm, (0.1, 50, 99.9)) # LogHelper.logText( # '%3d of %d, %4d-%4d-%4d-%4d-%5d, %.0f-%.0f' # % (i, nof, dm.min(), p1, p50, p99, dm.max(), np.mean(dm), np.std(dm)) # ) LogHelper.logText( '%4d-%d-%4d #%4d-%4d' % (np.std(p50s), np.std(mps), np.std(means), np.std(s2s), np.std(stds)) )
def loadPatientInfos(self): if (os.path.isfile(self.patientInfosFilename)): file = None try: file = open(self.patientInfosFilename, "r") while True: line = file.readline() if not line: break if line == "": continue newEntry = ast.literal_eval(line) if (isinstance(newEntry, dict) and "id" in newEntry): id = newEntry[self.cache_key_id] if (id in self.dictPatientInfos): entry = self.dictPatientInfos[id] else: entry = {} self.dictPatientInfos[id] = entry for key in newEntry: entry[key] = newEntry[key] self.logger.info("%d patient infos loaded" % len(self.dictPatientInfos)) except Exception, ex: self.logger.exception(LogHelper.getExceptionMsg(ex, "Can't read cache file")) finally:
def calculate_empty_stats(): parsed = get_plate_files("empty") for w in ['w1']: files = filter(lambda f1: f1.wave == w[1], parsed) nof = len(files) stats = [] for i, f in enumerate(files): img = imread(f.fullpath) h = np.unique(img) p0_1, p99_9 = np.percentile(img, (1, 99)).astype(np.uint16) mn = img.min() mx = img.max() mean = img.mean() std = img.std() st = [f.row, f.column, f.site, i, nof, mn, p0_1, p99_9, mx, mean, std, mx-p99_9, len(h)] stats.append(st + [f]) LogHelper.logText('%s%s%s %3d of %d, %4d-%4d-%4d-%5d, %.2f-%.2f-%3d %d' % tuple(st)) stats.sort(key=lambda s: s[8]) s = stats[0] fn = s.pop() LogHelper.logText('%s%s%s %3d of %d, %4d-%4d-%4d-%5d, %.2f-%.2f-%3d %d' % tuple(s)) LogHelper.logText(fn.filename)
def stitch(): DIM = 1080 sDIM = 216 + 4 DIMP = sDIM + 5 plate_name = '59476' parsed = get_plate_files(plate_name) superdim = (DIMP*4*6, DIMP*4*10, 3) superframe = np.ones(superdim, dtype=np.uint8) * 255 for f in parsed: LogHelper.logText(f.filename) s = int(f.site) - 1 c = int(f.column) - 2 r = (ord(f.row) - ord('B')) y = sDIM*(s % 4) + DIMP*4*c x = sDIM*(s / 4) + DIMP*4*r img = imread(f.fullpath) img = rescale(img, 0.2, multichannel=True, preserve_range=True) imgp = np.pad(img, ((0,4), (0,4), (0,0)), 'constant', constant_values=128) LogHelper.logText('{0} read to go {1}x{2}'.format(f.filename, x, y)) superframe[x:x+imgp.shape[0], y:y+imgp.shape[1]] = imgp LogHelper.logText(f.filename + ' placed') imsave(ROOT_DIR + '\\59476super.png', superframe)
def redis_public(o_redis, msg): #写入redis并发出消息 o_redis.public(msg) print(sys.getdefaultencoding()) print('--------------------------------------read config') #print(config_configparser.config_write()) config = config_configparser.config_read() print(config) log_file = (config['DEFAULT']['server action']) b_loop = True log = LogHelper(log_file) # 实例化RedisHelper类对象 str_r_ip = config['redis']['ip2'] str_r_port = config['redis']['port2'] str_r_pwd = config['redis']['pwd2'] str_r_chan = config['redis']['chan1'] str_r_db = config['redis']['db'] str_r_chan2 = config['redis']['chan2'] #打印配置文件 lists_header = config.sections() str_config = "" for secs in lists_header: for key in config[secs]: str_config = str_config + " " + key + ":" + config[secs][key]
def parsePages(self): try: pageIndex = 1 #urlNextPage = 'https://watsi.org/fund-treatments/page/129' urlNextPage = 'https://watsi.org/fund-treatments/' while True: self.logger.info('downloading page %d...' % pageIndex) currentPage = urlNextPage status, page = self.downloader.download_page(currentPage, self.htmlDir, self.assetDir, 'page%05d.html' % (pageIndex), css=False, javascript=False, image=False) if (page == None): self.logger.warn("error: downloading page %d" % (pageIndex)) break elif (status != 200): self.logger.warn("http response: %s %s" % (status, currentPage)) break else: self.logger.info('parsing page %d...' % pageIndex) #find next page's url nodes = page.xpath(u"//a[text()='Next ›']") urlNextPage = urlparse.urljoin(currentPage, nodes[0].attrib['href']) if (len(nodes) > 0) else None items = page.xpath(u"//div[@class='profiles']/ul/li") if (items): for item in items: id = item.attrib["id"] node = self.find_element_by_xpath(item, u".//div/a") url = self.get_attrib(node, "href", None) urlProfile = urlparse.urljoin(currentPage, url) if url else None node = self.find_element_by_xpath(item, u".//*[@class='info-bar']") #info-bar 會在 <p> or <div> 中 title = node.text if node != None else "" node = self.find_element_by_xpath(item, u".//p[@class='profile-description']") description = node.text if node != None else "" node = self.find_element_by_xpath(item, u".//div[@class='cont']/a/img") imgSrc = self.get_attrib(node, "src", "") #Progress node = self.find_element_by_xpath(item, u".//div[@class='meter orange nostripes']/span") progressStr = self.get_attrib(node, "style", "") list = re.findall(ur"[;^]*?\s*?width:\s*([,\d]*)", progressStr) progress = None if(len(list)==1): progress = list[0] #togo raised donors togo = None raised = None donors = None if(title=="The Universal Fund"): continue else: list = re.findall(ur"\$?([,\d]*)\s*(.*?)\s*\|\s*([,\d]*)\s*(.*)", title) if (len(list) == 1 and len(list[0]) == 4): values = list[0] if (values[1] != None and values[1].lower() == "raised"): raised = values[0] donors = values[2] elif (values[1] != None and values[1].lower() == "to go"): togo = values[0] donors = values[2] else: self.logger.error("invalid reg pattern for %s in page %s" % (title, currentPage)) continue else: self.logger.error("invalid reg pattern for %s in page %s" % (title, currentPage)) continue if (raised != None): raised = re.sub(ur"[,$]", "", raised) if (togo != None): togo = re.sub(ur"[,$]", "", togo) if (donors != None): donors = re.sub(ur"[,$]", "", donors) #Log.i("%s %s" %(id, progress)) #Log.i("%s %s" %(id, urlProfile)) # Log.i("%s %s" %(id, title)) # Log.i("\t%s" % description) # Log.i("\t%s" % imgSrc) outputDir = os.path.join(os.path.join(self.profileDir, id[-1:]), id) if (progress == '0' and not (os.path.isdir(outputDir))): os.makedirs(outputDir) if not os.path.exists(outputDir): continue if (self.getPrevProgressById(id) != '100'): self.parseProfile(id, urlProfile, outputDir, donors) else: self.ensureProfileDownloaded(id, urlProfile, outputDir) self.saveOverallEntry(id, [id, urlProfile]) self.cache_profile_details(id, progress, raised, togo, donors) self.logger.info("%d items found" % (len(items))) if (not items): self.logger.info("items not found") break if (len(items) == 0): self.logger.info("items length == 0") break if (not urlNextPage): self.logger.info("NextPage not found") break pageIndex += 1 self.savePatientInfos() self.logger.info('done!') except Exception as ex: self.logger.exception(LogHelper.getExceptionMsg(ex, "parsePages")) finally: pass
def MainProcess(uperList, saveRootPath, concurrency = 3): logger = LogHelper('Bili', cmdLevel='INFO', fileLevel="DEBUG").logger pp = None try: # -------------------------------------------------------------- # 进行每个 UP 主视频页数的获取 pp = PreProcess(logger = logger, uperList=uperList) pp.ScanLoclInfo(saveRootPath) pp.Process() # -------------------------------------------------------------- # 爬取要下载视频的 url for uper in pp.uperList: logger.info(uper.UserName + " Spider Start···") OneSpiderRetryTimes = 0 # 打算下载的数量,要去网络动态获取的数量进行对比 while ((uper.NeedDownloadFilmCount > len(uper.VideoInfoDic_NetFileName) or len(uper.ErrorUrl_Dic) > 0) and OneSpiderRetryTimes <= 10): # dd = BiliSpider() # GithubDeveloperSpider BiliSpider.start(logger = logger, uper = uper, saveRootPath = saveRootPath, concurrency = concurrency, middleware=middleware) OneSpiderRetryTimes = OneSpiderRetryTimes + 1 logger.info("Try Spider " + uper.UserName + " " + str(OneSpiderRetryTimes) + " times.") RandomSleep() logger.info(uper.UserName + " Spider Done.") if OneSpiderRetryTimes > 10: logger.error(uper.UserName + " Spider Retry " + str(OneSpiderRetryTimes) + "times.") logger.error("Error Url:") for eUrl in uper.ErrorUrl_Dic: logger.error(eUrl) else: # 本地应该原有+准备要下载的 != 网络总数,需要提示 if len(uper.VideoInfoDic_NetFileName) != len(uper.VideoInfoDic_loaclFileName): logger.warn("VideoInfoDic_NetFileName Count: " + str(len(uper.VideoInfoDic_NetFileName)) + " != VideoInfoDic_loaclFileName Count: " + str(len(uper.VideoInfoDic_loaclFileName)) ) uper.ErrorUrl_Dic.clear() logger.info("Spider All Done.") # -------------------------------------------------------------- logger.info("Start Download"+ "----" * 20) # 开始下载 # 先对 local 与 net 的字典进行同步 logger.info("Start Sync Dic") for uper in pp.uperList: iNeedDl = 0 for fileName, oneVideo in zip(uper.VideoInfoDic_loaclFileName.keys(), uper.VideoInfoDic_loaclFileName.values()): if fileName in uper.VideoInfoDic_NetFileName: uper.VideoInfoDic_NetFileName[fileName].isDownloaded = oneVideo.isDownloaded if oneVideo.isDownloaded == False: iNeedDl = iNeedDl + 1 logger.info(uper.UserName + "NetFile / LocalFile -- NeedDl: " + str(len(uper.VideoInfoDic_NetFileName)) + " / " + str(len(uper.VideoInfoDic_loaclFileName)) + " -- " + str(iNeedDl)) logger.info("End Sync Dic") for uper in pp.uperList: directory = os.path.join(saveRootPath, uper.UserName) for fileName, oneVideo in zip(uper.VideoInfoDic_NetFileName.keys(), uper.VideoInfoDic_NetFileName.values()): if oneVideo.isDownloaded == True: continue DownloadRetryTimes = 0 oneRe = False while oneRe is False and DownloadRetryTimes <= 10: oneRe = Downloader(logger, directory, oneVideo.time, oneVideo.title, oneVideo.url).ProcessOne() DownloadRetryTimes = DownloadRetryTimes + 1 logger.info("Try Download " + str(DownloadRetryTimes) + " times.") RandomSleep() if OneSpiderRetryTimes > 10: logger.error("Retry Download " + str(DownloadRetryTimes) + " times.") logger.error("Error Url: " + oneVideo.url) # 标记下载完成 if oneRe: oneVideo.isDownloaded = True uper.ThisTimeDownloadCount = uper.ThisTimeDownloadCount + 1 except Exception as ex: errInfo = "Catch Exception: " + str(ex) logger.error(errInfo) finally: logger.info("finally"+ "----" * 20) for uper in pp.uperList: logger.info("This Time Download: " + uper.UserName + " -- " + str(uper.ThisTimeDownloadCount)) for uper in pp.uperList: for fileName, oneVideo in zip(uper.VideoInfoDic_NetFileName.keys(), uper.VideoInfoDic_NetFileName.values()): if oneVideo.isDownloaded == False: logger.error('Download Fail:' + uper.UserName) logger.error(oneVideo.url) logger.info("All Done.")
configInfo.barkurl = cf.get("BarkConfig", "barkurl") configInfo.barkapikey = cf.get("BarkConfig", "barkapikey") configInfo.notifyurl = cf.get("BarkConfig", "notifyurl") configInfo.repeatTimes = int(cf.get("DownloadConfig", "repeatTimes")) configInfo.delay = int(cf.get("DownloadConfig", "delay")) return configInfo if __name__ == '__main__': # -------------------------------------------------------------- # 读取外部配置 configInfo = ReadConfigIni() while configInfo.repeatTimes > 0 or configInfo.repeatTimes == -1: logger = LogHelper('Bili', cmdLevel='INFO', fileLevel="DEBUG").logger try: logger.info('repeatTimes = ' + str(configInfo.repeatTimes)) # -------------------------------------------------------------- # 设置需要下载的信息 # 每个 UP 主视频 downloadlistfile = 'DownloadList.txt' if os.path.exists(downloadlistfile) == True: filmList = ReadDownloadList(downloadlistfile) else: logger.error("DownloadList.txt not found") raise Exception("DownloadList.txt not found") uperList = ReadDownloadList(downloadlistfile)
def saveProfile(self, profileName, dir, reportUrl, detailUrl, overallEntry): assetdir = os.path.join(dir, "files" + os.sep) if (not os.path.isdir(dir)): os.makedirs(dir) if (not os.path.isdir(assetdir)): os.makedirs(assetdir) status, page = self.downloader.download_page(reportUrl, dir, assetdir, '%s_origin.htm' % (profileName), css=False, javascript=False, image=False) #self.downloader.clear_cache() if (page != None): reporter = None reportContent = "" #headers items = page.xpath(u"//*[@id='maincontent']//article/header/hgroup/*") for item in items: header = StrHelper.trim(item.text_content()) if (header != None and header.startswith(profileName)): header = StrHelper.trim(header[len(profileName):]) reportContent += header + os.linesep break reportContent += os.linesep #content reg = re.compile(ur"^基金會編號.*$", re.MULTILINE) allsymbols = ur" ,、。.?!~$%@&#*‧;︰…‥﹐﹒˙·﹔﹕‘’“”〝〞‵′〃├─┼┴┬┤┌┐╞═╪╡│▕└┘╭╮╰╯╔╦╗╠═╬╣╓╥╖╒╤╕║╚╩╝╟╫╢╙╨╜╞╪╡╘╧╛﹣﹦≡|∣∥–︱—︳╴¯ ̄﹉﹊﹍﹎﹋﹌﹏︴﹨∕╲╱\/↑↓←→↖↗↙↘〔〕【】﹝﹞〈〉﹙﹚《》(){}﹛﹜『』「」<>≦≧﹤﹥︵︶︷︸︹︺︻︼︽︾︿﹀∩∪﹁﹂﹃﹄" regReporters = [ #re.compile(ur"[。:」\s]+(.{3,4})口述.?記者(.{3,4})(?:採訪整理)?$", re.MULTILINE), re.compile(allsymbols + ur"[\s]+(.{2,4})[口筆]述\s?.?\s?記者(.{2,4})(?:採訪整理)?$", re.MULTILINE), #[\u4e00-\u9fa5] 英文字符之外的字符,包括中文漢字和全角標點 re.compile(ur"報導.攝影.(.{2,4})記者$", re.MULTILINE), re.compile(ur"報導.攝影.(.{2,4})$", re.MULTILINE), re.compile(ur"攝影.報導.(.{2,4})$", re.MULTILINE), re.compile(ur"攝影.(.{2,4})$", re.MULTILINE), re.compile(ur"報導.(.{2,4})$", re.MULTILINE), re.compile(ur"報導.(.{2,4})$", re.MULTILINE), re.compile(ur"記者(.{2,4})採訪整理$", re.MULTILINE), re.compile(ur"^【(.{2,4})╱.{2,4}報導】", re.MULTILINE), ] #preserve <br> tags as \n brs = page.xpath(u"//div[@class='articulum']//br") if (len(brs) == 0): brs = page.xpath(u"//div[@class='articulum trans']//br") for br in brs: br.tail = "\n" + br.tail if br.tail else "\n" items = page.xpath(u"//div[@class='articulum']/*") if (len(items) == 0): items = page.xpath(u"//div[@class='articulum trans']/*") for item in items: tag = item.tag.lower() id = self.get_attrib(item, "id", None) # if (tag == "figure"): continue # if (tag == "iframe"): break if (id == "bcontent" or id == "bhead" or id == "introid"): text = StrHelper.trim(item.text_content()) if (text == None or text == ""): continue if (id != "bhead"): for regReporter in regReporters: list = regReporter.findall(text) if (len(list) == 1): if (not isinstance(list[0], basestring)): reporter = "/".join(list[0]) else: reporter = list[0] text = StrHelper.trim(regReporter.sub('', text)) break if (reporter): overallEntry.reporter = reporter else: self.logger.warn("error: parsing reporter: %s" % reportUrl) text = StrHelper.trim(reg.sub('', text)) reportContent += text + os.linesep + os.linesep FileHelper.saveToFile(os.path.join(dir, reportFileName), reportContent) status, page = self.downloader.download_page(detailUrl, dir, assetdir, detailSrcFileName, css=False, javascript=False, image=False) if (page != None): items = page.xpath(u"//div[@id='charitysidebox3'][1]/div[@id='inquiry3']/table//tr") maxDate = None if (len(items) > 0): file = None try: file = open(os.path.join(dir, detailFileName), "wb") csvwriter = csv.writer(file) for index, item in enumerate(items): if (index > 1): cols = item.xpath(u".//td") if (len(cols) == 4): no = StrHelper.trim(cols[0].text) name = StrHelper.trim(cols[1].text) amount = StrHelper.trim(cols[2].text) dateStr = StrHelper.trim(cols[3].text) try: date = datetime.datetime.strptime(dateStr, "%Y/%m/%d") if (maxDate == None or date > maxDate): maxDate = date except Exception as ex: self.logger.warn("error date format:%s in %s" % (dateStr, detailUrl)) csvwriter.writerow([no, dateStr, amount, name]) overallEntry.enddate = maxDate.strftime("%Y/%m/%d") if maxDate != None else "" overallEntry.doners = len(items) - 2 except Exception as ex: self.logger.exception(LogHelper.getExceptionMsg(ex, "error paring detail.html")) finally: if (file): file.close()
def parsePages(self): try: pageIndex = 1 while True: urlNextPage = 'http://search.appledaily.com.tw/charity/projlist/Page/%d' % pageIndex self.logger.info('downloading page %d...' % pageIndex) currentPage = urlNextPage status, page = self.downloader.download_page(currentPage, self.htmlDir, self.assetDir, 'page%05d.html' % (pageIndex), css=False, javascript=False, image=False) if (page == None): self.logger.warn("error: downloading page %d" % (pageIndex)) break elif (status != 200): self.logger.warn("http response: %s %s" % (status, currentPage)) break else: self.logger.info('parsing page %d...' % pageIndex) items = page.xpath(u"//tr[@class='odd']") row = 0 if (items): for item in items: nodes = item.xpath(u".//td") if (len(nodes) == 6): reportUrl = None detailUrl = None title = None row += 1 id = nodes[0].text if( id[0] == 'A'): node = nodes[1].xpath(u".//a") if len(node) > 0: title = node[0].text reportUrl = urlparse.urljoin(currentPage, self.get_attrib(node[0], "href", None)) else: self.logger.warn("title not found") date = nodes[2].text status = str(nodes[3].text_content()) amount = nodes[4].text node = nodes[5].xpath(u".//a") if len(node) > 0: detailUrl = urlparse.urljoin(currentPage, self.get_attrib(node[0], "href", None)) else: self.logger.warn("detail not found") if (title == None): self.logger.warn("title not found") if (title == None or reportUrl == None or detailUrl == None): self.logger.warn("parse error!!!") if (status == u"已結案"): dir = os.path.join(self.profileDir, id[-1:] + os.sep + id + os.sep) dirRm = os.path.join(self.profileDir, u"未結案" + os.sep + id[-1:] + os.sep + id + os.sep) if (self.getIsProfileSaved(dirRm)): shutil.rmtree(dirRm, ignore_errors=True) if (not self.getIsProfileSaved(dir)): #self.logger.warn("saving profile: page %d, id %s" % (pageIndex, id)) overallEntry = OverallEntry() overallEntry.id = id overallEntry.title = StrHelper.trim(title) overallEntry.total = amount overallEntry.begindate = date overallEntry.reportUrl = reportUrl overallEntry.detailUrl = detailUrl self.logger.info("saving profile %s" % id) # FIXME: IOError: [Errno 2] No such file or directory: appledaily/profiles/\u672a\u7d50\u6848/ dir = dir.replace(u"未結案"+os.sep, '') self.saveProfile(id, dir, reportUrl, detailUrl, overallEntry) self.saveOverallEntry(overallEntry.id, [overallEntry.id, overallEntry.begindate, overallEntry.enddate, overallEntry.total, overallEntry.doners, overallEntry.title, overallEntry.reporter, overallEntry.reportUrl, overallEntry.detailUrl]) self.saveUrls(dir, reportUrl, detailUrl) #self.saveMetadata(dir, title, date, amount) elif (status == u"未結案"): dir = os.path.join(self.profileDir, u"未結案" + os.sep + id[-1:] + os.sep + id + os.sep) overallEntry = OverallEntry() overallEntry.id = id overallEntry.title = StrHelper.trim(title) overallEntry.total = amount overallEntry.begindate = date overallEntry.reportUrl = reportUrl overallEntry.detailUrl = detailUrl self.logger.info("saving profile %s" % id) self.saveProfile(id, dir, reportUrl, detailUrl, overallEntry) self.saveOverallEntryPending(overallEntry.id, [overallEntry.id, overallEntry.begindate, overallEntry.enddate, overallEntry.total, overallEntry.doners, overallEntry.title, overallEntry.reporter, overallEntry.reportUrl, overallEntry.detailUrl]) self.saveUrls(dir, reportUrl, detailUrl) # pass else: self.logger.warn("unknown status") self.logger.info("%d items found" % (row)) if (row == 0): break if (not items): self.logger.info("items not found") break if (len(items) == 0): self.logger.info("items length == 0") break pageIndex += 1 self.logger.info('done!') except Exception as ex: self.logger.exception(LogHelper.getExceptionMsg(ex, "parsePages")) finally: pass
def main(): logger = LogHelper('ZiMuZuHelper', cmdLevel='INFO', fileLevel="DEBUG").logger onething = WanKeYunApi.WanKeYunApi(logger) bok = onething.LoginEx(user="******", passwd="1234567890") if bok is False: return bok = onething.GetUSBInfo() if bok is False: return bok = onething.RemoteDlLogin() if bok is False: return bok = onething.GetRemoteDlInfo() if bok is False: return # -------------------------------------------------------------------------------- # bok, mediaInfo = onething.UrlResolve('ed2k://|file|%E9%BB%84%E7%9F%B3.Yellowstone.2018.S01E07.%E4%B8%AD%E8%8B%B1%E5%AD%97%E5%B9%95.WEB.720P-%E4%BA%BA%E4%BA%BA%E5%BD%B1%E8%A7%86.mp4|559753916|bdb7746c12f23558420a1bfd610e8bb5|h=xavxscmhtkwu4bl52jiqnmow6pa6ntdf|/') # -------------------------------------------------------------------------------- JobList = [] OneJob = { "filesize": 0, "name": '黄石.Yellowstone.2018.S01E07.中英字幕.WEB.720P-人人影视.mp4', "url": 'ed2k://|file|%E9%BB%84%E7%9F%B3.Yellowstone.2018.S01E07.%E4%B8%AD%E8%8B%B1%E5%AD%97%E5%B9%95.WEB.720P-%E4%BA%BA%E4%BA%BA%E5%BD%B1%E8%A7%86.mp4|559753916|bdb7746c12f23558420a1bfd610e8bb5|h=xavxscmhtkwu4bl52jiqnmow6pa6ntdf|/', } OneJob2 = { "filesize": 0, "name": '黄石.Yellowstone.2018.S01E08.中英字幕.WEB.720P-人人影视.mp4', "url": 'ed2k://|file|%E9%BB%84%E7%9F%B3.Yellowstone.2018.S01E08.%E4%B8%AD%E8%8B%B1%E5%AD%97%E5%B9%95.WEB.720P-%E4%BA%BA%E4%BA%BA%E5%BD%B1%E8%A7%86.mp4|472873520|c273bf00703b45225f2056393d6de87f|h=yq4vc2vndh2fnqdiwnhnqapwh7xcvlrw|/', } OneJob3 = { "filesize": 0, # "name": '【幻櫻字幕組】【一拳超人 第二季 ONE PUNCH MAN S2】【OVA】【02】【BIG5_MP4】【1280X720】.mp4', "name": '123.mp4', "url": "magnet:?xt=urn:btih:UK32AE3T2R3UOBAPDVZJ6W35T7DRSFGJ&dn=&tr=http%3A%2F%2F104.238.198.186%3A8000%2Fannounce&tr=udp%3A%2F%2F104.238.198.186%3A8000%2Fannounce&tr=http%3A%2F%2Ftracker.openbittorrent.com%3A80%2Fannounce&tr=udp%3A%2F%2Ftracker3.itzmx.com%3A6961%2Fannounce&tr=http%3A%2F%2Ftracker4.itzmx.com%3A2710%2Fannounce&tr=http%3A%2F%2Ftracker.publicbt.com%3A80%2Fannounce&tr=http%3A%2F%2Ftracker.prq.to%2Fannounce&tr=http%3A%2F%2Fopen.acgtracker.com%3A1096%2Fannounce&tr=https%3A%2F%2Ft-115.rhcloud.com%2Fonly_for_ylbud&tr=http%3A%2F%2Ftracker1.itzmx.com%3A8080%2Fannounce&tr=http%3A%2F%2Ftracker2.itzmx.com%3A6961%2Fannounce&tr=udp%3A%2F%2Ftracker1.itzmx.com%3A8080%2Fannounce&tr=udp%3A%2F%2Ftracker2.itzmx.com%3A6961%2Fannounce&tr=udp%3A%2F%2Ftracker3.itzmx.com%3A6961%2Fannounce&tr=udp%3A%2F%2Ftracker4.itzmx.com%3A2710%2Fannounce&tr=http%3A%2F%2Fnyaa.tracker.wf%3A7777%2Fannounce" } JobList.append(OneJob) JobList.append(OneJob2) JobList.append(OneJob3) # -------------------------------------------------------------------------------- # 创建批量下载任务示例,原生,需要自己填入,需要下载到那个磁盘 # 一般就是磁盘 0 # partitionID = 0 # rootPath = onething.user_info["usb_info"][1]['partitions'][partitionID]['path'] # remoteLocation = rootPath + self.defaultPath # remoteLocation = remoteLocation.lower() # onething.CreateTasks(JobList, remoteLocation) # -------------------------------------------------------------------------------- # 当玩客云关机再开机的时候,需要恢复为下载完成的任务,也可以操作暂停正在下载的任务 # 查询下载的任务列表,下载完毕的也在内,需要过滤 # nowDownloadingList = onething.user_info["remote_download_list"]["tasks"] # for oneTask in nowDownloadingList: # iprogress = int(oneTask["progress"]) # if iprogress == 10000: # pass # else: # # onething.StartRemoteDl(oneTask["id"]) # onething.PauseRemoteDl(oneTask["id"]) # -------------------------------------------------------------------------------- # 创建批量下载任务,扩展,会判断 onething.AddDownloadTasks(JobList) # -------------------------------------------------------------------------------- print("Done.")