def getPageContent(self, filename, from_where='local'): if from_where == 'local': parts = filename.split("::") if len(parts) == 3: binReader = BinReader(parts[1]) _, content = binReader.readone_at(int(parts[2])) if len(content) == 0: raise Exception("file name:{} , content error".format(filename)) return content if len(parts) == 1: with open(filename) as f: content = f.read() if len(content) == 0: raise Exception("file name:{} , content error".format(filename)) return content elif from_where == 'remote': #TODO # 从远程获取bin文件内容 content = self.bin_read_client.getHtml(filename) if isinstance(content, unicode): content = content.encode('utf-8') return content else: raise Exception("unknown from_where")
def getPageContent(self, filename): parts = filename.split("::") if len(parts) == 3: binReader = BinReader(parts[1]) _, content = binReader.readone_at(int(parts[2])) if len(content) == 0: raise Exception("file name:{} , content error".format(filename)) return content if len(parts) == 1: with open(filename) as f: content = f.read() if len(content) == 0: raise Exception("file name:{} , content error".format(filename)) return content
def get_cv_html_page(jdid): cv_db = 'cv_crawler' try: coll = "page_store_%s" % jdid.split('://')[0] except Exception as e: traceback.print_exc() return None doc = GetHtmlPage.CvClient[cv_db][coll].find_one({'indexUrl':jdid}) if not doc: print "cvid: %s not exists" % jdid return None return BinReader.getPageContent(doc['pageContentPath'])