def savetexts(self, filepath, prepath): """ 保存预处理后的文本 :param filepath: html文件路径 :param prepath: 保存路径 :return: """ self.logger.info('init pretreatment directory:"{0}"'.format(prepath)) FileUtil.init_path(prepath) try: file_lists = os.listdir(filepath) # 返回当前路径下所有文件和路径,字符串类型 for filename in file_lists: file = os.path.join(filepath, filename) if os.path.isfile(file): # 1.获取url及文本 url, text = FileUtil.get_url_text(file) # 2.关键词信息 kws = PreDeal.seg(text) self.logger.info( "Store pretreatment texts content:{0}".format( filename)) FileUtil.writefile(url + '\t'.join(kws), os.path.join(prepath, filename)) self.logger.info('Text pretreatment End!') except Exception as e: print(e)
def main(): from_path = os.path.expanduser(sys.argv[1]) filestat_file = os.path.join(os.path.dirname(__file__), '../data/filestat.lst') dirstat_file = os.path.join(os.path.dirname(__file__), '../data/dirstat.lst') json_file = os.path.join(os.path.dirname(__file__), '../data/data.json') #scan_file_sizes(from_path, stat_file) #calc_dir_sizes(filestat_file, dirstat_file) lines = FileUtil.read_all(dirstat_file).splitlines() iter_dir_sizes = FileUtil.parse_path_sizes(lines) root = SizeNode.build_tree(iter_dir_sizes) node = root.find_node(os.path.join(from_path)) #nodes = sorted([c for c in node.name2childs.values()], key=lambda n: n.size, reverse=True) nodes = node.name2childs.values() min_size = node.size * 0.001 #print "min_size", min_size #nodes = SquareTreeMap.norm_node_sizes(nodes) rects = SquareTreeMap.squarify_size_nodes(node.name, nodes, 0., 0., 700., 400., min_size) # rects = square_node(node, 0., 0., 700., 400.) # print 'rects', rects write_d3_rect_json(json_file, rects)
def expriment(self, path='', pagenum=100): savename = os.path.join(config.hidepath, 'res.txt') for dirname in os.listdir(path): filepath = os.path.join(path, dirname) if os.path.isdir(filepath): for f in os.listdir(filepath): fi = os.path.join(filepath, f) res = [] keywords, num = self.info(fi=fi, pagenum=pagenum) unmatch = 0 hidenum = 0 s = '\t||' for i in num: if i == 0: unmatch += 1 else: hidenum = hidenum + i s = s + '\t' + str(i) res.append(fi) res.append(str(len(keywords))) res.append(str(hidenum)) res.append(str(len(num))) res.append(str(unmatch)) res_str = '\t'.join(res) + s + '\n' FileUtil.write_apd_file(res_str, savename) FileUtil.write_apd_file(dirname + ' End !\n', savename) pass pass
def init_path(self): savepath = os.path.join(config.hidepath, '_'.join(self.keys)) kwpath = os.path.join(config.hidekwpath, '_'.join(self.keys)) if not os.path.exists(savepath): os.makedirs(savepath) else: FileUtil.init_path(savepath) return savepath, kwpath
def query(self, keywords, kwpath=''): path = [] # 已经找到的文章列表 num = [] # 每篇含文章组合的个数 unmatch = 0 # 失配个数 maxh = 0 # 关键词个数 q = '' # 联合关键词 flag = True # 失配标志 hidekey = [] while keywords: kw = keywords[0] paper = Index.search(self.pindexp, q + ' ' + kw, limit=None) if paper: keywords.pop(0) hidekey.append(kw) q = q + ' ' + kw maxh += 1 else: # 当联合搜索无法进行下去时,转为寻找相似关键词 simikeys = WV.similarwords(kw) t_paper = [] if not simikeys: print( ".................Failed to find similar words................" ) flag = False else: for skw, similarity in simikeys: sq = q + ' ' + skw t_paper = Index.search(self.pindexp, sq, limit=None) if t_paper: hidekey.append(skw) keywords.pop(0) q = sq maxh += 1 break if not t_paper: # 有关键词但联合搜索仍失败 flag = False # 失配 if not flag: doc = Index.search(self.pindexp, q, limit=None) if not doc: print("The keyword '%s' is unMatch !" % kw) unmatch += 1 hidekey.append('0') keywords.pop(0) path.append(None) # flag = True else: path.append(doc) num.append(maxh) maxh = 0 q = '' flag = True if not keywords: path.append(paper) hide_string = ' '.join(hidekey) FileUtil.writefile(hide_string, kwpath) return path
def all_CR(self, infopath, extpath): CR = [] file_list = os.listdir(infopath) for name in file_list: origin_info = FileUtil.readfile(filename=os.path.join(infopath, name)) ext_info = FileUtil.readfile(filename=os.path.join(extpath, name)) cr = self.CR(origin_info, ext_info) CR.append(cr) return CR
def crawl(self): self.download.download() readpath = os.path.join(config.spiderhtml, self.filename) savepath = os.path.join(config.spidertext, self.filename) FileUtil.init_path(savepath) for filename in os.listdir(readpath): file = os.path.join(readpath, filename) url, content = self.parse.parse(file) filename = filename.rstrip('.html') + '.txt' self.logger.info("Save spider url and content:{0}".format(url)) FileUtil.writefile(url + content, os.path.join(savepath, filename)) print('crawl web contents end!')
def search_archive_file(self, sf): """Search an archive (compressed) file""" ext = FileUtil.get_extension(sf.filename) if not ext: return if self.settings.debug: common.log('Searching {0} file {1}'.format(ext, sf)) if ext in ('zip', 'jar', 'war', 'ear'): # handle zip files try: self.search_zip_file(sf) except zipfile.BadZipfile as e: if not ext == 'ear': common.log('BadZipfile: {0!s}: {1}'.format(e, sf)) elif ext in ('bz2', 'tar', 'tgz', 'gz') and \ tarfile.is_tarfile(sf.relativepath): # handle tar files try: self.search_tar_file(sf, ext) except Exception as e: msg = 'Exception while searching a tar file {0}: {1!s}' common.log(msg.format(sf, e)) else: msg = 'Searching archive file type "{0}" is unsupported at this time' common.log(msg.format(ext))
def save_data(self): _resAll = FileUtil.readlines(self.resFileName) self._clear_data() _totalSize = len(_resAll) print('Res data storing...') _stepSize = 1000 _sql = "insert into resitem(name) values" _sqlTemp = '' for _index, _value in enumerate(_resAll): try: _value = pymysql.escape_string(_value) if _index % _stepSize == 0 or _index == _totalSize - 1: if _sqlTemp: _conn = self.dbHelper.connectDatabase() print('storing: %d / %d' % (_index + 1, _totalSize)) _cur = _conn.cursor() _sqlTemp = _sqlTemp + ",('%s')" % _value if _index == _totalSize - 1 else _sqlTemp _cur.execute(_sqlTemp) _conn.commit() _cur.close() _conn.close() _sqlTemp = _sql + "('%s')" % _value else: _sqlTemp += ",('%s')" % _value except Exception as error: self.logger.log(logging.ERROR, error)
def kw_num(self): nums = [] for file in os.listdir(self.filepath): filename = os.path.join(self.filepath, file) kws = FileUtil.readupkws(filename) nums.append(len(kws)) return nums
def is_same(self, local_file_name): local_md5 = FileUtil.file_md5(local_file_name) remote_md5 = self.cloud.query_cloudfile_md5(local_file_name) #print "file name", local_file_name #print "local_md5", local_md5 #print "remote_md5", remote_md5 return local_md5 == remote_md5
def main(): # Create directory if it doesn't exist futil = FileUtil(".opendoord") # Get access to the database handler logger = Logger.get(verbose=True) db = Sqlite(futil.path + "/opendoor.db", logger) port = Port(logger) pipes = Pipes(logger, port, db) i = 0 logger.debug("Send commands via pipe with 10 sec delay") while i < 100: i += 1 pipes.send_to_app("OPEN DOOR\n", i) logger.debug("OPEN DOOR") time.sleep(10) i += 1 pipes.send_to_app("DOORBELL PRESSED\n", i) logger.debug("DOORBELL PRESSED") time.sleep(10) i += 1 pipes.send_to_app("DOW RING WITH AUTO OPEN\n", i) logger.debug("DOW RING WITH AUTO OPEN") time.sleep(10)
def __init__(self, lgConfFile, ptrnConfFile, outFldr, chnksz): self.lgConfFile = lgConfFile self.ptrnConfFile = ptrnConfFile self.outFldr = outFldr self.chnksz = chnksz self.rep = Report() self.fu = FileUtil()
def calc_dir_sizes(filestat_file, dirstat_file): lines = FileUtil.read_all(filestat_file).splitlines() dir2size = {} for filepath, size in FileUtil.parse_path_sizes(lines): dir2size[filepath] = size dirpath = os.path.dirname(filepath) while dirpath: try: dir2size[dirpath] += size except KeyError: dir2size[dirpath] = size dirpath = dirpath.rpartition("/")[0] iter_lines = FileUtil.combine_path_sizes(sorted(dir2size.items())) FileUtil.write_all(dirstat_file, iter_lines)
def info(self, fi='', pagenum=100): info = FileUtil.readfile(fi) keywords = PreDeal.seg(info) # 1. 关键词提取 keys = jieba.analyse.textrank(info, topK=10, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) # 2. 调用搜索引擎爬取相关网页 # 2.1 抓取链接 spider_link = SpiderLink(keys, self.root) spider_link.crawl(pagenum) # 2.2 抓取内容 filename = '_'.join(keys) + '.html' spider_to = SpiderTo(filename) spider_to.crawl() # 3. 文本预处理,去重,去停用词,分词,保留url和关键词集合 p = PreDeal() filepath = os.path.join(config.spidertext, '_'.join(keys)) prepath = os.path.join(config.prepapath, '_'.join(keys)) p.savetexts(filepath=filepath, prepath=prepath) # 4. 构建索引, 并检索,得到包含关键词信息的网页 # 4.1 索引构建 indexpath = os.path.join(config.indexpath, '_'.join(keys)) idx = Index() idx.build(datapath=prepath, indexpath=indexpath) search = Search1(filename=fi, pindexp=indexpath) # 4.2 搜索并保存 info_k = keywords[:] num = search.retrieve(keywords=info_k) return keywords, num
def loadAlterSQL(self, dbConnection): fileList = os.listdir(ResourceLocation.AlterDatabaseSQLs.value) index = 0 if len(fileList) > 0: print("Choose the file number:\n") foundSQLScript = False for fileName in fileList: index += 1 if "sql" in fileName: print((str)(index) + ".) " + fileName + "\n") foundSQLScript = True if foundSQLScript == True: exceptionFlag = False choosenFileIndex = input() filePaths = self.getFilePaths( fileList, "sql", ResourceLocation.AlterDatabaseSQLs.value) try: filePath = filePaths[(int)(choosenFileIndex) - 1] except Exception as e: er = Error("You have chosen wrong file as an Input.", traceback.format_exc()) er.handleError() exceptionFlag = True if exceptionFlag == False: file = FileUtil(filePath, "r") self.executeAndCommitToDatabase(dbConnection, file) else: print("Sorry, No SQL Files Exists in the Folder.") else: print("Sorry, No SQL Files Exists in the Folder.")
def init(use_base_dir=False): args = parser.parse_args() setup = ExperimentSetups.parse(args.setup) dirname = fileutil.base_dir(args.dest_dir, setup.name, args.max_quantifier_length, args.model_size) if use_base_dir \ else fileutil.run_dir(args.dest_dir, setup.name, args.max_quantifier_length, args.model_size, args.name) file_util = FileUtil(dirname) return args, setup, file_util
def _failed(self, filename): urls = FileUtil.readfilelist(filename) for i, failed_url in enumerate(urls): html = self.downloader.download(failed_url) datas = self.parser.parse(failed_url, html) self.logger.info("the spider system has fetch %s failed links" % str(i + 1)) self.output.add_data(datas)
def filter_file(self, sf): if FileUtil.is_hidden(sf.filename) and self.settings.excludehidden: return False if sf.filetype == FileType.Archive: return self.settings.searcharchives and self.is_archive_search_file( sf.filename) return not self.settings.archivesonly and self.is_search_file( sf.filename)
def filter_file(self, sf): if FileUtil.is_hidden(sf.filename) and self.settings.excludehidden: return False if sf.filetype == FileType.Archive: return self.settings.searcharchives and \ self.is_archive_search_file(sf.filename) return not self.settings.archivesonly and \ self.is_search_file(sf.filename)
def createSchema(self, dbConnection, schemaName): utility = Utility() self.createDataTablesSQLScript(schemaName) sqlRead = FileUtil(ResourceLocation.DatabaseScript.value, "r") utility.writeLogs(ResourceLocation.LogFileLocation.value, "", LogMessage.DBDatabaseCreation.value, "a", False) self.executeAndCommitToDatabase(dbConnection, sqlRead) utility.writeLogs(ResourceLocation.LogFileLocation.value, "", LogMessage.Completed.value, "a", True)
def get_url_titles(self): parse_list = [] html_str = FileUtil.readfile(self.filename) linktr = etree.HTML(text=html_str).xpath('//tr') for item in linktr: url = item.xpath('string(./td[1])') title = item.xpath('string(./td[2])') parse_list.append(dict([('url', url), ('title', title)])) return parse_list
def writeLogs(self, fileLocation, message, content, access, doComplete): t = time.localtime() logCommands = FileUtil(fileLocation, access) logCommands.writeFileContent( time.strftime("%H:%M:%S", t) + " : " + message + "\n" + content + "\n") if doComplete == True: logCommands.writeFileContent("\n" + logmessage.Seperator.value + "\n")
def readConfig(self): # Read the file and config the connection variables file = FileUtil(self.configFileName, "r") dbConfiguration = file.getFileContent() config = dbConfiguration[int(self.configLine)] configArray = config.split(self.configDelimiter) self.hostName = configArray[0] self.portValue = configArray[1] self.dbName = configArray[2] self.userName = configArray[3] self.password = configArray[4] self.schemaName = configArray[5]
def getTableHeader(self, fileList): utility = Utility() filePaths = self.getFilePaths(fileList, "csv", ResourceLocation.DatabaseLocation.value) utility.writeLogs(ResourceLocation.LogFileLocation.value, ("\n").join(filePaths), LogMessage.Files.value, "a", False) tableHeaders = [] for filePath in filePaths: fileHeader = ((FileUtil(filePath, "r")).getFileContent())[0] tableHeaders.append(fileHeader) return tableHeaders
def main(): global log, gpio, lirc, ping, db, gammu # Parse arguments, use file docstring as a parameter definition args = docopt.docopt(__doc__, version='0.1a') #print args # Create directory if it doesn't exist futil = FileUtil("/home/pi/.resq-pi") gammu = None # Create a logger if args["--verbose"]: log = Logger.get(verbose = True) else: log = Logger.get(futil.path + "/resq-pi.log", False) log.info("*** Start ResQ-Pi ***") # Be sure we have root privileges if os.geteuid() != 0: exit("You need to have root privileges. Exiting.") # Ctrl-C and SIGTERM handler signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) # Get access to the resq-pi database db = ResqStore(futil.path + "/resq-pi.db") if not db.exist(): log.info("No database found. Will create one.") db.create_tables() # if not already created db.reset_tables() # and initialize # Initalize GPIO, Lirc, GooglePing ... gpio = ResqGpio() gpio.led(0, False) # all LEDs off lirc = ResqLirc(log, gpio) ping = GooglePing(log, gpio) test = False if args["--alarm"]: test = True if args["--resetdb"]: log.info("Reset database") db.reset_tables() elif args["--resetpass"]: log.info("Reset password") db.reset_password() elif args["--credits"]: get_sms_credits() else: resqpi_endless_loop(test)
def test_02_size_tree(self): from fileutil import FileUtil from sizenode import SizeNode lines = """\ node_modules/promise/domains: 15908 node_modules/promise/lib: 15912 node_modules/promise/node_modules: 34383 node_modules/promise/node_modules/asap: 34383 node_modules/promise/setimmediate: 15886 node_modules/promise/src: 16314 """.splitlines() path_sizes = FileUtil.parse_path_sizes(lines) self.assertEqual(len([ps for ps in path_sizes]), 6) path_sizes = FileUtil.parse_path_sizes(lines) tree = SizeNode.build_tree(path_sizes) self.assertTrue(tree) self.assertTrue(tree.name2childs) self.assertEqual(tree.find_node("node_modules/promise/node_modules/asap").size, 34383)
def retrieve(self, keywords): savepath, kwpath = self.savepath, self.kwpath path = self.query(keywords, kwpath) for i, doc in enumerate(path): if not doc: oldname = os.path.join(config.unMatch_path, config.unMatch_name) newname = os.path.join(savepath, str(i) + '+' + config.unMatch_name) FileUtil.copyfile(oldname, newname) elif len(doc) > 1: filepath = os.path.join(savepath, str(i)) if not os.path.exists(filepath): os.mkdir(filepath) for d in doc: name = d.get('title') oldname = d.get('path') newname = os.path.join(filepath, str(i) + '+' + name) FileUtil.copyfile(oldname, newname) else: name = doc[0].get('title') oldname = doc[0].get('path') newname = os.path.join(savepath, str(i) + '+' + name) FileUtil.copyfile(oldname, newname) return path
def is_search_dir(self, d): path_elems = [p for p in d.split(os.sep) if p not in FileUtil.DOT_DIRS] if self.settings.excludehidden: for p in path_elems: if FileUtil.is_hidden(p): return False if self.settings.in_dirpatterns and \ not any_matches_any_pattern(path_elems, self.settings.in_dirpatterns): return False if self.settings.out_dirpatterns and \ any_matches_any_pattern(path_elems, self.settings.out_dirpatterns): return False return True
def test_01_file_util(self): from fileutil import FileUtil test_file = __file__ + ".test_01.data" path_sizes = [ ("path1", 1), ("path2", 2), ("path3", 3), ] iter_lines = FileUtil.combine_path_sizes(path_sizes) FileUtil.write_all(test_file, iter_lines) data = FileUtil.read_all(test_file) self.assertEqual(""" path1: 1 path2: 2 path3: 3 """.strip(), data.strip()) path2size = dict([(path, size) for path, size in FileUtil.parse_path_sizes(data.splitlines())]) self.assertEqual(path2size['path2'], 2)
def scpToRemoteMachine(self,remoteFilePath,localFile='', localCatalogue = ''): if localFile and localCatalogue: print 'SCP parameters are wrong.' t = paramiko.Transport((self.scpIp,22)) t.connect(username = self.scpUser, password = self.scpPassword) sftp = paramiko.SFTPClient.from_transport(t) if localFile: scpInfo="scp %s %s@%s:%s" % (localFile,self.scpUser,self.scpIp,remoteFilePath) print scpInfo basename=os.path.basename(localFile)#文件的不带路径的名字 sftp.put(localFile,remoteFilePath+'/'+basename) elif localCatalogue: from fileutil import FileUtil fileutil = FileUtil() scpInfo="scp %s/* %s@%s:%s" % (localCatalogue,self.scpUser,self.scpIp,remoteFilePath) print scpInfo files=os.listdir(localCatalogue) for eachFile in files: if fileutil.isExistFile(eachFile): sftp.put(os.path.join(localCatalogue,eachFile),os.path.join(remoteFilePath,eachFile)) else: print eachFile + ' is not a file.' t.close()
def build(self, datapath, indexpath): self.logger.info('the process of create full-text index!') schema = Schema(title=TEXT(stored=True), path=TEXT(stored=True), content=TEXT(analyzer=SpaceSeparatedTokenizer())) if not os.path.exists(indexpath): # 索引存储路径 os.makedirs(indexpath) ix = create_in(indexpath, schema) # 创建索引 writer = ix.writer() for filename in os.listdir(datapath): filepath = os.path.join(datapath, filename) content = FileUtil.readfile(filepath) writer.add_document(path=filepath, title=filename, content=content) writer.commit()
def is_archive_search_file(self, f): ext = FileUtil.get_extension(f) if self.settings.in_archiveextensions and \ not ext in self.settings.in_archiveextensions: return False if self.settings.out_archiveextensions and \ ext in self.settings.out_archiveextensions: return False if self.settings.in_archivefilepatterns and \ not matches_any_pattern(f, self.settings.in_archivefilepatterns): return False if self.settings.out_archivefilepatterns and \ matches_any_pattern(f, self.settings.out_archivefilepatterns): return False return True
def is_search_file(self, f): ext = FileUtil.get_extension(f) if self.settings.in_extensions and \ not ext in self.settings.in_extensions: return False if self.settings.out_extensions and \ ext in self.settings.out_extensions: return False if self.settings.in_filepatterns and \ not matches_any_pattern(f, self.settings.in_filepatterns): return False if self.settings.out_filepatterns and \ matches_any_pattern(f, self.settings.out_filepatterns): return False return True
def key2loc(self, keywords, filename='.'): col_bits = self.col_bits n = pow(2, col_bits) _url = FileUtil.readurl(filename) text_kws = FileUtil.readupkws(filename) row, col, p = self.location(keywords=keywords, text_list=text_kws) # 得到每个关键词的坐标(x,y) row_bits = len(self._dec2bin(len(text_kws) // n)) # 总行数的二进制表示所需的比特数 s = '' print('location information: ') for r, c in zip(row, col): loc = self._dec2bin(num=r, bits=row_bits) + \ self._dec2bin(num=c, bits=col_bits) print(loc) s = s + loc num_add_col_bits = self._dec2bin(col_bits, 5) # col_bits作为密钥,5位补全二进制流 _res = '' if len(s) % 8 == 0: _res = '000' + num_add_col_bits + s else: num = 8 - len(s) % 8 # 补0的个数 num_add = self._dec2bin(num, 3) _res = num_add + num_add_col_bits + s + '0' * num return _url, _res, p
def main(): global log, port, pipes, db, test_mode # Be sure we have root privileges if os.geteuid() != 0: exit("You need to have root privileges. Exiting.") # Ctrl-C and SIGTERM handler signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) # Parse arguments, use file docstring as a parameter definition args = docopt.docopt(__doc__, version='0.1a') # Create directory if it doesn't exist futil = FileUtil(".opendoord") print("Path: %s, args: %s" % (futil.path, args)) # Create a logger if args["--console"]: log = Logger.get(verbose=True) else: log = Logger.get(futil.path + "/opendoor.log", verbose=args["--verbose"]) log.info("*** Start OpenDoor ***") # Get access to the database handler db = Sqlite(futil.path + "/opendoor.db", log) if not db.exist(): log.info("No database found. Will create one.") db.create_tables() # if not already created db.reset_tables() # and initialize if args["--test"]: test_mode = True # Let's initialize the gpio's port = Port(log, test_mode) # Open the pipes pipes = Pipes(log, port, db) if args["--resetdb"]: db.reset_tables() log.info("Database has been reset.") else: log.info("Watch door events in an endless loop.") opendoor_endless_loop()
def is_search_file(self, sf): if self.settings.in_filetypes and \ not sf.filetype in self.settings.in_filetypes: return False if self.settings.out_filetypes and \ sf.filetype in self.settings.out_filetypes: return False ext = FileUtil.get_extension(sf.filename) if self.settings.in_extensions and \ not ext in self.settings.in_extensions: return False if self.settings.out_extensions and \ ext in self.settings.out_extensions: return False if self.settings.in_filepatterns and \ not matches_any_pattern(sf.filename, self.settings.in_filepatterns): return False if self.settings.out_filepatterns and \ matches_any_pattern(sf.filename, self.settings.out_filepatterns): return False return True
def is_xml_file(self, f): """Return true if file is of a (known) xml file type""" return FileUtil.get_extension(f) in self.filetypes['xml']
def is_text_file(self, f): """Return true if file is of a (known) text file type""" return FileUtil.get_extension(f) in self.filetypes['text']
def is_searchable_file(self, f): """Return true if file is of a (known) searchable type""" return FileUtil.get_extension(f) in self.filetypes['searchable']
def test_get_extension_hidden_file_no_extension(self): filename = '.hidden' self.assertEqual(FileUtil.get_extension(filename), '')
def test_is_hidden_double_dot(self): filename = '..' self.assertFalse(FileUtil.is_hidden(filename))
def test_is_hidden_not_hidden_file(self): filename = 'filename.txt' self.assertFalse(FileUtil.is_hidden(filename))
#-*- coding: UTF-8 -*- import lxml.etree from fileutil import FileUtil #http://lxml.de/lxmlhtml.html #http://www.cnblogs.com/descusr/archive/2012/06/20/2557075.html #http://docs.python-guide.org/en/latest/scenarios/scrape/ if __name__=='__main__': fileutil = FileUtil() content = fileutil.readLocalFile('./example.html') page = lxml.etree.HTML(content.decode('UTF-8'), parser=None, base_url=None) ''' for image in images: imageDict = image.attrib try: print imageDict['href']
def test_get_extension_has_txt_extension(self): filename = 'filename.txt' self.assertEqual(FileUtil.get_extension(filename), 'txt')
def test_get_extension_no_extension(self): filename = 'filename' self.assertEqual(FileUtil.get_extension(filename), '')
def test_get_extension_hidden_txt_file(self): filename = '.hidden.txt' self.assertEqual(FileUtil.get_extension(filename), 'txt')
def test_is_dot_dir_non_dot_dir(self): filename = '.git' self.assertFalse(FileUtil.is_dot_dir(filename))
#网页内容的回调函数 def contentWriteCallBack(self,buf): self.b.write(buf) #头部信息的回调函数 def headerWriteCallBack(self,buf): self.headerWrite.truncate()#从当前位置开始,后面的内容全部删除 self.headerWrite.write(str(buf)) #下载/上传进度 def progressWriteCallBack(self,downloadTotal,downloadNow,uploadTotal,uploadNow): if downloadTotal > 0: Progress = 'download progress: ' + str(round((downloadNow/downloadTotal)*100,2)) + '%' print Progress+" \r" else: if uploadTotal > 0: Progress = 'upload progress: ' + str(round((uploadNow/uploadTotal)*100,2)) + '%' print Progress+" \r" #######################回调函数结束################################################## if __name__=='__main__': from fileutil import FileUtil fileutil = FileUtil() downloadutil = DownloadUtil() value = downloadutil.download('http://s.taobao.com/search?q=10%E5%85%83%E5%8C%85%E9%82%AE') if value: print 'Login success' fileutil.writeIntoFile('C:/Users/guo_f/Desktop/compare/search_pycurl.html',value) else: print 'error'
def test_is_hidden_hidden_file(self): filename = '.filename.txt' self.assertTrue(FileUtil.is_hidden(filename))
def is_archive_file(self, f): """Return true if file is of a (known) archive file type""" return FileUtil.get_extension(f) in self.filetypes['archive']
class SimpleLogAnalyzer: def __init__(self, lgConfFile, ptrnConfFile, outFldr, chnksz): self.lgConfFile = lgConfFile self.ptrnConfFile = ptrnConfFile self.outFldr = outFldr self.chnksz = chnksz self.rep = Report() self.fu = FileUtil() def getLogFiles(self): files =[] folders = self.fu.readAllLines(self.lgConfFile); for fldrName in folders: lindx = getLastPathIndx( fldrName ) fldr = fldrName[0:lindx] ptrn = fldrName[lindx+1:] files+=self.fu.listDir( fldr, ptrn ) return files def copy(self,rb,chunks,found): chnk = chunk() chnk.addTag(found) chunks.append(chnk) for rbel in rb.get(): if(rbel!=None): chnk.append(rbel) def processBuffer(self,buff, ptrns, offset, chnksz, chnks,rb): cLastIndx = len(chnks) - 1 for line in buff: if(offset>0): chnks[cLastIndx].append(line) found = match(line,ptrns) offset-=1 if(found != None): chnks[cLastIndx].addTag(found) else: rb.append(line) found = match(line,ptrns) if(found != None): self.copy(rb,chnks,found) cLastIndx+=1 offset=chnksz return offset def processFile(self,inFileName, ptrns, chnksz): bsz = chnksz * 20 offset=0 chnks=[] rb = RingBuffer(chnksz) buffitr = self.fu.readNextBuffer(inFileName, bsz) for buff in buffitr: offset = self.processBuffer(buff,ptrns,offset,chnksz,chnks,rb) return chnks def dumpChnks(self, chnks, outFldr, lgFile): i=getLastPathIndx(lgFile) outFile = outFldr + lgFile[i:] + ".xml" for cn in chnks: self.fu.writeContents(outFile, cn.serialize()) def process(self): lgFiles = self.getLogFiles() ptrns = self.fu.readAllLines(self.ptrnConfFile) if(len(lgFiles)>0 and len(ptrns)>0): for lgFile in lgFiles: chnks = self.processFile(lgFile, ptrns, self.chnksz) if(len(chnks)>0): self.rep.addMatchedFile(lgFile,chnks) self.dumpChnks(chnks, self.outFldr, lgFile) else: self.rep.addUnMatchedFile(lgFile) def generateReport(self): self.rep.printReport()
def is_code_file(self, f): """Return true if file is of a (known) code file type""" return FileUtil.get_extension(f) in self.filetypes['code']
def test_get_extension_missing_extension(self): filename = 'filename.' self.assertEqual(FileUtil.get_extension(filename), '')
def test_is_dot_dir_double_dot(self): filename = '..' self.assertTrue(FileUtil.is_dot_dir(filename))