def enumerate_inspect_source_code( folder, file_pattern=".*[.]((py)|(ipynb))$", neg_pattern=".*(([-]checkpoint)|(_todo)|(_temp)).*", line_patterns="from sklearn[_0-9a-zA-Z.]* import ([_a-zA-Z0-9]+);;import sklearn[.]([_a-z]+)", fullname=False): """ Counts groups extracted from source file. We assume all selected files can be opened as text files encoded in :epkg:`utf-8` character set. @param folder folder to dig into @param file_pattern files to consider @param neg_pattern negative patterns for filenames @param line_patterns patterns to look into, separated by ``;;`` @param fullname if True, include the subfolder while checking the regex @return list of dictionaries """ regs = [re.compile(reg) for reg in line_patterns.split(';;')] nb = 0 for name in explore_folder_iterfile(folder, pattern=file_pattern, neg_pattern=neg_pattern, fullname=fullname): nb += 1 try: with open(name, "r", encoding="utf-8", errors='ignore') as f: for li, line in enumerate(f): for pi, reg in enumerate(regs): r = reg.search(line) if r: for g in r.groups(): obs = dict(group=g, name=name, line=li) obs['patid'] = pi yield obs except UnicodeDecodeError as e: raise FileNotFoundError( "Unable to process '{0}' due to '{1}'.".format(name, e)) from e if nb == 0: found = os.listdir(folder) founds = "\n".join(found) if found else "EMPTY" pos_found = list( explore_folder_iterfile(folder, pattern=file_pattern, fullname=fullname)) pos_founds = "\n".join(pos_found) if pos_found else "EMPTY" mes = "No file found in folder '{0}' with pattern '{1}' (neg='{2}')\n--IN--\n{3}\n--IN--\n{4}" raise FileNotFoundError( mes.format(folder, file_pattern, neg_pattern, founds, pos_founds))
def test_pyensae_links(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") this = os.path.join(os.path.dirname(__file__), '..', '..', '_doc', 'notebooks') checked = 0 missed = [] tolook = [ 'http://files.grouplens.org/datasets/movielens/ml-1m.zip', 'http://www.xavierdupre.fr/', 'url=\\"http', '\\"Skin_NonSkin.txt\\", website=\\"https://archive.ics', "website='http://telechargement.insee.fr/fichiersdetail", 'https://archive.ics.uci.edu/ml/machine-learning-databases' ] for note in explore_folder_iterfile(this, ".*[.]ipynb$", ".ipynb_checkpoints", fullname=True): with open(note, 'r', encoding='utf-8') as f: content = f.read() if "from pyensae.datasource import download_data" in content: checked += 1 found = False for to in tolook: if to in content: found = True if not found: missed.append(note) self.assertGreater(checked, 1) self.assertNotEmpty(missed)
def test_convert_notebooks(self): fold = os.path.abspath(os.path.dirname(__file__)) fold2 = os.path.normpath( os.path.join(fold, "..", "..", "_doc", "notebooks")) for nbf in explore_folder_iterfile(fold2, pattern=".*[.]ipynb"): t = upgrade_notebook(nbf) if t: fLOG("modified", nbf) # remove numbers remove_execution_number(nbf, nbf) fold2 = os.path.normpath(os.path.join(fold, "..", "..", "_unittests")) for nbf in explore_folder_iterfile(fold2, pattern=".*[.]ipynb"): t = upgrade_notebook(nbf) if t: fLOG("modified", nbf)
def test_pyensae_links(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") this = os.path.join(os.path.dirname(__file__), '..', '..', '_doc', 'notebooks') checked = 0 missed = [] tolook = ['http://files.grouplens.org/datasets/movielens/ml-1m.zip', 'http://www.xavierdupre.fr/', 'url=\\"http', '\\"Skin_NonSkin.txt\\", website=\\"https://archive.ics', "website='http://telechargement.insee.fr/fichiersdetail", 'https://archive.ics.uci.edu/ml/machine-learning-databases'] for note in explore_folder_iterfile(this, ".*[.]ipynb$", ".ipynb_checkpoints", fullname=True): with open(note, 'r', encoding='utf-8') as f: content = f.read() if "datasource import download_data" in content or "pyensae.download_data(" in content: checked += 1 found = False for to in tolook: if to in content: found = True if not found: missed.append(note) self.assertGreater(checked, 1) self.assertEmpty(missed)
def test_convert_notebooks(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") fold = os.path.abspath(os.path.dirname(__file__)) fold2 = os.path.normpath( os.path.join(fold, "..", "..", "_doc", "notebooks")) for nbf in explore_folder_iterfile(fold2, pattern=".*[.]ipynb"): t = upgrade_notebook(nbf) if t: fLOG("modified", nbf) fold2 = os.path.normpath(os.path.join(fold, "..", "..", "_unittests")) for nbf in explore_folder_iterfile(fold2, pattern=".*[.]ipynb"): t = upgrade_notebook(nbf) if t: fLOG("modified", nbf)
def test_convert_notebooks(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") fold = os.path.abspath(os.path.dirname(__file__)) fold2 = os.path.normpath( os.path.join(fold, "..", "..", "_doc", "notebooks")) for nbf in explore_folder_iterfile(fold2, pattern=".*[.]ipynb"): t = upgrade_notebook(nbf) if t: fLOG("modified", nbf) # remove numbers remove_execution_number(nbf, nbf) fold2 = os.path.normpath(os.path.join(fold, "..", "..", "_unittests")) for nbf in explore_folder_iterfile(fold2, pattern=".*[.]ipynb"): t = upgrade_notebook(nbf) if t: fLOG("modified", nbf)
def enumerate_inspect_source_code(folder, file_pattern=".*[.]((py)|(ipynb))$", neg_pattern=".*(([-]checkpoint)|(_todo)|(_temp)).*", line_patterns="from sklearn[_0-9a-zA-Z.]* import ([_a-zA-Z0-9]+);;import sklearn[.]([_a-z]+)", fullname=False): """ Counts groups extracted from source file. We assume all selected files can be opened as text files encoded in :epkg:`utf-8` character set. @param folder folder to dig into @param file_pattern files to consider @param neg_pattern negative patterns for filenames @param line_patterns patterns to look into, separated by ``;;`` @param fullname if True, include the subfolder while checking the regex @return list of dictionaries """ regs = [re.compile(reg) for reg in line_patterns.split(';;')] nb = 0 for name in explore_folder_iterfile(folder, pattern=file_pattern, neg_pattern=neg_pattern, fullname=fullname): nb += 1 try: with open(name, "r", encoding="utf-8", errors='ignore') as f: for li, line in enumerate(f): for pi, reg in enumerate(regs): r = reg.search(line) if r: for g in r.groups(): obs = dict(group=g, name=name, line=li) obs['patid'] = pi yield obs except UnicodeDecodeError as e: raise FileNotFoundError( "Unable to process '{0}' due to '{1}'.".format(name, e)) if nb == 0: found = os.listdir(folder) founds = "\n".join(found) if found else "EMPTY" pos_found = list(explore_folder_iterfile( folder, pattern=file_pattern, fullname=fullname)) pos_founds = "\n".join(pos_found) if pos_found else "EMPTY" mes = "No file found in folder '{0}' with pattern '{1}' (neg='{2}')\n--IN--\n{3}\n--IN--\n{4}" raise FileNotFoundError(mes.format( folder, file_pattern, neg_pattern, founds, pos_founds))
def enumerate_group_files(self, group): """ Enumerates all files in a group. @param group group @return iterator on files """ if group is None: for g in self.Groups: for _ in self.enumerate_group_files(g): yield _ else: loc = self.get_group_location(group) for _ in explore_folder_iterfile(loc): yield _
def execute_python_scripts(root, df, col_names=None, url=None, eol="/", fLOG=noLOG, gen_mail=None): """ Retrieves all :epkg:`python` scripts and run them. @param root main folder @param df dataframe @param col_names dictionary for columns: folder, mail, program, out, err, url, cmp, url_content, key, time @param eol if not None, replaces end of lines by *eof* @param gen_mail generator of mails @param fLOG logging function @return dataframe """ if gen_mail is None: def iter_mail(mail): yield mail yield mail.lower() gen_mail = iter_mail def post_process(out, eol): out = out.strip("\r\t\n").rstrip().replace("\r", "").replace("\t", " ") if eol: out = out.replace("\n", eol) return out downloads = {} res = [] for name, mail in zip(df[col_names.get("folder", "folder")], df[col_names.get("mail", "mail")]): row = {col_names.get("folder", "folder"): name} fLOG("[execute_python_script], look into '{0}'".format(name)) subf = os.path.join(root, name) col_find = col_names.get("exists", "exists") if not os.path.exists(subf): subf = os.path.join(root, name.replace("-", ".")) if not os.path.exists(subf): row[col_find] = False res.append(row) else: row[col_find] = True store = [] for py in explore_folder_iterfile(subf, ".*[.]py$"): store.append(py) fLOG(" -", len(store), "programs found") col_out = col_names.get("out", "out") col_err = col_names.get("err", "err") col_prog = col_names.get("program", "program") col_time = col_names.get("time", "time") col_key = col_names.get("key", "key") col_size = col_names.get("size", "size") col_url = col_names.get("url", "url") col_ind = col_names.get("pattern_id", "pattern_id") if len(store) == 0: for mm in sorted(gen_mail(mail.strip())): mailid = _get_code(mm.encode("utf-8")) r = row.copy() loc = url.format(mailid) ind = {col_key: mm, col_ind: mailid, col_url: loc} r.update(ind) res.append(r) continue # test all programs outs = [] for py in sorted(store): cmd = '"{0}" "{1}"'.format(sys.executable, py) t1 = time.clock() try: out, err = run_cmd(cmd, wait=True) except Exception as e: out = None err = str(e) out = post_process(out, eol) t2 = time.clock() outs.append({ col_out: out, col_err: post_process(err, eol), col_prog: os.path.split(py)[-1], col_time: t2 - t1, col_size: os.stat(py).st_size }) if url is None: for o in outs: r = row.copy() r.update(o) res.append(r) elif url is not None: col_cmp = col_names.get("cmp", "cmp") col_in = col_names.get("sortie_dans_motif", "sortie_dans_motif") col_in2 = col_names.get("motif_dans_sortie", "motif_dans_sortie") col_dist = col_names.get("dist", "dist") col_content = col_names.get("content", "content") if out is None: for _, mm in gen_mail(mail.strip()): mailid = _get_code(mm.encode("utf-8")) ind = {col_ind: mailid} for o in outs: r = row.copy() r.update(o) r.update(ind) res.append(r) else: for mm in sorted(gen_mail(mail.strip())): mailid = _get_code(mm.encode("utf-8")) loc = url.format(mailid) ind = {col_key: mm, col_ind: mailid, col_url: loc} if loc not in downloads: downloads[loc] = get_url_content_timeout( loc).strip("\n\r\t ") content = post_process(downloads[loc], eol) ind[col_content] = content for o in outs: r = row.copy() r.update(o) r.update(ind) out = r[col_out] r[col_cmp] = out == content or out.strip( ) == content.strip() r[col_in] = out in content r[col_in2] = content in out r[col_dist] = (edit_distance(out, content)[0]) if ( len(content) > len(out) // 2) else abs(len(content) - len(out)) res.append(r) return pandas.DataFrame(res)
from pyquickhelper.loghelper import fLOG # publish_lectures fLOG(OutputPrint=True) ######################################### # import des fonctions dont on a besoin from pyquickhelper.filehelper import synchronize_folder, explore_folder_iterfile ######################################## # récupération des répertoires compilés via un serveur jenkins fLOG("Digging into ", root) sub = os.path.join("_doc", "sphinxdoc", "build", "html", "index.html") index = [] pattern = "^index.html$" done = {} for name in explore_folder_iterfile(root, pattern): if name.endswith(sub): pack = name[:len(name) - len(sub) - 1] parent, spl = os.path.split(pack) if "_UT_" in spl: parent, spl = os.path.split(parent) if "_UT_" in spl: raise ValueError("Something is weird with: '{0}'".format(name)) index.append((spl, os.path.dirname(name))) if spl in done: raise ValueError("Duplicated package '{0}'.\n{1}".format( spl, "\n".join("{0}={1}".format(k, v) for k, v in sorted(done.items())))) fLOG("Found {0} directories".format(len(index))) for ind in index:
from pyquickhelper.loghelper import fLOG # publish_lectures fLOG(OutputPrint=True) ######################################### # import des fonctions dont on a besoin from pyquickhelper.filehelper import synchronize_folder, explore_folder_iterfile ######################################## # récupération des répertoires compilés via un serveur jenkins fLOG("Digging into ", root) sub = os.path.join("_doc", "sphinxdoc", "build", "html", "index.html") index = [] pattern = "^index.html$" done = {} for name in explore_folder_iterfile(root, pattern): if name.endswith(sub): pack = name[:len(name) - len(sub) - 1] parent, spl = os.path.split(pack) if "_UT_" in spl: parent, spl = os.path.split(parent) if "_UT_" in spl: raise ValueError("Something is weird with: '{0}'".format(name)) index.append((spl, os.path.dirname(name))) if spl in done: raise ValueError("Duplicated package '{0}'.\n{1}".format( spl, "\n".join("{0}={1}".format(k, v) for k, v in sorted(done.items())))) fLOG("Found {0} directories".format(len(index))) for ind in index: fLOG(" ", ind)
def execute_python_scripts(root, df, col_names=None, url=None, eol="/", fLOG=noLOG, gen_mail=None): """ retrieve all python scripts and run them @param root main folder @param df dataframe @param col_names dictionary for columns: folder, mail, program, out, err, url, cmp, url_content, key, time @param eol if not None, replaces end of lines by *eof* @param gen_mail generator of mails @param fLOG logging function @return dataframe """ if gen_mail is None: def iter_mail(mail): yield mail yield mail.lower() gen_mail = iter_mail def post_process(out, eol): out = out.strip("\r\t\n").rstrip().replace( "\r", "").replace("\t", " ") if eol: out = out.replace("\n", eol) return out downloads = {} res = [] for name, mail in zip(df[col_names.get("folder", "folder")], df[col_names.get("mail", "mail")]): row = {col_names.get("folder", "folder"): name} fLOG("[execute_python_script], look into '{0}'".format(name)) subf = os.path.join(root, name) col_find = col_names.get("exists", "exists") if not os.path.exists(subf): subf = os.path.join(root, name.replace("-", ".")) if not os.path.exists(subf): row[col_find] = False res.append(row) else: row[col_find] = True store = [] for py in explore_folder_iterfile(subf, ".*[.]py$"): store.append(py) fLOG(" -", len(store), "programs found") col_out = col_names.get("out", "out") col_err = col_names.get("err", "err") col_prog = col_names.get("program", "program") col_time = col_names.get("time", "time") col_key = col_names.get("key", "key") col_size = col_names.get("size", "size") col_url = col_names.get("url", "url") col_ind = col_names.get("pattern_id", "pattern_id") if len(store) == 0: for mm in sorted(gen_mail(mail.strip())): mailid = _get_code(mm.encode("utf-8")) r = row.copy() loc = url.format(mailid) ind = {col_key: mm, col_ind: mailid, col_url: loc} r.update(ind) res.append(r) continue # test all programs outs = [] for py in sorted(store): cmd = '"{0}" "{1}"'.format(sys.executable, py) t1 = time.clock() try: out, err = run_cmd(cmd, wait=True) except Exception as e: out = None err = str(e) out = post_process(out, eol) t2 = time.clock() outs.append({col_out: out, col_err: post_process(err, eol), col_prog: os.path.split(py)[-1], col_time: t2 - t1, col_size: os.stat(py).st_size}) if url is None: for o in outs: r = row.copy() r.update(o) res.append(r) elif url is not None: col_cmp = col_names.get("cmp", "cmp") col_in = col_names.get( "sortie_dans_motif", "sortie_dans_motif") col_in2 = col_names.get( "motif_dans_sortie", "motif_dans_sortie") col_dist = col_names.get("dist", "dist") col_content = col_names.get("content", "content") if out is None: for ii, mm in gen_mail(mail.strip()): mailid = _get_code(mm.encode("utf-8")) ind = {col_ind: mailid} for o in outs: r = row.copy() r.update(o) r.update(ind) res.append(r) else: for mm in sorted(gen_mail(mail.strip())): mailid = _get_code(mm.encode("utf-8")) loc = url.format(mailid) ind = {col_key: mm, col_ind: mailid, col_url: loc} if loc not in downloads: downloads[loc] = get_url_content_timeout( loc).strip("\n\r\t ") content = post_process(downloads[loc], eol) ind[col_content] = content for o in outs: r = row.copy() r.update(o) r.update(ind) out = r[col_out] r[col_cmp] = out == content or out.strip( ) == content.strip() r[col_in] = out in content r[col_in2] = content in out r[col_dist] = (edit_distance(out, content)[0]) if ( len(content) > len(out) // 2) else abs(len(content) - len(out)) res.append(r) return pandas.DataFrame(res)
outs.append(out) print('[csharpy.dotnet] OUT') print(out) # Copy files. from pyquickhelper.filehelper import explore_folder_iterfile dest = os.path.join('csharpy', 'binaries', version2) if not os.path.exists(dest): os.makedirs(dest) init = os.path.join(dest, "__init__.py") if not os.path.exists(init): with open(init, 'w') as f: pass must_copy = {'DynamicCS': 0, 'CSharPyExtension': 0} copied = 0 for name in explore_folder_iterfile(folder, pattern='.*[.]((dll)|(so))$'): full = os.path.join(folder, name) if version2 in full: short_name = os.path.split(os.path.splitext(name)[0])[-1] if short_name in must_copy: must_copy[short_name] += 1 copied += 1 print("[csharpy.copy] '{0}'".format(name)) shutil.copy(name, dest) else: # print("[csharpy.skip] '{0}'".format(name)) pass min_must_copy = min(must_copy.values()) if copied == 0 or min_must_copy == 0: raise RuntimeError( "Missing binaries in '{0}' for version='{1}'".format(