def get_tokenized_training_data(self): tokenized_data = [] lemmatized_data = [] print("Started tokenized data ...") logging.debug(self.TRAINING_DATA_ROOT_DIRECTORY) top_directory = '/home/agniv/Desktop/data-science/telegraph_scraper/' top_directory = top_directory + self.TRAINING_DATA_ROOT_DIRECTORY with working_directory(top_directory): datewise_directories = sorted(os.listdir(top_directory)) for datewise_directory in datewise_directories: datewise_directory = top_directory + datewise_directory #logging.debug('DATE: '+datewise_directory) with working_directory(datewise_directory): pagewise_directories = sorted( os.listdir(datewise_directory)) for pagewise_directory in pagewise_directories: pagewise_directory = datewise_directory + '/' + pagewise_directory #logging.debug('PAGE: '+pagewise_directory) with working_directory(pagewise_directory): newsfiles = sorted(os.listdir(pagewise_directory)) for newsfile in newsfiles: #logging.debug('HEADING: '+newsfile) with open(newsfile, "r") as content_file: file_content = content_file.read() text = re.sub(r'\W+', ' ', file_content) all_words = self.tokenize(text, stop_words) words = [] for word in all_words: if len( word ) < 3: # logging.debug("Small words not added: "+word) continue if re.match( r'^[0-9]', word ): # logging.debug("Words staring with number not added: "+word) continue words.append(word) #for word in words: tokenized_data.append(words) #logging.debug(tokenized_data) #logging.debug(len(tokenized_data)) #lemmatized_data = self.lemmatize(tokenized_data, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) #logging.debug(lemmatized_data) #logging.debug(len(lemmatized_data)) with open('tokenized_data.pkl', 'wb') as f: pickle.dump(tokenized_data, f) return 'FROM TRAINING DATA'
def repack_epub(): # Assumes pwd is *not* unpack directory. msg(f'.. Packing new {epub}.') with ZipFile(epub, 'w') as z: with working_directory(temp_dir): for f in os.listdir('.'): if f in ['..', '.']: continue zip_add(z, f, f)
def test_eglob(): with TemporaryDirectory() as path: for d in ('one', 'two', 'three', 'four/five', 'six/seven/eight'): os.makedirs(os.path.join(path, fix_path(d))) for f in ('one/foo.py', 'one/foo.txt', 'two/bar.c', 'four/test.py', 'four/test2.py', 'four/me.txt', 'four/five/x.py', 'six/seven/test.py'): with open(os.path.join(path, fix_path(f)), 'w'): pass from grizzled.os import working_directory with working_directory(path): expected = { 'one/foo.py', 'four/test.py', 'four/test2.py', 'four/five/x.py', 'six/seven/test.py' } res = set(eglob('**/*.py')) assert (res == expected)
def test_eglob(): with TemporaryDirectory() as path: for d in ('one', 'two', 'three', 'four/five', 'six/seven/eight'): os.makedirs(os.path.join(path, fix_path(d))) for f in ('one/foo.py', 'one/foo.txt', 'two/bar.c', 'four/test.py', 'four/test2.py', 'four/me.txt', 'four/five/x.py', 'six/seven/test.py'): with open(os.path.join(path, fix_path(f)), 'w'): pass from grizzled.os import working_directory with working_directory(path): expected = { 'one/foo.py', 'four/test.py', 'four/test2.py', 'four/five/x.py', 'six/seven/test.py' } res = set(eglob('**/*.py')) assert(res == expected)
def list_recursively(dir: str, *, include_files: bool = True, include_dirs: bool = True) -> Generator[None, str, None]: """ Recursively list the contents of a directory. Yields the contents of the directory and all subdirectories. This method returns a generator, so it evaluates its recursive walk lazily. This function is just a simple wrapper around `os.walk`. Each yielded value is a partial path, relative to the original directory. **Parameters** - `dir` (`str`): Path to directory to list - `include_files` (`bool`): Whether or not to yield directories. `True` by default. - `include_dirs` (`bool`): Whether or not to yield files. `True` by default. **Yields** partial paths of all directories and/or files below the specified directory **Raises** `ValueError`: If `dir` does not exist, or if `dir` exists but is not a directory. """ if not _os.path.isdir(dir): raise ValueError("{0} is not a directory.".format(dir)) from grizzled.os import working_directory with working_directory(dir): for dirpath, dirnames, filenames in _os.walk('.'): if include_dirs: for d in dirnames: yield _os.path.normpath(_os.path.join(dirpath, d)) if include_files: for f in filenames: yield _os.path.normpath(_os.path.join(dirpath, f))
def executecode(self): def jsonmayfail(jsono): try: return json.loads(jsono) except ValueError: return jsono def subst(org): return org.replace("__VERSION__", str(self.version)) def formatoutput(output): time = strftime(CLIENTTIMEFMT, gmtime()) return {"output": output, "time": time} output = [] with working_directory("jwork"): self.version += 1; realclassname = subst(request.json["classname"]) cleanup = [] for f in glob("de/tudarmstadt/botnet/janus_yanai/*") + ["class.jar","dexed.jar","../static/dexed.jar"]: try: os.remove(f) except OSError as e: cleanup.append([e.filename, e.strerror]) if cleanup != []: output.append({"cleanup": cleanup}) try: f = open("de/tudarmstadt/botnet/janus_yanai/" + realclassname + ".java", 'w') f.write(subst(request.json["code"])) f.close() except OSError as e: output.append({"write code to file": str(e)}) return formatoutput(output) compileanddex = OrderedDict() try: # for i in ["javac -cp ~/Desktop/android-sdk-linux_x86/platforms/android-10/android.jar de/tudarmstadt/botnet/janus_yanai/*", "jar cf class.jar de", "~/Desktop/android-sdk-linux_x86/platform-tools/dx --dex --output dexed.jar class.jar"]: for i in ["javac -cp ~/Downloads/android-sdk-linux/platforms/android-10/android.jar de/tudarmstadt/botnet/janus_yanai/*", "jar cf class.jar de", "~/Downloads/android-sdk-linux/platform-tools/dx --dex --output dexed.jar class.jar"]: o = six.u(check_output(i, stderr=subprocess.STDOUT, shell=True)) if o != "": compileanddex.update(i, o) except subprocess.CalledProcessError as e: output.append({"compile and dex": compileanddex}) output.append({"stdout": e.output}) output.append({"error": str(e)}) return formatoutput(output) if compileanddex != {}: output.append({"compile and dex": compileanddex}) os.symlink(os.path.join(os.getcwd(), "dexed.jar"), "../static/dexed.jar") ip = self.statuses[request.json["serial"]].ip port = self.statuses[request.json["serial"]].port if ip == "127.0.0.1": myip = "10.0.2.2" else: myip = six.u(check_output("ifconfig")).split("\n")[1].split()[1][5:] st = self.sendbotcmd(ip, port, "download http://" + myip + ":" + str(cherrypy.config["server.socket_port"]) + "/static/dexed.jar") output.append({"downloading": jsonmayfail(st)}) if (st == "timed out"): return formatoutput(output) st = self.sendbotcmd(ip, port, "run de.tudarmstadt.botnet.janus_yanai." + realclassname) output.append({"running": jsonmayfail(st)}) return formatoutput(output)
def fix_epub(epub, book_title, temp_dir): ''' Make some adjustments to the generated tables of contents in the ePub, removing empty elements and removing items matching the book title. Parameters: epub: The path to the epub file book_title: The book title temp_dir: Temporary directory to use for unpacking ''' from zipfile import ZipFile, ZIP_DEFLATED from xml.dom import minidom from grizzled.os import working_directory rm_rf(temp_dir, silent=True) def zip_add(zf, path, zippath): '''Swiped from zipfile module.''' if os.path.isfile(path): zf.write(path, zippath, ZIP_DEFLATED) elif os.path.isdir(path): if zippath: zf.write(path, zippath) for nm in os.listdir(path): zip_add(zf, os.path.join(path, nm), os.path.join(zippath, nm)) def unpack_epub(): # Assumes pwd is *not* unpack directory. msg(f'.. Unpacking {epub}.') with ZipFile(epub) as z: z.extractall(temp_dir) def repack_epub(): # Assumes pwd is *not* unpack directory. msg(f'.. Packing new {epub}.') with ZipFile(epub, 'w') as z: with working_directory(temp_dir): for f in os.listdir('.'): if f in ['..', '.']: continue zip_add(z, f, f) def strip_text_children(element): for child in element.childNodes: if type(child) == minidom.Text: element.removeChild(child) def get_text_children(element): text = None if element: s = '' for child in element.childNodes: if child and (type(child) == minidom.Text): s += child.data.strip() text = s if s else None return text def fix_toc_ncx(toc): # Assumes pwd *is* unpack directory msg(f'.. Reading table of contents file "{toc}".') with open(toc) as f: toc_xml = f.read() msg('.. Adjusting table of contents.') with minidom.parse(toc) as dom: nav_map = dom.getElementsByTagName('navMap') if not nav_map: abort('Malformed table of contents: No <navMap>.') nav_map = nav_map[0] for p in nav_map.getElementsByTagName('navPoint'): text_nodes = p.getElementsByTagName('text') text = None if text_nodes: text = get_text_children(text_nodes[0]) if (not text) or (text == book_title): nav_map.removeChild(p) # Renumber the nav points. for i, p in enumerate(nav_map.getElementsByTagName('navPoint')): num = i + 1 p.setAttribute('id', f'navPoint-{num}') # Strip any text nodes from the navmap. strip_text_children(nav_map) # Write it out. with open(toc, 'w') as f: dom.writexml(f) def fix_nav_xhtml(toc): # Assumes pwd *is* unpack directory msg(f'.. Reading table of contents file "{toc}".') with open(toc) as f: toc_xml = f.read() msg('.. Adjusting table of contents.') with minidom.parse(toc) as dom: navs = dom.getElementsByTagName('nav') nav = None for n in navs: if not n.hasAttributes(): continue a = n.attributes.get('id') if not a: continue if a.value == 'toc': nav = n break else: abort('Malformed table of contents: No TOC <nav>.') ol = nav.getElementsByTagName('ol') if (not ol) or (len(ol) == 0): abort('Malformed table of contents: No list in <nav>.') ol = ol[0] for li in ol.getElementsByTagName('li'): a = li.getElementsByTagName('a') if not a: abort('Malformed table of contents: No <a> in <li>.') a = a[0] text = get_text_children(a) if (not text) or (text == book_title): ol.removeChild(li) # Renumber the list items for i, li in enumerate(ol.getElementsByTagName('li')): num = i + 1 li.setAttribute('id', f'toc-li-{num}') # Strip any text nodes from the ol. strip_text_children(ol) # Write it out. with open(toc, 'w') as f: dom.writexml(f) # Main logic try: unpack_epub() with ensure_dir(temp_dir): with working_directory(temp_dir): for toc, func in (('toc.ncx', fix_toc_ncx), ('nav.xhtml', fix_nav_xhtml)): if not os.path.exists(toc): msg(f'.. No {toc} file. Skipping it.') continue func(toc) repack_epub() finally: #rmtree(temp_dir) pass
def piconfig(tag,filename):#function to go in directory and pass arg file to piconfig with working_directory(str(os.path.join(os.getcwd(),tag))): os.system('piconfig < '+ filename +'.txt')
def get_tokenized_test_data(self, id2word): logging.debug('STARTED GET_TOKENIZED_TEST_DATA') temp_file = datapath("saved-model") saved_lda_model = models.LdaModel.load(temp_file) tokenized_data = [] #lemmatized_data = [] print("Started tokenized data ...") logging.debug(self.TEST_DATA_ROOT_DIRECTORY) top_directory = '/home/agniv/Desktop/data-science/telegraph_scraper/' top_directory = top_directory + self.TEST_DATA_ROOT_DIRECTORY with working_directory(top_directory): datewise_directories = sorted(os.listdir(top_directory)) for datewise_directory in datewise_directories: datewise_directory = top_directory + datewise_directory #logging.debug('DATE: '+datewise_directory) with working_directory(datewise_directory): pagewise_directories = sorted( os.listdir(datewise_directory)) for pagewise_directory in pagewise_directories: pagewise_directory = datewise_directory + '/' + pagewise_directory #logging.debug('PAGE: '+pagewise_directory) with working_directory(pagewise_directory): newsfiles = sorted(os.listdir(pagewise_directory)) for newsfile in newsfiles: #logging.debug('HEADING: '+newsfile) with open(newsfile, "r") as content_file: file_content = content_file.read() text = re.sub(r'\W+', ' ', file_content) all_words = self.tokenize(text, stop_words) words = [] for word in all_words: if len( word ) < 3: # logging.debug("Small words not added: "+word) continue if re.match( r'^[0-9]', word ): # logging.debug("Words staring with number not added: "+word) continue words.append(word) #logging.debug(words) #print(words) bow = id2word.doc2bow(words) sorted_topic_list = sorted( saved_lda_model[bow], key=lambda x: x[1], reverse=True) top_topic = sorted_topic_list[:1] (idx, value) = top_topic[0] top_topic_str = str( saved_lda_model.print_topic(idx, 5)) top_topic_keywords = re.findall( r'"([^"]*)"', top_topic_str) top_topic_probabilities = re.findall( "\d+\.\d+", top_topic_str) logging.debug('FILENAME: ' + pagewise_directory + '/' + newsfile) print('FILENAME: ' + pagewise_directory + '/' + newsfile) logging.debug('TOPICS: ' + str(top_topic_keywords)) print('TOPICS: ' + str(top_topic_keywords)) logging.debug('TOPIC PROBABILITIES: ' + str(top_topic_probabilities)) print('TOPIC PROBABILITIES: ' + str(top_topic_probabilities)) #for word in words: #tokenized_data.append(words) #logging.debug(tokenized_data) #logging.debug(len(tokenized_data)) #lemmatized_data = self.lemmatize(tokenized_data, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) #logging.debug(lemmatized_data) #logging.debug(len(lemmatized_data)) #with open('tokenized_data.pkl', 'wb') as f: # pickle.dump(tokenized_data, f) return 'FROM TEST DATA'
from grizzled.os import working_directory with working_directory('..'): from critter import Critter from food import Food from world import World from biology import BioAssumptions class ScaryPredator(Critter): DESCRIPTION = "A critter than can eat other critters" pass