class TreePipeline(object): def open_spider(self, spider): self.tree = Tree() self.tree.create_node("root", "root") def process_item(self, item, spider): lst = item['text'] lst = [x.strip() for x in [y.replace('...', '') for y in lst]] item['pagetitle'] = item['pagetitle'].replace('...', '') lst[-1] = item['pagetitle'] for idx, elem in enumerate(lst): if idx == 0: previous = "root" else: previous = "|".join(lst[:idx]) elem = "|".join(lst[:idx + 1]) # elem = elem.replace('...', '') elem = elem.encode('utf-8').decode('utf-8') if not self.tree.contains(elem): print "Adding node %s" % elem self.tree.create_node(elem, elem, parent=previous) # self.tree.show() return item def close_spider(self, spider): self.tree.show() with open(makepath('data/cats/tree.json'), 'w') as outfile: outfile.write(self.tree.to_json()) self.tree.save2file(makepath('data/cats/tree.tree'))
def visualizeCtree(c_tree): #bottom up build tree tree = Tree() tree.create_node("root", "root") levels = sorted(c_tree.keys()) for level in levels: for node_id, cluster in c_tree[level].items(): node_id = "{}.{}".format(level, node_id) tree.create_node("{}".format(cluster["pattern"]), node_id, parent="root") if level == 0: for data in cluster["data"]: tree.create_node("log.{}".format(data), "log.{}".format(data), parent=node_id) else: for data in cluster["data"]: tree.move_node("{}.{}".format(level-1, data), node_id) tree.show() tree.save2file("./tree") ## How to use ##1. for log in logs:updateCTree, updatePatterns ##2. get the c_tree ##3. train all the logs to get the max level and estimate the best level we keep. ##4. train the cluster on different level ##要考虑到unmerged list的同步问题, 到时production考虑将c_tree存在redis里,然后同时flush到库里
def print_prob_val(self, fname="OutTree.txt"): n_tree = Tree(tree=self.tree) for node in n_tree.nodes: node = n_tree.get_node(node) node.tag = "{tag} - {val} - {prob}".format(tag=node.tag, val=node.data[0], prob=node.data[1]) n_tree.save2file(fname) self.tree = None
def show_tree_of_riad_group(riad_group, mopdb): from treelib import Node, Tree tree = Tree() ## combine ORG RIAD codes and GH RIAD codes in one DF ## DF to be used as input to get MFI data riad_as_input = riad_group[["ORG_RIAD_CODE", "ORG_ORGUNIT_NAME"]].copy() tmp = riad_group[["GH_RIAD_CODE", "GH_ORGUNIT_NAME"]].copy() tmp.rename(columns={ "GH_RIAD_CODE": "ORG_RIAD_CODE", "GH_ORGUNIT_NAME": "ORG_ORGUNIT_NAME" }, inplace=True) riad_as_input = riad_as_input.append(tmp) riad_as_input.drop_duplicates(inplace=True) riad_as_input.reset_index(drop=True, inplace=True) mfi_obj = mfis(riad_as_input, mopdb) mfi_data = mfi_obj.data #root: tree.create_node( riad_group["GH_ORGUNIT_NAME"][0], riad_group["GH_RIAD_CODE"][0], data=mfi( mfi_data[mfi_data['RIAD_CODE'] == riad_group["GH_RIAD_CODE"][0]])) i = 0 for index, row in riad_group.iterrows(): i = i + 1 #if i==500: #tree.show(data_property="summary", line_type="ascii-em") #break try: tree.create_node( row["ORG_ORGUNIT_NAME"], row["ORG_RIAD_CODE"], parent=row["DP_RIAD_CODE"], data=mfi( mfi_data[mfi_data['RIAD_CODE'] == row["ORG_RIAD_CODE"]])) except: missing_dp_id = row["DP_RIAD_CODE"] add_missing_node(tree, riad_group, missing_dp_id, mfi_data) f = open('D:/tree.txt', "w+", encoding="utf8") f.write("") f.close() tree.show(data_property="summary", line_type="ascii-em") tree.save2file(filename='D:/tree.txt', data_property="summary", line_type="ascii-em") f = open('D:/tree.txt', "r", encoding="utf8") contents = f.readlines() f.close() return contents
def generatetree(lista): tree = Tree() # se genera el arbol tree.create_node("calificador/dashboard/templates/dashboard", "raiz") # se agrega la raiz como el dashboard for i in lista: if tree.get_node(i[0])==None:# se revisa si el archivo esta en el arbol y se arregla tree.create_node(i[0],i[0],parent="raiz") # se agrega el archivo al arbol for i in lista: if tree.get_node(i[1]) == None: # se revisa si la iimagen se ha agregado al arbol tree.create_node(i[1],i[1],parent=i[0]) # se agrega la imagen al arbol tree.save2file("imagenes.txt")
def build_tree(tree_dict): """ Build ASCI tree and upload to S3. """ try: os.chdir(tempfile.gettempdir()) tree = Tree() for aws in tree_dict: aws_key = aws tree.create_node(aws, aws_key) for region in tree_dict.get(aws): region_key = aws_key + region tree.create_node(region, region_key, parent=aws_key) for service in tree_dict.get(aws).get(region): service_key = region_key + service tree.create_node(service, service_key, parent=region_key) for resource_type in tree_dict.get(aws).get(region).get( service): resource_type_key = service_key + resource_type tree.create_node(resource_type, resource_type_key, parent=service_key) for resource in tree_dict.get(aws).get(region).get( service).get(resource_type): resource_key = resource_type_key + resource tree.create_node(resource, resource_key, parent=resource_type_key) try: _, temp_file = tempfile.mkstemp() tree.save2file(temp_file) client = boto3.client('s3') bucket = os.environ['RESOURCETREEBUCKET'] key = 'resource_tree_%s.txt' % datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') client.upload_file(temp_file, bucket, key) logging.info( "Resource tree has been built and uploaded to S3 's3://%s/%s'." % (bucket, key)) finally: os.remove(temp_file) except: logging.critical(str(sys.exc_info()))
def generateTree(informacion): #print(informacion) tree = Tree() # se genera el arbol tree.create_node("calificador/dashboard/models", "raiz") # se genera la raiz for i in informacion.keys(): # se recorren las llaves del diccionario if tree.get_node(i) == None: tree.create_node(i, i, parent="raiz") # se agregan las llaves como hizos del nodo raiz for i in informacion.keys(): lis = informacion[i] # se extrae la lista de menciones del cada llave for j in lis:# se recorren la lista if tree.get_node(j[0]) == None: # se revisa si la el archivo (valor 0 en la tupla) ya existe en el arbol si no se agrega el nodo hijo tree.create_node(j[0], j[0], parent=i) for k in lis: # se vuelve a recorrer la lista if tree.get_node(k[1]) == None: # Se revisa que si la mencion (valor 1 en la tupla) ya existe en el nodo y si no se agrega al arbol con el archivo como padre tree.create_node(k[1], k[1], parent=k[0]) tree.save2file("Modulos.txt") #se guarde el arbol en un archivo
def build_tree(source, target_dir): id_to_value = dict() parent_to_child = defaultdict(set) child_to_parent = defaultdict(set) with open(source, "r", encoding="utf-8") as sf: content = json.loads(sf.read()) datas = content.get("@graph") for idx, data in enumerate(datas): if (idx + 1) % 100 == 0: print(f"index: {idx}") index = data.get("@id") value = data.get("label").get("@value") id_to_value[index] = value parents = data.get("subClassOf") if isinstance(parents, str): parent_to_child[parents].add(index) child_to_parent[index].add(parents) elif isinstance(parents, list): for parent in parents: parent_to_child[parent].add(index) child_to_parent[index].add(parent) root_set = get_root(child_to_parent) root_list = sorted(list(root_set)) for root in root_list: print(id_to_value.get(root)) target_dir = Path(target_dir) os.makedirs(target_dir) for root in root_list: tree = Tree() find_set = set() tree.create_node(id_to_value.get(root), root) tree = traverse_tree(tree, root, parent_to_child, id_to_value, find_set) tree.show() target_file = target_dir / f"{id_to_value.get(root)}.txt" tree.save2file(target_file)
def show_cluster_tree(root): """ 展示聚类树. :param root: 根节点ClusterTreeNode. :return: None :raise None """ tree = Tree() root_api_node = tree.create_node('[' + str(root.nid) + ']') def create_node(node, api_parent=None): """内部辅助创建API节点函数.""" tag = '[' + str(node.nid) + ']' if node.data != '': tag += ': ' + node.data tag += '(' + str(node.distance) + ')' api_node = tree.create_node(tag, parent=api_parent) _left = node.left _right = node.right if _left is None and _right is None: return if _left: create_node(_left, api_node) if _right: create_node(_right, api_node) left = root.left right = root.right if left: create_node(left, root_api_node) if right: create_node(right, root_api_node) tree.show() tree.save2file(WORKING_DIR + 'cluster-tree.txt')
class ParentChildEvaluate: """ Class to perform intrinsic evaluation of embeddings using the hierarchical relation of parent/child domains 1) parse ParendChildTreeFile.txt from interpro 2) for each child of root nn = ask embeddings model to give M nearest neighbors calculate_precision_atM(child.descendants, nn) calculate_recall_atN(child.descendants, nn) 3) plot histogram of precision and recall #Credits: https://medium.com/@m_n_malaeb/recall-and-precision-at-k-for-recommender-systems-618483226c54 """ def __init__(self, data_path): """ ParentChildEvaluate class init Parameters ---------- data_path : str full data path Returns ------- None """ print("ParentChildEvaluate") self.data_path = data_path self.tree = Tree() def get_model_name(self): """ Get embedding model name Parameters ---------- Returns ------- str embedding model name """ return ntpath.basename(self.model_file) def load_emb_model(self, model_file, is_model_binary): """ Load embedding model Parameters ---------- model_file : str model file name is_model_binary : bool model is saved in binary format (True), otherwise (False) Returns ------- None """ self.model_file = model_file self.emb_model = KeyedVectors.load_word2vec_format( model_file, binary=is_model_binary) def parse_parent_child_file(self, parent_child_file_name, out_path, output_file_name, save_parsed_tree=False): """ Parse the parent child file Parameters ---------- parent_child_file_name : str parent child file name out_path : str output data path output_file_name : str output file name save_parsed_tree : bool after parsing save parsed tree (True), otherwise (False) Returns ------- None """ previous_num_minus_signs = 0 last_interpro_id = None self.tree.create_node("INTERPRO", "INTERPRO") current_parent = "INTERPRO" with open(parent_child_file_name, 'r') as parent_child_file: for line in parent_child_file: line = line.strip() current_num_minus_signs = line[0:line.find("IPR")].count("--") double_colon_split = line.strip("--").split("::") interpro_id = double_colon_split[0] assert interpro_id[ 0: 3] == "IPR", "AssertionError: {} \n interpro id should start with IPR and has length of 9.".format( interpro_id) if current_num_minus_signs == 0: # assert child not in the tree current_parent = "INTERPRO" self.tree.create_node(interpro_id, interpro_id, parent=current_parent) else: # check if you are still with current parent or you need to create a new one if current_num_minus_signs == previous_num_minus_signs: # same level as last parent self.tree.create_node(interpro_id, interpro_id, parent=current_parent) elif current_num_minus_signs > previous_num_minus_signs: # one level down from last parent -> create new parent current_parent = last_interpro_id self.tree.create_node(interpro_id, interpro_id, parent=current_parent) else: # one level up from last parent -> get parent of the current parent if current_parent == "INTERPRO": # if one level up is the root then your papa is the root papa = "INTERPRO" else: # if one level up is not the root then get the parent of your parent (papa) papa = self.tree[current_parent].bpointer self.tree.create_node(interpro_id, interpro_id, parent=papa) current_parent = papa previous_num_minus_signs = current_num_minus_signs last_interpro_id = interpro_id # quick test # for interpro_node in self.tree.children("IPR000549"): # print(interpro_node.identifier) # self.tree.show() if save_parsed_tree: self.tree.save2file( filename=os.path.join(out_path, output_file_name)) def get_nn_calculate_precision_recall_atN(self, N, plot_histograms, save_diagnostics): """ Get nearest domain vector for each domains and calculate recall based on the ground truth (parsed tree) Parameters ---------- N : int number of nearest domain vector, if N==100 then retrieve as many as the children of a domain in the parsed tree plot_histograms : bool plot histograms for performance metrics (True), otherwise (False) save_diagnostics : bool save diagnostic plots for domain with low recall Returns ------- None """ print("Get NN and calculate precision and recall at {}".format(N)) recalls_n = [] precisions_n = [] interpros_recall0 = [] interpros_num_children_recall0 = [] if N == 100: retrieve_all_children = True else: retrieve_all_children = False for interpro_node in self.tree.children("INTERPRO"): recall_n = 0.0 precision_n = 0.0 all_children = self.tree.subtree( interpro_node.identifier).all_nodes() assert interpro_node in all_children, "AssertionError: parent {} is not in the set of all children.".format( interpro_node.identifier) all_children.remove(interpro_node) if retrieve_all_children: N = len(all_children) if self.emb_model.__contains__(interpro_node.identifier): nearest_neighbor_ids = set([ nn[0] for nn in self.emb_model.most_similar( positive=interpro_node.identifier, topn=N) ]) else: print("Model does not contain this id.") continue true_positives = set([child.identifier for child in all_children ]).intersection(nearest_neighbor_ids) assert len(all_children) > 0 and len( nearest_neighbor_ids ) == N, "AssertionError: For parent {} all children should be > 0 and nearest neighbors should be equal to N.".format( interpro_node.identifier) recall_n = len(true_positives) / len(all_children) precision_n = len(true_positives) / len(nearest_neighbor_ids) assert 0.0 <= recall_n <= 1.0 and 0.0 <= precision_n <= 1.0, "AssertionError: For parent {} recall or precision is not at (0,1]".format( interpro_node.identifier) recalls_n.append(recall_n) precisions_n.append(precision_n) if recall_n == 0.0: interpros_recall0.append(interpro_node.identifier) interpros_num_children_recall0.append(len(all_children)) if retrieve_all_children: # for printing in title N = 100 if plot_histograms: if retrieve_all_children: self.plot_histogram(recalls_n, "Recall", "Recall", "Number of Interpro domains", "recall") else: self.plot_histogram(recalls_n, "Recall@{}".format(N), "Recall", "Number of Interpro domains", "recall_{}".format(N)) self.plot_histogram(precisions_n, "Precision@{}".format(N), "Precision", "Number of Interpro domains", "precision_{}".format(N)) if retrieve_all_children: avg_recall = sum(recalls_n) / len(recalls_n) print("Average recall at 100: {:.3f}".format(avg_recall)) if save_diagnostics: self.save_diagnostics_recall0(interpros_recall0, interpros_num_children_recall0) def save_diagnostics_recall0(self, interpros_recall0, interpros_num_children_recall0): """ Save diagnostics histogram for domains with recall of 0 Parameters ---------- interpros_recall0 : list of str interpro ids with recall 0 interpros_num_children_recall0 : list of str number of children of each interpro id, found from the parsed tree, with recall 0 Returns ------- None """ print("Saving diagnostics for intepro domains with recall 0") with open( os.path.join( self.data_path, self.get_model_name() + "_interpros_recall0" + ".txt"), "w") as interpros_recall0_file: # write file with names of interpro having recall 0 interpros_recall0_file.write("\n".join(interpros_recall0)) # plot histogram of number of children for interpro parents with recall 0 self.plot_histogram(interpros_num_children_recall0, None, "Number of Intepro domains", "Number of children", "hist") def plot_histogram(self, performance_N, title, xlabel, ylabel, out_suffix): """ Plot histogram for performance metric and also for the number of children Parameters ---------- performance_N : list of float performance metric value per parent domain title : str histogram title (if not None) xlabel : str label x ylabel : str label y out_suffix : str histogram output file name suffix Returns ------- None """ # plot the histogram of lengths fig = plt.figure() plt.hist(performance_N, color='g', align='left', edgecolor='k', alpha=0.8) plt.xlabel(xlabel, fontsize=14) plt.ylabel(ylabel, fontsize=14) if title is not None: plt.title(title, fontsize=14) plt.xticks(np.arange(0, 1.1, 0.1)) hist_name = self.get_model_name() + "_" + out_suffix + ".png" fig.savefig(os.path.join(self.data_path, hist_name), bbox_inches='tight', dpi=600)
from treelib import Node, Tree roots = find_root_nodes() nodes_with_gold = [ bag_to_obj[bag].desc for bag in bag_to_obj if bag_to_obj[bag].contains_gold ] # for root in roots: # if root.desc not in nodes_with_gold: # continue # # tree = Tree() # # tree.create_node(root.desc, "0") # root.create_node(tree, "0", "0", True) # # # tree.show() # tree.save2file("trees.txt") for root in roots: if root.desc not in nodes_with_gold: continue tree = Tree() tree.create_node(root.desc, root.desc) root.create_node(tree, root.desc, True) tree.save2file("trees.txt")
class GitTool(object): def __init__(self, parent_path, shells, build_tree=False, log=None): """初始化操作目录, 操作命令. :parameter parent_path: 操作目录 :parameter shells: 执行shell :parameter build_tree: 是否生成树形导航 :parameter log: log文件 """ self._directory = parent_path self._unix_shell = shells self._log_file = log self._tree = None self._build_tree = build_tree def get_build_tree(self): return self._build_tree def set_build_tree(self, value): self._build_tree = value build_tree = property(get_build_tree, set_build_tree) def _print(self, info=''): if self._log_file: os.system("echo %s >> %s" % (info, self._log_file)) else: print(info) def run_work(self): """对指定的操作目录, 执行指定的操作命令. """ # 如果传入日志路径不存在则创建 if self._log_file: dir_name = os.path.dirname(self._log_file) if not os.path.exists(dir_name): os.makedirs(dir_name) if not os.path.exists(self._log_file): os.mknod(self._log_file) def build_tree(target_path): """创建树节点. :param target_path: 指定目录 """ if not self._build_tree: return self._tree = Tree() parent_name = os.path.basename(target_path) self._tree.create_node(parent_name, parent_name) def exist_node(sub_name): """指定节点是否存在. :param sub_name: 指定节点. """ if not self._build_tree: return sub_name nid = 0 while self._tree.contains(sub_name): sub_name = '_'.join((sub_name, str(nid))) nid += 1 return sub_name def report_tree(target_path, out_file=True): """输出文件树. :param target_path: 指定节点. :param out_file: 指定节点. """ if not self._build_tree: return if out_file: report_file = os.path.basename(target_path.strip(os.path.sep)) self._tree.save2file('%s.txt' % report_file) else: self._tree.show() def process_target_path(target_path, target_tag=None): """对指定目录执行操作. :param target_path: 指定目录 :param target_tag: 指定标签 """ # 判断路径是否存在 if not os.path.exists(target_path): self._print("Directory does not exist!") return parent_name = os.path.basename( target_path) if not target_tag else target_tag # 遍历目录下的Git Repository for i in os.listdir(target_path): sub_path = os.path.join(target_path, i) sub_name = os.path.basename(sub_path) # sub_path类型为目录, 并且存在.git且为目录, 视为Git Repository git_path = os.path.join(sub_path, ".git") if os.path.isdir(sub_path): sub_name = exist_node(sub_name) if self._build_tree: self._tree.create_node(sub_name, sub_name, parent=parent_name) if os.path.exists(git_path) and os.path.isdir(git_path): start_info = "Starting: %(sub_dir)s %(ph)s" % { 'sub_dir': i, 'ph': "." * (80 - len(i) - 1) } self._print(start_info) os.system(self._unix_shell % sub_path) self._print() else: process_target_path(sub_path, sub_name) if isinstance(self._directory, six.string_types): build_tree(self._directory) process_target_path(self._directory) report_tree(self._directory) elif isinstance(self._directory, (tuple, list)): for path in self._directory: build_tree(path) process_target_path(path) report_tree(path) else: pass self._print("Ok,All work is done!\r") def __call__(self): if self._log_file: now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') self._print("%s %s %s" % ("=" * 35, now_time, "=" * 35)) self.run_work()
tree.create_node("Curlie Site", "curlie") #Create the root node, with "curlie" as ID for category in categories: #Loop through all the main categories tree.create_node(category.text, category.text, parent="curlie") #Create a node with as a name the name of the category, and as ID the name, and as parent the root node urls.append( (category.get_attribute("href"), category.text)) #Fill a list of couples of categorie's link and name for url in urls: #Loop through all the links saved previously driver.get( url[0] ) #Get the link of the category, and navigate through it using the webdriver parentId = url[1] #Get the name of the category categories = driver.find_elements_by_xpath( '//section[@class="children"]/div/div[@class="cat-item"]/a') #Get all the children categories of the current category for category in categories: #Loop through all the categories tree.create_node(category.text, parentId + category.text, parent=parentId) #Create a node with as a name the name of the category, and as ID the name of the father node appended with the category name (I explain why in the attached document), and as parent the root node urls.append((category.get_attribute("href"), parentId + category.text)) #Add to the same list the new category link and name, so the program will pass through it too tree.show() #Display the final tree tree.save2file('tree.txt') #Save the structure into a text file
class DataCrawler(): USERNAME = "******" PASSWORD = "******" LOGIN_URL = "https://www.tgwiki.com/CookieAuth.dll?GetLogon?curl=Z2F&reason=0&formdir=9" URL = "https://www.tgwiki.com" DIRECTORY = ["RootFolder"] URL_suffix = ".aspx" EXCEPTION_MENU_ITEM = "Service Level Agreement" browser = None dataTree = None def __init__(self): # self.browser = webdriver.Chrome() self.array = [] self.dataTree = Tree() self.dataTree.create_node("Homepage", "homepage/", data=self.URL) def login(self): self.browser.get(self.LOGIN_URL) username = self.browser.find_element_by_id('username') username.send_keys(self.USERNAME) password = self.browser.find_element_by_id('password') password.send_keys(self.PASSWORD) self.browser.find_element_by_id('SubmitCreds').click() self.browser.find_element_by_xpath('//a[@href="/department"]').click() self.browser.find_element_by_xpath( '//a[@href="/department/citd"]').click() html = self.browser.page_source return html def get_HTML_From_URL(self, url): print("Accessing " + str(url)) self.browser.get(url) html = self.browser.page_source return html def get_menu(self, soup): result = soup.find(class_="menu vertical menu-vertical") result_in_static = result.findAll("li", class_="static") for ele in result_in_static: # print("-------------------------------------------------------") name = ele.find(class_="menu-item-text") inner_ele = ele.findAll("li", class_="dynamic") link = None parentID = "homepage/" if (inner_ele == []): # print(name.get_text()) # FOR DEBUGGING link_tag, link = self.parseLink(ele) self.dataTree.create_node(name.get_text(), parentID + name.get_text().lower() + "/", data=link, parent=parentID) if (self.isDirectory(link)): _html = self.get_HTML_From_URL(link) soup = BeautifulSoup(_html, "lxml") self.parseTable(soup, parentID + link_tag.get_text().lower() + "/") # else: # FOR DEBUGGING # print(name.get_text()) # FOR DEBUGGING if (name.get_text() == "Technology Update"): print("") # self.dataTree.show(idhidden=False) for small_ele in inner_ele: parentID = "homepage/" + name.get_text().lower() + "/" link_tag, link = self.parseLink(small_ele, _parent=parentID) print("CHECKING IF " + str(link) + " IS DIRECTORY...") #self.dataTree.show() # self.dataTree.create_node(small_ele.get_text(), small_ele.get_text().lower(), data=link, parent=name.get_text().lower()) if (self.isDirectory(link)): _html = self.get_HTML_From_URL(link) soup = BeautifulSoup(_html, "lxml") self.parseTable( soup, parentID + link_tag.get_text().lower() + "/") # print("-------------------------------------------------------") def parseLink(self, soup_result, _parent=None): print("praseLink") link_tag = soup_result.a link = None if (link_tag != None): link = link_tag.get('href') # print(link_tag.get_text()) # print(link) if (link[0] == '/'): link = self.URL + link if (_parent != None): print("############") print("Tag: " + str(link_tag)) print("Text: " + link_tag.get_text()) print("Link: " + str(link)) print("Parent: " + str(_parent)) print("############") # self.dataTree.show(idhidden=False) # if(link_tag.get_text() == self.EXCEPTION_MENU_ITEM): # if(self.dataTree.contains(self.EXCEPTION_MENU_ITEM.lower())): # print("dfgdfgdgdfgdfgd") # return link_tag, link try: self.dataTree.create_node( str(link_tag.get_text()), _parent + str(link_tag.get_text().lower() + "/"), data=link, parent=_parent) except treelib.tree.DuplicatedNodeIdError: print("duplicated") return link_tag, link # if(self.isDirectory(temp)): # _html = self.get_HTML_From_URL(temp) # soup = BeautifulSoup(_html, "lxml") # self.parseTable(soup, link.get_text().lower()) return link_tag, link def parseTable(self, soup_result, _parent=None): print("-------------------------------------------------------") print("parseTable") try: table_list = soup_result.findAll("table") for table in table_list: if (table.has_attr("summary")): table_body = table.find('tbody') row_list = table_body.findAll( 'td', attrs={"class": "ms-vb-title"}) for x in range(0, len(row_list)): link_tag, link = self.parseLink(row_list[x], _parent) if (self.isDirectory(link)): _html = self.get_HTML_From_URL(link) soup = BeautifulSoup(_html, "lxml") self.parseTable( soup, _parent + link_tag.get_text().lower() + "/") break except AttributeError as e: print(e) print("-------------------------------------------------------") def isDirectory(self, link): if (link == None): return False isDirectory = False if (self.URL_suffix == link[-5:]): return True elif (link[-1:] == "/"): print("dfasfdasfafafsdfdf") return True for directory in self.DIRECTORY: if (directory in link): isDirectory = True return isDirectory def writeToJSONFile(self, path, fileName, data): filePathNameWExt = './' + path + '/' + fileName + '.json' with open(filePathNameWExt, 'w') as fp: json.dump(data, fp) def main(self): _html = self.login() soup = BeautifulSoup(_html, "lxml") self.get_menu(soup) self.dataTree.show() tree_in_dict = self.dataTree.to_json(with_data=True) tree_in_json = json.dumps(tree_in_dict, indent=4, sort_keys=True, ensure_ascii=False) self.writeToJSONFile('./', 'training', tree_in_json) self.dataTree.save2file('tree_diagiam.json') # def process_node(self, node): # if(node.) def test(self): file_directory = "./ITSM_training.json" json_data = open(file_directory).read() data = json.loads(json_data) hello = node.Tree(tree=data) print(hello)
class FacultyPagesFilteredSpider(scrapy.Spider): name = 'faculty_pages_filtered' allowed_domains = [ 'cmu.edu', 'cornell.edu', 'washington.edu', 'gatech.edu', 'princeton.edu', 'utexas.edu', 'illinois.edu', 'berkeley.edu' 'mit.edu', 'stanford.edu' ] count = 0 record = {} start_urls = [ 'https://www.cmu.edu/', 'https://www.cornell.edu/', 'https://www.washington.edu/', 'https://www.gatech.edu/', 'https://www.princeton.edu/', 'https://www.utexas.edu/', 'https://illinois.edu/', 'https://www.berkeley.edu/', 'https://www.mit.edu/', 'https://www.stanford.edu/' ] exclude_words = [ 'news', 'events', 'publications', 'pub', 'gallery', 'category', 'courses', 'students', 'references', 'reference', 'software', 'softwares', 'tags', 'tutorials', 'workshop', 'festival', 'admissions', 'exhibitions', 'alumni', 'lectures', 'undergraduate', 'about', 'history', 'awards', 'ranking', 'enrollment', 'graduate', 'archive', 'stories', 'post', 'pages', 'magazine', 'curriculum', '404', 'faqs', 'engage', 'campaign', 'career', 'resources', 'services', 'network', 'security', 'donate', 'giving', 'finance', 'forms', 'policies', 'policy', 'alphabetical', 'summer', 'winter', 'spring', 'autumn', 'fall', 'health', 'facilities', 'facility', 'wp', 'information', 'general', 'catalog', 'guides', 'library', 'publish', 'blog', 'collection', 'share', 'search', 'periodicals', 'bookstore', 'store', 'product', 'organisation', 'webstore', 'funding', 'pdf' ] rules = [Rule(LinkExtractor(unique=True), callback='parse', follow=True)] #count_limits = {"page_count": 200, "item_count": 200} def __init__(self): self.tree = Tree() self.tree.create_node("root", "root") self.tree.create_node("unknown", "unknown", parent="root") self.bio_identifier = BioIdentifier(model="bio-model") for dom in self.allowed_domains: domain = dom.split('.')[0] if not os.path.exists('Crawled_Data'): os.makedirs('Crawled_Data') folder_name = 'Crawled_Data/' + domain.capitalize( ) + '_University_Files' self.record[domain] = 0 if not os.path.exists(folder_name): os.makedirs(folder_name) def parse(self, response): matched_domain = [x for x in self.allowed_domains if x in response.url] if len(matched_domain) > 0: domain = matched_domain[0].split('.')[0] folder_name = 'Crawled_Data/' + domain.capitalize( ) + '_University_Files' self.record[domain] = self.record.get(domain, 0) + 1 if self.record[domain] % 50 == 0: print('\n Crawled {} Bio-pages of {} University ...'.format( self.record[domain], domain.capitalize())) self.tree.save2file(folder_name + "/00__" + str(self.record[domain]) + "_tree.txt") isBio = self.bio_identifier.is_bio_html_content( response.xpath('//*').get()) if isBio: text = BeautifulSoup(response.xpath('//*').get(), features="html.parser").get_text() tokens = nltk.word_tokenize(text) normalized_text = ' '.join( [word for word in tokens if word.isalnum()]) normalized_text += '\n' + response.url hash_text = hashlib.md5(response.url.encode()) file_name = hash_text.hexdigest() with open(folder_name + "/" + file_name + ".txt", "w", encoding="utf-8") as file: file.write(normalized_text) AllLinks = LinkExtractor(allow_domains=domain + '.edu', unique=True).extract_links(response) for n, link in enumerate(AllLinks): if not any([x in link.url for x in self.exclude_words]): if self.tree.get_node(link.url) == None: referer = response.request.headers.get('Referer', None) if referer == None: self.tree.create_node(link.url, link.url, parent='root') else: referer = referer.decode("utf-8") if self.tree.contains(referer): self.tree.create_node(link.url, link.url, parent=referer) else: self.tree.create_node(link.url, link.url, parent='unknown') yield scrapy.Request(url=link.url, callback=self.parse)
# -*- coding: utf-8 -*- from treelib import Tree tree = Tree() tree.create_node("Harry", "harry") # root node tree.create_node("Jane", "jane", parent="harry") tree.create_node("Bill", "bill", parent="harry") tree.create_node("Diane", "diane", parent="jane") tree.create_node("Mary", "mary", parent="diane") tree.create_node("Mark", "mark", parent="jane") tree.save2file('tree.txt')
## Create the family tree tree = Tree() tree.create_node("Harry", "harry") # root node tree.create_node("Jane", "jane", parent="harry") tree.create_node("Bill", "bill", parent="harry") tree.create_node("Diane", "diane", parent="jane") tree.create_node("George", "george", parent="diane") tree.create_node("Mary", "mary", parent="diane") tree.create_node("Jill", "jill", parent="george") tree.create_node("Mark", "mark", parent="jane") print("#"*4 + "Breakdown of out family") tree.show(cmp=lambda x,y: cmp(x.tag, y.tag), key=None, reverse=True) #tree.show(key=lambda x: x.tag, reverse=False) tree.save2file("/home/chenxm/Desktop/tree.txt", idhidden=False) print('\n') print("#"*4 + "All family members in DEPTH mode") for node in tree.expand_tree(mode=Tree.DEPTH): print tree[node].tag print('\n') print("#"*4 + "All family members without Diane sub-family") tree.show(idhidden=False, filter=lambda x: x.identifier != 'diane') # for node in tree.expand_tree(filter=lambda x: x.identifier != 'diane', mode=Tree.DEPTH): # print tree[node].tag print('\n')
tax_tree = Tree() tax_tree.create_node('root', 'root') filea = open(sys.argv[1], 'r') head = filea.readline() for line in filea.readlines(): line = line.strip() arr = line.split('\t') upper = 'root' for i in range(1, len(arr)): if arr[i] == "NA": continue if arr[i] not in tax: tax.append(arr[i]) tax_tree.create_node(arr[i], arr[i], parent=upper, data=arr[i]) upper = arr[i] tax_tree.save2file(sys.argv[2]) #print(tax_tree.to_dict()) #def convert_dict_to_stdjson(tree_dict): # js='"name":root,"children":' # for key in tree_dict: def to_dict(tree, nid=None, key=None, sort=True, reverse=False, with_data=False): """Transform the whole tree into a dict."""
domain = tldextract.extract(url).domain subdomain = tldextract.extract(url).subdomain if not (tree.contains(domain)): tree.create_node(domain, domain, parent="ID of root node") #Add domains to root node if subdomain: tree.create_node(subdomain, subdomain+domain, parent=domain) #Add sub-domains to domain node file.close() tree.show(line_type="ascii-emv") #show data as stdout tree.to_graphviz(filename="tree_graphviz") #dump tree as graphviz #dot xxx -Tps -o test.ps -Grankdir=LR #left to right subprocess.call(["dot", "tree_graphviz", "-Tps", "-o" ,"output.ps" ,"-Grankdir=LR"]) #Grankdir=LR option to build tree from left to right #convert -flatten -density 150 -geometry 100% test.ps test.png subprocess.call(["convert" ,"-flatten" ,"-density" ,"150" ,"-geometry" ,"100%" ,"output.ps" , "tree_graphviz.png"],stderr=subprocess.DEVNULL) #convert graphviz to png # rm -rf tree_graphviz output.ps subprocess.call(["rm", "-rf", "tree_graphviz", "output.ps"]) #clear files if os.path.exists("output.txt"): #dump tree as text file subprocess.call(["rm", "-rf", "output.txt"]) tree.save2file('output.txt',line_type="ascii-emv") with open('output.json', 'w') as f: #dump tree as json form f.write(tree.to_json(with_data=True))
## Create the family tree tree = Tree() tree.create_node("Harry", "harry") # root node tree.create_node("Jane", "jane", parent="harry") tree.create_node("Bill", "bill", parent="harry") tree.create_node("Diane", "diane", parent="jane") tree.create_node("George", "george", parent="diane") tree.create_node("Mary", "mary", parent="diane") tree.create_node("Jill", "jill", parent="george") tree.create_node("Mark", "mark", parent="jane") print ("#" * 4 + "Breakdown of out family") tree.show(cmp=lambda x, y: cmp(x.tag, y.tag), key=None, reverse=True) # tree.show(key=lambda x: x.tag, reverse=False) tree.save2file("/home/chenxm/Desktop/tree.txt", idhidden=False) print ("\n") print ("#" * 4 + "All family members in DEPTH mode") for node in tree.expand_tree(mode=Tree.ZIGZAG): print tree[node].tag print ("\n") print ("#" * 4 + "All family members without Diane sub-family") tree.show(idhidden=False, filter=lambda x: x.identifier != "diane") # for node in tree.expand_tree(filter=lambda x: x.identifier != 'diane', mode=Tree.DEPTH): # print tree[node].tag print ("\n")
def construct_tree(self, file_path, child_node_index, json_file_location): node_list = [] try: directory = os.path.join(file_path) for root, dirs, files in os.walk(directory): for file in files: if str(file).endswith(".csv"): f = open(directory + file, 'r') csv_reader = csv.reader(f, delimiter=',') row_index = 0 filename = os.path.basename(f.name) rows = [] dict = {"Root": "root", filename: filename.lower()} esg_tree = Tree() esg_tree.create_node( "Root", "root", data=jsonpickle.encode( NodeParam('source', 'attr', 'desc', 'root'), unpicklable=False)) # root node node = Node( 'root', 'Root', '', '', jsonpickle.encode(NodeParam( 'source', 'attr', 'desc', 'root'), unpicklable=False)) node_list.append(node) esg_tree.create_node(filename, filename.lower(), parent='root', data=jsonpickle.encode( NodeParam( 'source', 'attr', 'desc', str(uuid.uuid1())), unpicklable=False)) node = Node( filename.lower(), filename, 'root', '', jsonpickle.encode(NodeParam( 'source', 'attr', 'desc', str(uuid.uuid1())), unpicklable=False)) node_list.append(node) for row in csv_reader: rows.append(row) for row in rows: if row_index != 0: column_index = 0 # data = row for curr_column in row: if str(curr_column) + str( row[0]) not in dict: if column_index > child_node_index: if "\n" in curr_column: for rowData in curr_column.splitlines( ): node_id_key = str( rowData) + str(row[0]) dict[ node_id_key] = uuid.uuid1( ) esg_tree.create_node( rowData, str( dict.get( node_id_key)), parent=str( dict.get( str(row[3]) + str(row[0]))), data=jsonpickle. encode(NodeParam( (rows[0] )[column_index], 'attr', str(rowData).lower( ), str( dict.get( node_id_key ))), unpicklable=False )) node = Node( str( dict.get( node_id_key)), rowData, str( dict.get( str(row[3]) + str(row[0]))), '', jsonpickle. encode(NodeParam( (rows[0] )[column_index], 'attr', str(rowData).lower( ), str( dict.get( node_id_key ))), unpicklable=False )) node_list.append(node) elif curr_column != '': node_id_key = str( curr_column) + str(row[0]) dict[node_id_key] = uuid.uuid1( ) esg_tree.create_node( curr_column, str(dict.get(node_id_key)), parent=str( dict.get( str(row[3]) + str(row[0]))), data=jsonpickle. encode(NodeParam( (rows[0] )[column_index], 'attr', str(curr_column).lower( ), str( dict.get( node_id_key))), unpicklable=False)) node = Node( str(dict.get(node_id_key)), curr_column, str( dict.get( str(row[3]) + str(row[0]))), '', jsonpickle. encode(NodeParam( (rows[0] )[column_index], 'attr', str(curr_column).lower( ), str( dict.get( node_id_key))), unpicklable=False)) node_list.append(node) else: node_id_key = str( curr_column) + str(row[0]) dict[node_id_key] = uuid.uuid1() if column_index == 0: esg_tree.create_node( curr_column, str(dict.get(node_id_key)), parent=str( dict.get(filename)), data=jsonpickle. encode(NodeParam( (rows[0] )[column_index], 'attr', str(curr_column).lower( ), str( dict.get( node_id_key))), unpicklable=False)) node = Node( str(dict.get(node_id_key)), curr_column, str(dict.get(filename)), '', jsonpickle. encode(NodeParam( (rows[0] )[column_index], 'attr', str(curr_column).lower( ), str( dict.get( node_id_key))), unpicklable=False)) node_list.append(node) else: esg_tree.create_node( curr_column, str(dict.get(node_id_key)), parent=str( dict.get( str(row[ column_index - 1]) + str(row[0]))), data=jsonpickle. encode(NodeParam( (rows[0] )[column_index], 'attr', str(curr_column).lower( ), str( dict.get( node_id_key))), unpicklable=False)) node = Node( str(dict.get(node_id_key)), curr_column, str( dict.get( str(row[ column_index - 1]) + str(row[0]))), '', jsonpickle. encode(NodeParam( (rows[0] )[column_index], 'attr', str(curr_column).lower( ), str( dict.get( node_id_key))), unpicklable=False)) node_list.append(node) column_index += 1 row_index += 1 f.close() filename = filename.replace(".csv", '') with open(json_file_location + filename + ".txt", "wb") as outfile: esg_tree.save2file(json_file_location + filename + ".json") pickle.dump(esg_tree, outfile) esgDatabase().add_data(node_list) print(esg_tree.to_json(with_data=True)) return 'success' except OSError: print("Path not found exception") return 'failed' except IOError: print('An error occurred trying to read the file.') f.close() return 'failed' except Exception as e: print("An error occurred while creating a tree") print(e) return 'failed'
from treelib import Node, Tree import json with open('jsondata.txt') as json_file: data = json.load(json_file) tree = Tree() tree.create_node(identifier='0', data='<html></html>') #print(len(data["tag"])) key = {"sd"} key.clear() for k, v in data.items(): for i in v: _id = str(i['id']) _tag = str(i['tag']) parent = i['parent'] for x in parent: tree.create_node(identifier=_id, parent=str(x), data=_tag) tree.show() #tree.show() x = tree.to_json() print(x) tree.save2file('tree.txt', data_property=True) # print(x) # print(key)
while node_dict: for key, value in node_dict.items(): if value['parent'] in added: tree.create_node(key, key, parent=value['parent']) added.add(key) node_dict.pop(key) break elif value['parent'] == None: tree.create_node(key, key) added.add(key) node_dict.pop(key) break tree_list.append(tree) for tree in tree_list: tree.save2file("Processed_Skeleton_Trees.txt") ####################################################################################################################### # Identify end nodes (leaves): leaf_list = [] for i in range(len(tree_list)): tree = tree_list[i] leaves = tree.leaves(nid=None) for leaf in leaves: leaf = leaf.identifier leaf_list.append(leaf) # Identify paths to leaves: paths_list = [] for i in range(len(tree_list)):
def build_tree(self, resource_tree): """ Build ASCI tree and upload to S3. """ try: os.chdir(tempfile.gettempdir()) tree = Tree() for aws in resource_tree: aws_key = aws tree.create_node(aws, aws_key) for region in resource_tree.get(aws): region_key = aws_key + region tree.create_node(region, region_key, parent=aws_key) for service in resource_tree.get(aws).get(region): service_key = region_key + service tree.create_node(service, service_key, parent=region_key) for resource_type in (resource_tree.get(aws).get( region).get(service)): resource_type_key = service_key + resource_type tree.create_node(resource_type, resource_type_key, parent=service_key) for resource in (resource_tree.get(aws).get( region).get(service).get(resource_type)): resource_key = resource_type_key + resource tree.create_node(resource, resource_key, parent=resource_type_key) try: _, temp_file = tempfile.mkstemp() try: tree.save2file(temp_file) except: self.logging.error("Could not generate resource tree.") return False client = boto3.client("s3") bucket = os.environ["RESOURCETREEBUCKET"] key = "resource_tree_%s.txt" % datetime.datetime.now( ).strftime("%Y_%m_%d_%H_%M_%S") try: client.upload_file(temp_file, bucket, key) except: self.logging.error( f"Could not upload resource tree to S3 's3://{bucket}/{key}." ) return False self.logging.info( f"Resource tree has been built and uploaded to S3 's3://{bucket}/{key}." ) finally: os.remove(temp_file) return True except: self.logging.error("Could not generate resource tree.") self.logging.error(sys.exc_info()[1]) return False