class FeatureBuilder: def __init__(self, csv_file_name): self.df = None self.new_df = pd.DataFrame() self.read_data(csv_file_name) self.logger = Logger(self) def read_data(self, csv_file_name): self.df = pd.read_csv(csv_file_name) def hasattributes(self, obj, attr_list, filename): for attr in attr_list: if not hasattr(obj, attr): self.logger.log('warn', 'module [',filename, '] doesn\'t have [', attr, '], skip.') return False return True def build_new_features(self): mod = None for filename in glob.glob(FEATURE_MODULE_DIR+"/*.py"): if mod is not None: del mod mod = None module_name = os.path.splitext(os.path.basename(filename))[0] if module_name.startswith('__'): continue if config.black_white_list_use_mode == 'disabled': pass elif config.black_white_list_use_mode == 'black': if module_name in config.black_list: continue elif config.black_white_list_use_mode == 'white': if module_name not in config.white_list: continue elif config.black_white_list_use_mode == 'both': if module_name not in config.white_list or module_name in config.black_list: continue self.logger.log('info', 'applying module [', module_name, ']') mod = __import__('modules.'+module_name) mod = getattr(mod, module_name) if not self.hasattributes(mod, ['input_columns', 'output_columns', 'run'], filename): continue input_columns = mod.input_columns output_columns = mod.output_columns run = mod.run if config.run_level is 1 and hasattr(mod, 'run_aggr'): if hasattr(mod, 'output_columns_aggr'): run = mod.run_aggr output_columns = mod.output_columns_aggr else: self.logger.log('warn', 'module [', module_name, '] has run_aggr() but no output_columns_aggr, downgrade to run().') try: tuple_df = self.df.apply(lambda row: run(*row[input_columns]), axis=1) tmp_df = tuple_df.apply(pd.Series) tmp_df.columns = output_columns except ValueError, err: print err self.logger.log('warn', 'module [', module_name, '] has incorrect columns define, skip.') continue self.new_df = pd.concat([self.new_df,tmp_df], axis=1)
class StixParseWorker: # stix_list=[] # indicator_list=[] stix_fields_list_of_list=[] indicator_fields_list_of_list=[] kill_chain_list_of_list = [] stix_fields_dict_of_list={} indicator_fields_dict_of_list={} kill_chain_dict_of_list = {} stix_title_list = [] stix_package_intent_list = [] stix_description_list = [] stix_marking_color_list = [] stix_produced_time_list = [] stix_indicator_id_list_list = [] stix_kill_chain_id_list_list = [] indicator_id_list = [] indicator_description_list = [] indicator_type_list = [] def __init__(self): self.logger = Logger(self) def __mergeKillChainPhase(self, kill_chain): pass def __processKillChainFieldsList(self, kill_chain): pass def __processStixFieldsList(self, stix_package): assert isinstance(stix_package, STIXPackage) self.stix_title_list.append(stix_package.stix_header.title) self.stix_package_intent_list.append(stix_package.stix_header.package_intents[0].__str__()) self.stix_description_list.append(stix_package.stix_header.description.__str__()) # self.stix_marking_color_list.append('|'.join(ms.color for ms in stix_package.stix_header.handling.marking[0].marking_structures)) tmpstr='' self.logger.log('handling type:',type(stix_package.stix_header.handling)) self.logger.log('handling type:',type(stix_package.stix_header.handling.marking)) self.logger.log('handling type:',type(stix_package.stix_header.handling.marking[0])) for ms in stix_package.stix_header.handling.marking[0].marking_structures: if isinstance(ms, TLPMarkingStructure): tmpstr += ms.color self.stix_marking_color_list.append(tmpstr) self.stix_produced_time_list.append(stix_package.stix_header.information_source.time.produced_time.to_dict()) indicator_id_list = [] self.logger.log('indicators type:', type(stix_package.indicators)) self.logger.log('indicator[0] type:',type(stix_package.indicators[0])) for indicator in stix_package.indicators: self.logger.log('indicator type:',type(indicator)) if not indicator.composite_indicator_expression: indicator_id_list.append(indicator.id_) self.__processIndicatorFieldsList(indicator) ind_ids_str = '|'.join(id for id in indicator_id_list) self.stix_indicator_id_list_list.append(ind_ids_str) # kill_chain_id_list = [] # for kill_chain in stix_package.kill_chains: # kill_chain_id_list.append(kill_chain.id) # self.__processKillChainFieldsList(kill_chain) # kchain_ids_str = '|'.join(id for id in kill_chain_id_list) # self.stix_kill_chain_id_list_list.append(kchain_ids_str) def __processIndicatorFieldsList(self, indicator): self.indicator_id_list.append(indicator.id_) self.indicator_description_list.append(indicator.description.__str__()) self.indicator_type_list.append(indicator.indicator_types[0].__str__()) def consumeStix(self, stix_package): self.__processStixFieldsList(stix_package) self.stix_fields_list_of_list.append(self.stix_title_list) self.stix_fields_list_of_list.append(self.stix_package_intent_list) self.stix_fields_list_of_list.append(self.stix_description_list) self.stix_fields_list_of_list.append(self.stix_marking_color_list) self.stix_fields_list_of_list.append(self.stix_produced_time_list) self.stix_fields_list_of_list.append(self.stix_indicator_id_list_list) self.stix_fields_list_of_list.append(self.stix_kill_chain_id_list_list) self.stix_fields_dict_of_list['title'] = self.stix_title_list self.stix_fields_dict_of_list['package_intent'] = self.stix_package_intent_list self.stix_fields_dict_of_list['description'] = self.stix_description_list self.stix_fields_dict_of_list['marking_color'] = self.stix_marking_color_list self.stix_fields_dict_of_list['produced_time'] = self.stix_produced_time_list self.stix_fields_dict_of_list['indicators_ids'] = self.stix_indicator_id_list_list self.stix_fields_dict_of_list['kill_chains_ids'] = self.stix_kill_chain_id_list_list self.indicator_fields_list_of_list.append(self.indicator_id_list) self.indicator_fields_list_of_list.append(self.indicator_description_list) self.indicator_fields_list_of_list.append(self.indicator_type_list) def getStixFieldsList(self): return self.stix_fields_list_of_list def getIndicatorFieldsList(self): return self.indicator_fields_list_of_list def getStixFieldsDict(self): return self.stix_fields_dict_of_list def getIndicatorFieldsDict(self): return self.indicator_fields_dict_of_list
class GraphicWithDataWorker(): def __init__(self): self.logger = Logger(self) self.is_clustering_node_by_name = False self.is_full_structure = False self.is_display_list_node_index = False self.is_save_avg_degree_connectivity = False self.clear_graph() self.hook_on_node = None self.all_avg_degree_con = {} # def initG(self): # self.G=nx.DiGraph() # self.G = nx.balanced_tree() # self.G=nx.Graph() # self.leafNode=[] # self.node_index=0 # self.nodelabel_to_displaylabel_dict={} def set_is_clustering_node_by_name(self): self.is_clustering_node_by_name = True def set_is_full_structure(self): self.is_full_structure = True def set_is_display_list_node_index(self): self.is_display_list_node_index = True def set_hook_on_node(self, func_on_node): self.hook_on_node = func_on_node def set_is_save_avg_degree_connectivity(self): self.is_save_avg_degree_connectivity = True def clear_graph(self): self.G = nx.DiGraph() self.edge_weight_dict = {} self.is_first_node = True self.first_node_label = 'root_start_123123123123123' self.node_index = 0 self.nodelabel_to_displaylabel_dict = {} def __getTreeID(self, node, label): # return str(type(node))+'@@'+str(label) typename = str(type(node)) typename = typename.split('\'')[1] typename = typename.split('.')[-1] # return str(label)+'@'+typename if not self.is_clustering_node_by_name: displaylabel = str(label).split('|')[-1] nodelabel = str(self.node_index) + '|' + displaylabel self.node_index += 1 self.nodelabel_to_displaylabel_dict[nodelabel] = displaylabel return nodelabel else: return str(label) # return str(type(node)) def __getListObjTreeID(self, node, listname, i): typename = str(type(node)) typename = typename.split('\'')[1] typename = typename.split('.')[-1] # return listname+'['+str(i)+']@'+typename if not self.is_clustering_node_by_name: if self.is_display_list_node_index: displaylabel = listname.split('|')[-1] + '[' + str(i) + ']' else: displaylabel = listname.split('|')[-1] + '[i]' nodelabel = str(self.node_index) + '|' + displaylabel self.node_index += 1 self.nodelabel_to_displaylabel_dict[nodelabel] = displaylabel return nodelabel else: if self.is_display_list_node_index: return listname + '[' + str(i) + ']' else: return listname + '[i]' # return listname + '[i]' def __dumpObjFields(self, obj): return obj.__dict__['_fields'] def dontDraw(self, label): # return False # return label[:5] == '<type' or label in ['id', 'valueOf_'] return label in [self.first_node_label, 'id', 'valueOf_'] def __add_edge_weight(self, fr, to): if not self.edge_weight_dict.has_key(fr): self.edge_weight_dict[fr] = {} if not self.edge_weight_dict[fr].has_key(to): self.edge_weight_dict[fr][to] = 0 self.edge_weight_dict[fr][to] += 1 def iterField(self, cur_node, cur_label, father_id, prefix='', func_on_field=None): if self.hook_on_node is not None: # if hook return False, then skip this node. if not self.hook_on_node(cur_node): return # init the row if this is the first this father show up. # if not self.fieldTree.has_key(father_id): # self.fieldTree[father_id] = [] if not self.G.has_node(father_id) and not self.dontDraw(father_id): if father_id == self.first_node_label: self.logger.log('WTF') self.G.add_node(father_id) # combine node name and label and get an id used in the tree my_id = self.__getTreeID(cur_node, cur_label) # insert child node(this node) in father's row # if my_id not in self.fieldTree[father_id]: # self.fieldTree[father_id].append(my_id) self.__add_edge_weight(father_id, my_id) if self.G.has_edge(father_id, my_id): self.logger.log('shouldnt have output', father_id, my_id) if not self.G.has_edge(father_id, my_id) and not self.dontDraw( father_id) and not self.dontDraw(my_id): self.G.add_edge(father_id, my_id) # if cur_node is a list or EntityList or TypedList or else, enum them # if isinstance(cur_node, list) or isinstance(cur_node, MutableSequence): ''' if isinstance(cur_node, MutableSequence): for ind, item in enumerate(cur_node): if cur_label is 'Marking_Structure': print father_id # i_label = cur_label + '['+str(ind)+']' # i_label = cur_label + '[i]' # i_label = self.__getListObjTreeID(cur_label, ind) # self.iterField(item, i_label, self.__getTreeID(cur_node, i_label), prefix + '--') i_label = self.__getListObjTreeID(item, my_id, ind) self.iterField(item, i_label, my_id, prefix + '--') # self.iterField(item, cur_label, self.__getTreeID(cur_node, cur_label), prefix + '--') ''' # if cur_node has listable fields, enum them # if cur_label in ['kill_chain_phase','kill_chain_phases', 'kill_chain', 'kill_chains', 'TTPs', 'ttps']: if hasattr(cur_node, '_fields'): fdict = self.__dumpObjFields(cur_node) for f in fdict: # print prefix,f,'<<',fdict[f]#str(father) # if cur_node.__getattribute__(f): # self.iterField(fdict[f], str(f), self.__getTreeID(cur_node, cur_label), prefix + '--') # if isinstance(cur_node, MutableSequence) and isinstance(fdict[f], MutableSequence): # if (isinstance(cur_node, MutableSequence) or isinstance(cur_node, TypedList)) and (isinstance(fdict[f], MutableSequence) or isinstance(fdict[f], TypedList)): if (isinstance(cur_node, MutableSequence) and isinstance( fdict[f], MutableSequence)) or isinstance( fdict[f], TypedList) or isinstance(fdict[f], list): # if (isinstance(fdict[f], MutableSequence) or isinstance(fdict[f], TypedList)): for ind, item in enumerate(fdict[f]): # i_label = cur_label + '['+str(ind)+']' # i_label = cur_label + '[i]' # i_label = self.__getListObjTreeID(cur_label, ind) # self.iterField(item, i_label, self.__getTreeID(cur_node, i_label), prefix + '--') i_label = self.__getListObjTreeID(item, str(f), ind) self.iterField(item, i_label, my_id, prefix + '--') # self.iterField(item, cur_label, self.__getTreeID(cur_node, cur_label), prefix + '--') elif self.is_full_structure or fdict[f] is not None: self.iterField(fdict[f], str(f), my_id, prefix + '--') def __get_display_labels(self, G): labels = {} for node in G: labels[node] = self.nodelabel_to_displaylabel_dict[node] return labels def __set_edge_weights(self, G): for u, v, d in G.edges(data=True): d['weight'] = self.edge_weight_dict[u][v] def __get_edge_weights(self, G): return [G[u][v]['weight'] for u, v in G.edges()] def avg_degree_conn_to_console(self): self.logger.log('rst', 'average_degree_connectivity', nx.average_degree_connectivity(self.G)) def avg_degree_conn_save_to_dict(self, stixname): self.all_avg_degree_con[stixname] = nx.average_degree_connectivity( self.G) def get_edge_weight_dict(self): return self.edge_weight_dict def get_graph(self): return self.G def get_all_avg_degree_con(self): return self.all_avg_degree_con # def get_child_parent_lists(self): # parent_list=[] # child_list=[] # for p in self.edge_weight_dict: # parent_list.append(p) # for c in self.edge_weight_dict[p]: # try: # child_list.index(c) # except ValueError, err: # child_list.append(c) # return child_list, parent_list def draw(self, stix_name=0, is_width_as_weight=False, is_draw_min_spin_tree=False, pic_num_minspintree=100000): # nx.draw(self.G, with_labels=True) # nx.draw_graphviz(self.G) # nx.nx_agraph.write_dot(self.G, 'test.dot') # nx.draw(self.G, pos=graphviz_layout(self.G)) self.DiG = self.G # self.DiG = nx.path_graph(6) # self.DiG.edge[1][2]['weight'] = 3 if isinstance(stix_name, int): stix_name = '#' + stix_name plt.figure("Structure Tree for STIX PACKAGE [ " + stix_name + ' ]') # plt.title("stix structure tree") mng = plt.get_current_fig_manager() mng.resize(*mng.window.maxsize()) pos = graphviz_layout(self.DiG, prog='dot', args='-Grankdir=LR') if is_width_as_weight: self.__set_edge_weights(self.DiG) weights = self.__get_edge_weights(self.DiG) nx.draw(self.DiG, node_size=40, pos=pos, edge_color='y', with_labels=False, width=weights) else: nx.draw(self.DiG, node_size=40, pos=pos, edge_color='y', with_labels=False) if not self.is_clustering_node_by_name: labels = self.__get_display_labels(self.DiG) nx.draw_networkx_labels(self.DiG, pos=pos, labels=labels, font_color='b') else: nx.draw_networkx_labels(self.DiG, pos=pos, font_color='b') if is_draw_min_spin_tree: self.UnDiG = self.G.to_undirected() self.UnDiG = nx.minimum_spanning_tree(self.UnDiG) plt.figure("Minimun Spinning Tree for STIX PACKAGE [ " + stix_name + ' ]') # plt.title("minimum spinning tree") mng = plt.get_current_fig_manager() mng.resize(*mng.window.maxsize()) pos = graphviz_layout(self.UnDiG, prog='dot', args='-Grankdir=LR') nx.draw(self.UnDiG, node_size=40, pos=pos, edge_color='y') nx.draw_networkx_labels(self.UnDiG, pos=pos, font_color='b') # nx.draw_networkx_nodes(self.G, nodelist=self.leafNode, node_color='b') # nx.draw_graphviz(self.G,'dot') # nx.draw_networkx(self.G) # plt.show() # nx.draw_shell(self.G, with_labels=True) def draw_show(self): plt.show() def doYourWork(self, stix_package): if isinstance(stix_package, STIXPackage): start_label = 'stix_package' elif isinstance(stix_package, Indicator): start_label = 'Indicator' # self.initG() # self.logger.log('info', 'working: stix id:',stix_package.id_) # self.logger.log('info', 'working: stix kill...:',type(stix_package.ttps.kill_chains[0].kill_chain_phases)) # self.logger.log('info', 'working: stix kill...:',stix_package.ttps.kill_chains[0].kill_chain_phases[1]) # self.logger.log('info', 'working: stix kill...:',stix_package.ttps.kill_chains[0].kill_chain_phases[1]) # self.logger.log('info', 'working: stix ttp:',type(stix_package.ttps)) self.iterField(stix_package, start_label, self.__getTreeID(STIXPackage, self.first_node_label), '-')
class GraphicWorker: ''' Describe: Walk through each node/attribute, and record the father-child relationship in self.fieldTree Params: cur_node: current node; cur_label: the lable of current node father_id: the node name + the label of parent node prefix: use in case that want to pprint the layers, don't care about it otherwise ''' def initG(self): self.G = nx.DiGraph() # self.G = nx.balanced_tree() # self.G=nx.Graph() # self.leafNode=[] self.logger = Logger(self) ''' Describe: Combine node name and lable to a id used in the tree Params: node: the node label: the label of the node ''' def __getTreeID(self, node, label): # return str(type(node))+'@@'+str(label) return str(label) def __dumpObjFields(self, obj): return obj.__dict__['_fields'] def dontDraw(self, label): return False # return label[:5] == '<type' or label in ['id', 'valueOf_'] # return label in ['id', 'valueOf_'] def iterField(self, cur_node, cur_label, father_id, prefix=''): # init the row if this is the first this father show up. # if not self.fieldTree.has_key(father_id): # self.fieldTree[father_id] = [] if not self.G.has_node(father_id) and not self.dontDraw(father_id): self.G.add_node(father_id) # combine node name and label and get an id used in the tree my_id = self.__getTreeID(cur_node, cur_label) # insert child node(this node) in father's row # if my_id not in self.fieldTree[father_id]: # self.fieldTree[father_id].append(my_id) if not self.G.has_edge(father_id, my_id) and not self.dontDraw(my_id): self.G.add_edge(father_id, my_id) # if cur_node is a list or EntityList or TypedList or else, enum them if isinstance(cur_node, list) or isinstance(cur_node, MutableSequence): for ind, item in enumerate(cur_node): # self.iterField(item, cur_label + '[i]', self.__getTreeID(cur_node, cur_label), prefix + '--') self.iterField(item, cur_label, self.__getTreeID(cur_node, cur_label), prefix + '--') # if cur_node has listable fields, enum them if hasattr(cur_node, '_fields'): fdict = self.__dumpObjFields(cur_node) for f in fdict: # print prefix,f,'<<',fdict[f]#str(father) self.iterField(fdict[f], str(f), self.__getTreeID(cur_node, cur_label), prefix + '--') def iterField_diff_color(self, cur_node, cur_label, father_id, prefix=''): # init the row if this is the first this father show up. # if not self.fieldTree.has_key(father_id): # self.fieldTree[father_id] = [] if not self.G.has_node(father_id) and not self.dontDraw(father_id): self.G.add_node(father_id) self.leafNode.append(father_id) # combine node name and label and get an id used in the tree my_id = self.__getTreeID(cur_node, cur_label) # insert child node(this node) in father's row # if my_id not in self.fieldTree[father_id]: # self.fieldTree[father_id].append(my_id) if not self.G.has_edge(father_id, my_id) and not self.dontDraw(my_id): self.G.add_edge(father_id, my_id) if father_id in self.leafNode: self.leafNode.remove(father_id) # if cur_node is a list or EntityList or TypedList or else, enum them if isinstance(cur_node, list) or isinstance(cur_node, MutableSequence): for ind, item in enumerate(cur_node): # self.iterField(item, cur_label + '[i]', self.__getTreeID(cur_node, cur_label), prefix + '--') self.iterField(item, cur_label, self.__getTreeID(cur_node, cur_label), prefix + '--') # if cur_node has listable fields, enum them if hasattr(cur_node, '_fields'): fdict = self.__dumpObjFields(cur_node) for f in fdict: # print prefix,f,'<<',fdict[f]#str(father) self.iterField(fdict[f], str(f), self.__getTreeID(cur_node, cur_label), prefix + '--') def outputDegree(self, filename): degreelist = self.G.degree() self.logger.log('info', 'output degree list to:', filename) with open(filename, 'wb') as f: f.write('NODE NAME' + ',' + 'DEGREE\n') for k in degreelist: f.write(str(k) + ',' + str(degreelist[k]) + '\n') self.logger.log('info', 'output degree list done') def outputWeight(self, filename): self.outputDegree(filename) def draw(self): # nx.draw(self.G, with_labels=True) # nx.draw_graphviz(self.G) # nx.nx_agraph.write_dot(self.G, 'test.dot') # nx.draw(self.G, pos=graphviz_layout(self.G)) self.DiG = self.G self.UnDiG = self.G.to_undirected() self.UnDiG = nx.minimum_spanning_tree(self.UnDiG) plt.figure(1) # plt.title("stix structure tree") mng = plt.get_current_fig_manager() mng.resize(*mng.window.maxsize()) pos = graphviz_layout(self.UnDiG, prog='dot', args='-Grankdir=LR') nx.draw(self.UnDiG, node_size=40, pos=pos, edge_color='y') nx.draw_networkx_labels(self.UnDiG, pos=pos, font_color='b') plt.figure(2) # plt.title("minimum spinning tree") mng = plt.get_current_fig_manager() mng.resize(*mng.window.maxsize()) pos = graphviz_layout(self.DiG, prog='dot', args='-Grankdir=LR') nx.draw(self.DiG, node_size=40, pos=pos, edge_color='y') nx.draw_networkx_labels(self.DiG, pos=pos, font_color='b') # nx.draw_networkx_nodes(self.G, nodelist=self.leafNode, node_color='b') # nx.draw_graphviz(self.G,'dot') # nx.draw_networkx(self.G) # plt.show() # nx.draw_shell(self.G, with_labels=True) plt.show() def doYourWork(self, stix_package): assert isinstance(stix_package, STIXPackage) self.initG() self.iterField(stix_package, 'stix_package', self.__getTreeID(STIXPackage, 'root_start'), '-')
class DataParsingFactory: def __init__(self): self.logger = Logger(self) self.tmp_G = None self.tmp_dict = None self.jobs = [] self.time_per_job = [] def stix_packages_fn_iterater(self, fn_or_dir, stopafter, onlyuse=None): # if fn_or_dir[-1] == '/': if os.path.isdir(fn_or_dir): self.is_dir = True for i in stixFileNameInDirectory(fn_or_dir, stopafter=stopafter, onlyuse=onlyuse): yield i else: self.is_dir = False for i in xmlFileName2EnumStixFileName(fn_or_dir, stopafter=stopafter): yield i def node_iterator(self, stix_package, iter_start): if iter_start == 'indicator': if stix_package.indicators is None: return for i in stix_package.indicators: yield i else: # use stix_package yield stix_package def __getParam(self, argname, nonevalue=None, errmsg=None): if errmsg is not None and not self.requirements.has_key(argname): self.logger.log('err', errmsg) exit(-1) return self.requirements[argname] if self.requirements.has_key( argname) else nonevalue def goFindSomeoneDoThisJob(self, *jobs, **requirements): self.requirements = requirements self.jobs.append(jobs) for job in jobs: self.logger.log('info', 'Start to work on', job) time_start = time.time() time_end = -1 if job is JobType.ParseStixFromXmlAndPrintValuesToConsole: if not requirements.has_key('xmlfilename'): self.logger.log( 'err', '{', job, '}', 'at least give me a xml file name to parse, please :)') return -1 xmlfilename = requirements['xmlfilename'] stopAfterFinishRound = requirements[ 'stopafter'] if requirements.has_key('stopafter') else -1 justDoThisRound = requirements[ 'justdo'] if requirements.has_key('justdo') else -1 worker = StixParseWorker() for ind, stix_fn in enumerate( self.stix_packages_fn_iterater( xmlfilename, stopafter=stopAfterFinishRound)): # if stopAfterFinishRound > -1: # if ind > stopAfterFinishRound: # break if justDoThisRound > -1: if ind is not justDoThisRound: continue self.logger.log( 'info', 'I\'m working on stix_package #' + str(ind)) stix_package = stixFileName2StixPackageObj(stix_fn) worker.consumeStix(stix_package) stix_fields_list_of_list = worker.getStixFieldsList() stix_fields_dict_of_list = worker.getStixFieldsDict() pprintDict(stix_fields_dict_of_list) if job is JobType.AnalyzeStixFromXmlAndBuildFieldTree: if not requirements.has_key('xmlfilename'): self.logger.log( 'err', '{', job, '}', 'at least give me a xml file name to parse, please :)') return -1 xmlfilename = requirements['xmlfilename'] stopAfterFinishRound = requirements[ 'stopafter'] if requirements.has_key('stopafter') else -1 justDoThisRound = requirements[ 'justdo'] if requirements.has_key('justdo') else -1 worker = FieldsDocumentaryWorker() for ind, stix_fn in enumerate( self.stix_packages_fn_iterater( xmlfilename, stopafter=stopAfterFinishRound)): # if stopAfterFinishRound > -1: # if ind > stopAfterFinishRound: # break if justDoThisRound > -1: if ind is not justDoThisRound: continue self.logger.log( 'info', 'I\'m working on stix_package #' + str(ind)) stix_package = stixFileName2StixPackageObj(stix_fn) worker.consumeStix(stix_package) self.fieldTree = worker.getTree() if job is JobType.SaveFieldTree: if not hasattr(self, 'fieldTree'): self.logger.log( 'err', '{', job, '}', 'we don\'t even a field-tree at this point to save, we need to build it first.' ) return -1 fieldtreepklfn = requirements[ 'fieldtreepklfn'] if requirements.has_key( 'fieldtreepklfn') else DEFAULT_TREE_PICKLE_FILE_NAME worker = FieldsDocumentaryWorker() worker.saveTreeToFile(self.fieldTree, fieldtreepklfn) if job is JobType.LoadFieldTree: fieldtreepklfn = requirements[ 'fieldtreepklfn'] if requirements.has_key( 'fieldtreepklfn') else DEFAULT_TREE_PICKLE_FILE_NAME worker = FieldsDocumentaryWorker() self.fieldTree = worker.loadTreeFrFile(fieldtreepklfn) if job is JobType.PrintFieldTreeToConsole: if not hasattr(self, 'fieldTree'): self.logger.log( 'err', '{', job, '}', 'we don\'t even a field-tree at this point to save, we need to build it first.' ) return -1 worker = FieldsDocumentaryWorker() worker.printTree2Console(self.fieldTree) if job is JobType.PrintFieldTreeToCsvFile: if not hasattr(self, 'fieldTree'): self.logger.log( 'err', '{', job, '}', 'we don\'t even a field-tree at this point to save, we need to build it first.' ) return -1 if not requirements.has_key('csvfilename'): self.logger.log( 'err', '{', job, '}', 'we need a CSV file name to save your tree') return -1 csvfilename = requirements['csvfilename'] worker = FieldsDocumentaryWorker() worker.printTree2Csv(self.fieldTree, csvfilename) ''' notice: there are two job types ''' if job in [ JobType.AnalyzeStixFromXmlAndDrawAGraph, JobType.FeedDataAndDrawWeightedGraph ]: if not requirements.has_key('xmlfilename'): self.logger.log( 'err', '{', job, '}', 'at least give me a xml file name to parse, please :)') return -1 xmlfilename = requirements['xmlfilename'] stopAfterFinishRound = requirements[ 'stopafter'] if requirements.has_key('stopafter') else -1 justDoThisRound = requirements[ 'justdo'] if requirements.has_key('justdo') else -1 weightCsvFileName = requirements[ 'csvfilename'] if requirements.has_key( 'csvfilename') else -1 isdrawgraph = requirements[ 'isdrawgraph'] if requirements.has_key( 'isdrawgraph') else False isforeachpackage = requirements[ 'isforeachpackage'] if requirements.has_key( 'isforeachpackage') else False isdrawminspintree = requirements[ 'isdrawminspintree'] if requirements.has_key( 'isdrawminspintree') else False iswidthasweight = requirements[ 'iswidthasweight'] if requirements.has_key( 'iswidthasweight') else False isclusteringnodebyname = requirements[ 'isclusteringnodebyname'] if requirements.has_key( 'isclusteringnodebyname') else False islistnodeidx = requirements[ 'islistnodeidx'] if requirements.has_key( 'islistnodeidx') else False isfullstructure = requirements[ 'isfullstructure'] if requirements.has_key( 'isfullstructure') else False isuseexistedtablenames = requirements[ 'isuseexistedtablenames'] if requirements.has_key( 'isuseexistedtablenames') else False issavetablesforeachpackage = requirements[ 'issavetablesforeachpackage'] if requirements.has_key( 'issavetablesforeachpackage') else False issaverowforeachpackage = requirements[ 'issaverowforeachpackage'] if requirements.has_key( 'issaverowforeachpackage') else False rowsCsvFileName = requirements[ 'rowscsvfilename'] if requirements.has_key( 'rowscsvfilename') else -1 pickleFileName = requirements[ 'picklefilename'] if requirements.has_key( 'picklefilename') else -1 isavgdegreecon = requirements[ 'isavgdegreecon'] if requirements.has_key( 'isavgdegreecon') else -1 hookOnNode = requirements['hookonnode'] if requirements.has_key( 'hookonnode') else None hookOnStart = requirements[ 'hookonstart'] if requirements.has_key( 'hookonstart') else None hookOnEnd = requirements['hookonend'] if requirements.has_key( 'hookonend') else None hookOnBeforePackage = requirements[ 'hookonafterpackage'] if requirements.has_key( 'hookonbeforepackage') else None hookOnAfterPackage = requirements[ 'hookonafterpackage'] if requirements.has_key( 'hookonafterpackage') else None if hookOnStart is not None: hookOnStart() # worker = FieldsDocumentaryWorker() worker = GraphicWorker( ) if job is JobType.AnalyzeStixFromXmlAndDrawAGraph else GraphicWithDataWorker( ) if isclusteringnodebyname: worker.set_is_clustering_node_by_name() if isfullstructure: worker.set_is_full_structure() if islistnodeidx: worker.set_is_display_list_node_index() if weightCsvFileName is not -1 or rowsCsvFileName is not -1 or pickleFileName is not -1: ioworker = IOWorker() if issavetablesforeachpackage: if isuseexistedtablenames: rowlist, collist = ioworker.get_child_parent_lists( self.tmp_dict) if issaverowforeachpackage: if isuseexistedtablenames: allstructure = self.tmp_dict if hookOnNode is not None: worker.set_hook_on_node(hookOnNode) # if isfullstructure: # worker.doYourWork(STIXPackage()) # for ind, stix_fn in enumerate(xmlFileName2EnumStixFileName(xmlfilename,stopafter=stopAfterFinishRound)): for ind, stix_fn in enumerate( self.stix_packages_fn_iterater( xmlfilename, stopafter=stopAfterFinishRound)): # if stopAfterFinishRound > -1: # if ind > stopAfterFinishRound: # break # if full, then try not to use specific package # if isfullstructure: # break if hookOnBeforePackage is not None: hookOnBeforePackage() if justDoThisRound != -1: if justDoThisRound == -2: break if type(justDoThisRound) is str: if os.path.split(stix_fn)[1] != justDoThisRound: continue else: justDoThisRound = -2 # need to break after this round elif ind != justDoThisRound: continue self.logger.log( 'info', 'I\'m working on stix_package #' + str(ind)) stix_package = stixFileName2StixPackageObj(stix_fn) for node_ind, node in enumerate( self.node_iterator(stix_package, 'indicator')): if isforeachpackage: worker.clear_graph() # worker.doYourWork(stix_package) worker.doYourWork(node) if isforeachpackage: thisStixName = stix_fn.split('/')[-1] self.tmp_dict = worker.get_edge_weight_dict() self.tmp_G = worker.get_graph() if isavgdegreecon: if pickleFileName == -1: worker.avg_degree_conn_to_console() else: worker.avg_degree_conn_save_to_dict( thisStixName) if weightCsvFileName is not -1: if issavetablesforeachpackage: twoparts = weightCsvFileName.split('%s') if len(twoparts) is not 2: self.logger.log( 'err', 'please include %s (only once) inside the target csv filename, to decide where to write the corresponding stix file name' ) exit(-1) if os.path.dirname(twoparts[1]) is not '': self.logger.log( 'warn', '%s is in a directory name not a filename, will create many directories.' ) thisCsvFileName = twoparts[ 0] + stix_fn.split( '/')[-1] + twoparts[1] try: os.makedirs( os.path.dirname(thisCsvFileName)) except OSError as e: if not e.errno is errno.EEXIST: self.logger.log( 'err', 'directory create fail') raise if isuseexistedtablenames: ioworker.outputWeightTable( self.tmp_dict, thisCsvFileName, rowlist, collist) else: ioworker.outputWeightTable( self.tmp_dict, thisCsvFileName) # if rowsCsvFileName != -1: # if issaverowforeachpackage: # if isuseexistedtablenames: # ioworker.outputWeightRow(weights=self.tmp_dict, filename=rowsCsvFileName, stixname=thisStixName, allstructure=allstructure) # else: # ioworker.outputWeightRow(weights=self.tmp_dict, filename=rowsCsvFileName, stixname=thisStixName) if isdrawgraph: stix_name = stix_fn.split( '/')[-1] if self.is_dir else ind worker.draw( stix_name + str(node_ind), is_width_as_weight=iswidthasweight, is_draw_min_spin_tree=isdrawminspintree) if hookOnAfterPackage is not None: hookOnAfterPackage(stixname=thisStixName, weights=self.tmp_dict) if job is JobType.FeedDataAndDrawWeightedGraph and not isforeachpackage: self.tmp_dict = worker.get_edge_weight_dict() self.tmp_G = worker.get_graph() if isavgdegreecon: if pickleFileName == -1: worker.avg_degree_conn_to_console() else: worker.avg_degree_conn_save_to_dict('all_stix') if isdrawgraph: worker.draw("All Stix Packages", is_width_as_weight=iswidthasweight, is_draw_min_spin_tree=isdrawminspintree) if weightCsvFileName is not -1: if issavetablesforeachpackage: twoparts = weightCsvFileName.split('%s') if len(twoparts) > 1: self.logger.log( 'err', 'remember to remove %s in the target file name' ) exit(-1) if not os.path.dirname( weightCsvFileName) is not '': self.logger.log( 'err', 'target file name is a directory, please change to a file name' ) exit(-1) if isuseexistedtablenames: ioworker.outputWeightTable( self.tmp_dict, weightCsvFileName, rowlist, collist) else: ioworker.outputWeightTable( self.tmp_dict, weightCsvFileName) # if rowsCsvFileName != -1: # if issaverowforeachpackage: # if isuseexistedtablenames: # ioworker.outputWeightRow(weights=self.tmp_dict, filename=rowsCsvFileName, stixname='all_stix', allstructure=allstructure) # else: # ioworker.outputWeightRow(weights=self.tmp_dict, filename=rowsCsvFileName, stixname='all_stix') if pickleFileName != -1: ioworker.pickle_dump(worker.get_all_avg_degree_con(), pickleFileName) if isdrawgraph: time_end = time.time() worker.draw_show() if hookOnEnd is not None: hookOnEnd() if time_end == -1: time_end = time.time() self.time_per_job.append(time_end - time_start) self.logger.log('info', 'Job', job, 'has done!') self.logger.log('info', 'All the jobs have done! Enjoy your data!') def print_time_per_job(self): for i, j in enumerate(self.jobs): self.logger.log('info', 'Time of Job #' + str(i), self.time_per_job[i], 'seconds')
class FieldsDocumentaryWorker(): # stix_fields_list = [] # indicator_fields_list = [] # stix_values_list = [] # indicator_values_list = [] fieldTree = {} def __init__(self): self.logger = Logger(self) ''' Describe: Main method, takes a stix object in and parse it. Params: stix_package: STIXPackage object ''' def consumeStix(self, stix_package): assert isinstance(stix_package, STIXPackage) '''Enable this if want to use the test function, ignore otherwise''' # self.iterFieldAndPrint(stix_package, '-', stix_package) self.iterField(stix_package, 'stix_package', self.__getTreeID(STIXPackage,'root_start'), '-') '''for test''' # for node in stix_package.walk(): # if isinstance(stix_package, ) # print type(node) ''' Describe: This version is similar to iterField(), but it's a test version which is going to print the tree to console in a tree-like way as the parsing goes by. This is only an experimental function. ''' def iterFieldAndPrint(self, obj, prefix, father, label=''): # if isinstance(obj, list) or isinstance(obj, EntityList): if isinstance(obj, list) or isinstance(obj, MutableSequence): for item in obj: self.iterFieldAndPrint(item, prefix+'--', obj) self.logger.log('rst',prefix,obj.__class__,'@@',label,'<<',father.__class__)#str(father) # don't delete this, enable this to check if there is any List like Entity List print '+'+prefix[1:],obj,'<<',father#str(father) if not hasattr(obj, '_fields'): return fdict = self.__dumpObjFields(obj) for f in fdict: # print prefix,f,'<<',fdict[f]#str(father) # print '+'+prefix[1:],type(f),'<<',fdict[f]#str(father) # if isinstance(fdict[f], Indicators): # if isinstance(fdict[f], EntityList): # print 'here we go!' # for item in fdict[f]: # print type(item) # print 'here we go end!' self.iterFieldAndPrint(fdict[f], prefix+'--', obj, f) ''' Describe: Combine node name and lable to a id used in the tree Params: node: the node label: the label of the node ''' def __getTreeID(self, node, label): return str(type(node))+'@@'+str(label) ''' Describe: Walk through each node/attribute, and record the father-child relationship in self.fieldTree Params: cur_node: current node; cur_label: the lable of current node father_id: the node name + the label of parent node prefix: use in case that want to pprint the layers, don't care about it otherwise ''' def iterField(self, cur_node, cur_label, father_id, prefix=''): # init the row if this is the first this father show up. if not self.fieldTree.has_key(father_id): self.fieldTree[father_id] = [] # combine node name and label and get an id used in the tree my_id = self.__getTreeID(cur_node, cur_label) # insert child node(this node) in father's row if my_id not in self.fieldTree[father_id]: self.fieldTree[father_id].append(my_id) # if cur_node is a list or EntityList or TypedList or else, enum them if isinstance(cur_node, list) or isinstance(cur_node, MutableSequence): for ind, item in enumerate(cur_node): self.iterField(item, cur_label + '[i]', self.__getTreeID(cur_node, cur_label), prefix + '--') # if cur_node has listable fields, enum them if hasattr(cur_node, '_fields'): fdict = self.__dumpObjFields(cur_node) for f in fdict: # print prefix,f,'<<',fdict[f]#str(father) self.iterField(fdict[f], str(f), self.__getTreeID(cur_node, cur_label), prefix + '--') def __dumpObjFields(self, obj): return obj.__dict__['_fields'] ''' Describe: write the tree structure to a csv file ''' def printTree2Csv(self, fieldTree, csvfilename): self.logger.log('info', 'Writing tree to CSV file \'', csvfilename, '\'...') with open(csvfilename,'wb') as f: for father in fieldTree: ostr = str(father) ostr += ';' ostr += ';'.join(str(child) for child in fieldTree[father]) f.write(ostr+'\n') self.logger.log('info', 'Writing finished.') ''' Describe: print the tree structure to console ''' def printTree2Console(self, fieldTree): for father in fieldTree: ostr = str(father) ostr += ';' ostr += ';'.join(str(child) for child in fieldTree[father]) self.logger.log('rst', ostr) def getTree(self): return self.fieldTree ''' Describe: Dump the field tree with pickle and store it to file ''' def saveTreeToFile(self, fieldTree, filename): if not filename or type(filename) is not str or filename=='': self.logger.log('err', 'I need a file name where the object is going be saved') return -1 self.logger.log('info', 'Dumping tree object to pickle file \'', filename, '\'...') saveObjToFile(fieldTree, filename) self.logger.log('info', 'Dumping finished') ''' Describe: Load field tree from pickle file ''' def loadTreeFrFile(self, filename=''): if not filename or type(filename) is not str or filename=='': self.logger.log('err', 'I need a file name to load from') return -1 self.logger.log('info', 'Loading tree object from pickle file \'', filename, '\'...') return loadObjFrFile(filename)
class IOWorker: def __init__(self): self.logger = Logger(self) # self.is_titles_wrote = False ''' weights: ex.: {'parent1': {'child1': 5, 'child2': 4}, 'parent2': {'child2':3, 'child3': 1}} rowlist: ex.: ['child1', 'child2'...] collist: ex.: ['parent1', 'parent2'...] target table: parent1 parent2 child1 5 0 child2 4 3 child3 0 1 ''' def outputWeightTable(self, weights, filename, rowlist=None, collist=None): assert isinstance(weights, dict) assert isinstance(rowlist, list) or rowlist is None assert isinstance(collist, list) or collist is None self.logger.log('info', 'writing weight dict to:', filename) if rowlist is not None and collist is not None: self.logger.log('info', 'using provided row and col index.') else: self.logger.log('info', 'building row and col index by myself.') rowlist, collist = self.get_child_parent_lists(weights) target_table = [] for i in range(len(rowlist)): child_name = rowlist[i] this_row = [] for j in range(len(collist)): parent_name = collist[j] w = weights[parent_name][child_name] if weights.has_key( parent_name) and weights[parent_name].has_key( child_name) else 0 this_row.append(w) target_table.append(this_row) self.logger.log('info', target_table) with open(filename, 'wb') as f: for parent_name in collist: f.write(',' + parent_name) f.write('\n') for ind, row in enumerate(target_table): f.write(rowlist[ind]) for cell in row: f.write(',' + str(cell)) f.write('\n') self.logger.log('info', 'writing weight dict is done.') return # degreelist = self.G. ''' def writeWeightTitle(self, filename, allstructure=None, weights=None): if allstructure is not None: self.logger.log('info', 'using provided col names.') else: self.logger.log('info', 'building row and col names by myself.') allstructure = weights title_row = 'Stix_Name' for p in allstructure: for c in allstructure[p]: # title_row = title_row + p+'_to_'+c + ',' title_row = title_row + ',' + p+'->'+c with open(filename, 'w') as f: f.write(title_row + '\n') # self.is_titles_wrote = True ''' def get_child_parent_lists(self, weight_dict): parent_list = [] child_list = [] for p in weight_dict: parent_list.append(p) for c in weight_dict[p]: try: child_list.index(c) except ValueError, err: child_list.append(c) return child_list, parent_list