def __init__(self, b_add_sstream_link=False, b_add_sstream_size=False): self.vars = {} self.declare = Declare() self.set = Set() self.input = Input() self.output = Output() self.module = Module() self.process = Process() self.reduce = Reduce() self.combine = Combine() self.using = Using() self.select = Select() self.scope_resolver = ScopeResolver() self.b_add_sstream_link = b_add_sstream_link self.b_add_sstream_size = b_add_sstream_size self.sstream_link_prefix = "" self.sstream_link_suffix = "" self.external_params = {} self.sstream_link_prefix = "" self.sstream_link_suffix = "" self.target_date_str = "" self.default_datetime = DatetimeUtility.get_datetime() if b_add_sstream_size: self.ssu = SstreamUtility( "d:/workspace/dummydummy.ini") # specify your auth file path # read fallback configs from ini file config_filepath = os.path.join(os.path.dirname(__file__), os.pardir, 'config', 'config.ini') self.read_configs(config_filepath)
def test_func_datetime_parse(self): func_str = 'DateTime.Parse("2011-01-01")' result = ScopeResolver().resolve_func( func_str) # result is datetime obj result_str = result.strftime('%Y-%m-%d') self.assertEqual("2011-01-01", result_str)
def test_func_datetime_parseexact(self): func_str = 'DateTime.ParseExact("2018-01-01" + " 00:00:00", "yyyy-MM-dd HH:mm:ss", System.Globalization.CultureInfo.InvariantCulture);' result = ScopeResolver().resolve_func( func_str) # result is datetime obj result_str = result.strftime('%Y-%m-%d %H:%M:%S') self.assertEqual("2018-01-01 00:00:00", result_str)
def test_resolve_math_abs(self): s = ''' Math.Abs(-0).ToString() ''' result = ScopeResolver().resolve_func(s) self.assertEqual('0', result)
def test_resolve_set_rvalue(self): s = ''' @FeatureStreamForAccount.Replace(".ss", "_fakeForSanityCheck.ss") ''' result = ScopeResolver().resolve_set_rvalue( s, {'@FeatureStreamForAccount': 'aa.ss'}) self.assertEqual('aa_fakeForSanityCheck.ss', result)
def test_inner_string_format(self): s = ''' "aaa__" + string.Format("/path/to/data/prod/pipelines/ImpressionShare/Common"+"/%Y/%m/%d/DSAMerge%Y%m%d%h.ss?date={0}&hour={1}","2018-01-01",22/2*2) ''' result = ScopeResolver().resolve_declare_rvalue(None, s, {}) self.assertEqual( 'aaa__/path/to/data/prod/pipelines/ImpressionShare/Common/2018/01/01/DSAMerge2018010100.ss?date=2018-01-01&hour=22', result)
def test_string_format_param_datetime_parse(self): s = ''' string.Format("{0}/{1:yyyy/MM/dd}/NegativeKWCandidates.ss", @InputPath, DateTime.Parse(@RunDate)) ''' declare_map = {'@InputPath': '/path/to', '@RunDate': '2018-01-01'} result = ScopeResolver().resolve_declare_rvalue(None, s, declare_map) self.assertEqual('/path/to/2018/01/01/NegativeKWCandidates.ss', result)
def test_string_format_datetime_parse_to_string(self): s = ''' string.Format("/path_to/Daily/%Y/%m/Campaign_FiltrationFunnelDaily_%Y%m%d.ss?date={0}...{1}", @dateObj.AddDays(-6).ToString("yyyy-MM-dd"), @dateObj.ToString("yyyy-MM-dd")); ''' result = ScopeResolver().resolve_declare_rvalue( None, s, self.declare_map) self.assertEqual( '/path_to/Daily/2018/01/Campaign_FiltrationFunnelDaily_20180101.ss?date=2017-12-26...2018-01-01', result)
def test_string_format_datetime_parse(self): s = ''' string.Format("{0}/BidEstimation/Result/%Y/%m/AuctionContext_%Y-%m-%d.ss?date={1:yyyy-MM-dd}", "path_to", DateTime.Parse("2018-08-03")); ''' result = ScopeResolver().resolve_declare_rvalue( None, s, self.declare_map) self.assertEqual( "path_to/BidEstimation/Result/2018/08/AuctionContext_2018-08-03.ss?date=2018-08-03", result)
def test_str_cat_datetime_parse_range(self): s = ''' "/path_to/%Y/%m/KeywordsSearchCountDaily_%Y-%m-%d.ss?date=" + @dateObj.AddDays(-31).ToString("yyyy-MM-dd") + "..." + @dateObj.AddDays(-1).ToString("yyyy-MM-dd") + "&sparsestreamset=true" ''' result = ScopeResolver().resolve_declare_rvalue( None, s, self.declare_map) self.assertEqual( "/path_to/2017/12/KeywordsSearchCountDaily_2017-12-31.ss?date=2017-12-01...2017-12-31&sparsestreamset=true", result)
def test_string_format_idx(self): s = ''' string.Format("{0}/Preparations/MPIProcessing/{1:yyyy/MM/dd}/AuctionWithKeywordAndMT.ss", @KWRawPath, @dateObj) ''' result = ScopeResolver().resolve_declare_rvalue( None, s, self.declare_map) self.assertEqual( 'kw_raw_path/Preparations/MPIProcessing/2018/01/01/AuctionWithKeywordAndMT.ss', result)
def test_string_format_param_str_datetime(self): s = ''' string.Format("{0}/Flights/{1:yyyy/MM/dd}/AuctionParticipants{1:yyyyMMdd}.ss", "/path/to", @BTERunDate) ''' declare_map = {'@BTERunDate': parser.parse('2018-01-01')} result = ScopeResolver().resolve_declare_rvalue(None, s, declare_map) self.assertEqual( '/path/to/Flights/2018/01/01/AuctionParticipants20180101.ss', result)
def test_string_format_add_days_black_minus(self): s = ''' String.Format(@"{0}RawSearchQuery/RawSearchQuery_{1:yyyy-MM-dd}.ss", @INPUT_PATH, @ObjDate.AddDays( - 1)) ''' declare_map = { '@INPUT_PATH': '/path/to/', '@ObjDate': parser.parse('2018-01-01') } result = ScopeResolver().resolve_declare_rvalue(None, s, declare_map) self.assertEqual( '/path/to/RawSearchQuery/RawSearchQuery_2017-12-31.ss', result)
def test_string_format_item_int(self): s = ''' String.Format("{0}/%Y/%m/%d/EligibleAuctionParticipants_%h.ss?date={1}&hour={2}", @SOVRawBasePath, @DATE_UTC, 23) ''' declare_map = { '@SOVRawBasePath': '/path/to', '@DATE_UTC': '2018-01-01' } result = ScopeResolver().resolve_declare_rvalue(None, s, declare_map) self.assertEqual( '/path/to/2018/01/01/EligibleAuctionParticipants_00.ss?date=2018-01-01&hour=23', result)
def test_basic_str_cat(self): items = ['"abc"', '+', '"123"'] result = ScopeResolver().resolve_basic(items, {'"123"': '"ABC"'}) self.assertEqual("abcABC", result)
def test_func_datetime_parse_add_days(self): func_str = 'DateTime.Parse("2018-08-01").AddDays(3)' result = ScopeResolver().resolve_func(func_str) result_str = result.strftime('%Y-%m-%d') self.assertEqual("2018-08-04", result_str)
def test_func_math_abs(self): func_str = 'Math.Abs(-1000)' result = ScopeResolver().resolve_func(func_str) self.assertEqual(1000, result)
def test_basic_num(self): items = ["66"] result = ScopeResolver().resolve_basic(items, {}) self.assertEqual(66, result)
class ScriptParser(object): logger = logging.getLogger(__name__) def __init__(self, b_add_sstream_link=False, b_add_sstream_size=False): self.vars = {} self.declare = Declare() self.set = Set() self.input = Input() self.output = Output() self.module = Module() self.process = Process() self.reduce = Reduce() self.combine = Combine() self.using = Using() self.select = Select() self.scope_resolver = ScopeResolver() self.b_add_sstream_link = b_add_sstream_link self.b_add_sstream_size = b_add_sstream_size self.sstream_link_prefix = "" self.sstream_link_suffix = "" self.external_params = {} self.sstream_link_prefix = "" self.sstream_link_suffix = "" self.target_date_str = "" self.default_datetime = DatetimeUtility.get_datetime() if b_add_sstream_size: self.ssu = SstreamUtility( "d:/workspace/dummydummy.ini") # specify your auth file path # read fallback configs from ini file config_filepath = os.path.join(os.path.dirname(__file__), os.pardir, 'config', 'config.ini') self.read_configs(config_filepath) def read_configs(self, filepath): config = configparser.ConfigParser() config.optionxform = str # reserve case config.read(filepath) self.sstream_link_prefix = config['ScriptParser'][ 'sstream_link_prefix'] self.sstream_link_suffix = config['ScriptParser'][ 'sstream_link_suffix'] for key in config['ExternalParam']: if key.startswith('#'): continue self.external_params[key] = config['ExternalParam'][key] self.target_date_str = config['ExternalParam']['TARGET_DATE'] self.default_datetime = parser.parse(self.target_date_str) def remove_empty_lines(self, content): return "\n".join( [ll.rstrip() for ll in content.splitlines() if ll.strip()]) def remove_comments(self, content): # handy function from https://stackoverflow.com/questions/241327/python-snippet-to-remove-c-and-c-comments def replacer(match): s = match.group(0) if s.startswith('/'): return " " # note: a space and not an empty string else: return s pattern = re.compile( r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', re.DOTALL | re.MULTILINE) return re.sub(pattern, replacer, content) def remove_if(self, content, keep_if_content=True): re_if = re.compile(r'#IF.*?\n(.*?)#ENDIF', re.MULTILINE | re.DOTALL) if keep_if_content: content = re.sub(re_if, '\g<1>', content) else: content = re.sub(re_if, '', content) # remove inner '#ELSE' # which means #IF (block_1) #ELSE (block_2) #ENDIF # after substitution it will be {block_1} {block_2} content = re.sub(r'#ELSE', '', content) # remaining #IF, left from nested #IF content = re.sub(r'#IF.*?\n', '', content) content = re.sub(r'#ENDIF.*?\n', '', content) return content def resolve_external_params(self, content, params={}): self.logger.debug('params = {}'.format(params)) re_external_param = re.compile(r'@@(.*?)@@') def replace_matched(match): text = match.group() return params.get(match.group(1), text) content = re_external_param.sub(replace_matched, content) content = re.sub(r'("")([\w]+)', r'"2', content) content = re.sub(r'([\w]+)("")', r'1"', content) return content.replace('"""', '""') def find_latest_node(self, target_name, nodes): for node in nodes[::-1]: if node.name == target_name: return node self.logger.warning( 'cannot find node [{}]! Probably source node.'.format(target_name)) return Node(target_name) def upsert_node(self, node_map, node_name): if node_name not in node_map: self.logger.info( 'cannot find node [{}]! Probably source node.'.format( node_name)) node_map[node_name] = Node(node_name) def get_target_declare_int(self, content, target_key): match = re.search( '#DECLARE[ \t]+{}[ \t]+int[ \t]+=[ \t]+(\d)+.*;'.format( target_key), content) if match: return int(match.group(1)) def expand_loop(self, content): ''' Assumption: end bracelet '}' of LOOP is isolated in single line If not, should use stack to process char by char Note: LOOP is officially unrecommended operation https://stackoverflow.microsoft.com/questions/5174/where-can-i-find-more-information-about-the-scope-keyword-loop/5175#5175 :param content: the scope script body :return: expanded content ''' lines = content.splitlines() result_lines = [] loop_on = False loop_var = None loop_count = 0 loop_content = [] for line in lines: if loop_on: # string.Format has format placeholder {} if '}' in line.strip() and not 'string.format' in line.lower(): loop_on = False # it can also be param in declare_map, just ignore this case now if self.is_int(loop_count): for i in range(int(loop_count)): for content_line in loop_content: result_lines.append( content_line.replace( '@@{}@@'.format(loop_var), str(i))) else: result_lines.extend(loop_content) # anything after enclosing should be kept result_lines.append(line.replace('}', '')) continue elif line.strip() == '{': continue loop_content.append(line) continue if 'LOOP' in line: var, loop_count = Loop().get_var_loop_count(line) self.logger.debug( 'found keyword LOOP, var = {}, loop_count = {}'.format( var, loop_count)) if var is not None and loop_count is not None: if loop_count.startswith('@'): # declare loop_count = self.get_target_declare_int( content, loop_count[1:].strip()) loop_on = True loop_var = var loop_content = [] continue result_lines.append(line) return '\n'.join(result_lines) def remove_data_hint(self, content): # [ROWCOUNT=100] re_dh_1 = re.compile(r'\[.+?=[ ]?\d+?\]') content = re.sub(re_dh_1, '', content) # [LOWDISTINCTNESS(MatchTypeId)] re_dh_2 = re.compile(r'\[LOWDISTINCTNESS[ ]*\(.*\)\]') content = re.sub(re_dh_2, '', content) # [PARTITION=(PARTITIONCOUNT=2000)] re_dh_3 = re.compile(r'\[.+?=\(.+=.+\)\]') content = re.sub(re_dh_3, '', content) # [Privacy.xxx] re_dh_4 = re.compile(r'\[Privacy\..+?]') content = re.sub(re_dh_4, '', content) # [ PARTITION(BiddedKeyword) ] # [ STAGEBOUNDARYONOUTPUT ] # [ PARTITION(Query, DeviceTypeId) ] re_dh_5 = re.compile(r'\[[ ]*[a-zA-Z\(\), ]+[ ]*\]') content = re.sub(re_dh_5, '', content) return content def remove_split_reserved_char(self, content): content = re.sub("';'", '', content) content = re.sub('";"', '', content) return content def remove_ascii_non_target(self, content): # such as ASCII 160, 194 return content.encode('ascii', 'ignore').decode() def remove_view_template(self, content): re_view = re.compile('.*CREATE VIEW.*?AS BEGIN(.*)END;', re.DOTALL | re.MULTILINE) match = re.match(re_view, content) if match: return match.group(1) return content def get_module_views(self, content): re_module_view = re.compile('.*?VIEW(.*?)RETURN.*?BEGIN(.*?)END VIEW', re.DOTALL | re.MULTILINE) occurs = re.findall(re_module_view, content) ret = {} for occur in occurs: view_name, body = occur ret[view_name.strip()] = body return ret def is_input_sstream(self, node): if node.name.startswith('SSTREAM_'): return True return False def is_output(self, node): if node.attr.get('type', None) == 'output': return True return False def is_int(self, s): try: int(s) return True except ValueError: return False def add_sstream_info(self, nodes, declare_map, url=False, using=False): if url: self.add_sstream_info_url(nodes, declare_map) if using: self.add_sstream_info_using(nodes) def add_sstream_info_url(self, nodes, declare_map): for node in nodes: param = '' # only target SSTREAM and OUTPUT if self.is_input_sstream(node): param = node.name[node.name.index('_') + 1:] elif self.is_output(node): # skip debug files if 'debug' in node.name.lower(): continue param = node.name else: continue if not param in declare_map: self.logger.info( 'param [{}] not in declare_map, probably local reference. Ignore for now.' .format(param)) continue body_str = declare_map[param] # remove date or streamset query if isinstance(body_str, str) and '?' in body_str: self.logger.info( 'remove query string of [{}]'.format(body_str)) body_str = body_str[:body_str.index('?')] # query string supports %Y%m%d replacement, # if there are %Y%m%d in url, use default datetime to replace it if '%Y' in body_str or '%m' in body_str or '%d' in body_str: body_str = self.default_datetime.strftime(body_str) # change node label to html format for different font size # ignore if already inserted href if not 'FONT' in node.attr['label']: href = '{}{}{}'.format( self.sstream_link_prefix, body_str.replace('"', '').replace('\n', ''), self.sstream_link_suffix) the_label = node.attr['label'] if self.b_add_sstream_size: self.logger.info( 'trying to get stream size of [{}]'.format(href)) stream_size = '' if 'tring.Format' not in href: stream_size = self.ssu.get_stream_size(href) else: self.logger.warning( 'skip not well-formed url [{}]'.format(href)) the_label = '{} ({})'.format(the_label, stream_size) if self.b_add_sstream_link: the_label = '<{} <BR/> <FONT POINT-SIZE="4">{}</FONT>>'.format( the_label, href) # node.attr['href'] = href # not work when rendered to pdf, works in jupyter node.attr['label'] = the_label def add_sstream_info_using(self, nodes): # for highlight PROCESS/REDUCE ... USING for node in nodes: if 'using' in node.attr and 'FONT' not in node.attr['label']: label = '<{} <BR/> <FONT POINT-SIZE="8">-- {} --</FONT>>'.format( node.attr['label'], node.attr['using']) node.attr['label'] = label node.attr['fillcolor'] = 'yellow' node.attr['style'] = 'filled' def change_node_color(self, nodes): for node in nodes: self.logger.debug('node = {}'.format(node)) if '_' not in node.name: continue if node.name == 'SCOPE_IMPLICIT': node.attr['type'] = 'input' node.attr['style'] = 'filled' node.attr['fillcolor'] = 'gray' continue input_type = node.name.split('_')[0] if input_type not in [ 'SSTREAM', 'SSTREAM<STREAMSET>', 'EXTRACT', 'MODULE', 'VIEW', 'FUNC' ]: continue attr = node.attr attr.update({'type': 'input', 'style': 'filled'}) # color scheme: https://www.graphviz.org/doc/info/colors.html#brewer if input_type == 'SSTREAM': attr['fillcolor'] = 'greenyellow' elif input_type == 'SSTREAM<STREAMSET>': attr['fillcolor'] = 'wheat' elif input_type == 'EXTRACT': attr['fillcolor'] = 'honeydew' elif input_type == 'MODULE': attr['fillcolor'] = 'sandybrown' elif input_type == 'VIEW': attr['fillcolor'] = 'lightpink' elif input_type == 'FUNC': attr['fillcolor'] = 'lightblue' node.attr.update(attr) def process_output(self, part, node_map, all_nodes, edges): d = self.output.parse(part) self.logger.debug(d) to_node = Node(d['path'], attr={ 'type': 'output', 'style': 'filled', 'fillcolor': 'tomato' }) source_names = d['idents'] if not source_names: from_node = node_map['last_node'] edges.append(Edge(from_node, to_node)) else: for source_name in source_names: from_node = node_map.get(source_name, node_map['last_node']) edges.append(Edge(from_node, to_node)) all_nodes.append(to_node) def process_extract(self, part, node_map, all_nodes, edges): self.process_core(part, node_map, all_nodes, edges, self.input.parse(part)) def process_view(self, part, node_map, all_nodes, edges): self.process_core(part, node_map, all_nodes, edges, self.input.parse(part)) def process_input_sstream(self, part, node_map, all_nodes, edges): self.process_core(part, node_map, all_nodes, edges, self.input.parse(part)) def process_import(self, part, node_map, all_nodes, edges): self.process_core(part, node_map, all_nodes, edges, self.input.parse(part)) def process_input_module(self, part, node_map, all_nodes, edges): self.process_core(part, node_map, all_nodes, edges, self.input.parse(part)) def process_process(self, part, node_map, all_nodes, edges): self.logger.debug('process_process') self.process_core(part, node_map, all_nodes, edges, self.process.parse(part)) def process_reduce(self, part, node_map, all_nodes, edges): self.process_core(part, node_map, all_nodes, edges, self.reduce.parse(part)) def process_combine(self, part, node_map, all_nodes, edges): self.process_core(part, node_map, all_nodes, edges, self.combine.parse(part)) def process_select(self, part, node_map, all_nodes, edges): self.process_core(part, node_map, all_nodes, edges, self.select.parse(part)) def connect_module_params(self, node_map, all_nodes, edges, dest_node, params): for param in params: if not param in node_map: # do nothing if param not appeared before continue param_node = node_map[param] edges.append(Edge(param_node, dest_node)) def process_core(self, part, node_map, all_nodes, edges, d): from_nodes = [] to_node = None for source in d['sources']: self.upsert_node( node_map, source) # first, check and upsert if not in node_map from_nodes.append(node_map[source]) if source.startswith('MODULE_'): self.connect_module_params(node_map, all_nodes, edges, node_map[source], d.get('params', [])) if '.' in source: main_node_name = source.split('.')[0] if main_node_name in node_map: edges.append( Edge(node_map[main_node_name], node_map[source])) if len(from_nodes) == 0: from_nodes.append(node_map['last_node']) if d['assign_var']: attr = {} node_name = d['assign_var'] if node_name in node_map: attr = node_map[node_name].attr # for those like PROCESS ... USING if 'using' in d: attr['using'] = d['using'] new_node = Node(node_name, attr=attr) node_map[node_name] = new_node # update to_node = new_node else: if not node_map['last_node']: new_node = Node("SCOPE_IMPLICIT") to_node = new_node else: to_node = node_map['last_node'] for from_node in from_nodes: edges.append(Edge(from_node, to_node)) all_nodes.append(from_node) all_nodes.append(to_node) node_map['last_node'] = to_node def process_declare(self, part, declare_map): key, value = self.declare.parse(part) self.logger.debug('process_declare key [{}], value [{}]'.format( key, value)) # ignore MAP for now if 'MAP' in value: declare_map['@' + key] = 'MAP' # back-to-back double quotes are from resolving external params # case: "@@ExtParam@@" with @@ExtParam@@ = \"some_string\" declare_map['@' + key] = value.replace('""', '"') # early resolve result = self.scope_resolver.resolve_declare_rvalue( None, declare_map['@' + key], declare_map) declare_map['@' + key] = result self.logger.info('declare [{}] as [{}]'.format(key, result)) def process_set(self, part, declare_map): key, value = self.set.parse(part) if 'IF' in value: self.logger.info('for now, we do not handle IF statement.') return declare_lvalue = key declare_rvalue = value declare_map['@' + key] = self.scope_resolver.resolve_declare_rvalue( declare_lvalue, declare_rvalue, declare_map) self.logger.info('set [{}] as [{}]'.format(key, value)) def update_module_view_data(self, final_nodes, final_edges, nodes, edges, view_name): processed = set() for node in nodes: # nodes may be duplicate because we recorded the whole appearance if node in processed: continue node.name = '<{}>_{}'.format(view_name, node.name) processed.add(node) final_nodes.extend(nodes) final_edges.extend(edges) def parse_file(self, filepath, external_params={}, dest_filepath=None): self.logger.info('parse_file [{}]'.format(filepath)) self.logger.debug('file [{}], external_params = {}'.format( filepath, external_params)) # keep date key because external params from config is probably yyyy-MM-dd format for key in external_params: if 'date' in key.lower() or 'hour' in key.lower( ) or 'time' in key.lower(): if 'yyyy' in external_params[key] or 'mm' in external_params[ key] or 'dd' in external_params[key]: normalized_format = ScopeResolver.to_normalized_time_format( external_params[key]) normalized_format = normalized_format.replace('{', '') \ .replace('}', '') \ .replace('@', '') \ .replace('"', '') self.logger.debug( 'external_param datetime format = {}, normalized to {}' .format(external_params[key], normalized_format)) if key not in self.external_params: self.logger.debug( 'use TARGET_DATE [{}] in config.ini as datatime'. format(self.target_date_str)) default_datetime = parser.parse(self.target_date_str) self.external_params[key] = '"{}"'.format( default_datetime.strftime(normalized_format)) self.logger.debug( 'set self.external_params[{}] to [{}]'.format( key, self.external_params[key])) continue self.external_params[key] = external_params[key] self.logger.debug( 'update external_param key [{}] to value [{}]'.format( key, self.external_params[key])) content = FileUtility.get_file_content(filepath) final_nodes = [] final_edges = [] if filepath.endswith('.module'): d = self.get_module_views(content) for view_name in d: content = d[view_name] nodes, edges = self.parse_content(content, external_params) self.update_module_view_data(final_nodes, final_edges, nodes, edges, view_name) if filepath.endswith('.view'): content = self.remove_view_template(content) final_nodes, final_edges = self.parse_content( content, external_params) if filepath.endswith('.script'): final_nodes, final_edges = self.parse_content( content, external_params) if dest_filepath: self.to_graph(dest_filepath, final_nodes, final_edges) # save cosmos querying results if self.b_add_sstream_size: self.ssu.refresh_cache() def get_parse_type(self, part): ''' Use the first occurred keyword as parsing type :param part: the content part for parsing :return: the keyword as parse type ''' keywords = { 'OUTPUT', 'REDUCE', 'SELECT', 'PROCESS', 'COMBINE', 'SSTREAM', 'EXTRACT', 'VIEW', 'IMPORT', 'USING' } for word in part.split(): word = word.strip() if word in keywords: return word return None def parse_content(self, content, external_params={}): content = self.remove_comments(content) content = self.remove_if(content) content = self.remove_if(content) # for nested if content = self.resolve_external_params(content, self.external_params) content = self.expand_loop(content) content = self.remove_data_hint(content) content = self.remove_split_reserved_char(content) content = self.remove_ascii_non_target(content) parts = content.split(';') declare_map = {} node_map = {'last_node': None} edges = [] all_nodes = [ ] # add node to networkx ourself, missing nodes in edges will be added automatically # and the id of auto-added nodes are not controllable for part in parts: self.logger.debug('-' * 20) self.logger.debug(part) # ignore data after C# block if '#CS' in part: self.logger.info('meet CS block, break parsing.') break if '#DECLARE' in part: # some files contain prefix unicode string self.process_declare(part, declare_map) continue elif '#SET' in part: self.process_set(part, declare_map) continue parse_type = self.get_parse_type(part) self.logger.debug('parse_type = {}'.format(parse_type)) if parse_type == 'IMPORT': self.logger.info('not support IMPORT for now.') elif parse_type == 'OUTPUT': self.process_output(part, node_map, all_nodes, edges) elif parse_type == 'REDUCE': self.process_reduce(part, node_map, all_nodes, edges) elif parse_type == 'COMBINE': self.process_combine(part, node_map, all_nodes, edges) elif parse_type == 'SELECT': self.process_select(part, node_map, all_nodes, edges) elif parse_type == 'SSTREAM': self.process_input_sstream(part, node_map, all_nodes, edges) elif parse_type == 'EXTRACT': self.process_extract(part, node_map, all_nodes, edges) elif parse_type == 'VIEW': self.process_view(part, node_map, all_nodes, edges) elif parse_type == 'PROCESS': self.process_process(part, node_map, all_nodes, edges) elif parse_type == 'USING': self.logger.info('not support USING for now.') else: try: self.process_input_module(part, node_map, all_nodes, edges) except Exception as ex: self.logger.warning(ex) pass self.logger.info(declare_map) self.scope_resolver.resolve_declare(declare_map) self.logger.info('change node color for output') self.change_node_color(all_nodes) if self.b_add_sstream_link or self.b_add_sstream_size: self.add_sstream_info(all_nodes, declare_map, url=True) # always add using info self.add_sstream_info(all_nodes, declare_map, using=True) return all_nodes, edges def to_graph(self, dest_filepath, nodes, edges): gu = GraphUtility(nodes, edges) gexf_output_file = gu.to_gexf_file(dest_filepath) self.logger.info('output .gexf file to [{}]'.format(gexf_output_file)) dot_output_file = gu.to_dot_file(dest_filepath) self.logger.info('output .dot file to [{}]'.format(dot_output_file)) self.logger.info('render graphviz file') try: gu.dot_to_graphviz(dot_output_file, format='pdf') except Exception as ex: self.logger.warning('failed converting to pdf, try svg') gu.dot_to_graphviz(dot_output_file, format='svg')
def test_str_format(self): fmt_str = '/{0}/{1}-{2}' items = ['AAA', 'BBB', 'CCC'] result = ScopeResolver().resolve_str_format(fmt_str, items, {}) self.assertEqual("/AAA/BBB-CCC", result)
def parse_file(self, filepath, external_params={}, dest_filepath=None): self.logger.info('parse_file [{}]'.format(filepath)) self.logger.debug('file [{}], external_params = {}'.format( filepath, external_params)) # keep date key because external params from config is probably yyyy-MM-dd format for key in external_params: if 'date' in key.lower() or 'hour' in key.lower( ) or 'time' in key.lower(): if 'yyyy' in external_params[key] or 'mm' in external_params[ key] or 'dd' in external_params[key]: normalized_format = ScopeResolver.to_normalized_time_format( external_params[key]) normalized_format = normalized_format.replace('{', '') \ .replace('}', '') \ .replace('@', '') \ .replace('"', '') self.logger.debug( 'external_param datetime format = {}, normalized to {}' .format(external_params[key], normalized_format)) if key not in self.external_params: self.logger.debug( 'use TARGET_DATE [{}] in config.ini as datatime'. format(self.target_date_str)) default_datetime = parser.parse(self.target_date_str) self.external_params[key] = '"{}"'.format( default_datetime.strftime(normalized_format)) self.logger.debug( 'set self.external_params[{}] to [{}]'.format( key, self.external_params[key])) continue self.external_params[key] = external_params[key] self.logger.debug( 'update external_param key [{}] to value [{}]'.format( key, self.external_params[key])) content = FileUtility.get_file_content(filepath) final_nodes = [] final_edges = [] if filepath.endswith('.module'): d = self.get_module_views(content) for view_name in d: content = d[view_name] nodes, edges = self.parse_content(content, external_params) self.update_module_view_data(final_nodes, final_edges, nodes, edges, view_name) if filepath.endswith('.view'): content = self.remove_view_template(content) final_nodes, final_edges = self.parse_content( content, external_params) if filepath.endswith('.script'): final_nodes, final_edges = self.parse_content( content, external_params) if dest_filepath: self.to_graph(dest_filepath, final_nodes, final_edges) # save cosmos querying results if self.b_add_sstream_size: self.ssu.refresh_cache()
def test_func_int_parse(self): func_str = 'int.Parse("1000")' result = ScopeResolver().resolve_func(func_str) self.assertEqual(1000, result)