def removeReassignedRDDs(loop, cache_candidate_set): """ Removes reassigned RDDs from set within the loop """ rdd_candidate_regex_pattern = '|'.join(cache_candidate_set) comments_span_list = opt.findCommentSpans(loop) cache_candidate_set.difference_update(findReassignedRDD(loop, rdd_candidate_regex_pattern, comments_span_list)) return cache_candidate_set
def removeCachedRDDs(cache_candidate_set, application_code, end_limit, func_spans): """ Removes cached rdds from set that occur before end_limit in application_code """ comments_span_list = opt.findCommentSpans(application_code) filtered_cache_candidates = set() for rdd in cache_candidate_set: if opt.isCached(rdd, comments_span_list, application_code, end_limit, func_spans) == False: filtered_cache_candidates.add(rdd) return filtered_cache_candidates
def initBeforeLoop(application_code, rdd, end_limit, func_spans, func_rdd_args): """ Finds all the rdd var names in the code """ # Check if the args of the function was one of the candidate for rdd_arg in func_rdd_args: if rdd_arg == rdd: return True span_with_limit = opt.spansWithEndLimit(func_spans, end_limit) search_region = opt.extractSearchRegion(span_with_limit, application_code) comments_span_list = opt.findCommentSpans(search_region) rdd_set = set() matched_iter = re.finditer(r'(val|var)\s*(%s)\s*?='%rdd, search_region, re.S|re.X|re.M) for matched_obj in matched_iter: if not opt.inComment (matched_obj, search_region): rdd_set.add(matched_obj.group()) return len(rdd_set) > 0
def getRDDsFromLoops(loop, rdd_actions, rdd_functions): """ finds all RDD candidates from loop and returns it as a set """ comments_span_list = opt.findCommentSpans(loop) rdd_set = set() non_arg_matched_iter = re.finditer(r'(\w+?)\.(%s)'%rdd_actions, loop, re.S|re.X|re.M) for matched_obj in non_arg_matched_iter: if not opt.inComment(matched_obj, loop, comments_span_list): rddname = matched_obj.group(1) rdd_set.add(rddname) arg_matched_iter = re.finditer(r'(%s)\(\s*(\w+?)\s*\)'%rdd_actions, loop, re.S|re.X|re.M) for matched_obj in arg_matched_iter: if not opt.inComment(matched_obj, loop, comments_span_list): rddname = matched_obj.group(2) rdd_set.add(rddname) #this is to capture functions defined that are not default RDD functions for rdd_func in rdd_functions: func_name = rdd_func[0] num_args = rdd_func[4] arg_pos_array = rdd_func[5] num_periods = num_args - 1 arg_regex_pattern = "" for i in range(num_args): #adds accordingly number of arg patterns to capture arg_regex_pattern += "\s*(\w+?)\s*" if i < num_args -1 : arg_regex_pattern += "," func_arg_matched_iter = re.finditer(r""" {0}\s*\({1}\) """.format(func_name,arg_regex_pattern) , loop, re.S|re.X|re.M) for matched_obj in func_arg_matched_iter: if not opt.inComment(matched_obj, loop, comments_span_list): #add in the corresponding arguments at their positions for arg_pos in arg_pos_array: rddname = matched_obj.group(arg_pos+1) rdd_set.add(rddname) return rdd_set