def removeReassignedRDDs(loop, cache_candidate_set):
	"""
	Removes reassigned RDDs from set within the loop
	"""
	rdd_candidate_regex_pattern = '|'.join(cache_candidate_set)
	comments_span_list = opt.findCommentSpans(loop)
	cache_candidate_set.difference_update(findReassignedRDD(loop, rdd_candidate_regex_pattern, comments_span_list))

	return cache_candidate_set
def removeCachedRDDs(cache_candidate_set, application_code, end_limit, func_spans):
	"""
	Removes cached rdds from set that occur before end_limit in application_code
	"""
	comments_span_list = opt.findCommentSpans(application_code)
	filtered_cache_candidates = set()
	for rdd in cache_candidate_set:
		if opt.isCached(rdd, comments_span_list, application_code, end_limit, func_spans) == False:
			filtered_cache_candidates.add(rdd)
	return filtered_cache_candidates
def initBeforeLoop(application_code, rdd, end_limit, func_spans, func_rdd_args):
	"""
	Finds all the rdd var names in the code
	"""
	# Check if the args of the function was one of the candidate
	for rdd_arg in func_rdd_args:
		if rdd_arg == rdd:
			return True

	span_with_limit = opt.spansWithEndLimit(func_spans, end_limit)
	search_region = opt.extractSearchRegion(span_with_limit, application_code)
	comments_span_list = opt.findCommentSpans(search_region)
	rdd_set = set()
	matched_iter = re.finditer(r'(val|var)\s*(%s)\s*?='%rdd, search_region, re.S|re.X|re.M)
	for matched_obj in matched_iter:
		if not opt.inComment (matched_obj, search_region):
			rdd_set.add(matched_obj.group())
	return len(rdd_set) > 0
def getRDDsFromLoops(loop, rdd_actions, rdd_functions):
	"""
	finds all RDD candidates from loop and returns it as a set
	"""
	comments_span_list = opt.findCommentSpans(loop)
	rdd_set = set()
	non_arg_matched_iter = re.finditer(r'(\w+?)\.(%s)'%rdd_actions, loop, re.S|re.X|re.M)
	for matched_obj in non_arg_matched_iter:
		if not opt.inComment(matched_obj, loop, comments_span_list):
			rddname = matched_obj.group(1)
			rdd_set.add(rddname) 

	arg_matched_iter = re.finditer(r'(%s)\(\s*(\w+?)\s*\)'%rdd_actions, loop, re.S|re.X|re.M)
	for matched_obj in arg_matched_iter:
		if not opt.inComment(matched_obj, loop, comments_span_list):
			rddname = matched_obj.group(2)
			rdd_set.add(rddname) 

	#this is to capture functions defined that are not default RDD functions
	for rdd_func in rdd_functions:
		func_name = rdd_func[0]
		num_args = rdd_func[4]
		arg_pos_array = rdd_func[5]

		num_periods = num_args - 1
		arg_regex_pattern = ""
		for i in range(num_args):
			#adds accordingly number of arg patterns to capture
			arg_regex_pattern += "\s*(\w+?)\s*"
			if i < num_args -1 :
				arg_regex_pattern += ","

		func_arg_matched_iter = re.finditer(r"""
			{0}\s*\({1}\)
			""".format(func_name,arg_regex_pattern) , loop, re.S|re.X|re.M)

		for matched_obj in func_arg_matched_iter:
			if not opt.inComment(matched_obj, loop, comments_span_list):
				#add in the corresponding arguments at their positions
				for arg_pos in arg_pos_array:
					rddname = matched_obj.group(arg_pos+1)
					rdd_set.add(rddname) 

	return rdd_set