def get_features(kewords_special_symbols_fn,source_fn,source_str,block_comment_regex,line_comment_regex,numbers_regex,strings_regex,find_words,find_regexes,types_regex,keywords_for_not_general,pygments_lexer):
  keys_dt,ssym_dt = get_keys_ssym_dicts(kewords_special_symbols_fn)

  if (source_str == None or len(source_str)==0) and source_fn != None and len(source_fn)>0:
    source_str = myfile.get_text(source_fn)

  if source_str is None or len(source_str)<=1: # INVALID_FILE
    return (None,None)
 
  source_str = source_str + mypatmat.NEWLINES[0] # prime
  initLOC = mypatmat.count_each_symbol(source_str,[mypatmat.NEWLINES[0]])[0]

  source_str = mypatmat.remove_double_newline(source_str)
  source_str = re.sub(r'\\.',' ',source_str) # remove escaped symbols for simplicity
  source_str = source_str.replace(SINGLE_QUOTE_DELIMITER,DOUBLE_QUOTE_DELIMITER)  # treat chars as strings for simplicity

  theoriginal_str = source_str

  (block_comments, source_str) = mypatmat.strip_regex_but_keep_newlines(source_str,block_comment_regex)
  (line_comments, source_str) = mypatmat.strip_regex_but_keep_newlines(source_str,line_comment_regex) 

  num_block_comments = len(block_comments)
  num_line_comments = len(line_comments)
  total_comment_lines = mypatmat.count_each_symbol(mypatmat.NEWLINES[0].join(block_comments),[mypatmat.NEWLINES[0]])[0] + num_line_comments
  comment_lines_div_orig_loc = total_comment_lines / float(max(1,initLOC))

  source_w_just_spaces_and_newlines = mypatmat.replace_whitespace_with_single_space(source_str,except_newline=True)
  source_w_just_spaces = source_w_just_spaces_and_newlines.replace('\n',' ') # after this, we can match more, but we still maintain where the newlines are in 

  (orig_sa,orig_lcp)=make_sa_lcp.get_sa_lcp(source_w_just_spaces)
  stats_orig_lcp = mymath.get_list_describe_pandas_features(orig_lcp)
  duplications = dupfuns.get_dup_groups(DUP_MEAN_FACTOR_GROUP_ONLY_IF,stats_orig_lcp[mymath.INDEX_OF_MEAN],orig_sa,orig_lcp,source_w_just_spaces_and_newlines,pygments_lexer,SPECIAL_PLACEHOLDER)
  orig_dup_mean_div_count = stats_orig_lcp[mymath.INDEX_OF_MEAN] / float(max(1,stats_orig_lcp[mymath.INDEX_OF_COUNT]))
  
  (temp, linenos_arr) = mypatmat.get_linenos_and_remove_newlines(source_w_just_spaces_and_newlines)
  origLOC = mypatmat.count_each_symbol(source_str,[mypatmat.NEWLINES[0]])[0]
  myLOC = len(set(linenos_arr))
  

  # X-parameterized transformation
  (sourcemod,tokens,tokens_str,variables,strings,digits,types) = get_data_str(source_str,keys_dt,ssym_dt,strings_regex,numbers_regex,types_regex)

  (thereconstructed_str,linenos_for_xp_dup) = reconstruct_the_original(theoriginal_str,tokens_str,variables,VARIABLE_PLACEHOLDER,strings,STRING_PLACEHOLDER,digits,DIGITS_PLACEHOLDER,types,TYPES_PLACEHOLDER)

  (xp_sa,xp_lcp)=make_sa_lcp.get_sa_lcp(tokens_str)
  stats_xp_lcp = mymath.get_list_describe_pandas_features(xp_lcp)
  
  symbol_counts_in_string_ith_base_0 = mypatmat.get_symbol_counts_in_string_ith_base_0(tokens_str)
  
  counts_word = mypatmat.dirty_search_for_num_occ_of_pat_lst_in_text(tokens_str,find_words,pat_whitespace_to_left=True,pat_whitespace_to_right=False,pattern_lst_is_regex=False)
  counts_regex = mypatmat.dirty_search_for_num_occ_of_pat_lst_in_text(tokens_str,find_regexes,pat_whitespace_to_left=True,pat_whitespace_to_right=False,pattern_lst_is_regex=True)
  counts_not_general_keywords = mypatmat.dirty_search_for_num_occ_of_pat_lst_in_text(tokens_str,keywords_for_not_general,pat_whitespace_to_left=True,pat_whitespace_to_right=False,pattern_lst_is_regex=False)

  generality_flag = 1
  if sum(counts_not_general_keywords) > 0:
    generality_flag = 0

  xp_duplications = dupfuns.get_dup_groups(XP_MEAN_FACTOR_GROUP_ONLY_IF,stats_xp_lcp[mymath.INDEX_OF_MEAN],xp_sa,xp_lcp,tokens_str,pygments_lexer,SPECIAL_PLACEHOLDER,symbol_counts_in_string_ith_base_0,variables,VARIABLE_PLACEHOLDER,strings,STRING_PLACEHOLDER,digits,DIGITS_PLACEHOLDER,types,TYPES_PLACEHOLDER,linenos_for_xp_dup,ssym_dt)
  xp_dup_mean_div_count = stats_xp_lcp[mymath.INDEX_OF_MEAN] / float(max(1,stats_xp_lcp[mymath.INDEX_OF_COUNT]))

  (total_literals_count, unique_literals_count, magic_numbers_count, magic_numbers_per_number, magic_numbers_display) = get_magic_number_info(digits)

  ( identifier_avg_length, count_small_identifier, count_small_identifier_percent, identifier_creativity_uppercase_underscore_count, identifier_creativity_uppercase_underscore_percent, poor_identifier_display ) = get_identifier_info(variables)

  
  


  # #################
  # BEGIN gather features and return them with header
  # #################
  
  FEATURES_HEADER = ['language','filename']
  features_val = [constants.C_LANGUAGE[0],source_fn]

  FEATURES_HEADER += mypatmat.prepend_and_or_append_str_to_strlst(mymath.DESCRIBE_FEATURES_HEADER,prependstr='orig_dup_')
  features_val += stats_orig_lcp

  FEATURES_HEADER += mypatmat.prepend_and_or_append_str_to_strlst(mymath.DESCRIBE_FEATURES_HEADER,prependstr='xp_dup_')
  features_val += stats_xp_lcp

  FEATURES_HEADER += ['init_loc','orig_loc','my_loc']
  features_val += [initLOC, origLOC, myLOC]

  FEATURES_HEADER += mypatmat.prepend_and_or_append_str_to_strlst(find_words,prependstr='word_count_')
  features_val += counts_word

  FEATURES_HEADER += mypatmat.prepend_and_or_append_str_to_strlst(find_regexes,prependstr='regex_count_')
  features_val += counts_regex

  FEATURES_HEADER += ['count_line_comments','count_block_comments','count_total_comment_lines','comment_lines_div_orig_loc']
  features_val += [ num_line_comments, num_block_comments, total_comment_lines, comment_lines_div_orig_loc ]

  FEATURES_HEADER += ['total_literals_count', 'unique_literals_count', 'magic_numbers_count', 'magic_numbers_per_number']
  features_val += [ total_literals_count, unique_literals_count, magic_numbers_count, magic_numbers_per_number ]

  FEATURES_HEADER += ['identifier_avg_length', 'count_small_identifier', 'count_small_identifier_percent', 'identifier_creativity_uppercase_underscore_count', 'identifier_creativity_uppercase_underscore_percent']
  features_val += [ identifier_avg_length, count_small_identifier, count_small_identifier_percent, identifier_creativity_uppercase_underscore_count, identifier_creativity_uppercase_underscore_percent ]

  FEATURES_HEADER += ['orig_dup_mean_div_count', 'xp_dup_mean_div_count']
  features_val += [orig_dup_mean_div_count, xp_dup_mean_div_count]

  FEATURES_HEADER += ['generality_flag']
  features_val += [generality_flag ]

  # #################
  # END
  # #################

  return (FEATURES_HEADER, features_val, duplications, xp_duplications, poor_identifier_display, magic_numbers_display)
def get_data_str(source,keys_dt,ssym_dt,STRINGS_REGEX,DIGITS_REGEX,TYPES_REGEX):
  tokens = []
  variables = []
  SPACE = mypatmat.SPACES[0]
  

  (strings_removed,source) = mypatmat.remove_all_in_between_on_same_line2(source,STRINGS_REGEX,replacewith=' '+STRING_PLACEHOLDER+' ')

  types_removed = []

  (digits_removed,source) = mypatmat.remove_all_in_between_on_same_line2(source,DIGITS_REGEX,replacewith=' '+DIGITS_PLACEHOLDER+' ')

  source = mypatmat.replace_whitespace_with_single_space(source)
  source = mypatmat.ensure_thissym_around_syms(source,ssym_dt,SPACE)
  
  token_lines = source.split(SPACE)
  

  pos_in_source = 0
  num = 0
  strings_num = 0
  digits_num = 0
  variables_num = 0
  types_num = 0
  for tok in token_lines:
    # symbol token
    if tok in ssym_dt:
      tokens.append((tok,pos_in_source))
    # keyword token
    elif tok in keys_dt:
      tokens.append((tok,pos_in_source,KEYWORD_CONSTANT))
    
    elif tok != None and len(tok)>0:
      if len(tok)==1:
        if ord(tok)==ord(DIGITS_PLACEHOLDER):
          tokens.append((DIGITS_PLACEHOLDER,pos_in_source,strings_num))
          strings_num += 1
        elif ord(tok)==ord(STRING_PLACEHOLDER):
          tokens.append((STRING_PLACEHOLDER,pos_in_source,digits_num))
          digits_num += 1
        elif ord(tok)==ord(TYPES_PLACEHOLDER):
          tokens.append((TYPES_PLACEHOLDER,pos_in_source,types_num))
          types_num += 1
        else: # variables (identifier)
          tokens.append((VARIABLE_PLACEHOLDER,pos_in_source,num))
          variables.append(tok)
          variables_num += 1
        num += 1
      elif tok!=SPACE:  # variable (identifier)
        tokens.append((VARIABLE_PLACEHOLDER,pos_in_source,num))
        num += 1
        variables.append(tok)
        variables_num += 1
    pos_in_source += len(tok) + 1 # + 1 for the SPACE

  tokens_str = tokens_to_str(tokens)

  


  return (source,tokens,tokens_str,variables,strings_removed,digits_removed,types_removed)