Пример #1
0
def get_keys_ssym_dicts(keys_spsym_fn):
  tmp = myfile.get_text(keys_spsym_fn)
  mtch = re.finditer(r''+SYM_KEYS_SEP,tmp)
  
  tmp = tmp.split(ENTRY_SEP)
  setkeys = False
  keys_dt = {}
  ssym_dt = {}
  for line in tmp:
    if line == SYM_KEYS_SEP:
      setkeys = True
      continue
    if line is not None and len(line.strip())>0:
      if setkeys:
        keys_dt[line]=True
      else:
        ssym_dt[line]=True
  return keys_dt,ssym_dt
Пример #2
0
def get_features(kewords_special_symbols_fn,source_fn,source_str,block_comment_regex,line_comment_regex,numbers_regex,strings_regex,find_words,find_regexes,types_regex,keywords_for_not_general,pygments_lexer):
  keys_dt,ssym_dt = get_keys_ssym_dicts(kewords_special_symbols_fn)

  if (source_str == None or len(source_str)==0) and source_fn != None and len(source_fn)>0:
    source_str = myfile.get_text(source_fn)

  if source_str is None or len(source_str)<=1: # INVALID_FILE
    return (None,None)
 
  source_str = source_str + mypatmat.NEWLINES[0] # prime
  initLOC = mypatmat.count_each_symbol(source_str,[mypatmat.NEWLINES[0]])[0]

  source_str = mypatmat.remove_double_newline(source_str)
  source_str = re.sub(r'\\.',' ',source_str) # remove escaped symbols for simplicity
  source_str = source_str.replace(SINGLE_QUOTE_DELIMITER,DOUBLE_QUOTE_DELIMITER)  # treat chars as strings for simplicity

  theoriginal_str = source_str

  (block_comments, source_str) = mypatmat.strip_regex_but_keep_newlines(source_str,block_comment_regex)
  (line_comments, source_str) = mypatmat.strip_regex_but_keep_newlines(source_str,line_comment_regex) 

  num_block_comments = len(block_comments)
  num_line_comments = len(line_comments)
  total_comment_lines = mypatmat.count_each_symbol(mypatmat.NEWLINES[0].join(block_comments),[mypatmat.NEWLINES[0]])[0] + num_line_comments
  comment_lines_div_orig_loc = total_comment_lines / float(max(1,initLOC))

  source_w_just_spaces_and_newlines = mypatmat.replace_whitespace_with_single_space(source_str,except_newline=True)
  source_w_just_spaces = source_w_just_spaces_and_newlines.replace('\n',' ') # after this, we can match more, but we still maintain where the newlines are in 

  (orig_sa,orig_lcp)=make_sa_lcp.get_sa_lcp(source_w_just_spaces)
  stats_orig_lcp = mymath.get_list_describe_pandas_features(orig_lcp)
  duplications = dupfuns.get_dup_groups(DUP_MEAN_FACTOR_GROUP_ONLY_IF,stats_orig_lcp[mymath.INDEX_OF_MEAN],orig_sa,orig_lcp,source_w_just_spaces_and_newlines,pygments_lexer,SPECIAL_PLACEHOLDER)
  orig_dup_mean_div_count = stats_orig_lcp[mymath.INDEX_OF_MEAN] / float(max(1,stats_orig_lcp[mymath.INDEX_OF_COUNT]))
  
  (temp, linenos_arr) = mypatmat.get_linenos_and_remove_newlines(source_w_just_spaces_and_newlines)
  origLOC = mypatmat.count_each_symbol(source_str,[mypatmat.NEWLINES[0]])[0]
  myLOC = len(set(linenos_arr))
  

  # X-parameterized transformation
  (sourcemod,tokens,tokens_str,variables,strings,digits,types) = get_data_str(source_str,keys_dt,ssym_dt,strings_regex,numbers_regex,types_regex)

  (thereconstructed_str,linenos_for_xp_dup) = reconstruct_the_original(theoriginal_str,tokens_str,variables,VARIABLE_PLACEHOLDER,strings,STRING_PLACEHOLDER,digits,DIGITS_PLACEHOLDER,types,TYPES_PLACEHOLDER)

  (xp_sa,xp_lcp)=make_sa_lcp.get_sa_lcp(tokens_str)
  stats_xp_lcp = mymath.get_list_describe_pandas_features(xp_lcp)
  
  symbol_counts_in_string_ith_base_0 = mypatmat.get_symbol_counts_in_string_ith_base_0(tokens_str)
  
  counts_word = mypatmat.dirty_search_for_num_occ_of_pat_lst_in_text(tokens_str,find_words,pat_whitespace_to_left=True,pat_whitespace_to_right=False,pattern_lst_is_regex=False)
  counts_regex = mypatmat.dirty_search_for_num_occ_of_pat_lst_in_text(tokens_str,find_regexes,pat_whitespace_to_left=True,pat_whitespace_to_right=False,pattern_lst_is_regex=True)
  counts_not_general_keywords = mypatmat.dirty_search_for_num_occ_of_pat_lst_in_text(tokens_str,keywords_for_not_general,pat_whitespace_to_left=True,pat_whitespace_to_right=False,pattern_lst_is_regex=False)

  generality_flag = 1
  if sum(counts_not_general_keywords) > 0:
    generality_flag = 0

  xp_duplications = dupfuns.get_dup_groups(XP_MEAN_FACTOR_GROUP_ONLY_IF,stats_xp_lcp[mymath.INDEX_OF_MEAN],xp_sa,xp_lcp,tokens_str,pygments_lexer,SPECIAL_PLACEHOLDER,symbol_counts_in_string_ith_base_0,variables,VARIABLE_PLACEHOLDER,strings,STRING_PLACEHOLDER,digits,DIGITS_PLACEHOLDER,types,TYPES_PLACEHOLDER,linenos_for_xp_dup,ssym_dt)
  xp_dup_mean_div_count = stats_xp_lcp[mymath.INDEX_OF_MEAN] / float(max(1,stats_xp_lcp[mymath.INDEX_OF_COUNT]))

  (total_literals_count, unique_literals_count, magic_numbers_count, magic_numbers_per_number, magic_numbers_display) = get_magic_number_info(digits)

  ( identifier_avg_length, count_small_identifier, count_small_identifier_percent, identifier_creativity_uppercase_underscore_count, identifier_creativity_uppercase_underscore_percent, poor_identifier_display ) = get_identifier_info(variables)

  
  


  # #################
  # BEGIN gather features and return them with header
  # #################
  
  FEATURES_HEADER = ['language','filename']
  features_val = [constants.C_LANGUAGE[0],source_fn]

  FEATURES_HEADER += mypatmat.prepend_and_or_append_str_to_strlst(mymath.DESCRIBE_FEATURES_HEADER,prependstr='orig_dup_')
  features_val += stats_orig_lcp

  FEATURES_HEADER += mypatmat.prepend_and_or_append_str_to_strlst(mymath.DESCRIBE_FEATURES_HEADER,prependstr='xp_dup_')
  features_val += stats_xp_lcp

  FEATURES_HEADER += ['init_loc','orig_loc','my_loc']
  features_val += [initLOC, origLOC, myLOC]

  FEATURES_HEADER += mypatmat.prepend_and_or_append_str_to_strlst(find_words,prependstr='word_count_')
  features_val += counts_word

  FEATURES_HEADER += mypatmat.prepend_and_or_append_str_to_strlst(find_regexes,prependstr='regex_count_')
  features_val += counts_regex

  FEATURES_HEADER += ['count_line_comments','count_block_comments','count_total_comment_lines','comment_lines_div_orig_loc']
  features_val += [ num_line_comments, num_block_comments, total_comment_lines, comment_lines_div_orig_loc ]

  FEATURES_HEADER += ['total_literals_count', 'unique_literals_count', 'magic_numbers_count', 'magic_numbers_per_number']
  features_val += [ total_literals_count, unique_literals_count, magic_numbers_count, magic_numbers_per_number ]

  FEATURES_HEADER += ['identifier_avg_length', 'count_small_identifier', 'count_small_identifier_percent', 'identifier_creativity_uppercase_underscore_count', 'identifier_creativity_uppercase_underscore_percent']
  features_val += [ identifier_avg_length, count_small_identifier, count_small_identifier_percent, identifier_creativity_uppercase_underscore_count, identifier_creativity_uppercase_underscore_percent ]

  FEATURES_HEADER += ['orig_dup_mean_div_count', 'xp_dup_mean_div_count']
  features_val += [orig_dup_mean_div_count, xp_dup_mean_div_count]

  FEATURES_HEADER += ['generality_flag']
  features_val += [generality_flag ]

  # #################
  # END
  # #################

  return (FEATURES_HEADER, features_val, duplications, xp_duplications, poor_identifier_display, magic_numbers_display)