def get_keys_ssym_dicts(keys_spsym_fn): tmp = myfile.get_text(keys_spsym_fn) mtch = re.finditer(r''+SYM_KEYS_SEP,tmp) tmp = tmp.split(ENTRY_SEP) setkeys = False keys_dt = {} ssym_dt = {} for line in tmp: if line == SYM_KEYS_SEP: setkeys = True continue if line is not None and len(line.strip())>0: if setkeys: keys_dt[line]=True else: ssym_dt[line]=True return keys_dt,ssym_dt
def get_features(kewords_special_symbols_fn,source_fn,source_str,block_comment_regex,line_comment_regex,numbers_regex,strings_regex,find_words,find_regexes,types_regex,keywords_for_not_general,pygments_lexer): keys_dt,ssym_dt = get_keys_ssym_dicts(kewords_special_symbols_fn) if (source_str == None or len(source_str)==0) and source_fn != None and len(source_fn)>0: source_str = myfile.get_text(source_fn) if source_str is None or len(source_str)<=1: # INVALID_FILE return (None,None) source_str = source_str + mypatmat.NEWLINES[0] # prime initLOC = mypatmat.count_each_symbol(source_str,[mypatmat.NEWLINES[0]])[0] source_str = mypatmat.remove_double_newline(source_str) source_str = re.sub(r'\\.',' ',source_str) # remove escaped symbols for simplicity source_str = source_str.replace(SINGLE_QUOTE_DELIMITER,DOUBLE_QUOTE_DELIMITER) # treat chars as strings for simplicity theoriginal_str = source_str (block_comments, source_str) = mypatmat.strip_regex_but_keep_newlines(source_str,block_comment_regex) (line_comments, source_str) = mypatmat.strip_regex_but_keep_newlines(source_str,line_comment_regex) num_block_comments = len(block_comments) num_line_comments = len(line_comments) total_comment_lines = mypatmat.count_each_symbol(mypatmat.NEWLINES[0].join(block_comments),[mypatmat.NEWLINES[0]])[0] + num_line_comments comment_lines_div_orig_loc = total_comment_lines / float(max(1,initLOC)) source_w_just_spaces_and_newlines = mypatmat.replace_whitespace_with_single_space(source_str,except_newline=True) source_w_just_spaces = source_w_just_spaces_and_newlines.replace('\n',' ') # after this, we can match more, but we still maintain where the newlines are in (orig_sa,orig_lcp)=make_sa_lcp.get_sa_lcp(source_w_just_spaces) stats_orig_lcp = mymath.get_list_describe_pandas_features(orig_lcp) duplications = dupfuns.get_dup_groups(DUP_MEAN_FACTOR_GROUP_ONLY_IF,stats_orig_lcp[mymath.INDEX_OF_MEAN],orig_sa,orig_lcp,source_w_just_spaces_and_newlines,pygments_lexer,SPECIAL_PLACEHOLDER) orig_dup_mean_div_count = stats_orig_lcp[mymath.INDEX_OF_MEAN] / float(max(1,stats_orig_lcp[mymath.INDEX_OF_COUNT])) (temp, linenos_arr) = mypatmat.get_linenos_and_remove_newlines(source_w_just_spaces_and_newlines) origLOC = mypatmat.count_each_symbol(source_str,[mypatmat.NEWLINES[0]])[0] myLOC = len(set(linenos_arr)) # X-parameterized transformation (sourcemod,tokens,tokens_str,variables,strings,digits,types) = get_data_str(source_str,keys_dt,ssym_dt,strings_regex,numbers_regex,types_regex) (thereconstructed_str,linenos_for_xp_dup) = reconstruct_the_original(theoriginal_str,tokens_str,variables,VARIABLE_PLACEHOLDER,strings,STRING_PLACEHOLDER,digits,DIGITS_PLACEHOLDER,types,TYPES_PLACEHOLDER) (xp_sa,xp_lcp)=make_sa_lcp.get_sa_lcp(tokens_str) stats_xp_lcp = mymath.get_list_describe_pandas_features(xp_lcp) symbol_counts_in_string_ith_base_0 = mypatmat.get_symbol_counts_in_string_ith_base_0(tokens_str) counts_word = mypatmat.dirty_search_for_num_occ_of_pat_lst_in_text(tokens_str,find_words,pat_whitespace_to_left=True,pat_whitespace_to_right=False,pattern_lst_is_regex=False) counts_regex = mypatmat.dirty_search_for_num_occ_of_pat_lst_in_text(tokens_str,find_regexes,pat_whitespace_to_left=True,pat_whitespace_to_right=False,pattern_lst_is_regex=True) counts_not_general_keywords = mypatmat.dirty_search_for_num_occ_of_pat_lst_in_text(tokens_str,keywords_for_not_general,pat_whitespace_to_left=True,pat_whitespace_to_right=False,pattern_lst_is_regex=False) generality_flag = 1 if sum(counts_not_general_keywords) > 0: generality_flag = 0 xp_duplications = dupfuns.get_dup_groups(XP_MEAN_FACTOR_GROUP_ONLY_IF,stats_xp_lcp[mymath.INDEX_OF_MEAN],xp_sa,xp_lcp,tokens_str,pygments_lexer,SPECIAL_PLACEHOLDER,symbol_counts_in_string_ith_base_0,variables,VARIABLE_PLACEHOLDER,strings,STRING_PLACEHOLDER,digits,DIGITS_PLACEHOLDER,types,TYPES_PLACEHOLDER,linenos_for_xp_dup,ssym_dt) xp_dup_mean_div_count = stats_xp_lcp[mymath.INDEX_OF_MEAN] / float(max(1,stats_xp_lcp[mymath.INDEX_OF_COUNT])) (total_literals_count, unique_literals_count, magic_numbers_count, magic_numbers_per_number, magic_numbers_display) = get_magic_number_info(digits) ( identifier_avg_length, count_small_identifier, count_small_identifier_percent, identifier_creativity_uppercase_underscore_count, identifier_creativity_uppercase_underscore_percent, poor_identifier_display ) = get_identifier_info(variables) # ################# # BEGIN gather features and return them with header # ################# FEATURES_HEADER = ['language','filename'] features_val = [constants.C_LANGUAGE[0],source_fn] FEATURES_HEADER += mypatmat.prepend_and_or_append_str_to_strlst(mymath.DESCRIBE_FEATURES_HEADER,prependstr='orig_dup_') features_val += stats_orig_lcp FEATURES_HEADER += mypatmat.prepend_and_or_append_str_to_strlst(mymath.DESCRIBE_FEATURES_HEADER,prependstr='xp_dup_') features_val += stats_xp_lcp FEATURES_HEADER += ['init_loc','orig_loc','my_loc'] features_val += [initLOC, origLOC, myLOC] FEATURES_HEADER += mypatmat.prepend_and_or_append_str_to_strlst(find_words,prependstr='word_count_') features_val += counts_word FEATURES_HEADER += mypatmat.prepend_and_or_append_str_to_strlst(find_regexes,prependstr='regex_count_') features_val += counts_regex FEATURES_HEADER += ['count_line_comments','count_block_comments','count_total_comment_lines','comment_lines_div_orig_loc'] features_val += [ num_line_comments, num_block_comments, total_comment_lines, comment_lines_div_orig_loc ] FEATURES_HEADER += ['total_literals_count', 'unique_literals_count', 'magic_numbers_count', 'magic_numbers_per_number'] features_val += [ total_literals_count, unique_literals_count, magic_numbers_count, magic_numbers_per_number ] FEATURES_HEADER += ['identifier_avg_length', 'count_small_identifier', 'count_small_identifier_percent', 'identifier_creativity_uppercase_underscore_count', 'identifier_creativity_uppercase_underscore_percent'] features_val += [ identifier_avg_length, count_small_identifier, count_small_identifier_percent, identifier_creativity_uppercase_underscore_count, identifier_creativity_uppercase_underscore_percent ] FEATURES_HEADER += ['orig_dup_mean_div_count', 'xp_dup_mean_div_count'] features_val += [orig_dup_mean_div_count, xp_dup_mean_div_count] FEATURES_HEADER += ['generality_flag'] features_val += [generality_flag ] # ################# # END # ################# return (FEATURES_HEADER, features_val, duplications, xp_duplications, poor_identifier_display, magic_numbers_display)