def get_features(kewords_special_symbols_fn,source_fn,source_str,block_comment_regex,line_comment_regex,numbers_regex,strings_regex,find_words,find_regexes,types_regex,keywords_for_not_general,pygments_lexer): keys_dt,ssym_dt = get_keys_ssym_dicts(kewords_special_symbols_fn) if (source_str == None or len(source_str)==0) and source_fn != None and len(source_fn)>0: source_str = myfile.get_text(source_fn) if source_str is None or len(source_str)<=1: # INVALID_FILE return (None,None) source_str = source_str + mypatmat.NEWLINES[0] # prime initLOC = mypatmat.count_each_symbol(source_str,[mypatmat.NEWLINES[0]])[0] source_str = mypatmat.remove_double_newline(source_str) source_str = re.sub(r'\\.',' ',source_str) # remove escaped symbols for simplicity source_str = source_str.replace(SINGLE_QUOTE_DELIMITER,DOUBLE_QUOTE_DELIMITER) # treat chars as strings for simplicity theoriginal_str = source_str (block_comments, source_str) = mypatmat.strip_regex_but_keep_newlines(source_str,block_comment_regex) (line_comments, source_str) = mypatmat.strip_regex_but_keep_newlines(source_str,line_comment_regex) num_block_comments = len(block_comments) num_line_comments = len(line_comments) total_comment_lines = mypatmat.count_each_symbol(mypatmat.NEWLINES[0].join(block_comments),[mypatmat.NEWLINES[0]])[0] + num_line_comments comment_lines_div_orig_loc = total_comment_lines / float(max(1,initLOC)) source_w_just_spaces_and_newlines = mypatmat.replace_whitespace_with_single_space(source_str,except_newline=True) source_w_just_spaces = source_w_just_spaces_and_newlines.replace('\n',' ') # after this, we can match more, but we still maintain where the newlines are in (orig_sa,orig_lcp)=make_sa_lcp.get_sa_lcp(source_w_just_spaces) stats_orig_lcp = mymath.get_list_describe_pandas_features(orig_lcp) duplications = dupfuns.get_dup_groups(DUP_MEAN_FACTOR_GROUP_ONLY_IF,stats_orig_lcp[mymath.INDEX_OF_MEAN],orig_sa,orig_lcp,source_w_just_spaces_and_newlines,pygments_lexer,SPECIAL_PLACEHOLDER) orig_dup_mean_div_count = stats_orig_lcp[mymath.INDEX_OF_MEAN] / float(max(1,stats_orig_lcp[mymath.INDEX_OF_COUNT])) (temp, linenos_arr) = mypatmat.get_linenos_and_remove_newlines(source_w_just_spaces_and_newlines) origLOC = mypatmat.count_each_symbol(source_str,[mypatmat.NEWLINES[0]])[0] myLOC = len(set(linenos_arr)) # X-parameterized transformation (sourcemod,tokens,tokens_str,variables,strings,digits,types) = get_data_str(source_str,keys_dt,ssym_dt,strings_regex,numbers_regex,types_regex) (thereconstructed_str,linenos_for_xp_dup) = reconstruct_the_original(theoriginal_str,tokens_str,variables,VARIABLE_PLACEHOLDER,strings,STRING_PLACEHOLDER,digits,DIGITS_PLACEHOLDER,types,TYPES_PLACEHOLDER) (xp_sa,xp_lcp)=make_sa_lcp.get_sa_lcp(tokens_str) stats_xp_lcp = mymath.get_list_describe_pandas_features(xp_lcp) symbol_counts_in_string_ith_base_0 = mypatmat.get_symbol_counts_in_string_ith_base_0(tokens_str) counts_word = mypatmat.dirty_search_for_num_occ_of_pat_lst_in_text(tokens_str,find_words,pat_whitespace_to_left=True,pat_whitespace_to_right=False,pattern_lst_is_regex=False) counts_regex = mypatmat.dirty_search_for_num_occ_of_pat_lst_in_text(tokens_str,find_regexes,pat_whitespace_to_left=True,pat_whitespace_to_right=False,pattern_lst_is_regex=True) counts_not_general_keywords = mypatmat.dirty_search_for_num_occ_of_pat_lst_in_text(tokens_str,keywords_for_not_general,pat_whitespace_to_left=True,pat_whitespace_to_right=False,pattern_lst_is_regex=False) generality_flag = 1 if sum(counts_not_general_keywords) > 0: generality_flag = 0 xp_duplications = dupfuns.get_dup_groups(XP_MEAN_FACTOR_GROUP_ONLY_IF,stats_xp_lcp[mymath.INDEX_OF_MEAN],xp_sa,xp_lcp,tokens_str,pygments_lexer,SPECIAL_PLACEHOLDER,symbol_counts_in_string_ith_base_0,variables,VARIABLE_PLACEHOLDER,strings,STRING_PLACEHOLDER,digits,DIGITS_PLACEHOLDER,types,TYPES_PLACEHOLDER,linenos_for_xp_dup,ssym_dt) xp_dup_mean_div_count = stats_xp_lcp[mymath.INDEX_OF_MEAN] / float(max(1,stats_xp_lcp[mymath.INDEX_OF_COUNT])) (total_literals_count, unique_literals_count, magic_numbers_count, magic_numbers_per_number, magic_numbers_display) = get_magic_number_info(digits) ( identifier_avg_length, count_small_identifier, count_small_identifier_percent, identifier_creativity_uppercase_underscore_count, identifier_creativity_uppercase_underscore_percent, poor_identifier_display ) = get_identifier_info(variables) # ################# # BEGIN gather features and return them with header # ################# FEATURES_HEADER = ['language','filename'] features_val = [constants.C_LANGUAGE[0],source_fn] FEATURES_HEADER += mypatmat.prepend_and_or_append_str_to_strlst(mymath.DESCRIBE_FEATURES_HEADER,prependstr='orig_dup_') features_val += stats_orig_lcp FEATURES_HEADER += mypatmat.prepend_and_or_append_str_to_strlst(mymath.DESCRIBE_FEATURES_HEADER,prependstr='xp_dup_') features_val += stats_xp_lcp FEATURES_HEADER += ['init_loc','orig_loc','my_loc'] features_val += [initLOC, origLOC, myLOC] FEATURES_HEADER += mypatmat.prepend_and_or_append_str_to_strlst(find_words,prependstr='word_count_') features_val += counts_word FEATURES_HEADER += mypatmat.prepend_and_or_append_str_to_strlst(find_regexes,prependstr='regex_count_') features_val += counts_regex FEATURES_HEADER += ['count_line_comments','count_block_comments','count_total_comment_lines','comment_lines_div_orig_loc'] features_val += [ num_line_comments, num_block_comments, total_comment_lines, comment_lines_div_orig_loc ] FEATURES_HEADER += ['total_literals_count', 'unique_literals_count', 'magic_numbers_count', 'magic_numbers_per_number'] features_val += [ total_literals_count, unique_literals_count, magic_numbers_count, magic_numbers_per_number ] FEATURES_HEADER += ['identifier_avg_length', 'count_small_identifier', 'count_small_identifier_percent', 'identifier_creativity_uppercase_underscore_count', 'identifier_creativity_uppercase_underscore_percent'] features_val += [ identifier_avg_length, count_small_identifier, count_small_identifier_percent, identifier_creativity_uppercase_underscore_count, identifier_creativity_uppercase_underscore_percent ] FEATURES_HEADER += ['orig_dup_mean_div_count', 'xp_dup_mean_div_count'] features_val += [orig_dup_mean_div_count, xp_dup_mean_div_count] FEATURES_HEADER += ['generality_flag'] features_val += [generality_flag ] # ################# # END # ################# return (FEATURES_HEADER, features_val, duplications, xp_duplications, poor_identifier_display, magic_numbers_display)
def get_data_str(source,keys_dt,ssym_dt,STRINGS_REGEX,DIGITS_REGEX,TYPES_REGEX): tokens = [] variables = [] SPACE = mypatmat.SPACES[0] (strings_removed,source) = mypatmat.remove_all_in_between_on_same_line2(source,STRINGS_REGEX,replacewith=' '+STRING_PLACEHOLDER+' ') types_removed = [] (digits_removed,source) = mypatmat.remove_all_in_between_on_same_line2(source,DIGITS_REGEX,replacewith=' '+DIGITS_PLACEHOLDER+' ') source = mypatmat.replace_whitespace_with_single_space(source) source = mypatmat.ensure_thissym_around_syms(source,ssym_dt,SPACE) token_lines = source.split(SPACE) pos_in_source = 0 num = 0 strings_num = 0 digits_num = 0 variables_num = 0 types_num = 0 for tok in token_lines: # symbol token if tok in ssym_dt: tokens.append((tok,pos_in_source)) # keyword token elif tok in keys_dt: tokens.append((tok,pos_in_source,KEYWORD_CONSTANT)) elif tok != None and len(tok)>0: if len(tok)==1: if ord(tok)==ord(DIGITS_PLACEHOLDER): tokens.append((DIGITS_PLACEHOLDER,pos_in_source,strings_num)) strings_num += 1 elif ord(tok)==ord(STRING_PLACEHOLDER): tokens.append((STRING_PLACEHOLDER,pos_in_source,digits_num)) digits_num += 1 elif ord(tok)==ord(TYPES_PLACEHOLDER): tokens.append((TYPES_PLACEHOLDER,pos_in_source,types_num)) types_num += 1 else: # variables (identifier) tokens.append((VARIABLE_PLACEHOLDER,pos_in_source,num)) variables.append(tok) variables_num += 1 num += 1 elif tok!=SPACE: # variable (identifier) tokens.append((VARIABLE_PLACEHOLDER,pos_in_source,num)) num += 1 variables.append(tok) variables_num += 1 pos_in_source += len(tok) + 1 # + 1 for the SPACE tokens_str = tokens_to_str(tokens) return (source,tokens,tokens_str,variables,strings_removed,digits_removed,types_removed)