示例#1
0
def parse_files_using_srcml(repository_id, repository_name):
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()

    cursor.execute("select id from files where repository_id = %s", (repository_id, ))
    files_results = cursor.fetchall()

    for file_line in files_results:
        file_id = file_line[0]
        file_versions_directory = DiretoryConfig.get_parameter('file_versions_directory') + repository_name
        parsed_files_directory = DiretoryConfig.get_parameter('parsed_files_directory') + repository_name
        create_directory(parsed_files_directory)
        cursor.execute('select id, commit_hash, version_path from file_versions where file_id = %s and has_parsed_file is false order by author_date', (file_id, ))
        file_vesions_result = cursor.fetchall()

        for file_versions_line in file_vesions_result:
            file_versions_id =  file_versions_line[0]
            commit_hash = file_versions_line[1]
            version_path = file_versions_line[2]
            file_extension = version_path.split('.')[-1]
                                                             
            local_file_copy  =  file_versions_directory  +"/"+ str(file_id) + "_" + str(file_versions_id) + "_" + commit_hash +"."+  file_extension
            parsed_file_output =  parsed_files_directory +"/"+ str(file_id) + "_" + str(file_versions_id) + "_" + commit_hash +"."+  file_extension
            subprocess.call(["srcml", local_file_copy, "-o", parsed_file_output])
            
            cursor.execute("update file_versions set has_parsed_file = true where id = %s", (file_versions_id, ))
            connection.commit()
    connection.close()
示例#2
0
def snapshot_the_repo(repository_name, tag):
    repository_path = DiretoryConfig.get_parameter('repository_directory') + repository_name
    tag_path = repository_path + "_tags/" + tag
    checkout = "git checkout " + tag 
    copy_repo =  "cp -r ./ ../" + repository_name + "_tags/" + tag
    command = checkout + " ; " + copy_repo 
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr = subprocess.PIPE, stdin = subprocess.PIPE, shell=True, cwd=repository_path)
    process.communicate()[1].strip().decode("utf-8")
    clean_unwanted_files(tag_path)
示例#3
0
def process_parseable_files(repository_id, repository_name):
    repository_path = DiretoryConfig.get_parameter('repository_directory') + repository_name
    file_regex = FileHandlerConfig.get_parameter('parseable_files_regex')
    for root, dirs, files in os.walk(repository_path):
        for file in files:
            file_matcher = re.match(file_regex, file)
            if file_matcher is not None:
                absolute_path = os.path.join(root, file).replace(repository_path + '/', '')
                file_id = insert_file(repository_id, file, absolute_path)                
                print (absolute_path)
示例#4
0
def parse_files_using_srcml(repository_id, repository_name):
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()

    cursor.execute("select id from files where repository_id = %s",
                   (repository_id, ))
    files_results = cursor.fetchall()

    for file_line in files_results:
        file_id = file_line[0]
        file_versions_directory = DiretoryConfig.get_parameter(
            'file_versions_directory') + repository_name
        parsed_files_directory = DiretoryConfig.get_parameter(
            'parsed_files_directory') + repository_name
        create_directory(parsed_files_directory)
        cursor.execute(
            'select id, commit_hash, version_path from file_versions where file_id = %s and has_parsed_file is false order by author_date',
            (file_id, ))
        file_vesions_result = cursor.fetchall()

        for file_versions_line in file_vesions_result:
            file_versions_id = file_versions_line[0]
            commit_hash = file_versions_line[1]
            version_path = file_versions_line[2]
            file_extension = version_path.split('.')[-1]

            local_file_copy = file_versions_directory + "/" + str(
                file_id) + "_" + str(
                    file_versions_id
                ) + "_" + commit_hash + "." + file_extension
            parsed_file_output = parsed_files_directory + "/" + str(
                file_id) + "_" + str(
                    file_versions_id
                ) + "_" + commit_hash + "." + file_extension
            subprocess.call(
                ["srcml", local_file_copy, "-o", parsed_file_output])

            cursor.execute(
                "update file_versions set has_parsed_file = true where id = %s",
                (file_versions_id, ))
            connection.commit()
    connection.close()
示例#5
0
def generate_training_dataset():
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()

    default_nlp_path = DiretoryConfig.get_parameter('nlp_directory')

    training_dataset_path = default_nlp_path + NLPHandlerConfig.get_parameter('training_dataset_name')
    classification_types  = NLPHandlerConfig.get_parameter('classification_types')

    cursor.execute("select classification, treated_comment_text from manually_classified_comments where classification in %s", [tuple(classification_types),])
    write_formated_file(training_dataset_path, cursor.fetchall())
示例#6
0
def list_repository_tags(repository_name):
    repository_path = DiretoryConfig.get_parameter(
        'repository_directory') + repository_name
    command = "git log --tags --date-order --reverse --simplify-by-decoration --pretty=%ai%d"
    process = subprocess.Popen(command,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE,
                               stdin=subprocess.PIPE,
                               shell=True,
                               cwd=repository_path)
    return process.communicate()[0].strip().decode("utf-8").split('\n')
示例#7
0
def classify_comments(repository_id):

    default_nlp_path = DiretoryConfig.get_parameter('nlp_directory')
    test_dataset_path = default_nlp_path + NLPHandlerConfig.get_parameter('test_dataset_name')

    connection = PSQLConnection.get_connection()
    cursor = connection.cursor() 
    cursor.execute("select distinct(file_versions_id) from processed_comments where repository_id = %s", (repository_id, ))
    file_versions = cursor.fetchall()
    
    for file_version in file_versions:
        before = timeit.default_timer()
        file_versions_id = file_version[0]
        print("file version:", file_versions_id)
        
        cursor.execute("select 'WITHOUT_CLASSIFICATION' as classification, treated_comment_text, id from processed_comments where file_versions_id = %s and td_classification is null order by end_line", (file_versions_id, ))
        all_comments_from_file = cursor.fetchall()
        write_formated_file(test_dataset_path , all_comments_from_file)

        nlp_classifier_memory_use = NLPHandlerConfig.get_parameter('nlp_classifier_memory_use')
        command = 'java ' + nlp_classifier_memory_use + ' -jar stanford-classifier.jar -prop ./dataset.prop -1.useSplitWords -1.splitWordsRegexp "\s"' 
        process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, cwd=default_nlp_path).communicate()
        subprocess.call("rm " + test_dataset_path , shell=True)

        output  = process[0].strip().decode("utf-8").split('\n')
        # results = process[1].strip().decode("utf-8").split('\n')

        output_regex = NLPHandlerConfig.get_parameter('output_regex')
        comment_text_exact_regex = NLPHandlerConfig.get_parameter('comment_text_exact_regex')

        for comment in all_comments_from_file:
            treated_comment_text = comment[1]
            comment_id = comment[2]

            for line in output:
                comment_text_exact_matcher = re.match(comment_text_exact_regex, line)
                comment_text_from_output = comment_text_exact_matcher.group(1)

                if treated_comment_text == comment_text_from_output :
                    output_without_comment = line.replace(treated_comment_text, '')
                    output_matcher = re.findall(output_regex, line)

                    if output_matcher is not None:
                        golden_anwser = output_matcher[0].replace('\'', '')
                        nlp_tool_classification = output_matcher[1].replace('\'', '')

                        cursor.execute("update processed_comments set td_classification = %s where id = %s " , (nlp_tool_classification, comment_id) )
                        connection.commit()
                        # print (golden_anwser , "-" , nlp_tool_classification)
                        break

        after = timeit.default_timer()
        print (after - before)
示例#8
0
def checkout_file_versions(repository_id, repository_name, master_branch):
    repository_directory = DiretoryConfig.get_parameter('repository_directory') + repository_name

    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()

    cursor.execute("select id from files where repository_id = %s", (repository_id, ))
    files_results = cursor.fetchall()

    for file_line in files_results:
        file_id = file_line[0]
        
        checkout_to_latest_version(repository_name, master_branch)
        file_versions_directory = DiretoryConfig.get_parameter('file_versions_directory') + repository_name
        create_directory(file_versions_directory)

        cursor.execute('select id, commit_hash, version_path from file_versions where file_id = %s and has_local_file is false order by author_date', (file_id, ))
        file_vesions_result = cursor.fetchall()

        for file_versions_line in file_vesions_result:
            file_versions_id = file_versions_line[0]
            commit_hash = file_versions_line[1]
            version_path = file_versions_line[2]
            file_extension = version_path.split('.')[-1]

            git_checkout = "git checkout " + commit_hash
            cp_file = "cp " + version_path + " ../" + file_versions_directory +"/"+ str(file_id)+ "_" + str(file_versions_id) + "_" + commit_hash +"."+  file_extension  

            print (cp_file)

            command = git_checkout + ";" + cp_file
            process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr = subprocess.PIPE, stdin = subprocess.PIPE, shell=True, cwd=repository_directory)
            git_log_output = process.communicate()[0].strip().decode("utf-8").split('\n')

            cursor.execute("update file_versions set has_local_file = true where id = %s", (file_versions_id, ))
            connection.commit()

    connection.close()
示例#9
0
def search_deleted_files(repository_id, repository_name, master_branch):
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()

    repository_directory = DiretoryConfig.get_parameter('repository_directory') + repository_name
    git_deleted_log_file_regex = FileHandlerConfig.get_parameter('git_deleted_log_file_regex')
    file_regex = FileHandlerConfig.get_parameter('parseable_files_regex')

    command = "git log --diff-filter=D --summary"
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr = subprocess.PIPE, stdin = subprocess.PIPE, shell=True, cwd=repository_directory)
    git_log_output = process.communicate()[0].strip().decode("utf-8").split('\n')

    commit_hash     = ''
    author_name     = ''
    author_email    = ''
    author_date     = ''
    version_path    = ''

    for git_log_output_line in git_log_output:
            # removes non ascii characters
            stripped = (c for c in git_log_output_line if 0 < ord(c) < 127)
            stripped_line = ''.join(stripped)
            
            git_log_file_matcher = re.match(git_deleted_log_file_regex, stripped_line)
            if git_log_file_matcher is not None:
                if git_log_file_matcher.group(1):         
                    commit_hash  = git_log_file_matcher.group(1)
                    # print (commit_hash)
                if git_log_file_matcher.group(2):
                    author_name  = git_log_file_matcher.group(2)
                    # print (author_name)
                if git_log_file_matcher.group(3):
                    author_email = git_log_file_matcher.group(3) 
                    # print (author_email)
                if git_log_file_matcher.group(4):
                    author_date  = git_log_file_matcher.group(4)
                    # print (author_date)
                if git_log_file_matcher.group(5):
                    version_path = git_log_file_matcher.group(5)
                    file_regex_matcher = re.match(file_regex, version_path)
                    if file_regex_matcher is not None:
                        # print (version_path)
                        cursor.execute("select count(*) from file_versions where older_version_path = %s and commit_hash = %s", (version_path, commit_hash))
                        found_in_database = cursor.fetchone()[0]
                        if found_in_database == 0:
                            print(found_in_database, version_path, commit_hash)
                            file_name = version_path.split('/')[-1]
                            file_id = insert_file(repository_id, file_name, version_path, commit_hash)
                            if file_id is not None:
                                execute_git_log_to_get_versions("git log "+commit_hash+"^ --follow --stat=350 --stat-graph-width=2 -- ", file_id, version_path, repository_directory)
示例#10
0
def snapshot_the_repo(repository_name, tag):
    repository_path = DiretoryConfig.get_parameter(
        'repository_directory') + repository_name
    tag_path = repository_path + "_tags/" + tag
    checkout = "git checkout " + tag
    copy_repo = "cp -r ./ ../" + repository_name + "_tags/" + tag
    command = checkout + " ; " + copy_repo
    process = subprocess.Popen(command,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE,
                               stdin=subprocess.PIPE,
                               shell=True,
                               cwd=repository_path)
    process.communicate()[1].strip().decode("utf-8")
    clean_unwanted_files(tag_path)
示例#11
0
def extract_file_versions(repository_id, repository_name):

    repository_path = DiretoryConfig.get_parameter('repository_directory') + repository_name
    git_log_file_regex = FileHandlerConfig.get_parameter('git_log_file_regex')

    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()
    
    cursor.execute('select id, file_path from files where repository_id = %s', (repository_id, ))
    files_results =  cursor.fetchall()
    connection.close()

    for files_results_line in files_results:

        file_id = files_results_line[0]
        file_path = files_results_line[1]

        execute_git_log_to_get_versions("git log --follow --stat=350 --stat-graph-width=2 -- ", file_id, file_path, repository_path)
    command = 'git clone ' + clone_url
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr = subprocess.PIPE, stdin = subprocess.PIPE, shell=True, cwd=repository_default_directory)
    proc_stdout = process.communicate()[1].strip().decode('utf-8')
    print (proc_stdout)
    repository_name_matcher = re.search('\'(.*)\'', proc_stdout)
    return repository_name_matcher.group(1)

def get_repository_master_branch(repository_name):
    repository_path = repository_default_directory + repository_name
    command = 'cat .git/HEAD'
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr = subprocess.PIPE, stdin = subprocess.PIPE, shell=True, cwd=repository_path)
    proc_stdout = process.communicate()[0].strip()
    master_branch_name_matcher = re.search('ref:.*\/(.*)', str(proc_stdout))
    return master_branch_name_matcher.group(1).replace('\'', '')

def insert_cloned_repo_info(repository_name, master_branch):
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()
    cursor.execute("insert into repositories (name, clone_url, master_branch) values (%s, %s, %s)", (repository_name, clone_url, master_branch))
    connection.commit()
    connection.close()

clone_url  = sys.argv[1]
repository_default_directory = DiretoryConfig.get_parameter('repository_directory')       

create_repository_directory(repository_default_directory)

if has_to_clone_repository(clone_url):
    repository_name = clone_repository(clone_url)
    get_repository_master_branch = get_repository_master_branch(repository_name)
    insert_cloned_repo_info(repository_name, get_repository_master_branch)
示例#13
0
def search_authors(repository_id, repository_name):
    before = timeit.default_timer()
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor() 
    cursor.execute("select file_id, treated_comment_text from processed_comments where repository_id = %s and td_classification != 'WITHOUT_CLASSIFICATION' group by 1,2 order by 1   ", (repository_id, ))
    files = cursor.fetchall()
    
    for file in files:        
        file_id = file[0]
        treated_comment_text = file[1]
        print("file id:", file_id)
        print("treated_comment_text:", treated_comment_text)        

        iteration_counter = 0
        has_removed_version = False
        is_introduced_version = False
        removed_version_commit_hash = ''
        introduced_version_commit_hash = ''
        introduced_version_processed_comment_id = ''

        cursor.execute("select a.id, b.author_date, b.commit_hash, b.author_name from processed_comments a, file_versions b where a.file_versions_id = b.id and a.file_id = %s and a.treated_comment_text = %s order by 1", (file_id, treated_comment_text))
        all_file_versions = cursor.fetchall()

        for file_version_line in all_file_versions:
            iteration_counter = iteration_counter + 1
            processed_comment_id = file_version_line[0]
            author_date = file_version_line[1]
            commit_hash = file_version_line[2]
            author_name = file_version_line[3]

            if introduced_version_commit_hash == '':
                is_introduced_version = True
                introduced_version_commit_hash = commit_hash
                introduced_version_processed_comment_id = processed_comment_id
            else:
                is_introduced_version = False
    
            cursor.execute("update processed_comments set introduced_version_commit_hash = %s, is_introduced_version = %s, introduced_version_author = %s, introduced_version_date = %s where id = %s", (introduced_version_commit_hash, is_introduced_version, author_name, author_date, processed_comment_id))
            connection.commit()
 
            if iteration_counter == len(all_file_versions):
                cursor.execute ("select id, commit_hash, author_name, author_date from file_versions where file_id = %s and author_date > %s order by author_date", (file_id, author_date))
                remaining_file_versions = cursor.fetchall()

                if len(remaining_file_versions) > 0:
                    removed_version_commit_hash = remaining_file_versions[0][1]
                    removed_version_author = remaining_file_versions[0][2]
                    removed_version_date = remaining_file_versions[0][3]
                    has_removed_version = True

                    cursor.execute("update processed_comments set removed_version_commit_hash = %s, has_removed_version = %s, removed_version_author = %s, removed_version_date = %s where id = %s", (removed_version_commit_hash, has_removed_version, removed_version_author, removed_version_date, introduced_version_processed_comment_id))
                    connection.commit()
                else:
                    cursor.execute("select deletion_commit_hash from files where id = %s", (file_id,))
                    file_commit_hash_result = cursor.fetchone()

                    if file_commit_hash_result[0] is not None:
                        repository_directory = DiretoryConfig.get_parameter('repository_directory') + repository_name
                        git_log_file_regex = TDAuthorsHandlerConfig.get_parameter('git_log_file_regex')

                        removed_version_commit_hash = file_commit_hash_result[0]
                        has_removed_version = True
                
                        git_log = "git log -1 " + removed_version_commit_hash
                        process = subprocess.Popen(git_log, stdout=subprocess.PIPE, shell=True, cwd= repository_directory)
                        proc_stdout = process.communicate()[0].strip().decode('utf-8').split('\n')
                        
                        for proc_stdout_line in proc_stdout:   
                            git_log_file_matcher =  re.match(git_log_file_regex, proc_stdout_line)    
                            if git_log_file_matcher is not None:
                                if git_log_file_matcher.group(2):
                                    git_commit_author = git_log_file_matcher.group(2)
                                if git_log_file_matcher.group(4):
                                    git_commit_date = git_log_file_matcher.group(4)
                            
                        cursor.execute("update processed_comments set removed_version_commit_hash = %s, has_removed_version = %s, removed_version_author = %s, removed_version_date = to_timestamp(%s, 'Dy Mon DD HH24:MI:SS YYYY +-####') where id = %s", (removed_version_commit_hash, has_removed_version, git_commit_author, git_commit_date, introduced_version_processed_comment_id))
                        connection.commit()

                    else:
                        cursor.execute("update processed_comments set has_removed_version = %s where id = %s", (has_removed_version, introduced_version_processed_comment_id))
                        connection.commit()
示例#14
0
def extract_comments(repository_id, repository_name):
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()

    cursor.execute("select id from files where repository_id = %s ", (repository_id, ))
    files_results = cursor.fetchall()

    for file_line in files_results:
        file_id = file_line[0]
        parsed_files_directory = DiretoryConfig.get_parameter('parsed_files_directory') + repository_name
        cursor.execute('select id, commit_hash, version_path from file_versions where file_id = %s and has_parsed_file is true order by author_date', (file_id, ))
        file_vesions_result = cursor.fetchall()

        for file_versions_line in file_vesions_result:
            
            class_declaration_lines = []
            has_class_declaration = False
            has_interface_declaration = False
            has_enum_declaration = False
            has_annotation_declaration = False

            file_versions_id =  file_versions_line[0]
            commit_hash = file_versions_line[1]
            version_path = file_versions_line[2]
            file_extension = version_path.split('.')[-1]
                                                             
            parsed_file_output =  parsed_files_directory +"/"+ str(file_id) + "_" + str(file_versions_id) + "_" + commit_hash +"."+  file_extension
            print(parsed_file_output)
            try:
                tree = etree.parse(parsed_file_output)
                root = tree.getroot()
            except Exception as e:
                print(e)
            
            for element in root.iter("{http://www.srcML.org/srcML/src}class"):
                class_declaration_line = element.sourceline -1
                class_declaration_lines.append(str(class_declaration_line))
                has_class_declaration = True

            for element in root.iter("{http://www.srcML.org/srcML/src}interface"):
                class_declaration_line = element.sourceline -1
                class_declaration_lines.append(str(class_declaration_line))
                has_interface_declaration = True

            for element in root.iter("{http://www.srcML.org/srcML/src}enum"):
                class_declaration_line = element.sourceline -1
                class_declaration_lines.append(str(class_declaration_line))
                has_enum_declaration = True

            for element in root.iter("{http://www.srcML.org/srcML/src}annotation_defn"):
                class_declaration_line = element.sourceline -1
                class_declaration_lines.append(str(class_declaration_line))
                has_annotation_declaration = True


            for element in root.iter("{http://www.srcML.org/srcML/src}comment"):
                start_line = element.sourceline -1
                comment_text = element.text
                comment_type = element.get("type")
                comment_format = element.get("format")
                
                if comment_type == 'line':
                    end_line = start_line
                else:
                    next_element = element.getnext()
                    if next_element is not None:
                        end_line = next_element.sourceline -2
                    else:
                        end_line = start_line

                cursor.execute("insert into raw_comments (repository_id,file_id, file_versions_id, commit_hash, comment_text, comment_type, comment_format, start_line, end_line, has_class_declaration, has_interface_declaration, has_enum_declaration, has_annotation_declaration, class_declaration_lines) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", (repository_id, file_id, file_versions_id, commit_hash, comment_text, comment_type, comment_format, start_line, end_line, has_class_declaration, has_interface_declaration, has_enum_declaration, has_annotation_declaration, ','.join(class_declaration_lines))) 
                connection.commit()

    connection.close()
示例#15
0
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE,
                               stdin=subprocess.PIPE,
                               shell=True,
                               cwd=repository_path)
    proc_stdout = process.communicate()[0].strip()
    master_branch_name_matcher = re.search('ref:.*\/(.*)', str(proc_stdout))
    return master_branch_name_matcher.group(1).replace('\'', '')


def insert_cloned_repo_info(repository_name, master_branch):
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()
    cursor.execute(
        "insert into repositories (name, clone_url, master_branch) values (%s, %s, %s)",
        (repository_name, clone_url, master_branch))
    connection.commit()
    connection.close()


clone_url = sys.argv[1]
repository_default_directory = DiretoryConfig.get_parameter(
    'repository_directory')

create_repository_directory(repository_default_directory)

if has_to_clone_repository(clone_url):
    repository_name = clone_repository(clone_url)
    get_repository_master_branch = get_repository_master_branch(
        repository_name)
    insert_cloned_repo_info(repository_name, get_repository_master_branch)
示例#16
0
def checkout_to_latest_version(repository_name, master_branch):
    repository_path = DiretoryConfig.get_parameter('repository_directory') + repository_name
    command = "git checkout " + master_branch
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr = subprocess.PIPE, stdin = subprocess.PIPE, shell=True, cwd=repository_path)
    return process.communicate()[0].strip().decode("utf-8").split('\n')
示例#17
0
def delete_training_dataset():
    default_nlp_path = DiretoryConfig.get_parameter('nlp_directory')
    training_dataset_path = default_nlp_path + NLPHandlerConfig.get_parameter('training_dataset_name')
    subprocess.call("rm " + training_dataset_path , shell=True)
示例#18
0
def extract_comments(repository_id, repository_name):
    connection = PSQLConnection.get_connection()
    cursor = connection.cursor()

    cursor.execute("select id from files where repository_id = %s ",
                   (repository_id, ))
    files_results = cursor.fetchall()

    for file_line in files_results:
        file_id = file_line[0]
        parsed_files_directory = DiretoryConfig.get_parameter(
            'parsed_files_directory') + repository_name
        cursor.execute(
            'select id, commit_hash, version_path from file_versions where file_id = %s and has_parsed_file is true order by author_date',
            (file_id, ))
        file_vesions_result = cursor.fetchall()

        for file_versions_line in file_vesions_result:

            class_declaration_lines = []
            has_class_declaration = False
            has_interface_declaration = False
            has_enum_declaration = False
            has_annotation_declaration = False

            file_versions_id = file_versions_line[0]
            commit_hash = file_versions_line[1]
            version_path = file_versions_line[2]
            file_extension = version_path.split('.')[-1]

            parsed_file_output = parsed_files_directory + "/" + str(
                file_id) + "_" + str(
                    file_versions_id
                ) + "_" + commit_hash + "." + file_extension
            print(parsed_file_output)
            try:
                tree = etree.parse(parsed_file_output)
                root = tree.getroot()
            except Exception as e:
                print(e)

            for element in root.iter("{http://www.srcML.org/srcML/src}class"):
                class_declaration_line = element.sourceline - 1
                class_declaration_lines.append(str(class_declaration_line))
                has_class_declaration = True

            for element in root.iter(
                    "{http://www.srcML.org/srcML/src}interface"):
                class_declaration_line = element.sourceline - 1
                class_declaration_lines.append(str(class_declaration_line))
                has_interface_declaration = True

            for element in root.iter("{http://www.srcML.org/srcML/src}enum"):
                class_declaration_line = element.sourceline - 1
                class_declaration_lines.append(str(class_declaration_line))
                has_enum_declaration = True

            for element in root.iter(
                    "{http://www.srcML.org/srcML/src}annotation_defn"):
                class_declaration_line = element.sourceline - 1
                class_declaration_lines.append(str(class_declaration_line))
                has_annotation_declaration = True

            for element in root.iter(
                    "{http://www.srcML.org/srcML/src}comment"):
                start_line = element.sourceline - 1
                comment_text = element.text
                comment_type = element.get("type")
                comment_format = element.get("format")

                if comment_type == 'line':
                    end_line = start_line
                else:
                    next_element = element.getnext()
                    if next_element is not None:
                        end_line = next_element.sourceline - 2
                    else:
                        end_line = start_line

                cursor.execute(
                    "insert into raw_comments (repository_id,file_id, file_versions_id, commit_hash, comment_text, comment_type, comment_format, start_line, end_line, has_class_declaration, has_interface_declaration, has_enum_declaration, has_annotation_declaration, class_declaration_lines) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
                    (repository_id, file_id, file_versions_id, commit_hash,
                     comment_text, comment_type, comment_format, start_line,
                     end_line, has_class_declaration,
                     has_interface_declaration, has_enum_declaration,
                     has_annotation_declaration,
                     ','.join(class_declaration_lines)))
                connection.commit()

    connection.close()
示例#19
0
        (repository_id, name, version_date, version_order))
    connection.commit()
    connection.close()


tags_regex = '(\d\d\d\d\-\d\d\-\d\d\s\d\d:\d\d:\d\d)|\(tag:\s([A-Za-z0-9\-\_\.+]*)\)'

repository_list = repositoryfetch_repositories()
for repository_entry in repository_list:
    repository_id = repository_entry[0]
    repository_name = repository_entry[1]
    repository_url = repository_entry[2]
    repository_cloned_date = repository_entry[3]

    tag_entry_list = list_repository_tags(repository_name)
    tags_directory = DiretoryConfig.get_parameter(
        'repository_directory') + repository_name + "_tags/"
    create_directory(tags_directory)

    version_order = 0
    for tag_entry in tag_entry_list:
        if re.search(tags_regex, tag_entry) is not None:
            matche_groups = re.findall(tags_regex, tag_entry)
            """It has to have match for tag and date (merge has date but not tag)"""
            if len(matche_groups) == 2:
                tag_date = matche_groups[0][0]
                tag = matche_groups[1][1]

                create_directory(tags_directory + tag)
                snapshot_the_repo(repository_name, tag)
                insert_snapshot_version_info(repository_id, tag, tag_date,
                                             version_order)
示例#20
0
def list_repository_tags(repository_name):
    repository_path = DiretoryConfig.get_parameter('repository_directory') + repository_name
    command = "git log --tags --date-order --reverse --simplify-by-decoration --pretty=%ai%d"
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr = subprocess.PIPE, stdin = subprocess.PIPE, shell=True, cwd=repository_path)
    return process.communicate()[0].strip().decode("utf-8").split('\n')
示例#21
0
    cursor = connection.cursor()
    cursor.execute("insert into tags (repository_id, name, version_date, version_order) values (%s, %s, to_timestamp(%s, 'YYYY-MM-DD HH24:MI:SS'), %s)", (repository_id, name, version_date, version_order))
    connection.commit()
    connection.close()

tags_regex = '(\d\d\d\d\-\d\d\-\d\d\s\d\d:\d\d:\d\d)|\(tag:\s([A-Za-z0-9\-\_\.+]*)\)'

repository_list = repositoryfetch_repositories()
for repository_entry in repository_list:
    repository_id   = repository_entry[0]
    repository_name = repository_entry[1]
    repository_url  = repository_entry[2]
    repository_cloned_date = repository_entry[3]

    tag_entry_list = list_repository_tags(repository_name)
    tags_directory = DiretoryConfig.get_parameter('repository_directory') + repository_name + "_tags/"
    create_directory(tags_directory)

    version_order = 0
    for tag_entry in tag_entry_list:
        if re.search(tags_regex, tag_entry) is not None:
            matche_groups = re.findall(tags_regex, tag_entry)
            """It has to have match for tag and date (merge has date but not tag)"""
            if len(matche_groups) == 2:
                tag_date = matche_groups[0][0]
                tag = matche_groups[1][1]

                create_directory(tags_directory + tag)
                snapshot_the_repo(repository_name, tag)
                insert_snapshot_version_info(repository_id, tag, tag_date, version_order)
                version_order = version_order + 1