예제 #1
0
    def export_to_json(self, file_name):
        f = open(file_name, 'w+')
        f.write(json.dumps(self.data.json_ready(), indent=4))
        f.close()
        # Zip the output
        with ZipFile(file_name + '.zip', 'w', compression=8) as zip:
            zip.write(file_name)
            zip.close()

        print('Result has has been saved in: ' + file_name + '.zip')
        q = Questions()

        result = q.query_yes_no(
            'Do you want to upload the result to your profile automatically?')
        if result:
            response = uploadRepo(file_name + '.zip')
            if response is not None:
                reponame = self.data.repo_name
                url = 'https://profile.codersrank.io/repo?token=' + response[
                    'token'] + '&reponame=' + reponame
                # print("Open this url to upload the private repo:")
                print('Go to this link in the browser => ' + url)
                webbrowser.open(url)

        os.remove(file_name)
예제 #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        'directory', help='Path to the repository. Example usage: run.sh path/to/directory')
    parser.add_argument('--output', default='./repo_data.json', dest='output',
                        help='Path to the JSON file that will contain the result')
    parser.add_argument('--skip_obfuscation', default=False, dest='skip_obfuscation',
                        help='If true it won\'t obfuscate the sensitive data such as emails and file names. Mostly for testing purpuse')
    parser.add_argument('--parse_libraries', default=False,
                        dest='parse_libraries', help='If true, used libraries will be parsed')

    args = parser.parse_args()

    repo = git.Repo(args.directory)
    ar = AnalyzeRepo(repo)
    q = Questions()

    print('Initialization...')
    for branch in repo.branches:
        ar.create_commits_entity_from_branch(branch.name)
    ar.flag_duplicated_commits()
    ar.get_commit_stats()
    r = ar.create_repo_entity(args.directory)

    # Ask the user if we cannot find remote URL
    if r.primary_remote_url == '':
        answer = q.ask_primary_remote_url(r)

    authors = [(c['name'], c['email']) for _, c in r.contributors.items()]

    identities_err = None
    identities = q.ask_user_identity(authors, identities_err)
    MAX_LIMIT = 50
    while len(identities['user_identity']) == 0 or len(identities['user_identity']) > MAX_LIMIT:
        if len(identities['user_identity']) == 0:
            identities_err = 'Please select at least one author'
        if len(identities['user_identity']) > MAX_LIMIT:
            identities_err = 'You cannot select more than', MAX_LIMIT
        identities = q.ask_user_identity(authors, identities_err)
    r.local_usernames = identities['user_identity']

    if args.parse_libraries:
        # build authors from the selection
        al = AnalyzeLibraries(r.commits, authors, repo.working_tree_dir)
        libs = al.get_libraries()

        # combine repo stats with libs used
        for i in range(len(r.commits)):
            c = r.commits[i]
            if c.hash in libs.keys():
                r.commits[i].libraries = libs[c.hash]

    if not args.skip_obfuscation:
        r = obfuscate(r)

    er = ExportResult(r)
    er.export_to_json(args.output)
예제 #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        'directory',
        help='Path to the repository. Example usage: run.sh path/to/directory')
    parser.add_argument(
        '--output',
        default='./repo_data.json',
        dest='output',
        help='Path to the JSON file that will contain the result')
    parser.add_argument(
        '--skip_obfuscation',
        default=False,
        dest='skip_obfuscation',
        action='store_true',
        help=
        'If true it won\'t obfuscate the sensitive data such as emails and file names. Mostly for testing purpuse'
    )
    parser.add_argument('--parse_libraries',
                        default=True,
                        action='store_true',
                        dest='parse_libraries',
                        help='If true, used libraries will be parsed')
    parser.add_argument(
        '--email',
        default='',
        dest='email',
        help='If set, commits from this email are preselected on authors list')
    parser.add_argument('--skip_upload',
                        default=False,
                        action='store_true',
                        dest='skip_upload',
                        help="If true, don't prompt for inmediate upload")
    try:
        args = parser.parse_args()
        folders = args.directory.split('|,|')
        if len(folders) > 1:
            q = Questions()
            repos = q.ask_which_repos(folders)
            if 'chosen_repos' not in repos or len(repos['chosen_repos']) == 0:
                print("No repos chosen, will exit")
            for repo in repos['chosen_repos']:
                repo_name = os.path.basename(repo).replace(' ', '_')
                output = ('./%s.json' % (repo_name))
                initialize(repo, args.skip_obfuscation, output,
                           args.parse_libraries, args.email, args.skip_upload)
                print('Finished analyzing %s ' % (repo_name))

        else:
            initialize(args.directory, args.skip_obfuscation, args.output,
                       args.parse_libraries, args.email, args.skip_upload)

    except KeyboardInterrupt:
        print("Cancelled by user")
        os._exit(0)
예제 #4
0
def init_headless(directory, skip_obfuscation, output, parse_libraries, emails,
                  debug_mode, user_commits, reponame, skip, commit_size_limit,
                  file_size_limit):
    # Initialize logger
    logger = logging.getLogger("main")
    if debug_mode:
        logger.setLevel(logging.DEBUG)
        fh = logging.FileHandler('extractor_debug_info.log')
        fh.setLevel(logging.DEBUG)
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        fh.setFormatter(formatter)
        logger.addHandler(fh)
    else:
        logger.setLevel(logging.WARNING)

    repo = git.Repo(directory)
    ar = AnalyzeRepo(repo)
    q = Questions()

    print('Initialization...')
    for branch in repo.branches:
        ar.create_commits_entity_from_branch(branch.name)
    ar.flag_duplicated_commits()
    ar.get_commit_stats()
    print('Analysing the master branch..')
    ar.analyse_master_user_commits(user_commits)
    print('Creating the repo entity..')
    r = ar.create_repo_entity(directory)

    r.local_usernames = list(set(r.local_usernames + emails))
    print('Setting the local user_names ::', r.local_usernames)
    r.repo_name = reponame

    if parse_libraries:
        # build authors from the the email list provided
        # we are provided only emails in the headless mode
        # TODO! Support both name -> email and email formats
        author_emails = []
        for email in r.local_usernames:
            author_emails.append(email)

        if author_emails:
            al = AnalyzeLibraries(r.commits, author_emails,
                                  repo.working_tree_dir, skip,
                                  commit_size_limit, file_size_limit)
            libs = al.get_libraries()

            # combine repo stats with libs used
            for i in range(len(r.commits)):
                c = r.commits[i]
                if c.hash in libs.keys():
                    r.commits[i].libraries = libs[c.hash]

    if not skip_obfuscation:
        r = obfuscate(r)

    er = ExportResult(r)
    er.export_to_json_headless(output)
    print('Successfully analysed the repo ==>' + reponame)
예제 #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        'directory',
        help='Path to the repository. Example usage: run.sh path/to/directory')
    parser.add_argument(
        '--output',
        default='./repo_data.json',
        dest='output',
        help='Path to the JSON file that will contain the result')
    parser.add_argument(
        '--skip_obfuscation',
        default=False,
        dest='skip_obfuscation',
        help=
        'If true it won\'t obfuscate the sensitive data such as emails and file names. Mostly for testing purpuse'
    )
    args = parser.parse_args()

    repo = git.Repo(args.directory)
    ar = AnalyzeRepo(repo, args.skip_obfuscation)
    q = Questions()

    print('Initialization...')
    for branch in repo.branches:
        ar.create_commits_entity_from_branch(branch.name)
    ar.flag_duplicated_commits()
    ar.get_commit_stats()
    r = ar.create_repo_entity(args.directory)

    # Ask the user if we cannot find remote URL
    if r.primary_remote_url == '':
        answer = q.ask_primary_remote_url(r)

    identities = q.ask_user_identity(r)
    MAX_LIMIT = 50
    while len(identities['user_identity']) == 0 or len(
            identities['user_identity']) > MAX_LIMIT:
        if len(identities['user_identity']) == 0:
            print('Please select at least one.')
        if len(identities['user_identity']) > MAX_LIMIT:
            print('You cannot select more than', MAX_LIMIT)
        identities = q.ask_user_identity(r)
    r.local_usernames = identities['user_identity']
    er = ExportResult(r)
    er.export_to_json(args.output)
예제 #6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        'directory', help='Path to the repository. Example usage: run.sh path/to/directory')
    parser.add_argument('--output', default='./repo_data.json', dest='output',
                        help='Path to the JSON file that will contain the result. Use .json extension otherwise it cannot be recognized.')
    parser.add_argument('--skip_obfuscation', default=False, dest='skip_obfuscation', action='store_true',
                        help='If true it won\'t obfuscate the sensitive data such as emails and file names. Mostly for testing purpose')
    parser.add_argument('--parse_libraries',  default=True, action='store_true',
                        dest='parse_libraries', help='If true, used libraries will be parsed')
    parser.add_argument('--email', default='',
                        dest='email', help='If set, commits from this email are preselected on authors list')
    parser.add_argument('--skip_upload',  default=False, action='store_true',
                        dest='skip_upload', help="If true, don't prompt for inmediate upload")
    parser.add_argument('--debug_mode', default=False, action='store_true',
                        dest='debug_mode', help="Print additional debug info into extractor_debug_info.log")
    parser.add_argument('--noskip', default=True, dest='skip', action='store_false',
                        help='Do not skip any commits in analyze_libraries. May impact running time.')
    parser.add_argument('--commit_size_limit', default=5, type=int,
                        help='If the estimated size of the changed files is bigger than this, we skip the commit')
    parser.add_argument('--file_size_limit', default=2, type=int,
                        help='The library analyzer skips files bigger than this limit')
    try:
        args = parser.parse_args()
        folders=args.directory.split('|,|')
        if len(folders) > 1:
            q = Questions()
            repos = q.ask_which_repos(folders)
            if 'chosen_repos' not in repos or len(repos['chosen_repos']) == 0:
                print("No repos chosen, will exit")
            for repo in repos['chosen_repos']:
                repo_name = os.path.basename(repo).replace(' ','_')
                output=('./%s.json' % (repo_name))
                initialize(repo, args.skip_obfuscation, output, args.parse_libraries, args.email, args.skip_upload,
                           args.debug_mode, args.skip, args.commit_size_limit, args.file_size_limit)
                print('Finished analyzing %s ' % (repo_name))

        else:
            initialize(args.directory, args.skip_obfuscation, args.output,
                       args.parse_libraries, args.email, args.skip_upload, args.debug_mode, args.skip,
                       args.commit_size_limit, args.file_size_limit)

    except KeyboardInterrupt:
        print("Cancelled by user")
        os._exit(0)
예제 #7
0
    def export_to_json_interactive(self, file_name, skip_upload=False):
        self.dump(file_name)

        q = Questions()
        if skip_upload != False:
            result = False
        else:
            result = q.query_yes_no(
                'Do you want to upload the result to your profile automatically?'
            )
        if result:
            response = uploadRepo(file_name + '.zip')
            if response is not None:
                reponame = self.data.repo_name
                url = 'https://profile.codersrank.io/repo?token=' + \
                    response['token']+'&reponame='+reponame
                print('Go to this link in the browser => ' + url)
                webbrowser.open(url)

        os.remove(file_name)
예제 #8
0
def init_headless(directory, skip_obfuscation, output, parse_libraries, emails,
                  user_commits, reponame):
    repo = git.Repo(directory)
    ar = AnalyzeRepo(repo)
    q = Questions()

    print('Initialization...')
    for branch in repo.branches:
        ar.create_commits_entity_from_branch(branch.name)
    ar.flag_duplicated_commits()
    ar.get_commit_stats()
    print('Analysing the master branch..')
    ar.analyse_master_user_commits(user_commits)
    print('Creating the repo entity..')
    r = ar.create_repo_entity(directory)

    r.local_usernames = list(set(r.local_usernames + emails))
    print('Setting the local user_names ::', r.local_usernames)
    r.repo_name = reponame

    if parse_libraries:
        # build authors from the the email list provided
        # we are provided only emails in the headless mode
        # TODO! Support both name -> email and email formats
        author_emails = []
        for email in r.local_usernames:
            author_emails.append(email)

        if author_emails:
            al = AnalyzeLibraries(r.commits, author_emails,
                                  repo.working_tree_dir)
            libs = al.get_libraries()

            # combine repo stats with libs used
            for i in range(len(r.commits)):
                c = r.commits[i]
                if c.hash in libs.keys():
                    r.commits[i].libraries = libs[c.hash]

    if not skip_obfuscation:
        r = obfuscate(r)

    er = ExportResult(r)
    er.export_to_json_headless(output)
    print('Successfully analysed the repo ==>' + reponame)
예제 #9
0
def initialize(directory, skip_obfuscation, output, parse_libraries, email,
               skip_upload, debug_mode, skip, commit_size_limit,
               file_size_limit):

    # Initialize logger
    logger = logging.getLogger("main")
    if debug_mode:
        logger.setLevel(logging.DEBUG)
        fh = logging.FileHandler('extractor_debug_info.log')
        fh.setLevel(logging.DEBUG)
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        fh.setFormatter(formatter)
        logger.addHandler(fh)
    else:
        logger.setLevel(logging.WARNING)

    logger.debug("Initialized main logger.")

    repo = git.Repo(directory)
    ar = AnalyzeRepo(repo)
    q = Questions()

    print('Analyzing repo under %s ...' % (directory))

    try:
        # Stop parsing if there are no branches
        if not repo.branches:
            print('No branches detected, will ignore this repo')
            return

        for branch in repo.branches:
            ar.create_commits_entity_from_branch(branch.name)
        ar.flag_duplicated_commits()
        ar.get_commit_stats()
        r = ar.create_repo_entity(directory)

        # Stop parsing if there are no remotes
        if not r.original_remotes:
            print('No remotes detected, will ignore this repo')
            return

        # Ask the user if we cannot find remote URL
        if r.primary_remote_url == '':
            answer = q.ask_primary_remote_url(r)

        if not r.contributors.items():
            print('No authors detected, will ignore this repo')
            return

        authors = [(c['name'], c['email']) for _, c in r.contributors.items()]
        identities = {}
        identities['user_identity'] = []

        # Stop parsing if there are no authors
        if len(authors) == 0:
            print('No authors detected, will ignore this repo')
            return

        identities_err = None
        identities = q.ask_user_identity(authors, identities_err, email)
        MAX_LIMIT = 50
        while len(identities['user_identity']) == 0 or len(
                identities['user_identity']) > MAX_LIMIT:
            if len(identities['user_identity']) == 0:
                identities_err = 'Please select at least one author'
            if len(identities['user_identity']) > MAX_LIMIT:
                identities_err = 'You cannot select more than', MAX_LIMIT
            identities = q.ask_user_identity(authors, identities_err)
        r.local_usernames = identities['user_identity']

        if parse_libraries:
            # build authors from the selection
            # extract email from name -> email list
            author_emails = [i.split(' -> ', 1)[1] for i in r.local_usernames]

            if author_emails:
                al = AnalyzeLibraries(r.commits, author_emails,
                                      repo.working_tree_dir, skip,
                                      commit_size_limit, file_size_limit)
                libs = al.get_libraries()
                # combine repo stats with libs used
                for i in range(len(r.commits)):
                    c = r.commits[i]
                    if c.hash in libs.keys():
                        r.commits[i].libraries = libs[c.hash]

        if not skip_obfuscation:
            r = obfuscate(r)
        er = ExportResult(r)
        er.export_to_json_interactive(output, skip_upload)
    except KeyboardInterrupt:
        print("Cancelled by user")
        return
예제 #10
0
def init_headless(directory,
                  skip_obfuscation,
                  output,
                  parse_libraries,
                  emails,
                  debug_mode,
                  user_commits,
                  reponame,
                  skip,
                  commit_size_limit,
                  file_size_limit,
                  seed,
                  timeout_seconds=600):
    # Initialize logger
    logger = logging.getLogger("main")
    if debug_mode:
        logger.setLevel(logging.DEBUG)
        fh = logging.FileHandler('extractor_debug_info.log')
        fh.setLevel(logging.DEBUG)
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        fh.setFormatter(formatter)
        logger.addHandler(fh)
    else:
        logger.setLevel(logging.WARNING)

    repo = git.Repo(directory)
    ar = AnalyzeRepo(repo)
    q = Questions()
    timer = Timer(timeout_seconds, timeout)
    timer.start()
    # Use a context manager with signal to measure seconds, and timeout
    try:
        print('Initialization...')
        for branch in repo.branches:
            ar.create_commits_entity_from_branch(branch.name)
        ar.flag_duplicated_commits()
        ar.get_commit_stats()
        print('Analysing the master branch..')
        ar.analyse_master_user_commits(user_commits)
        print('Creating the repo entity..')
        r = ar.create_repo_entity(directory)

        r.local_usernames = list(set(r.local_usernames + emails))
        MAX_EMAIL_LIMIT = 50
        if len(r.local_usernames) > MAX_EMAIL_LIMIT:
            print("Email count (" + str(len(r.local_usernames)) +
                  ") for this repo exceeds the limit of " +
                  str(MAX_EMAIL_LIMIT) + " emails.")
            r.local_usernames = r.local_usernames[0:MAX_EMAIL_LIMIT]
        print('Setting the local user_names ::', r.local_usernames)
        r.repo_name = reponame

        if parse_libraries and len(ar.commit_list) > 0:
            # build authors from the the email list provided
            # we are provided only emails in the headless mode
            # TODO! Support both name -> email and email formats
            author_emails = []
            for email in r.local_usernames:
                author_emails.append(email)

            if author_emails:
                al = AnalyzeLibraries(r.commits, author_emails,
                                      repo.working_tree_dir, skip,
                                      commit_size_limit, file_size_limit)
                libs = al.get_libraries()

                # combine repo stats with libs used
                for i in range(len(r.commits)):
                    c = r.commits[i]
                    if c.hash in libs.keys():
                        r.commits[i].libraries = libs[c.hash]

            # new email detection
            try:
                emails_v2 = match_emails(directory, seed)
                r.emails_v2 = emails_v2["emails"]
            except:
                r.emails_v2 = list()

        if not skip_obfuscation:
            r = obfuscate(r)

        er = ExportResult(r)
        er.export_to_json_headless(output)
        print('Successfully analysed the repo ==>' + reponame)
    except KeyboardInterrupt:
        print("{} timeouted after {} seconds.".format(repo.working_dir,
                                                      timeout_seconds))
        print("Deleting", repo.working_dir)
        try:
            shutil.rmtree(repo.working_dir)
        except (PermissionError, NotADirectoryError, Exception) as e:
            print("Error when deleting {}. Message: {}".format(
                repo.working_dir, str(e)))
    finally:
        timer.cancel()