Пример #1
0
def patch_issues(save_to_external_file):
        with open(repos_filtered_heuristic_metrics, 'r') as f:
                repos = json.load(f)
                enriched_result = list()
                counter = 1
                for p in repos:
                        print("Patching issues for repo number " + str(counter) + " --- " + p['id'])
                        counter += 1
                        if(p['source'] == "github"):
                                mined_issue_data = get_issues_github(p)
                                p['num_issues'] = mined_issue_data[0]
                                p['open_issues'] = mined_issue_data[1]
                                p['closed_issues'] = mined_issue_data[2]
                        else:
                                p['num_issues'] = "NA"
                                p['open_issues'] = "NA"
                                p['closed_issues'] = "NA"
                        enriched_result.append(p)
        if(save_to_external_file):
                c.save(repos_filtered_heuristic_metrics, enriched_result)
        else:
                csv.register_dialect('tab_separated_csv', delimiter = '\t', quoting=csv.QUOTE_ALL, skipinitialspace=True)
                to_save = list()
                for p in enriched_result:
                        to_save.append([p['id'], p['num_issues'], p['open_issues'], p['closed_issues']])
                with open("./repos_mining_data/otherData/issues_patch.csv", 'w') as f:
                        writer = csv.writer(f, dialect='tab_separated_csv')
                        for row in to_save:
                                writer.writerow(row)
Пример #2
0
    def synchronize_games(self):
        if config.cloudfolder is None:
            folderpicker = SyncfolderPicker()
            if folderpicker.exec():
                return
            config.cloudfolder = folderpicker.get_syncfolder()
            config.save()
            config.load()

        remembered_resolution_strategy = None
        model = self.list_found_games.model
        for row in range(0, model.rowCount()):
            item = model.item(row)
            game = item.game
            if item.checkState() == Qt.Checked:
                if savesync.has_conflicts(game):
                    resolution_strategy = remembered_resolution_strategy
                    if resolution_strategy is None:
                        conflict_resolution_dialog = ConflictResolutionDialog(game.name)
                        if conflict_resolution_dialog.exec():
                            result = conflict_resolution_dialog.get_dialog_result()
                            resolution_strategy = result[0]
                            if result[1]:
                                remembered_resolution_strategy = resolution_strategy
                    if resolution_strategy == ConflictResolutionDialog.ResolutionMethod.OVERWRITE_LOCAL:
                        savesync.remove_local_savegame(game)
                        savesync.move_save_to_cloud(game)
                    elif resolution_strategy == ConflictResolutionDialog.ResolutionMethod.OVERWRITE_CLOUD:
                        savesync.remove_cloud_savegame(game)
                        savesync.move_save_to_cloud(game)
                else:
                    savesync.move_save_to_cloud(game)
        self.refresh_games()
def analyze_pair(rosmap_file_path, gh_file_path, merged_file_path):

    with open(rosmap_file_path, 'r') as rosmap:
        with open(gh_file_path, 'r') as gh:  
            # we load the data
            rosmap_data = get_all_github_repos_data(json.load(rosmap))
            gh_reader = csv.DictReader(gh, delimiter='\t')

            rosmap_urls = list()
            for p in rosmap_data:
                rosmap_urls.append(get_rosmap_repo_substring(p['url']))

            results = rosmap_urls
            
            for line in gh_reader:
                try:
                    current_url = get_github_repo_substring(line['url'])
                    if(not current_url in rosmap_urls):
                        results.append(current_url)
                except (AttributeError, TypeError, IndexError):
                    print("Error for: " + str(line))
            gh.close()
        rosmap.close()

    c.save(merged_file_path, results)
    print(rosmap_file_path + " + " + gh_file_path + " = " + str(len(results)) + ". Saved in: " + merged_file_path)
Пример #4
0
def patch_languages(save_to_external_file):
        github_no_simul = json.load(open('./repos_mining_data/intermediateResults/6_github_no_simul.json', 'r'))
        bitbucket_no_simul = json.load(open('./repos_mining_data/intermediateResults/6_bitbucket_no_simul.json', 'r'))
        with open(repos_filtered_heuristic_metrics, 'r') as f:
                repos = json.load(f)
                enriched_result = list()
                counter = 1
                for p in repos:
                        print("Patching languages for repo number " + str(counter) + " --- " + p['id'])
                        counter += 1
                        if(p['source'] == "github"):
                                p['language'] = get_repo_data(p['id'], github_no_simul)
                        if(p['source'] == "bitbucket"):
                                p['language'] = get_repo_data(p['id'], bitbucket_no_simul)
                        if(p['source'] == "gitlab"):
                                p['language'] = "NA"
                        enriched_result.append(p)
        if(save_to_external_file):
                c.save('./repos_mining_data/otherData/repos_filtered_launch_file_metrics_languages.json', enriched_result)
        else:
                csv.register_dialect('tab_separated_csv', delimiter = '\t', quoting=csv.QUOTE_ALL, skipinitialspace=True)
                to_save = list()
                for p in enriched_result:
                        to_save.append([p['id'], p['language']])
                with open("./repos_mining_data/otherData/languages_patch.csv", 'w') as f:
                        writer = csv.writer(f, dialect='tab_separated_csv')
                        for row in to_save:
                                writer.writerow(row)
Пример #5
0
def start_detecting():
    with open(cloned_repos, 'r') as f:
        repos_list = json.load(f)
        counter = 1
        detection_result = list()
        for p in repos_list:
            print("Detecting files for repo number " + str(counter) + " --- " +
                  p['id'])
            counter += 1
            p['xml_launch_files'] = detect_xml_launch_files(p)
            p['py_launch_files'] = detect_py_launch_files(p)
            detection_result.append(p)
    c.save(detection_result_path, detection_result)
Пример #6
0
def crawl_data(app):
    try:
        # Tells to the caller that we just downloaded new data
        is_new_data_available = False

        # Download Google Play metadata
        app_metadata = get_gp_metadata(app)
        app_latest_version = app_metadata['version']
        app_suffix_path = app['id'] + c.SEPARATOR + app_latest_version

        # Save the metadata if it is new
        metadata_path = c.DATA_PATH + app_suffix_path + c.SEPARATOR + 'metadata.json'
        if (not os.path.exists(metadata_path)):
            is_new_data_available = True
            c.save(metadata_path, app_metadata)

        # Save the reviews
        reviews_path = c.DATA_PATH + app_suffix_path + c.SEPARATOR + 'reviews.json'
        app_reviews = get_reviews(app)
        c.save(reviews_path, app_reviews)

        # Download the APK if it is new
        apk_path = c.APKS_PATH + app_suffix_path + '.apk'
        if not os.path.exists(apk_path):
            if not download_apk(app['id'], apk_path):
                print(
                    'Error while downloading the following app, we skip it: ' +
                    app['id'])
                return False
            elif not apk_downloader.verify_apk(app['id'], apk_path,
                                               app_suffix_path):
                print('The downloaded APK is not well formed, we skip it: ' +
                      apk_path)
                return False

        app['latest_crawled_version'] = app_latest_version
        app['latest_crawl'] = int(time.time())

        # Let's inform the user about whether new data has been crawled
        if is_new_data_available:
            print('Crawled new data for: ' + app['id'] + ' - version: ' +
                  app_latest_version)
        else:
            print('Already up to date: ' + app['id'] + ' - version: ' +
                  app_latest_version)

        return is_new_data_available
    except:
        print('It seems like we had some problems in fetching new data for: ' +
              app['id'] + '. So, we skip it in the analysis.s')
        return False
def get_repos_list_to_clone(file_path):
    gitlab_no_simul = json.load(open('./repos_mining_data/intermediateResults/6_gitlab_no_simul.json', 'r'))
    bitbucket_no_simul = json.load(open('./repos_mining_data/intermediateResults/6_bitbucket_no_simul.json', 'r'))
    github_no_simul = json.load(open('./repos_mining_data/intermediateResults/6_github_no_simul.json', 'r'))

    repos = list()
    for p in github_no_simul:
        repos.append({'id': p['full_name'], 'description': p['description'], 'web_url': p['html_url'], 'clone_url': p['clone_url'], 'default_branch': p['default_branch'], 'source': 'github'})
    for p in gitlab_no_simul:
        repos.append({'id': p['path_with_namespace'], 'description': p['description'], 'web_url': p['web_url'], 'clone_url': p['http_url_to_repo'], 'default_branch': p['default_branch'], 'source': 'gitlab'})
    for p in bitbucket_no_simul:
        repos.append({'id': p['full_name'], 'description': p['description'], 'web_url': p['links']['html']['href'], 'clone_url': p['links']['clone'][0]['href'], 'default_branch': p['mainbranch']['name'], 'source': 'bitbucket'})

    c.save(file_path, repos)
def start_cloning():
    with open(repos_to_clone, 'r') as f:
        repos_list = json.load(f)
        counter = 1
        cloned_repos = list()
        for p in repos_list:
            print("Cloning repo number " + str(counter) + " --- " + p['id'])
            counter += 1
            absolute_path_to_clone = get_clone_path(p, True)
            local_path_to_clone = get_clone_path(p, False)
            clone_repo(p, absolute_path_to_clone)
            p['absolute_clone_path'] = absolute_path_to_clone
            p['local_clone_path'] = local_path_to_clone
            cloned_repos.append(p)
    c.save(cloned_repos_json, cloned_repos)
Пример #9
0
    def init_ui(self):
        layout = QGridLayout(self)

        label_found_games = QLabel("Found games", self)
        layout.addWidget(label_found_games, 0, 0)

        label_synchronized_games = QLabel("Synchronized games", self)
        layout.addWidget(label_synchronized_games, 0, 1)

        self.list_found_games = GameList(self)
        layout.addWidget(self.list_found_games, 1, 0)

        self.list_synchronized_games = GameList(self)
        layout.addWidget(self.list_synchronized_games, 1, 1)

        button_synchronize = QPushButton("Synchronize selected", self)
        button_synchronize.clicked.connect(self.synchronize_games)
        layout.addWidget(button_synchronize, 2, 0)

        button_unsynchronize = QPushButton("Unsynchronize selected", self)
        button_unsynchronize.clicked.connect(self.unsynchronize_games)
        layout.addWidget(button_unsynchronize, 2, 1)

        button_group = QWidget(self)
        button_group_layout = QVBoxLayout(button_group)

        button_update = QPushButton("Update supported games list", self)
        button_update.clicked.connect(self.update_games_list)
        button_group_layout.addWidget(button_update)

        button_change_syncfolder = QPushButton("Change sync folder", button_group)
        button_change_syncfolder.clicked.connect(self.change_sync_folder)
        button_group_layout.addWidget(button_change_syncfolder)

        layout.addWidget(button_group, 1, 3, Qt.AlignTop)

        if not config.exists():
            folderpicker = SyncfolderPicker()
            if folderpicker.exec():
                config.cloudfolder = folderpicker.get_syncfolder()
                config.save()

        if config.exists():
            config.load()
            self.refresh_games()

        self.center()
        self.show()
Пример #10
0
def collect_data():

    apps = json.load(open(c.APPS_PATH, 'r'))

    for a in apps:
        # Crawl data from the Google Play store
        crawled_new_data = crawler.crawl_data(a)

        # if the app has a new release we did not analyze before...
        if crawled_new_data:
            # Launch the Androguard and Androwarn analyses
            androguard_androwarn_analyzer.analyze(a)
            # Analyze the servers pointed by the URLs we found in the String analysis of Androguard
            servers_analyzer.analyze(a)

    # Finally, if everything goes well, save the updated apps.json file with the new timestamps and versions
    c.save(c.APPS_PATH, apps)
Пример #11
0
def patch_contributors(save_to_external_file):
        with open(repos_filtered_heuristic_metrics, 'r') as f:
                repos = json.load(f)
                enriched_result = list()
                for p in repos:
                        p['num_contributors'] = get_contributors_locally(p['local_clone_path'])
                        enriched_result.append(p)
        if(save_to_external_file):
                c.save(repos_filtered_heuristic_metrics, enriched_result)
        else:
                csv.register_dialect('tab_separated_csv', delimiter = '\t', quoting=csv.QUOTE_ALL, skipinitialspace=True)
                to_save = list()
                for p in enriched_result:
                        to_save.append([p['id'], p['num_contributors']])
                with open("./repos_mining_data/otherData/locally_identified_contributors.csv", 'w') as f:
                        writer = csv.writer(f, dialect='tab_separated_csv')
                        for row in to_save:
                                writer.writerow(row)
Пример #12
0
def update_apps_lists(root_path, countries):

    for e in countries:
        country = e['code']
        lang = e['lang']
        data_path = root_path + '/data_' + country
        if not os.path.exists(data_path):
            print(
                'The country with code "' + country +
                '" is new, I am setting up its folder and app.json file now...'
            )
            os.mkdir(data_path)
            os.mkdir(data_path + '/apks')
            os.mkdir(data_path + '/data')
            os.mkdir(data_path + '/reports')
            c.save(data_path + '/apps.json', [])

        c.setPaths(data_path)

        url = 'https://play.google.com/store/search?q=covid&gl=' + country
        r = requests.get(url)
        search_page = BeautifulSoup(r.text, 'html.parser')
        # We look for all the links referring to the apps listed by the search
        apps = search_page.find_all(
            "a", href=re.compile('^\/store\/apps\/details\?id=*'))
        app_ids = list()
        # We collect all app ids
        for a in apps:
            app_ids.append(a['href'].replace('/store/apps/details?id=', ''))
        # Remove duplicate ids
        app_ids = list(dict.fromkeys(app_ids))

        # Now we iterate over all apps and add the new ones to the apps.json file
        analysed_apps = json.load(open(c.APPS_PATH, 'r'))
        for a in app_ids:
            if is_new(a, analysed_apps):
                analysed_apps.append({
                    'id': a,
                    'store_country': country,
                    'store_lang': lang
                })

        c.save(c.APPS_PATH, analysed_apps)
Пример #13
0
def collect_data(input_path, sonarqube):
    print("Sonarqube = " + str(sonarqube))
    apps = json.load(open(c.APPS_PATH, 'r'))

    for a in apps:
        # Crawl data from the Google Play store
        crawled_new_data = crawler.crawl_data(a)

        # if the app has a new release we did not analyze before...
        if crawled_new_data:
            # Launch the Androguard and Androwarn analyses
            androguard_androwarn_analyzer.analyze(a)
            # Analyze the servers pointed by the URLs we found in the String analysis of Androguard
            servers_analyzer.analyze(a)
        # Sonarqube analysis can be passed as -S flag when invoking the program
        if sonarqube:
            SonarQube.sq_analyze(input_path)
        
    # Finally, if everything goes well, save the updated apps.json file with the new timestamps and versions
    c.save(c.APPS_PATH, apps)
Пример #14
0
def analyze(app):

    result = {}

    # We open the APK
    apk_path = c.get_apk_path(app)
    # Here we check if the APK is actually there, otherwise we skip the analysis
    if (not os.path.exists(apk_path)):
        return
    a, d, dx = AnalyzeAPK(apk_path)

    # Get all the permissions requested by the app
    requested_permissions = a.get_permissions()

    # Get all the Android activities of the app
    activities = a.get_activities()

    # Get all String constants in the app presumably containing a URL
    urls = list()
    for u in dx.find_strings("http[s]?://."):
        urls.append(u.get_value())

    # We pack together all the partial results
    result['permissions'] = requested_permissions
    result['activities'] = activities
    result['urls'] = urls

    # We save the result into a JSON file
    app_suffix_path = app['id'] + c.SEPARATOR + app['latest_crawled_version']
    result_path = c.DATA_PATH + app_suffix_path + c.SEPARATOR + 'androguard.json'
    c.save(result_path, result)

    # Now we run also the Androwarn analysis (with no Play Store look up)
    data = perform_analysis(apk_path, a, d, dx, False)

    # We generate the JSON report with the following parameters
    # Verbosity level: 3 (advanced)
    # Report type: json
    # Output path: same pattern as all the other JSON files produced so far
    androwarn_report_path = c.DATA_PATH + app_suffix_path + c.SEPARATOR + 'androwarn.json'
    generate_report(app['id'], data, 3, 'json', androwarn_report_path)
Пример #15
0
 def change_sync_folder(self):
     folderpicker = SyncfolderPicker()
     if not folderpicker.exec():
         return
     new_cloudfolder = folderpicker.get_syncfolder()
     if new_cloudfolder == config.cloudfolder:
         return
     games = savesync.detect_games()
     games_in_old_folder = [game for game in games if os.path.isdir(os.path.join(config.cloudfolder, game.id))]
     if len(games_in_old_folder) > 0:
         move_dialog = MoveSyncfolderDialog()
         if not move_dialog.exec():
             return
         move = move_dialog.move
         for game in games_in_old_folder:
             if move:
                 savesync.move_game_to_other_cloud(game, new_cloudfolder)
     config.cloudfolder = new_cloudfolder
     config.save()
     config.load()
     self.refresh_games()
Пример #16
0
def analyze(app):
    print('Analyzing the servers mentioned by: ' + app['id'])
    result = list()
    urls = get_candidate_urls(app)

    for url in urls:
        # Here is where we do the real Whois query
        try:
            domain_info = whois.query(url, force=1, slow_down=2)
            # We transform the domain object into a plain dictionary, otherwise we cannot save it into the json file
            item = domain_info.__dict__
            result.append(item)
        except:
            print(
                'Error performing the whois lookup for this server, it will be ignored: '
                + url)

    # We save the result into a JSON file
    app_suffix_path = app['id'] + c.SEPARATOR + app['latest_crawled_version']
    result_path = c.DATA_PATH + app_suffix_path + c.SEPARATOR + 'servers.json'
    c.save(result_path, result)
Пример #17
0
def apply_filtering_heuristics():
    with open(detection_result_path, 'r') as f:
        repos = json.load(f)
        with_launch_file = 0
        final_filtered = 0
        collected_xml_launch_files = list()
        collected_py_launch_files = list()
        filtered_repos = list()
        for p in repos:
            # Check 1: the repo must contain at least one Launch file
            if ((len(p['xml_launch_files']) > 0)
                    or (len(p['py_launch_files']) > 0)):
                with_launch_file += 1
                total_nodes = 0
                total_includes = 0
                for el in p['xml_launch_files']:
                    total_nodes += el['num_nodes']
                    total_includes += el['num_includes']
                for el in p['py_launch_files']:
                    total_nodes += el['num_nodes']
                    total_includes += el['num_includes']
                collected_xml_launch_files.append(len(p['xml_launch_files']))
                collected_py_launch_files.append(len(p['py_launch_files']))
                if (total_nodes >= 2 or total_includes >= 1):
                    final_filtered += 1
                filtered_repos.append(p)
        c.save(filtered_heuristic, filtered_repos)
        print("Total number XML launch file: " +
              str(sum(collected_xml_launch_files)))
        print("Details: " + str(collected_xml_launch_files))
        print("Total number Python launch file: " +
              str(sum(collected_py_launch_files)))
        print("Details: " + str(collected_py_launch_files))
        print("Repos with either an XML or Python launch file: " +
              str(with_launch_file))
        print("Repos with either more than 2 nodes or 1 include statement: " +
              str(final_filtered))
Пример #18
0
def collect_metrics_counts():
    with open(filtered_heuristic, 'r') as f:  
        repos_list = json.load(f)
        counter = 1
        enriched_result = list()
        for p in repos_list:
            print("Collecting metrics for repo number " + str(counter) + " --- " + p['id'])
            counter += 1
            metrics = count_metrics(p)
            p['num_issues'] = metrics[0]
            p['num_pull_requests'] = metrics[1]
            p['num_commits'] = metrics[2]
            p['num_branches'] = metrics[3]
            p['num_releases'] = metrics[4]
            p['num_contributors'] = metrics[5]
            # here we double check if there are repos with less than NUM_COMMITS commits
            if(p['num_commits'] != "NA"):
                if(int(p['num_commits']) >= c.NUM_COMMITS):
                        enriched_result.append(p)
                else:
                        print("Discarded this repo because it has less than NUM_COMMITS commits: " + p['id'])
            else:
                enriched_result.append(p)   
    c.save(repos_filtered_heuristic_metrics, enriched_result)
def start_analysis():    
    with open('./otherData/rosmap_output.json', 'r') as outputfile:  
        # we load the data
        data = json.load(outputfile)

        # in bitbucket_repos we will have the JSON representation of all the data we mined from the search API of bitbucket.org
        if(not os.path.isfile('./repos_mining_data/intermediateResults/0_all_bitbucket.json')):
            bitbucket_repos = get_all_bitbucket_repos_data(data)
            c.save('./repos_mining_data/intermediateResults/0_all_bitbucket.json', gitlab_repos)
        else:
            bitbucket_repos = json.load(open('./repos_mining_data/intermediateResults/0_all_bitbucket.json', 'r'))
        
        # in gitlab_repos we will have the JSON representation of all the data we mined from the search API of gitlab.com 
        # Notice that out of the 46 initial gitlab repos, 16 of them are not hosted on gitlab.com, we did a manual analysis of those repos, which lead to no included results
        if(not os.path.isfile('./repos_mining_data/intermediateResults/0_all_gitlab.json')):
            gitlab_repos = get_all_gitlab_repos_data(data)
            c.save('./repos_mining_data/intermediateResults/0_all_gitlab.json', gitlab_repos)
        else:
            gitlab_repos = json.load(open('./repos_mining_data/intermediateResults/0_all_gitlab.json', 'r'))

        # in github_rosmap_repos we will have the JSON representation of all the data we mined from the search of ROSMAP
        if(not os.path.isfile('./repos_mining_data/intermediateResults/0_rosmap_github.json')):
            github_rosmap_repos = get_all_github_repos_data(data)
            c.save('./repos_mining_data/intermediateResults/0_rosmap_github.json', github_rosmap_repos)

            # here we transform into a dictionary for eliminating duplicates and for easing the rest of the filtering
            commits_rosmap = to_dictionary(get_last_github_commits_api(github_rosmap_repos))
            c.save('./repos_mining_data/intermediateResults/0_rosmap_github_commits.json', commits_rosmap)
        else:
            # here we transform into a dictionary for eliminating duplicates and for easing the rest of the filtering
            github_rosmap_repos = to_dictionary(json.load(open('./repos_mining_data/intermediateResults/0_rosmap_github.json', 'r')))
            commits_rosmap = json.load(open('./repos_mining_data/intermediateResults/0_rosmap_github_commits.json', 'r'))
        
        # in github_gh_repos we will have the JSON representation of all the data we mined from the search API of the GitHub platform 
        # The initial point here is the data coming from the GHTorrent query as of this filtering step: "Filter repositories with #commits < 100"
        if(not os.path.isfile('./repos_mining_data/intermediateResults/2_ghtorrent_github.json')):
            # we load the data from the output of the GHTorrent query
            with open("./ghtorrentIntermediateResults/2_github_num_commits.txt", 'r') as gh:
                gh_reader = csv.DictReader(gh, delimiter='\t')
                data_ghtorrent = list()
                for line in gh_reader:
                    try:
                        data_ghtorrent.append(line)
                    except (AttributeError, TypeError, IndexError):
                        print("Error for: " + str(line))
            github_ghtorrent_repos = get_ghtorrent_github_repos_data(data_ghtorrent)
            c.save('./repos_mining_data/intermediateResults/2_ghtorrent_github.json', github_ghtorrent_repos)

            # here we transform into a dictionary for eliminating duplicates and for easing the rest of the filtering
            commits_ghtorrent = to_dictionary(get_last_github_commits_api(github_ghtorrent_repos))
            c.save('./repos_mining_data/intermediateResults/2_ghtorrent_github_commits.json', commits_ghtorrent)
        else:
            # here we transform into a dictionary for eliminating duplicates and for easing the rest of the filtering
            github_ghtorrent_repos = to_dictionary(json.load(open('./repos_mining_data/intermediateResults/2_ghtorrent_github.json', 'r')))
            commits_ghtorrent = json.load(open('./repos_mining_data/intermediateResults/2_ghtorrent_github_commits.json', 'r'))
        
        start_bitbucket_analysis(bitbucket_repos, data)
        start_gitlab_analysis(gitlab_repos)
        start_github_analysis(github_rosmap_repos, commits_rosmap, github_ghtorrent_repos, commits_ghtorrent, True)
def start_bitbucket_analysis(repos, rosmap_data):
    filtered_repos = list()
    print("0 - BitBucket Initial search: " + str(len(repos)))
    # Filter fork repositories
    for p in repos:
        if(not ('parent' in p)):
            filtered_repos.append(p)
    print("1 - BitBucket Filter fork repositories: " + str(len(filtered_repos)))
    c.save('./repos_mining_data/intermediateResults/1_bitbucket_forks.json', filtered_repos)
    repos = filtered_repos
    filtered_repos = list()
    # Filter repositories with #commits < NUM_COMMITS
    if(not os.path.isfile('./repos_mining_data/intermediateResults/2_bitbucket_commits.json')):
        for p in repos:
            if(get_bitbucket_commits(p['links']['clone'][0]['href']) >= c.NUM_COMMITS):
                filtered_repos.append(p)
        c.save('./repos_mining_data/intermediateResults/2_bitbucket_commits.json', filtered_repos)
        repos = filtered_repos
        print("2 - BitBucket Filter repositories with at least " + str(c.NUM_COMMITS) + " commits: " + str(len(repos)))
    else:
        repos = json.load(open('./repos_mining_data/intermediateResults/2_bitbucket_commits.json', 'r'))
        print("2 - BitBucket Filter repositories with at least " + str(c.NUM_COMMITS) + " commits: " + str(len(repos)))
    filtered_repos = list()
    # Filter repositories with at least X stars
    for p in repos:
        stars = get_rosmap_project(p['links']['html']['href'], rosmap_data)['stars']
        if(stars >= NUM_STARS):
            filtered_repos.append(p)
    print("3 - BitBucket Filter repositories with at least X stars: " + str(len(filtered_repos)))
    c.save('./repos_mining_data/intermediateResults/3_bitbucket_stars.json', filtered_repos)
    repos = filtered_repos
    filtered_repos = list()
    # Filter demo repositories
    for p in repos:
        if(p['description'] is not None):
            if(not (("demo" in p['description'].lower()) or ("course" in p['description'].lower()) or ("thesis" in p['description'].lower()) or ("exame" in p['description'].lower()))): # “demo”, “course”, "thesis", exame
                if(not (("demo" in p['full_name'].lower()) or ("course" in p['full_name'].lower()) or ("thesis" in p['full_name'].lower()) or ("exame" in p['full_name'].lower()))): # “demo”, “course”, "thesis", exame
                    filtered_repos.append(p)
        else:
            filtered_repos.append(p)
    print("4 - Bitbucket Filter DEMO repositories: " + str(len(filtered_repos)))
    c.save('./repos_mining_data/intermediateResults/4_bitbucket_no_demo.json', filtered_repos)
    repos = filtered_repos
    filtered_repos = list()
    # Filter tools repositories
    for p in repos:
        if(p['description'] is not None):
            if(not (("tool" in p['description'].lower()) or ("util" in p['description'].lower()) or ("helper" in p['description'].lower()) or ("library" in p['description'].lower()) or ("util" in p['description'].lower()) or ("plugin" in p['description'].lower()) or ("plug-in" in p['description'].lower()))): # tool, util, helper, library, plugin, plug-in
                if(not (("tool" in p['full_name'].lower()) or ("util" in p['full_name'].lower()) or ("helper" in p['full_name'].lower()) or ("library" in p['full_name'].lower()) or ("plugin" in p['full_name'].lower()) or ("plug-in" in p['full_name'].lower()))): # tool, util, helper, library, plugin, plug-in
                    filtered_repos.append(p)
        else:
            filtered_repos.append(p)
    print("5 - Bitbucket Filter TOOLS repositories: " + str(len(filtered_repos)))
    c.save('./repos_mining_data/intermediateResults/5_bitbucket_no_tools.json', filtered_repos)
    repos = filtered_repos
    filtered_repos = list()
    # Filter simulation-oriented repositories
    for p in repos:
        if(p['description'] is not None):
            if(not (("simulat" in p['description'].lower()) or ("gazebo" in p['description'].lower()))): # simulat, gazebo
                if(not (("simulat" in p['full_name'].lower()) or ("gazebo" in p['full_name'].lower()))): # simulat, gazebo
                    filtered_repos.append(p)
        else:
            filtered_repos.append(p)
    print("6 - Bitbucket Filter SIMULATION repositories: " + str(len(filtered_repos)))
    c.save('./repos_mining_data/intermediateResults/6_bitbucket_no_simul.json', filtered_repos)
    repos = filtered_repos
    filtered_repos = list()
    repos = filtered_repos
    filtered_repos = list()
    print("Bitbucket analysis done.")
def start_gitlab_analysis(repos):
    filtered_repos = list()
    print("0 - GitLab Initial search: " + str(len(repos)))
    # Filter fork repositories
    for p in repos:
        if((not ('fork' in p['name_with_namespace'])) and not(('fork' in p['description']))):
            filtered_repos.append(p)
    print("1 - GitLab Filter fork repositories: " + str(len(filtered_repos)))
    c.save('./repos_mining_data/intermediateResults/1_gitlab_forks.json', filtered_repos)
    repos = filtered_repos
    filtered_repos = list()
    # Filter repositories with #commits < NUM_COMMITS
    if(not os.path.isfile('./repos_mining_data/intermediateResults/3_bitbucket_commits.json')):
        for p in repos:
            response = form_request(p['web_url'])
            try:
                if response.status == 200:
                    data = response.data
                    commits = int(re.findall(r"[\d]+</strong> Commits</a>", str(data))[0].split("<")[0])
                else:
                    print("error: " + p['web_url'])
            except:
                commits = 0
            if(commits >= c.NUM_COMMITS):
                filtered_repos.append(p)
        c.save('./repos_mining_data/intermediateResults/2_gitlab_commits.json', filtered_repos)
        repos = filtered_repos
        print("2 - GitLab Filter repositories with at least " + str(c.NUM_COMMITS) + " commits: " + str(len(filtered_repos)))
    else:
        repos = json.load(open('./repos_mining_data/intermediateResults/2_gitlab_commits.json', 'r')) # filtered_repos
        print("2 - GitLab Filter repositories with at least " + str(c.NUM_COMMITS) + " commits: " + str(len(repos)))
    filtered_repos = list()
    # Filter repositories with at least X stars
    for p in repos:
        if(p['star_count'] >= NUM_STARS):
            filtered_repos.append(p)
    print("3 - GitLab Filter repositories with at least X stars: " + str(len(filtered_repos)))
    c.save('./repos_mining_data/intermediateResults/3_gitlab_stars.json', filtered_repos)
    repos = filtered_repos
    filtered_repos = list()
    # Filter demo repositories
    for p in repos:
        if(p['description'] is not None):
            if(not (("demo" in p['description'].lower()) or ("course" in p['description'].lower()) or ("thesis" in p['description'].lower()) or ("exame" in p['description'].lower()))): # “demo”, “course”, "thesis"
                if(not (("demo" in p['path_with_namespace'].lower()) or ("course" in p['path_with_namespace'].lower()) or ("thesis" in p['path_with_namespace'].lower()) or ("exame" in p['path_with_namespace'].lower()))): # “demo”, “course”, "thesis", "exame"
                    filtered_repos.append(p)
        else:
            filtered_repos.append(p)
    print("4 - Gitlab Filter DEMO repositories: " + str(len(filtered_repos)))
    c.save('./repos_mining_data/intermediateResults/4_gitlab_no_demo.json', filtered_repos)
    repos = filtered_repos
    filtered_repos = list()
    # Filter tools repositories
    for p in repos:
        if(p['description'] is not None):
            if(not (("tool" in p['description'].lower()) or ("util" in p['description'].lower()) or ("helper" in p['description'].lower()) or ("library" in p['description'].lower()) or ("util" in p['description'].lower()) or ("plugin" in p['description'].lower()) or ("plug-in" in p['description'].lower()))): # tool, util, helper, library, plugin, plug-in
                if(not (("tool" in p['path_with_namespace'].lower()) or ("util" in p['path_with_namespace'].lower()) or ("helper" in p['path_with_namespace'].lower()) or ("library" in p['path_with_namespace'].lower()) or ("plugin" in p['path_with_namespace'].lower()) or ("plug-in" in p['path_with_namespace'].lower()))): # tool, util, helper, library, plugin, plug-in
                    filtered_repos.append(p)
        else:
            filtered_repos.append(p)    
    print("5 - Gitlab Filter TOOLS repositories: " + str(len(filtered_repos)))
    c.save('./repos_mining_data/intermediateResults/5_gitlab_no_tools.json', filtered_repos)
    repos = filtered_repos
    filtered_repos = list()
    # Filter simulation-oriented repositories
    for p in repos:
        if(p['description'] is not None):
            if(not (("simulat" in p['description'].lower()) or ("gazebo" in p['description'].lower()))): # simulat, gazebo
                if(not (("simulat" in p['path_with_namespace'].lower()) or ("gazebo" in p['path_with_namespace'].lower()))): # simulat, gazebo
                    filtered_repos.append(p)
        else:
            filtered_repos.append(p)
    print("6 - Gitlab Filter SIMULATION repositories: " + str(len(filtered_repos)))
    c.save('./repos_mining_data/intermediateResults/6_gitlab_no_simul.json', filtered_repos)
    repos = filtered_repos
    filtered_repos = list()
    repos = filtered_repos
    filtered_repos = list()
    print("GitLab analysis done.")
def start_github_analysis(rosmap_repos, rosmap_commits, ghtorrent_repos, ghtorrent_commits, jump_commits):
    filtered_repos = list()
    print("0 - GitHub ROSMAP Initial search: " + str(len(rosmap_repos)))
    # Filter ROSMAP fork repositories
    for key, p in rosmap_repos.items():
        if(p['fork'] == False):
            filtered_repos.append(p)
    print("1 - GitHub ROSMAP Fork repositories: " + str(len(filtered_repos)))
    c.save('./repos_mining_data/intermediateResults/1_github_rosmap_no_forks.json', filtered_repos)
    repos = filtered_repos
    filtered_repos = list()
    # Filter ROSMAP repositories with #commits < NUM_COMMITS
    # the check below is used just for saving time in case the number of commits has been already fetched from the web
    # i.e., the 2_github_rosmap_commits.json already exists and it is up to date
    if(not jump_commits):
        for p in repos:
            if(count_commits(p, rosmap_commits) >= c.NUM_COMMITS):
                filtered_repos.append(p)
        print("2 - Github ROSMAP Filter repositories with at least " + str(c.NUM_COMMITS) + " commits: " + str(len(filtered_repos)))
        c.save('./repos_mining_data/intermediateResults/2_github_rosmap_commits.json', filtered_repos)
    else:
        with open('./repos_mining_data/intermediateResults/2_github_rosmap_commits.json', 'r') as outputfile:  
            filtered_repos = json.load(outputfile)
    repos = filtered_repos
    filtered_repos = list()
    # MERGE rosmap and ghtorrent repos
    repos = union_dictionaries(to_dictionary(repos), ghtorrent_repos)
    repos = repos.values()
    commits = union_dictionaries(to_dictionary_commits(rosmap_commits), to_dictionary_commits(ghtorrent_commits))
    commits = commits.values()
    print("MERGE - Merged lists of GitHub repos coming from the rosmap and the ghtorrent searches: " + str(len(repos)))
    # Filter repositories with at least X stars
    for p in repos:
        if(p['stargazers_count'] >= NUM_STARS):
            filtered_repos.append(p)
    print("3 - GitHub Filter repositories with at least X stars: " + str(len(filtered_repos)))
    c.save('./repos_mining_data/intermediateResults/3_github_stars.json', filtered_repos)
    repos = filtered_repos
    filtered_repos = list()
    # Filter demo repositories
    discarded = list()
    for p in repos:
        if(p['description'] is not None):
            if(not (("demo" in p['description'].lower()) or ("tutorial" in p['description'].lower()) or ("course" in p['description'].lower()) or ("thesis" in p['description'].lower()) or ("exame" in p['description'].lower()))): # “demo”, "tutorial", “course”, "thesis", exame
                if(not (("demo" in p['full_name'].lower()) or ("tutorial" in p['description'].lower()) or ("course" in p['full_name'].lower()) or ("thesis" in p['full_name'].lower()) or ("exame" in p['full_name'].lower()))): # “demo”, "tutorial", “course”, "thesis", exame
                    filtered_repos.append(p)
                else:
                    discarded.append(p)
            else:
                discarded.append(p)
        else:
            filtered_repos.append(p)
    print("4 - Github Filter DEMO repositories: " + str(len(filtered_repos)))
    c.save('./repos_mining_data/intermediateResults/4_github_no_demo.json', filtered_repos)
    c.save('./repos_mining_data/intermediateResults/4_github_no_demo_discarded.json', discarded)
    repos = filtered_repos
    filtered_repos = list()
    discarded = list()
    # Filter tools repositories
    for p in repos:
        if(p['description'] is not None):
            if(not (("tool" in p['description'].lower()) or ("util" in p['description'].lower()) or ("helper" in p['description'].lower()) or ("library" in p['description'].lower()) or ("util" in p['description'].lower()) or ("plugin" in p['description'].lower()) or ("plug-in" in p['description'].lower()))): # tool, util, helper, library, plugin, plug-in
                if(not (("tool" in p['full_name'].lower()) or ("util" in p['full_name'].lower()) or ("helper" in p['full_name'].lower()) or ("library" in p['full_name'].lower()) or ("plugin" in p['full_name'].lower()) or ("plug-in" in p['full_name'].lower()))): # tool, util, helper, library, plugin, plug-in
                    filtered_repos.append(p)
                else:
                    discarded.append(p)
            else:
                discarded.append(p)
        else:
            filtered_repos.append(p)
    print("5 - GitHub Filter TOOLS repositories: " + str(len(filtered_repos)))
    c.save('./repos_mining_data/intermediateResults/5_github_no_tools.json', filtered_repos)
    c.save('./repos_mining_data/intermediateResults/5_github_no_tools_discarded.json', discarded)
    discarded = list()
    repos = filtered_repos
    filtered_repos = list()
    # Filter simulation-oriented repositories
    for p in repos:
        if(p['description'] is not None):
            if(not (("simulat" in p['description'].lower()) or ("gazebo" in p['description'].lower()))): # simulat, gazebo
                if(not (("simulat" in p['full_name'].lower()) or ("gazebo" in p['full_name'].lower()))): # simulat, gazebo
                    filtered_repos.append(p)
                else:
                    discarded.append(p)
            else:
                discarded.append(p)
        else:
            filtered_repos.append(p)
    print("6 - GitHub Filter SIMULATION repositories: " + str(len(filtered_repos)))
    c.save('./repos_mining_data/intermediateResults/6_github_no_simul.json', filtered_repos)
    c.save('./repos_mining_data/intermediateResults/6_github_no_simul_discarded.json', discarded)
    discarded = list()
    repos = filtered_repos
    filtered_repos = list()
    print("GitHub analysis done.")