示例#1
0
def eventize_and_enrich(commits, git_enrich):
    logging.info("New commits: " + str(len(commits)))

    # Create events from commits
    # TODO add tests for eventize method
    git_events = Git(commits, git_enrich)
    events_df = git_events.eventize(2)

    logging.info("New events: " + str(len(events_df)))

    # Filter information
    data_filtered = FilterRows(events_df)
    events_df = data_filtered.filter_(["filepath"], "-")

    logging.info("New events filtered: " + str(len(events_df)))

    # Add filetype info
    enriched_filetype = FileType(events_df)
    events_df = enriched_filetype.enrich('filepath')

    logging.info("New Filetype events: " + str(len(events_df)))

    # Split filepath info
    enriched_filepath = FilePath(events_df)
    events_df = enriched_filepath.enrich('filepath')

    logging.info("New Filepath events: " + str(len(events_df)))

    # Deal with surrogates
    convert = ToUTF8(events_df)
    events_df = convert.enrich(["owner"])

    logging.info("Final new events: " + str(len(events_df)))

    return events_df
示例#2
0
    def process(self, items_block):
        """Process items to add file related information.

        Eventize items creating one new item per each file found in the commit (excluding
        files with no actions performed on them). For each event, file path, file name,
        path parts, file type and file extension are added as fields.

        :param items_block: items to be processed. Expects to find ElasticSearch hits _source part only.
        """

        logger.debug("{} New commits: {}".format(self.__log_prefix,
                                                 len(items_block)))

        # Create events from commits
        git_events = Git(items_block, self._git_enrich)
        events_df = git_events.eventize(2)

        logger.debug("{} New events: {}".format(self.__log_prefix,
                                                len(events_df)))

        if len(events_df) > 0:
            # Filter information
            data_filtered = FilterRows(events_df)
            events_df = data_filtered.filter_(["filepath"], "-")

            logger.debug("{} New events filtered: {}".format(
                self.__log_prefix, len(events_df)))

            events_df['message'] = events_df['message'].str.slice(
                stop=AreasOfCode.MESSAGE_MAX_SIZE)
            logger.debug("{} Remove message content".format(self.__log_prefix))

            # Add filetype info
            enriched_filetype = FileType(events_df)
            events_df = enriched_filetype.enrich('filepath')

            logger.debug("{} New Filetype events: {}".format(
                self.__log_prefix, len(events_df)))

            # Split filepath info
            enriched_filepath = FilePath(events_df)
            events_df = enriched_filepath.enrich('filepath')

            logger.debug("{} New Filepath events: {}".format(
                self.__log_prefix, len(events_df)))

            events_df['origin'] = events_df['repository']

            # Deal with surrogates
            convert = ToUTF8(events_df)
            events_df = convert.enrich(["owner"])

        logger.debug("{} Final new events: {}".format(self.__log_prefix,
                                                      len(events_df)))

        return self.ProcessResults(processed=len(events_df),
                                   out_items=events_df)
示例#3
0
    def process(self, items_block):
        """Process items to add file related information.

        Eventize items creating one new item per each file found in the commit (excluding
        files with no actions performed on them). For each event, file path, file name,
        path parts, file type and file extension are added as fields.

        :param items_block: items to be processed. Expects to find ElasticSearch hits _source part only.
        """

        logger.info("New commits: " + str(len(items_block)))

        # Create events from commits
        git_events = Git(items_block, self._git_enrich)
        events_df = git_events.eventize(2)

        logger.info("New events: " + str(len(events_df)))

        if len(events_df) > 0:
            # Filter information
            data_filtered = FilterRows(events_df)
            events_df = data_filtered.filter_(["filepath"], "-")

            logger.info("New events filtered: " + str(len(events_df)))

            # Add filetype info
            enriched_filetype = FileType(events_df)
            events_df = enriched_filetype.enrich('filepath')

            logger.info("New Filetype events: " + str(len(events_df)))

            # Split filepath info
            enriched_filepath = FilePath(events_df)
            events_df = enriched_filepath.enrich('filepath')

            logger.info("New Filepath events: " + str(len(events_df)))

            # Deal with surrogates
            convert = ToUTF8(events_df)
            events_df = convert.enrich(["owner"])

        logger.info("Final new events: " + str(len(events_df)))

        return self.ProcessResults(processed=len(events_df),
                                   out_items=events_df)
示例#4
0
    def test_FilePath(self):
        """ Test FilePath enricher"""

        # Empty test
        empty_df = pandas.DataFrame()
        filepath = FilePath(empty_df)
        enriched_df = filepath.enrich('filepath')
        self.assertTrue(enriched_df.empty)

        #
        test_df = pandas.DataFrame()
        file_1 = {}
        file_2 = {}
        file_3 = {}
        file_4 = {}
        file_5 = {}
        file_6 = {}
        file_1['filepath'] = 'file.txt'
        file_1['file_name'] = 'file.txt'
        file_1['file_ext'] = 'txt'
        file_1['file_dir_name'] = '/'
        file_1['file_path_list'] = ['file.txt']
        file_2['filepath'] = '/foo/bar'
        file_2['file_name'] = 'bar'
        file_2['file_ext'] = ''
        file_2['file_dir_name'] = '/foo/'
        file_2['file_path_list'] = ['foo', 'bar']
        file_3['filepath'] = '/foo/bar/file.txt'
        file_3['file_name'] = 'file.txt'
        file_3['file_ext'] = 'txt'
        file_3['file_dir_name'] = '/foo/bar/'
        file_3['file_path_list'] = ['foo', 'bar', 'file.txt']
        file_4['filepath'] = '/foo/bar/'
        file_4['file_name'] = ''
        file_4['file_ext'] = ''
        file_4['file_dir_name'] = '/foo/bar/'
        file_4['file_path_list'] = ['foo', 'bar']
        file_5['filepath'] = '/foo//bar.txt'
        file_5['file_name'] = 'bar.txt'
        file_5['file_ext'] = 'txt'
        file_5['file_dir_name'] = '/foo/'
        file_5['file_path_list'] = ['foo', 'bar.txt']
        file_6['filepath'] = '//foo///bar.txt'
        file_6['file_name'] = 'bar.txt'
        file_6['file_ext'] = 'txt'
        file_6['file_dir_name'] = '/foo/'
        file_6['file_path_list'] = ['foo', 'bar.txt']
        test_df['filepath'] = [
            file_1['filepath'], file_2['filepath'], file_3['filepath'],
            file_4['filepath'], file_5['filepath'], file_6['filepath']
        ]
        filepath = FilePath(test_df)
        enriched_df = filepath.enrich('filepath')
        self.assertEqual(enriched_df.iloc[[0]]['filepath'].item(),
                         file_1['filepath'])
        self.assertEqual(enriched_df.iloc[[1]]['filepath'].item(),
                         file_2['filepath'])
        self.assertEqual(enriched_df.iloc[[2]]['filepath'].item(),
                         file_3['filepath'])
        self.assertEqual(enriched_df.iloc[[3]]['filepath'].item(),
                         file_4['filepath'])
        self.assertEqual(enriched_df.iloc[[4]]['filepath'].item(),
                         file_5['filepath'])
        self.assertEqual(enriched_df.iloc[[5]]['filepath'].item(),
                         file_6['filepath'])
        self.assertEqual(enriched_df.iloc[[0]]['file_name'].item(),
                         file_1['file_name'])
        self.assertEqual(enriched_df.iloc[[1]]['file_name'].item(),
                         file_2['file_name'])
        self.assertEqual(enriched_df.iloc[[2]]['file_name'].item(),
                         file_3['file_name'])
        self.assertEqual(enriched_df.iloc[[3]]['file_name'].item(),
                         file_4['file_name'])
        self.assertEqual(enriched_df.iloc[[4]]['file_name'].item(),
                         file_5['file_name'])
        self.assertEqual(enriched_df.iloc[[5]]['file_name'].item(),
                         file_6['file_name'])
        self.assertEqual(enriched_df.iloc[[0]]['file_ext'].item(),
                         file_1['file_ext'])
        self.assertEqual(enriched_df.iloc[[1]]['file_ext'].item(),
                         file_2['file_ext'])
        self.assertEqual(enriched_df.iloc[[2]]['file_ext'].item(),
                         file_3['file_ext'])
        self.assertEqual(enriched_df.iloc[[3]]['file_ext'].item(),
                         file_4['file_ext'])
        self.assertEqual(enriched_df.iloc[[4]]['file_ext'].item(),
                         file_5['file_ext'])
        self.assertEqual(enriched_df.iloc[[5]]['file_ext'].item(),
                         file_6['file_ext'])
        self.assertEqual(enriched_df.iloc[[0]]['file_dir_name'].item(),
                         file_1['file_dir_name'])
        self.assertEqual(enriched_df.iloc[[1]]['file_dir_name'].item(),
                         file_2['file_dir_name'])
        self.assertEqual(enriched_df.iloc[[2]]['file_dir_name'].item(),
                         file_3['file_dir_name'])
        self.assertEqual(enriched_df.iloc[[3]]['file_dir_name'].item(),
                         file_4['file_dir_name'])
        self.assertEqual(enriched_df.iloc[[4]]['file_dir_name'].item(),
                         file_5['file_dir_name'])
        self.assertEqual(enriched_df.iloc[[5]]['file_dir_name'].item(),
                         file_6['file_dir_name'])
        self.assertEqual(enriched_df.iloc[[0]]['file_path_list'].item(),
                         file_1['file_path_list'])
        self.assertEqual(enriched_df.iloc[[1]]['file_path_list'].item(),
                         file_2['file_path_list'])
        self.assertEqual(enriched_df.iloc[[2]]['file_path_list'].item(),
                         file_3['file_path_list'])
        self.assertEqual(enriched_df.iloc[[3]]['file_path_list'].item(),
                         file_4['file_path_list'])
        self.assertEqual(enriched_df.iloc[[4]]['file_path_list'].item(),
                         file_5['file_path_list'])
        self.assertEqual(enriched_df.iloc[[5]]['file_path_list'].item(),
                         file_6['file_path_list'])
示例#5
0
    def test_FilePath(self):
        """ Test FilePath enricher"""

        # Empty test
        empty_df = pandas.DataFrame()
        filepath = FilePath(empty_df)
        enriched_df = filepath.enrich('filepath')
        self.assertTrue(enriched_df.empty)

        #
        test_df = pandas.DataFrame()
        file_1 = {}
        file_2 = {}
        file_3 = {}
        file_4 = {}
        file_5 = {}
        file_6 = {}
        file_1['filepath'] = 'file.txt'
        file_1['file_name'] = 'file.txt'
        file_1['file_ext'] = 'txt'
        file_1['file_dir_name'] = '/'
        file_1['file_path_list'] = ['file.txt']
        file_2['filepath'] = '/foo/bar'
        file_2['file_name'] = 'bar'
        file_2['file_ext'] = ''
        file_2['file_dir_name'] = '/foo/'
        file_2['file_path_list'] = ['foo', 'bar']
        file_3['filepath'] = '/foo/bar/file.txt'
        file_3['file_name'] = 'file.txt'
        file_3['file_ext'] = 'txt'
        file_3['file_dir_name'] = '/foo/bar/'
        file_3['file_path_list'] = ['foo', 'bar', 'file.txt']
        file_4['filepath'] = '/foo/bar/'
        file_4['file_name'] = ''
        file_4['file_ext'] = ''
        file_4['file_dir_name'] = '/foo/bar/'
        file_4['file_path_list'] = ['foo', 'bar']
        file_5['filepath'] = '/foo//bar.txt'
        file_5['file_name'] = 'bar.txt'
        file_5['file_ext'] = 'txt'
        file_5['file_dir_name'] = '/foo/'
        file_5['file_path_list'] = ['foo', 'bar.txt']
        file_6['filepath'] = '//foo///bar.txt'
        file_6['file_name'] = 'bar.txt'
        file_6['file_ext'] = 'txt'
        file_6['file_dir_name'] = '/foo/'
        file_6['file_path_list'] = ['foo', 'bar.txt']
        test_df['filepath'] = [file_1['filepath'], file_2['filepath'],
                               file_3['filepath'], file_4['filepath'],
                               file_5['filepath'], file_6['filepath']]
        filepath = FilePath(test_df)
        enriched_df = filepath.enrich('filepath')
        self.assertEqual(enriched_df.iloc[[0]]['filepath'].item(), file_1['filepath'])
        self.assertEqual(enriched_df.iloc[[1]]['filepath'].item(), file_2['filepath'])
        self.assertEqual(enriched_df.iloc[[2]]['filepath'].item(), file_3['filepath'])
        self.assertEqual(enriched_df.iloc[[3]]['filepath'].item(), file_4['filepath'])
        self.assertEqual(enriched_df.iloc[[4]]['filepath'].item(), file_5['filepath'])
        self.assertEqual(enriched_df.iloc[[5]]['filepath'].item(), file_6['filepath'])
        self.assertEqual(enriched_df.iloc[[0]]['file_name'].item(), file_1['file_name'])
        self.assertEqual(enriched_df.iloc[[1]]['file_name'].item(), file_2['file_name'])
        self.assertEqual(enriched_df.iloc[[2]]['file_name'].item(), file_3['file_name'])
        self.assertEqual(enriched_df.iloc[[3]]['file_name'].item(), file_4['file_name'])
        self.assertEqual(enriched_df.iloc[[4]]['file_name'].item(), file_5['file_name'])
        self.assertEqual(enriched_df.iloc[[5]]['file_name'].item(), file_6['file_name'])
        self.assertEqual(enriched_df.iloc[[0]]['file_ext'].item(), file_1['file_ext'])
        self.assertEqual(enriched_df.iloc[[1]]['file_ext'].item(), file_2['file_ext'])
        self.assertEqual(enriched_df.iloc[[2]]['file_ext'].item(), file_3['file_ext'])
        self.assertEqual(enriched_df.iloc[[3]]['file_ext'].item(), file_4['file_ext'])
        self.assertEqual(enriched_df.iloc[[4]]['file_ext'].item(), file_5['file_ext'])
        self.assertEqual(enriched_df.iloc[[5]]['file_ext'].item(), file_6['file_ext'])
        self.assertEqual(enriched_df.iloc[[0]]['file_dir_name'].item(), file_1['file_dir_name'])
        self.assertEqual(enriched_df.iloc[[1]]['file_dir_name'].item(), file_2['file_dir_name'])
        self.assertEqual(enriched_df.iloc[[2]]['file_dir_name'].item(), file_3['file_dir_name'])
        self.assertEqual(enriched_df.iloc[[3]]['file_dir_name'].item(), file_4['file_dir_name'])
        self.assertEqual(enriched_df.iloc[[4]]['file_dir_name'].item(), file_5['file_dir_name'])
        self.assertEqual(enriched_df.iloc[[5]]['file_dir_name'].item(), file_6['file_dir_name'])
        self.assertEqual(enriched_df.iloc[[0]]['file_path_list'].item(), file_1['file_path_list'])
        self.assertEqual(enriched_df.iloc[[1]]['file_path_list'].item(), file_2['file_path_list'])
        self.assertEqual(enriched_df.iloc[[2]]['file_path_list'].item(), file_3['file_path_list'])
        self.assertEqual(enriched_df.iloc[[3]]['file_path_list'].item(), file_4['file_path_list'])
        self.assertEqual(enriched_df.iloc[[4]]['file_path_list'].item(), file_5['file_path_list'])
        self.assertEqual(enriched_df.iloc[[5]]['file_path_list'].item(), file_6['file_path_list'])