示例#1
0
    def predict(self, domains):
        """This function accepts cudf series of domains as an argument to classify domain names as benign/malicious and returns the learned label for each object in the form of cudf series.

        :param domains: List of domains.
        :type domains: cudf.Series
        :return: Predicted results with respect to given domains.
        :rtype: cudf.Series

        Examples
        --------
        >>> dd.predict(['nvidia.com', 'dgadomain'])
        0    0
        1    1
        Name: is_dga, dtype: int64
        """
        df = cudf.DataFrame({"domain": domains})
        domains_len = df["domain"].count()
        temp_df = du.str2ascii(df, domains_len)
        # Assigning sorted domains index to return learned labels as per the given input order.
        df.index = temp_df.index
        df["domain"] = temp_df["domain"]
        temp_df = temp_df.drop("domain")
        input, seq_lengths = self.__create_variables(temp_df)
        del temp_df
        model_result = self.model(input, seq_lengths)
        pred = model_result.data.max(1, keepdim=True)[1]
        type_ids = pred.view(-1).tolist()
        df["is_dga"] = type_ids
        df = df.sort_index()
        return df["is_dga"]
示例#2
0
 def __get_partitioned_dfs(self, df, batch_size):
     """Partition one dataframe to multiple small dataframes based on a given batch size.
     :param df: Contains domains and it's types.
     :type df: cudf.DataFrame
     :param batch_size: Number of records has to be in each partitioned dataframe.
     :type batch_size: int
     """
     dataset_len = df["domain"].count()
     df = du.str2ascii(df, dataset_len)
     prev_chunk_offset = 0
     partitioned_dfs = []
     while prev_chunk_offset < dataset_len:
         curr_chunk_offset = prev_chunk_offset + batch_size
         chunk = df.iloc[prev_chunk_offset:curr_chunk_offset:1]
         partitioned_dfs.append(chunk)
         prev_chunk_offset = curr_chunk_offset
     return partitioned_dfs, dataset_len
示例#3
0
def test_str2ascii():
    actual_output_df = du.str2ascii(test_input_df, test_domains_len)
    assert actual_output_df.equals(expected_output_df)