示例#1
0
    def __init__(self,
                 aws_access_key_id=None,
                 aws_secret_access_key=None,
                 aws_bucket_name='cvae-insights',
                 local_data_store=False,
                 deployment_prefix='dev',
                 model_version='2019-01-03'):
        """Create an instance of GetData."""
        self.aws_access_key_id = os.environ.get('AWS_S3_ACCESS_KEY_ID', '')
        self.aws_secret_access_key = os.environ.get('AWS_S3_SECRET_ACCESS_KEY',
                                                    '')
        self.github_token = os.environ.get('GITHUB_TOKEN', '')
        self.bucket_name = aws_bucket_name
        self.deployment_prefix = deployment_prefix
        self.version_name = model_version
        if local_data_store:
            self.s3_client = LocalDataStore('tests/test_data')
        else:
            self.s3_object = AmazonS3(
                bucket_name=self.bucket_name,
                aws_access_key_id=self.aws_access_key_id,
                aws_secret_access_key=self.aws_secret_access_key)
            self.s3_client = self.load_S3()

        self.utility = Utility()
示例#2
0
 def __init__(self, data_obj, df_=pd.DataFrame()):
     """Create an instance for PreprocessData."""
     self.get_data_obj = data_obj
     self.utility_obj = Utility()
     self.get_keywords_obj = GetKeywords(data_obj)
     self.df_ = df_
     self.existing_data = self.get_data_obj.load_existing_data()
     self.pkg_kwd_df = self.fetch_package_keywords()
示例#3
0
 def __init__(self,
              df_=pd.DataFrame(),
              dict_=dict(),
              local_data_store=False):
     """Create an instance for GetKeywords."""
     self.df_ = df_
     self.dict_ = dict_
     self.get_data = GetData(local_data_store=local_data_store)
     self.utility = Utility()
示例#4
0
 def __init__(self,
              aws_access_key_id='',
              aws_secret_access_key='',
              aws_bucket_name='cvae-insights',
              model_version='',
              num_train_per_user=5):
     """Create an instance for GetPreprocessData."""
     self.obj_ = GetData(aws_access_key_id=aws_access_key_id,
                         aws_secret_access_key=aws_secret_access_key,
                         aws_bucket_name=aws_bucket_name,
                         model_version=model_version,
                         num_train_per_user=num_train_per_user)
     self.keyword_obj_ = GetKeywords(self.obj_)
     self.preprocess_data_obj = PreprocessData(data_obj=self.obj_)
     self.utils = Utility()
     self.num_users = num_train_per_user
 def __init__(self,
              aws_access_key_id,
              aws_secret_access_key,
              num_train_per_user,
              aws_bucket_name,
              model_version):
     """Create an instance of GetData."""
     self.aws_access_key_id = os.environ.get('AWS_S3_ACCESS_KEY_ID', aws_access_key_id)
     self.aws_secret_access_key = os.environ.get('AWS_S3_SECRET_ACCESS_KEY',
                                                 aws_secret_access_key)
     self.github_token = os.environ.get('GITHUB_TOKEN', '')
     self.bucket_name = aws_bucket_name
     self.version_name = model_version
     self.s3_object = AmazonS3(bucket_name=self.bucket_name,
                               aws_access_key_id=self.aws_access_key_id,
                               aws_secret_access_key=self.aws_secret_access_key
                               )
     self.num_train_per_user = num_train_per_user
     self.s3_client = self.load_s3()
     self.utility = Utility()
示例#6
0
class PreprocessData:
    """This class defines the PreprocessData functions."""
    def __init__(self, data_obj, df_=pd.DataFrame()):
        """Create an instance for PreprocessData."""
        self.get_data_obj = data_obj
        self.utility_obj = Utility()
        self.get_keywords_obj = GetKeywords(data_obj)
        self.df_ = df_
        self.existing_data = self.get_data_obj.load_existing_data()
        self.pkg_kwd_df = self.fetch_package_keywords()

    def add_dependencies_resolved_column(self, df_, dependencies_list):
        """Return a binary value for dependency resoled column."""
        dependencies = [dep.lower() for dep in dependencies_list]
        pkg_with_tags = df_.loc[df_['name'].isin(dependencies)]
        if len(pkg_with_tags) == 0:
            return 0
        elif len(set(dependencies) - set(pkg_with_tags['name'])) == 0:
            return 1
        else:
            return 0

    def check_resolved_dependencies(self, df_):
        """Add a column all dependencies resolved and assign the binary value."""
        df_['all_deps_resolved'] = [
            self.add_dependencies_resolved_column(self.pkg_kwd_df, i)
            for i in df_['dependencies']
        ]
        df_ = df_.loc[df_['all_deps_resolved'] == 0]
        return df_

    def fetch_package_keywords(self):
        """Fetch the keywords for raw data's package list."""
        raw_data = self.get_data_obj.load_raw_data()
        manifest_data = raw_data.get('package_dict', {})
        all_manifest = manifest_data.get('user_input_stack', []) + \
            manifest_data.get('bigquery_data', [])
        try:
            package_keyword_df = self.get_keywords_obj.find_keywords(
                self.existing_data, all_manifest)
            return package_keyword_df
        except Exception:
            raise ValueError("Unable to fetch keywords.")

    def make_necessary_df(self, limit_manifest, limit_keywords):
        """Create two dataframes for dependencies and keywords respectively.."""
        filtered_pkg_kwd_df = self.df_
        manifest_df = self.df_
        if 'dependencies' in self.pkg_kwd_df.columns:
            manifest_df = self.utility_obj.make_manifest_df(
                self.pkg_kwd_df, limit_manifest)
        else:
            raise KeyError("Dependency is not present")
        if 'keywords' in self.pkg_kwd_df.columns:
            filtered_pkg_kwd_df = self.utility_obj.make_filtered_pkg_kwd_df(
                self.pkg_kwd_df, limit_keywords)
        else:
            raise KeyError("Keywords are not present")

        return (list([manifest_df, filtered_pkg_kwd_df]))

    def extract_unique_packages(self):
        """Return all unique packages from filtered package keyword dataframe."""
        filtered_pkg_kwd_df = self.make_necessary_df(5, 0)[1]
        data_with_dep_check = self.check_resolved_dependencies(
            filtered_pkg_kwd_df)
        unique_packages, manifest_user_data = self.utility_obj.extract_package_manifest_lst(
            data_with_dep_check)
        manifest_user_data = self.utility_obj.make_user_data(
            manifest_user_data, unique_packages)
        return unique_packages, manifest_user_data

    def create_df_and_dictionaries(self):
        """Create all the necessary dataframes and dictionaries."""
        self.unique_packages, self.manifest_user_data = self.extract_unique_packages(
        )
        self.keyword_df, self.dependencies_df = self.utility_obj.make_kwd_dependencies_df(
            self.pkg_kwd_df, self.unique_packages)
        self.package_tag_map, self.vocabulary = self.utility_obj.create_pkg_tag_map(
            self.keyword_df)
        self.package_dep_map, self.first_level_deps = self.utility_obj.create_pkg_dep_map(
            self.dependencies_df)

    def create_extended_pkg_tag_map(self):
        """Create the package tag map according to all first level dependencies."""
        self.create_df_and_dictionaries()
        self.extended_ptm = dict()
        keywords_df_deps = self.pkg_kwd_df.loc[
            self.pkg_kwd_df['name'].isin(self.first_level_deps),
            ['name', 'keywords']]
        for k, g in keywords_df_deps.groupby("name"):
            try:
                self.extended_ptm[k] = self.utility_obj.clean_set(
                    self.package_dep_map.get(k, set()).union(
                        set(g["keywords"].tolist()[0])))
            except Exception:
                pass

        return self.extended_ptm, self.manifest_user_data, self.unique_packages

    def update_pkg_tag_map(self):
        """Update the existing package tag map."""
        extended_ptm, manifest_user_data, unique_packages = self.create_extended_pkg_tag_map(
        )
        for package_name in self.package_tag_map.keys():
            more_keywords = set()
            for dependency in self.package_dep_map[package_name]:
                more_keywords = more_keywords.union(
                    set(extended_ptm.get(dependency, [])))
            self.package_tag_map[package_name] = self.package_tag_map.get(
                package_name).union(more_keywords)
            self.vocabulary = self.vocabulary.union(more_keywords)
        return self.package_tag_map, self.vocabulary, manifest_user_data, unique_packages
class GetData:
    """This class defines the S3 Connections viz fetching and storing data."""

    def __init__(self,
                 aws_access_key_id,
                 aws_secret_access_key,
                 num_train_per_user,
                 aws_bucket_name,
                 model_version):
        """Create an instance of GetData."""
        self.aws_access_key_id = os.environ.get('AWS_S3_ACCESS_KEY_ID', aws_access_key_id)
        self.aws_secret_access_key = os.environ.get('AWS_S3_SECRET_ACCESS_KEY',
                                                    aws_secret_access_key)
        self.github_token = os.environ.get('GITHUB_TOKEN', '')
        self.bucket_name = aws_bucket_name
        self.version_name = model_version
        self.s3_object = AmazonS3(bucket_name=self.bucket_name,
                                  aws_access_key_id=self.aws_access_key_id,
                                  aws_secret_access_key=self.aws_secret_access_key
                                  )
        self.num_train_per_user = num_train_per_user
        self.s3_client = self.load_s3()
        self.utility = Utility()

    def load_s3(self):
        """Establish the connection with S3."""
        self.s3_object.connect()
        if self.s3_object.is_connected():
            logger.info("S3 connection established.")
            return self.s3_object
        else:
            raise Exception

    def load_raw_data(self):
        """Load the raw data from S3 bucket."""
        NPM_raw_data_path = os.path.join(self.version_name,
                                         "data/manifest.json")
        logger.info("Reading raw data from {}".format(self.version_name))
        if (self.s3_client.object_exists(NPM_raw_data_path)):
            try:
                raw_data_dict_ = self.s3_client.read_json_file(NPM_raw_data_path)
                logger.info("Size of Raw Manifest file is: {}".format(len(raw_data_dict_)))
                return raw_data_dict_
            except Exception:
                raise Exception

    def load_existing_data(self):
        """Load the node registry dump from S3 bucket."""
        NPM_clean_json_data_path = os.path.join("training-utils",
                                                "node-package-details-with-url.json")
        if self.s3_client.object_exists(NPM_clean_json_data_path):
            try:
                logger.info("Reading dump data from training-utils folder.")
                existing_data = self.s3_client.read_generic_file(NPM_clean_json_data_path)
                existing_df = self.utility.read_json_file(existing_data)
                logger.info("Size of Raw df with url is: {}".format(len(existing_df)))
                return existing_df
            except Exception:
                raise Exception("S3 connection error")
        else:
            raise ValueError("Given Path is not present.")

    def load_user_item_data(self):
        """Load the manifest file."""
        NPM_manifest_user_data_path = os.path.join(TEMPORARY_PATH, "manifest_user_data.dat")
        try:
            with open(NPM_manifest_user_data_path, 'rb') as f:
                user_item_data = f.read()
            return user_item_data
        except Exception:
            raise Exception("S3 could not read the file.")

    def create_package_train_user_data(self):
        """Create package train user data."""
        self.package_train_user_data = list()
        for user_id in range(self.num_users):
            this_user_items = self.pairs_train[self.pairs_train[:, 0] == user_id, 1]
            items_str = " ".join(str(x) for x in this_user_items)
            self.package_train_user_data.append([len(this_user_items), items_str])
        return self.package_train_user_data

    def create_package_train_item_data(self):
        """Create package train item data."""
        self.package_train_item_data = list()
        for item_id in range(self.num_items):
            this_item_users = self.pairs_train[self.pairs_train[:, 1] == item_id, 0]
            users_str = " ".join(str(x) for x in this_item_users)
            self.package_train_item_data.append([len(this_item_users), users_str])
        return self.package_train_item_data

    def create_package_test_user_data(self):
        """Create package test user data."""
        self.package_test_user_data = list()
        for user_id in range(self.num_users):
            this_user_items = self.pairs_test[self.pairs_test[:, 0] == user_id, 1]
            items_str = " ".join(str(x) for x in this_user_items)
            self.package_test_user_data.append([len(this_user_items), items_str])
        return self.package_test_user_data

    def create_package_test_item_data(self):
        """Create package test item data."""
        self.package_test_item_data = list()
        for item_id in range(self.num_items):
            this_item_users = self.pairs_test[self.pairs_test[:, 1] == item_id, 0]
            users_str = " ".join(str(x) for x in this_item_users)
            self.package_test_item_data.append([len(this_item_users), users_str])
        return self.package_test_item_data

    def train_test_data(self):
        """Create the training testing data for PMF."""
        data_list = self.split_training_testing_data()
        self.pairs_train = data_list[0]
        self.pairs_test = data_list[1]
        self.num_users = data_list[2]
        self.num_items = data_list[3]
        packagedata_train_users = self.create_package_train_user_data()
        packagedata_train_items = self.create_package_train_item_data()
        packagedata_test_users = self.create_package_test_user_data()
        packagedata_test_items = self.create_package_test_item_data()
        return packagedata_train_users, packagedata_train_items, \
            packagedata_test_users, packagedata_test_items

    def split_training_testing_data(self):
        """Split data into training and testing."""
        data_in_bytes = self.load_user_item_data()
        data = data_in_bytes.decode("utf-8")
        data_list = data.split('\n')
        pairs_train = []
        pairs_test = []
        user_id = 0
        np.random.seed(int(time.time()))
        logger.info("Splitting data into training and testing.")
        for line in data_list:
            arr = line.strip().split()
            arr = np.asarray([int(x) for x in arr[1:]])
            n = len(arr)
            idx = np.random.permutation(n)
            for i in range(min(self.num_train_per_user, n)):
                pairs_train.append((user_id, arr[idx[i]]))
            if n > self.num_train_per_user:
                for i in range(self.num_train_per_user, n):
                    pairs_test.append((user_id, arr[idx[i]]))
            user_id += 1
        num_users = user_id
        pairs_train = np.asarray(pairs_train)
        pairs_test = np.asarray(pairs_test)
        num_items = np.maximum(np.max(pairs_train[:, 1]), np.max(pairs_test[:, 1])) + 1
        logger.info("Number of users and items are respectively {},"
                    " {}".format(num_users, num_items))
        return [pairs_train, pairs_test, num_users, num_items]

    def check_path(self, path):
        """Check the given datastore path."""
        logger.info("Given path is: {}".format(path))
        try:
            if not os.path.exists(path):
                os.makedirs(path)
            return path
        except Exception as e:
            raise e

    def save_file_temporary(self, content, filename, datastore):
        """Store data file in temporary storage."""
        path = self.check_path(datastore)
        try:
            with open(os.path.join(path, filename), 'w') as f:
                for lst in content:
                    ele_str = " ".join([str(x) for x in lst[1:]])
                    f.write("{} {}\n".format(lst[0], ele_str))
            logger.info("File has been stored successfully.")
        except Exception as e:
            raise e

    def save_manifest_file_temporary(self, content, filename, datastore):
        """Store manifest file in temporary storage."""
        path = self.check_path(datastore)
        try:
            with open(os.path.join(path, filename), 'w') as f:
                for lst in content:
                    f.write("{} {}\n".format(lst[0], " ".join(str(x) for x in lst[1:])))
            logger.info("Manifest File has been stored successfully.")

        except Exception as e:
            raise e

    def save_numpy_matrix_temporary(self, content, filename, datastore):
        """Store numpy matrix in temporary storage."""
        path = self.check_path(datastore)
        try:
            np.savez(os.path.join(path, filename), matrix=content)
            logger.info("Numpy matrix has been stored successfully.")

        except Exception as e:
            raise e

    def save_json_file_temporary(self, content, filename, datastore):
        """Store JSON file in temporary storage."""
        path = self.check_path(datastore)
        try:
            with open(os.path.join(path, filename), 'w') as f:
                json.dump(content, f)
            logger.info("JSON file has been stored successfully.")
        except Exception as e:
            raise e

    def save_on_s3(self, folder_path):
        """Store all the contents on S3."""
        try:
            if os.path.exists(folder_path):
                if 'intermediate-model' in folder_path:
                    self.s3_client.s3_upload_folder(folder_path=folder_path,
                                                    prefix=self.version_name + '/intermediate-model'
                                                    )
                else:
                    self.s3_client.s3_upload_folder(folder_path=folder_path,
                                                    prefix=self.version_name + '')
                logger.info("Folders are successfully saved on S3.")
            else:
                logger.error("Folder path doesn't exist.")
        except Exception as e:
            raise e
示例#8
0
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""

import json
import unittest
from training.datastore.utils import Utility
import pandas as pd

utils_obj = Utility()

with open('tests/test_data/npm/dev/2019-01-03/data/test-node-package-details-with-url.json') as f:
    test_data_df = pd.DataFrame(json.load(f))


class TestUtility(unittest.TestCase):
    """This class tests the Utility Class."""

    def test_flatten_list(self):
        """Test Flatten list function."""
        test_list_ = [[1, 2], [3, 4]]
        test_flatten_list_output = utils_obj.flatten_list(test_list_)
        assert type(test_flatten_list_output[0]) != list

    def test_make_list_from_series(self):
示例#9
0
class GetKeywords:
    """This class defines the S3 Connections viz fetching and storing data."""
    def __init__(self,
                 df_=pd.DataFrame(),
                 dict_=dict(),
                 local_data_store=False):
        """Create an instance for GetKeywords."""
        self.df_ = df_
        self.dict_ = dict_
        self.get_data = GetData(local_data_store=local_data_store)
        self.utility = Utility()

    def from_existing_df(self, df_, package):
        """Find the keywords from existing dump."""
        if not df_.empty:
            data_lst = df_.loc[
                df_['name'] == str(package),
                ['name', 'description', 'keywords', 'dependencies']].iloc[0]
            return data_lst
        else:
            logger.error("Node Package details Dataframe is not existed.")
            return self.df_

    def from_npm_registry(self, package):
        """Find the keywords from NPM registry(through api)."""
        data_dict = self.dict_
        api_url = "https://registry.npmjs.org/" + str(package)
        try:
            api_data = requests.get(api_url).text
            json_data = json.loads(api_data)
            data_dict['name'] = json_data.get('name', '')
            data_dict['description'] = json_data.get('description', '')
            data_dict['keywords'] = json_data.get('keywords', [])
            data_dict['dependencies'] = self.get_dependencies(json_data)
            return data_dict
        except Exception:
            logger.error("Can't fetch the keywords from NPM Registry")
            return data_dict

    def get_version(self, api_data):
        """Give the latest version for the package."""
        if api_data:
            try:
                latest_version = api_data['dist-tags']['latest']
                return latest_version
            except Exception:
                logger.info("Unable to fetch latest version from API data.")
                return ''
        else:
            logger.error("API Data is not available.")
            return ''

    def get_dependencies(self, api_data):
        """Give the dependencies for latest version of package."""
        version = self.get_version(api_data)
        logger.info("Latest_version is: {}".format(version))
        versions_dict = api_data.get('versions', dict())
        try:
            if versions_dict:
                latest_version_data_dict = versions_dict.get(version, dict())
                if latest_version_data_dict:
                    latest_dependencies = latest_version_data_dict.get(
                        'dependencies', list())
                    return list(latest_dependencies.keys())
        except Exception:
            return list()

    def clean_response(self, response_json):
        """Clean the api response json."""
        topic_lst = response_json['data']['organization']['repository'][
            'repositoryTopics']['nodes']
        topic_name_lst = [dict(i.get('topic')).get('name') for i in topic_lst]
        return list(topic_name_lst)

    def from_github(self, package, url_df, api_url, api_token):
        """Find the keywords from the Github Graph QL."""
        url_ = self.utility.get_url(url_df, package)
        logger.info("Repo URL is {}".format(url_))
        keywords = list()
        query_params = self.utility.get_query_params(url_)
        logger.info("Query Parameters are: {}, {}".format(
            query_params[0], query_params[1]))
        json = {
            'query':
            '{{organization(login: "******"){{name url repository(name: "{1}")\
            {{name url description repositoryTopics(first: 10){{nodes{{topic {{name}}}}}}}}}}}}'
            .format(str(query_params[0]), str(query_params[1]))
        }
        headers = {'Authorization': 'token %s' % api_token}
        try:
            response = requests.post(url=api_url, json=json, headers=headers)

            keywords = list(self.clean_response(response.json()))
            return keywords
        except Exception:
            logger.error("Github tokens are not present.")

    def find_keywords(self, df_, list_):
        """Find the keywords for given list of list of raw data."""
        package_lst = self.utility.flatten_list(list_)
        out_lst = list()
        for i in package_lst:
            pkg_kwd_lst = list()
            pkg_kwd_lst = self.utility.make_list_from_series(
                self.from_existing_df(df_, i))
            if not pkg_kwd_lst or type(pkg_kwd_lst[2]) != list:
                logger.info("Finding from the NPM repository.")
                pkg_kwd_dict = self.from_npm_registry(i)
                pkg_kwd_lst = list(pkg_kwd_dict.values())
                if len(pkg_kwd_lst[2]) == 0:
                    logger.info("Trying to fetch from Github")
                    api_url = 'https://api.github.com/graphql'
                    api_token = self.get_data.github_token
                    pkg_kwd_lst[2] = self.from_github(i, df_, api_url,
                                                      api_token)
            out_lst.append(pkg_kwd_lst)
        return pd.DataFrame(
            out_lst,
            columns=['name', 'description', 'keywords', 'dependencies'])
示例#10
0
 def __init__(self, df_=pd.DataFrame(), dict_=dict()):
     """Create an instance for GetKeywords."""
     self.df_ = df_
     self.dict_ = dict_
     self.get_data = GetData()
     self.utility = Utility()
示例#11
0
class GetData:
    """This class defines the S3 Connections viz fetching and storing data."""
    def __init__(self,
                 aws_access_key_id=None,
                 aws_secret_access_key=None,
                 aws_bucket_name='cvae-insights',
                 local_data_store=False,
                 deployment_prefix='dev',
                 model_version='2019-01-03'):
        """Create an instance of GetData."""
        self.aws_access_key_id = os.environ.get('AWS_S3_ACCESS_KEY_ID', '')
        self.aws_secret_access_key = os.environ.get('AWS_S3_SECRET_ACCESS_KEY',
                                                    '')
        self.github_token = os.environ.get('GITHUB_TOKEN', '')
        self.bucket_name = aws_bucket_name
        self.deployment_prefix = deployment_prefix
        self.version_name = model_version
        if local_data_store:
            self.s3_client = LocalDataStore('tests/test_data')
        else:
            self.s3_object = AmazonS3(
                bucket_name=self.bucket_name,
                aws_access_key_id=self.aws_access_key_id,
                aws_secret_access_key=self.aws_secret_access_key)
            self.s3_client = self.load_S3()

        self.utility = Utility()

    def load_S3(self):
        """Establish the connection with S3."""
        self.s3_object.connect()
        if self.s3_object.is_connected():
            logger.info("S3 connection established.")
            return self.s3_object
        else:
            raise Exception

    def load_raw_data(self):
        """Load the raw data from S3 bucket."""
        NPM_raw_data_path = os.path.join(self.version_name,
                                         "data/manifest.json")
        try:
            raw_data_dict_ = self.s3_client.read_json_file(NPM_raw_data_path)
            logger.info("Size of Raw Manifest file is: {}".format(
                len(raw_data_dict_)))
            return raw_data_dict_
        except Exception:
            raise Exception

    def load_existing_data(self):
        """Load the node registry dump from S3 bucket."""
        NPM_clean_json_data_path = os.path.join(
            self.version_name, "data/node-package-details-with-url.json")
        try:
            logger.info("Path Existed")
            existing_data = self.s3_client.read_generic_file(
                NPM_clean_json_data_path)
            existing_df = self.utility.read_json_file(existing_data)
            logger.info("Size of Raw df with url is: {}".format(
                len(existing_df)))
            return existing_df
        except Exception:
            raise Exception("S3 connection error")
示例#12
0
class GetPreprocessData:
    """This class processes raw data and converts into the input data for models."""
    def __init__(self,
                 aws_access_key_id='',
                 aws_secret_access_key='',
                 aws_bucket_name='cvae-insights',
                 model_version='',
                 num_train_per_user=5):
        """Create an instance for GetPreprocessData."""
        self.obj_ = GetData(aws_access_key_id=aws_access_key_id,
                            aws_secret_access_key=aws_secret_access_key,
                            aws_bucket_name=aws_bucket_name,
                            model_version=model_version,
                            num_train_per_user=num_train_per_user)
        self.keyword_obj_ = GetKeywords(self.obj_)
        self.preprocess_data_obj = PreprocessData(data_obj=self.obj_)
        self.utils = Utility()
        self.num_users = num_train_per_user

    def preprocess_data(self):
        """Preprocesses the data and save into temporary storage."""
        package_tag_map, vocabulary, manifest_user_data, unique_packages = \
            self.preprocess_data_obj.update_pkg_tag_map()
        package_tag_map = {k: list(v) for k, v in package_tag_map.items()}
        self.obj_.save_manifest_file_temporary(manifest_user_data,
                                               'manifest_user_data.dat',
                                               TEMPORARY_DATA_PATH)
        package_id_map = self.utils.create_package_map(unique_packages)
        id_package_map = dict(
            zip(range(len(unique_packages)), list(unique_packages)))
        user_train_data, item_train_data, user_test_data, item_test_data = \
            self.obj_.train_test_data()
        content_matrix = self.utils.create_content_matrix(
            package_tag_map, unique_packages, vocabulary)
        self.obj_.save_json_file_temporary(package_id_map,
                                           'package_to_index_map.json',
                                           TEMPORARY_PATH)
        self.obj_.save_json_file_temporary(id_package_map,
                                           'index_to_package_map.json',
                                           TEMPORARY_PATH)
        self.obj_.save_json_file_temporary(package_tag_map,
                                           'package_tag_map.json',
                                           TEMPORARY_PATH)
        self.obj_.save_file_temporary(
            user_train_data,
            "packagedata-train-" + str(self.num_users) + "-users.dat",
            TEMPORARY_DATA_PATH)
        self.obj_.save_file_temporary(
            user_test_data,
            "packagedata-test-" + str(self.num_users) + "-users.dat",
            TEMPORARY_DATA_PATH)
        self.obj_.save_file_temporary(
            item_train_data,
            "packagedata-train-" + str(self.num_users) + "-items.dat",
            TEMPORARY_DATA_PATH)
        self.obj_.save_file_temporary(
            item_test_data,
            "packagedata-test-" + str(self.num_users) + "-items.dat",
            TEMPORARY_DATA_PATH)
        self.obj_.save_numpy_matrix_temporary(content_matrix,
                                              'content_matrix.npz',
                                              TEMPORARY_DATA_PATH)
        logger.info("All items are saved successfully in temporary location.")