Пример #1
0
    def train(cls):
        if MachineLearning.PKGS_CLASSIFICATIONS is None:
            ml_data = MachineLearningData()
            labels = ['RU', 'U', 'NU']
            MachineLearning.PKGS_CLASSIFICATIONS = ml_data.create_data(labels)

        cls.run_train(MachineLearning.PKGS_CLASSIFICATIONS)
Пример #2
0
    def train(cls):
        if MachineLearning.PKGS_CLASSIFICATIONS is None:
            ml_data = MachineLearningData()
            labels = ['RU', 'U', 'NU']
            MachineLearning.PKGS_CLASSIFICATIONS = ml_data.create_data(labels)

        cls.run_train(MachineLearning.PKGS_CLASSIFICATIONS)
class PkgClassificationTests(unittest.TestCase):

    def setUp(self):
        self.ml_data = MachineLearningData()
        self.cache = AptCache()

    def test_get_pkg_debtags(self):
        vim_debtags = ['devel::editor', 'implemented-in::c',
                       'interface::commandline', 'interface::text-mode',
                       'role::program', 'scope::application',
                       'uitoolkit::ncurses', 'use::editing',
                       'works-with::text', 'works-with::unicode']

        axi_path = "/var/lib/apt-xapian-index/index"
        axi = xapian.Database(axi_path)

        vim_debtags_result = self.ml_data.get_pkg_debtags(axi, 'vim')

        for debtag in vim_debtags:
            self.assertTrue(debtag in vim_debtags_result)

    @patch('apprecommender.ml.data.MachineLearningData.get_pkg_description')
    def test_get_pkg_terms(self, mock_description):
        mock_description.return_value = 'Vim is an text editor written in C'
        vim_terms = [u'vim', u'text', u'editor']
        vim_terms_result = self.ml_data.get_pkg_terms(self.cache, 'vim')

        for term in vim_terms:
            self.assertTrue(term in vim_terms_result)

    def test_create_row_table_list(self):
        labels_name = ['devel::editor', 'implemented-in::c', 'complet',
                       'contain', 'syntax', 'unix', 'version']
        pkg_elements = ['implemented-in::c', 'complet']

        row_list_to_assert = [0, 1, 1, 0, 0, 0, 0]
        row_list = self.ml_data.create_row_table_list(labels_name,
                                                      pkg_elements)

        self.assertEqual(row_list_to_assert, row_list)

    @patch('apprecommender.ml.data.MachineLearningData.get_pkg_description')
    def test_get_pkg_classification(self, mock_description):
        mock_description.return_value = 'vim is an text editor written in c'
        axi_path = "/var/lib/apt-xapian-index/index"
        axi = xapian.Database(axi_path)
        pkgs = {'vim': 'EX'}
        debtags_name = ['devel::editor', 'implemented-in::c',
                        'devel::interpreter', 'devel::lang:python']
        terms_name = ['vim', 'editor', 'python']

        assert_pkgs_classification = {'vim': [1, 1, 0, 0, 1, 1, 0, 'EX']}

        pkgs_classification = self.ml_data.get_pkgs_table_classification(
            axi, pkgs, self.cache, debtags_name, terms_name)

        self.assertEqual(assert_pkgs_classification, pkgs_classification)
Пример #4
0
 def __init__(self, content, profile_size, suggestion_size=200):
     ContentBased.__init__(self, content, profile_size)
     self.content = content
     self.description = 'Machine-learning'
     self.profile_size = profile_size
     self.suggestion_size = suggestion_size
     self.cache = apt.Cache()
     self.ml_data = MachineLearningData()
     self.axi = xapian.Database(XAPIAN_DATABASE_PATH)
Пример #5
0
    def train(cls):
        ml_data = MachineLearningData()
        labels = ['RU', 'U', 'NU']

        try:
            MachineLearning.PKGS_CLASSIFICATIONS = ml_data.create_data(
                labels)
            if len(MachineLearning.PKGS_CLASSIFICATIONS) >= 10:
                cls.run_train(MachineLearning.PKGS_CLASSIFICATIONS)
            else:
                raise MachineLearningTrainError()
        except IOError:
            raise
Пример #6
0
def ml_cross_validation(folder_path, ml_strategy_str):
    logger = logging.getLogger('')
    logger.setLevel(logging.CRITICAL)

    if not os.path.exists(folder_path):
        os.mkdir(folder_path)

    partition_size = 0.8
    rounds = 5
    metrics_list = [SimpleAccuracy(), Precision(), Recall(), FPR(), F_score(1)]
    labels = ['RU', 'U', 'NU']

    ml_data = MachineLearningData()
    pkg_data = get_pkg_data(ml_strategy_str, ml_data, labels)
    ml_cross_validation = get_strategy(ml_strategy_str, pkg_data,
                                       partition_size, rounds, metrics_list,
                                       labels)

    cross_validaton_file = 'cross_validation_result_{}_{}_{}_{}.txt'.format(
        ml_strategy_str, rounds, partition_size,
        dt.datetime.now().strftime('%Y%m%d%H%M'))

    ml_cross_validation.run(None)

    cross_validation_file_path = folder_path + cross_validaton_file
    with open(cross_validation_file_path, 'w') as result:
        result.write(ml_cross_validation.__str__())

    return ml_cross_validation
Пример #7
0
 def __init__(self, content, profile_size, suggestion_size=200):
     ContentBased.__init__(self, content, profile_size)
     self.content = content
     self.description = 'Machine-learning'
     self.profile_size = profile_size
     self.suggestion_size = suggestion_size
     self.cache = AptCache()
     self.ml_data = MachineLearningData()
     self.axi = xapian.Database(XAPIAN_DATABASE_PATH)
Пример #8
0
    def train_model(self, pkgs_list, axi, save_files=True):
        cache = Cache()
        ml_data = MachineLearningData()

        pkgs_description, pkg_classification = self.prepare_data(
            pkgs_list, axi, cache, ml_data)
        pkg_features = self.vectorizer.fit_transform(pkgs_description)
        features_array = pkg_features.toarray()

        terms, debtags = self.get_used_terms_and_debtags(
            self.vectorizer.get_feature_names())

        self.classifier = GaussianNB()
        self.classifier.fit(features_array, pkg_classification)

        path = BagOfWords.BAG_OF_WORDS_PKGS_CLASSIFICATION

        if save_files:
            self.save_features(terms, BagOfWords.BAG_OF_WORDS_TERMS)
            self.save_features(debtags, BagOfWords.BAG_OF_WORDS_DEBTAGS)
            self.save_pkgs_features(path, pkgs_list, features_array,
                                    pkg_classification)

        return BagOfWords.CREATED_MODEL
Пример #9
0
class MachineLearning(ContentBased):

    __metaclass__ = ABCMeta

    PKGS_CLASSIFICATIONS = None

    def __init__(self, content, profile_size, suggestion_size=200):
        ContentBased.__init__(self, content, profile_size)
        self.content = content
        self.description = 'Machine-learning'
        self.profile_size = profile_size
        self.suggestion_size = suggestion_size
        self.cache = AptCache()
        self.ml_data = MachineLearningData()
        self.axi = xapian.Database(XAPIAN_DATABASE_PATH)

    def display_recommended_terms(self, terms_name, debtags_name, item_score,
                                  rec_size):
        sorted_result = sorted(item_score.items(), key=operator.itemgetter(1))
        sorted_result = list(reversed(sorted_result))
        sorted_result = [pkg[0] for pkg in sorted_result][0:rec_size]
        sorted_result = list(reversed(sorted_result))

        for pkg in sorted_result:
            pkg_terms = self.ml_data.get_pkg_terms(self.cache, pkg)
            pkg_debtags = self.ml_data.get_pkg_debtags(self.axi, pkg)

            terms_match = []
            for term in pkg_terms:
                if term in terms_name:
                    terms_match.append(term)

            debtags_match = []
            for debtag in pkg_debtags:
                if debtag in debtags_name:
                    debtags_match.append(debtag)

            print "\n\n="
            print "{0}".format(pkg)
            print "debtags:"
            print debtags_match
            print "-"
            print "terms:"
            print terms_match
            print "="

    def get_item_score(self, pkgs_score, pkgs_classifications):
        item_score = {}
        order = ['RU', 'U', 'NU']
        order_values = [0, 1000, 2000]

        for pkg, classification in pkgs_classifications.iteritems():
            item_score[pkg] = order_values[order.index(classification)]
            item_score[pkg] += pkgs_score[pkg]

        return item_score

    def get_pkgs_and_scores(self, rec, user):
        profile = user.content_profile(rec.items_repository, self.content,
                                       self.suggestion_size, rec.valid_tags)

        content_based = self.get_sugestion_from_profile(
            rec, user, profile, self.suggestion_size, because=False)
        pkgs, pkgs_score = [], {}

        for pkg_line in str(content_based).splitlines()[1:]:
            pkg = re.search(r'\d+:\s([\w-]+)', pkg_line)

            if not pkg.groups():
                continue

            pkg = pkg.groups()[0]
            pkg_score = int(pkg_line.split(':')[0].strip())

            pkgs.append(pkg)
            pkgs_score[pkg] = self.suggestion_size - pkg_score

        return pkgs, pkgs_score

    def get_pkgs_classifications(self, pkgs, terms_name, debtags_name):
        ml_strategy = self.get_ml_strategy()
        pkgs_classifications = {}
        kwargs = {}

        kwargs['terms_name'] = terms_name
        kwargs['debtags_name'] = debtags_name
        kwargs['ml_strategy'] = ml_strategy

        for pkg in pkgs:
            if pkg not in self.cache:
                continue

            attribute_vector = self.prepare_pkg_data(
                pkg, **kwargs)

            classification = self.get_pkg_classification(
                ml_strategy, attribute_vector)
            pkgs_classifications[pkg] = classification

        return pkgs_classifications

    def load_terms_and_debtags(self):
        terms_name = []
        debtags_name = []

        terms_path = self.get_terms_path()
        debtags_path = self.get_debtags_path()

        with open(terms_path, 'rb') as terms:
            terms_name = pickle.load(terms)
        with open(debtags_path, 'rb') as debtags:
            debtags_name = pickle.load(debtags)

        return terms_name, debtags_name

    @staticmethod
    def train(cls):
        ml_data = MachineLearningData()
        labels = ['RU', 'U', 'NU']

        try:
            MachineLearning.PKGS_CLASSIFICATIONS = ml_data.create_data(
                labels)
            if len(MachineLearning.PKGS_CLASSIFICATIONS) >= 10:
                cls.run_train(MachineLearning.PKGS_CLASSIFICATIONS)
            else:
                raise MachineLearningTrainError()
        except IOError:
            raise

    @abstractmethod
    def get_debtags_path(self):
        raise NotImplementedError("Method not implemented.")

    @abstractmethod
    def get_ml_strategy(self):
        raise NotImplementedError("Method not implemented.")

    @abstractmethod
    def get_pkg_classification(self, ml_strategy, attribute_vector):
        raise NotImplementedError("Method not implemented.")

    @abstractmethod
    def get_terms_path(self):
        raise NotImplementedError("Method not implemented.")

    @abstractmethod
    def prepare_pkg_data(self, pkg, **kwargs):
        raise NotImplementedError("Method not implemented.")

    @abstractmethod
    def run_train(cls, pkgs_classifications):
        raise NotImplementedError("Method not implemented.")

    def run(self, rec, user, rec_size):
        user_profile = None

        terms_name, debtags_name = self.load_terms_and_debtags()

        pkgs, pkgs_score = self.get_pkgs_and_scores(rec, user)

        pkgs_classifications = self.get_pkgs_classifications(pkgs, terms_name,
                                                             debtags_name)

        item_score = self.get_item_score(pkgs_score, pkgs_classifications)

        if Config().because:
            user_profile = user.pkg_profile

        return recommender.RecommendationResult(
            item_score, limit=rec_size, user_profile=user_profile)
Пример #10
0
class PkgClassificationTests(unittest.TestCase):
    def setUp(self):
        self.ml_data = MachineLearningData()
        self.cache = apt.Cache()

    def test_get_pkg_debtags(self):
        vim_debtags = [
            'devel::editor', 'implemented-in::c', 'interface::commandline',
            'interface::text-mode', 'role::program', 'scope::application',
            'uitoolkit::ncurses', 'use::editing', 'works-with::text',
            'works-with::unicode'
        ]

        axi_path = "/var/lib/apt-xapian-index/index"
        axi = xapian.Database(axi_path)

        vim_debtags_result = self.ml_data.get_pkg_debtags(axi, 'vim')

        for debtag in vim_debtags:
            self.assertTrue(debtag in vim_debtags_result)

    def test_get_pkg_terms(self):
        vim_terms = [
            u'almost', u'compat', u'version', u'editor', u'new', u'featur',
            u'ad', u'multi', u'level', u'undo', u'syntax', u'highlight',
            u'command', u'line', u'histori', u'help', u'filenam', u'complet',
            u'block', u'oper', u'fold', u'support', u'etc', u'packag',
            u'contain', u'version', u'vim', u'compil', u'rather', u'standard',
            u'set', u'featur', u'packag', u'provid', u'version', u'packag',
            u'need', u'less'
        ]
        vim_terms_result = self.ml_data.get_pkg_terms(self.cache, 'vim')

        for term in vim_terms:
            self.assertTrue(term in vim_terms_result)

    def test_create_row_table_list(self):
        labels_name = [
            'devel::editor', 'implemented-in::c', 'complet', 'contain',
            'syntax', 'unix', 'version'
        ]
        pkg_elements = ['implemented-in::c', 'complet']

        row_list_to_assert = [0, 1, 1, 0, 0, 0, 0]
        row_list = self.ml_data.create_row_table_list(labels_name,
                                                      pkg_elements)

        self.assertEqual(row_list_to_assert, row_list)

    def test_get_pkg_classification(self):
        axi_path = "/var/lib/apt-xapian-index/index"
        axi = xapian.Database(axi_path)
        pkgs = {'vim': 'EX'}
        debtags_name = [
            'devel::editor', 'implemented-in::c', 'devel::interpreter',
            'devel::lang:python'
        ]
        terms_name = ['contain', 'syntax', 'python']

        assert_pkgs_classification = {'vim': [1, 1, 0, 0, 1, 1, 0, 'EX']}

        pkgs_classification = self.ml_data.get_pkgs_table_classification(
            axi, pkgs, self.cache, debtags_name, terms_name)

        self.assertEqual(assert_pkgs_classification, pkgs_classification)
Пример #11
0
 def setUp(self):
     self.ml_data = MachineLearningData()
     self.cache = apt.Cache()
 def setUp(self):
     self.ml_data = MachineLearningData()
     self.cache = AptCache()
class PkgClassificationTests(unittest.TestCase):

    def setUp(self):
        self.ml_data = MachineLearningData()
        self.cache = apt.Cache()

    def test_get_pkg_debtags(self):
        vim_debtags = ['devel::editor', 'implemented-in::c',
                       'interface::commandline', 'interface::text-mode',
                       'role::program', 'scope::application',
                       'uitoolkit::ncurses', 'use::editing',
                       'works-with::text', 'works-with::unicode']

        axi_path = "/var/lib/apt-xapian-index/index"
        axi = xapian.Database(axi_path)

        vim_debtags_result = self.ml_data.get_pkg_debtags(axi, 'vim')

        for debtag in vim_debtags:
            self.assertTrue(debtag in vim_debtags_result)

    def test_get_pkg_terms(self):
        vim_terms = [u'vim', u'compat', u'version', u'unix', u'editor', u'vi',
                     u'new', u'featur', u'ad', u'multi', u'level', u'undo',
                     u'syntax', u'highlight', u'command', u'line', u'histori',
                     u'line', u'help', u'filenam', u'complet', u'block',
                     u'oper', u'fold', u'unicod', u'support', u'packag',
                     u'contain', u'version', u'vim', u'compil', u'standard',
                     u'set', u'featur', u'packag', u'doe', u'provid', u'gui',
                     u'version', u'vim', u'vim', u'packag', u'need']
        vim_terms_result = self.ml_data.get_pkg_terms(self.cache, 'vim')

        print vim_terms_result

        for term in vim_terms:
            self.assertTrue(term in vim_terms_result)

    def test_create_row_table_list(self):
        labels_name = ['devel::editor', 'implemented-in::c', 'complet',
                       'contain', 'syntax', 'unix', 'version']
        pkg_elements = ['implemented-in::c', 'complet']

        row_list_to_assert = [0, 1, 1, 0, 0, 0, 0]
        row_list = self.ml_data.create_row_table_list(labels_name,
                                                      pkg_elements)

        self.assertEqual(row_list_to_assert, row_list)

    def test_get_pkg_classification(self):
        axi_path = "/var/lib/apt-xapian-index/index"
        axi = xapian.Database(axi_path)
        pkgs = {'vim': 'EX'}
        debtags_name = ['devel::editor', 'implemented-in::c',
                        'devel::interpreter', 'devel::lang:python']
        terms_name = ['contain', 'syntax', 'python']

        assert_pkgs_classification = {'vim': [1, 1, 0, 0, 1, 1, 0, 'EX']}

        pkgs_classification = self.ml_data.get_pkgs_table_classification(
            axi, pkgs, self.cache, debtags_name, terms_name)

        self.assertEqual(assert_pkgs_classification, pkgs_classification)
Пример #14
0
class MachineLearning(ContentBased):

    __metaclass__ = ABCMeta

    PKGS_CLASSIFICATIONS = None

    def __init__(self, content, profile_size, suggestion_size=200):
        ContentBased.__init__(self, content, profile_size)
        self.content = content
        self.description = 'Machine-learning'
        self.profile_size = profile_size
        self.suggestion_size = suggestion_size
        self.cache = apt.Cache()
        self.ml_data = MachineLearningData()
        self.axi = xapian.Database(XAPIAN_DATABASE_PATH)

    def display_recommended_terms(self, terms_name, debtags_name, item_score,
                                  rec_size):
        sorted_result = sorted(item_score.items(), key=operator.itemgetter(1))
        sorted_result = list(reversed(sorted_result))
        sorted_result = [pkg[0] for pkg in sorted_result][0:rec_size]
        sorted_result = list(reversed(sorted_result))

        for pkg in sorted_result:
            pkg_terms = self.ml_data.get_pkg_terms(self.cache, pkg)
            pkg_debtags = self.ml_data.get_pkg_debtags(self.axi, pkg)

            terms_match = []
            for term in pkg_terms:
                if term in terms_name:
                    terms_match.append(term)

            debtags_match = []
            for debtag in pkg_debtags:
                if debtag in debtags_name:
                    debtags_match.append(debtag)

            print "\n\n="
            print "{0}".format(pkg)
            print "debtags:"
            print debtags_match
            print "-"
            print "terms:"
            print terms_match
            print "="

    def get_item_score(self, pkgs_score, pkgs_classifications):
        item_score = {}
        order = ['RU', 'U', 'NU']
        order_values = [0, 1000, 2000]

        for pkg, classification in pkgs_classifications.iteritems():
            item_score[pkg] = order_values[order.index(classification)]
            item_score[pkg] += pkgs_score[pkg]

        return item_score

    def get_pkgs_and_scores(self, rec, user):
        profile = user.content_profile(rec.items_repository, self.content,
                                       self.suggestion_size, rec.valid_tags)

        content_based = self.get_sugestion_from_profile(rec, user,
                                                        profile,
                                                        self.suggestion_size)
        pkgs, pkgs_score = [], {}
        for pkg_line in str(content_based).splitlines()[1:]:
            pkg = pkg_line.split(':')[1][1:]
            pkg_score = int(pkg_line.split(':')[0].strip())

            pkgs.append(pkg)
            pkgs_score[pkg] = self.suggestion_size - pkg_score

        return pkgs, pkgs_score

    def get_pkgs_classifications(self, pkgs, terms_name, debtags_name):
        ml_strategy = self.get_ml_strategy()
        pkgs_classifications = {}
        kwargs = {}

        kwargs['terms_name'] = terms_name
        kwargs['debtags_name'] = debtags_name
        kwargs['ml_strategy'] = ml_strategy

        for pkg in pkgs:

            if pkg not in self.cache:
                continue

            attribute_vector = self.prepare_pkg_data(
                pkg, **kwargs)

            classification = self.get_pkg_classification(
                ml_strategy, attribute_vector)
            pkgs_classifications[pkg] = classification

        return pkgs_classifications

    def load_terms_and_debtags(self):
        terms_name = []
        debtags_name = []

        terms_path = self.get_terms_path()
        debtags_path = self.get_debtags_path()

        with open(terms_path, 'rb') as terms:
            terms_name = pickle.load(terms)
        with open(debtags_path, 'rb') as debtags:
            debtags_name = pickle.load(debtags)

        return terms_name, debtags_name

    @staticmethod
    def train(cls):
        if MachineLearning.PKGS_CLASSIFICATIONS is None:
            ml_data = MachineLearningData()
            labels = ['RU', 'U', 'NU']
            MachineLearning.PKGS_CLASSIFICATIONS = ml_data.create_data(labels)

        cls.run_train(MachineLearning.PKGS_CLASSIFICATIONS)

    @abstractmethod
    def get_debtags_path(self):
        raise NotImplementedError("Method not implemented.")

    @abstractmethod
    def get_ml_strategy(self):
        raise NotImplementedError("Method not implemented.")

    @abstractmethod
    def get_pkg_classification(self, ml_strategy, attribute_vector):
        raise NotImplementedError("Method not implemented.")

    @abstractmethod
    def get_terms_path(self):
        raise NotImplementedError("Method not implemented.")

    @abstractmethod
    def prepare_pkg_data(self, pkg, **kwargs):
        raise NotImplementedError("Method not implemented.")

    @abstractmethod
    def run_train(cls, pkgs_classifications):
        raise NotImplementedError("Method not implemented.")

    def run(self, rec, user, rec_size):
        terms_name, debtags_name = self.load_terms_and_debtags()

        pkgs, pkgs_score = self.get_pkgs_and_scores(rec, user)

        pkgs_classifications = self.get_pkgs_classifications(pkgs, terms_name,
                                                             debtags_name)

        item_score = self.get_item_score(pkgs_score, pkgs_classifications)
        result = recommender.RecommendationResult(item_score, limit=rec_size)

        return result