예제 #1
0
class TestManager(unittest.TestCase):

    def setUp(self):
        self.cleanup = True
        create_database("mining.db")
        self.manager = Manager("mining.db")

    def tearDown(self):
        if self.cleanup:
            self.manager.session.close_all()
            delete_database("mining.db")

    def test_bulk_insert(self):
        arr_d_val = [
                     {'text':'1'},
                     {'text':'2'},
                     {'text':'3'},
                     {'text':'4'},
                     {'text':'5'},
                     {'text':'6'},
                     {'text':'7'}
                    ]
        for dict_val in arr_d_val:
            self.manager.bulk_insert_simple(
                dict_val=dict_val,
                instance=Word,
                list_search=['text'])
예제 #2
0
 def __init__(self, json_path, text_dir, db_url):
     self.json_path = json_path
     self.text_dir = text_dir
     self.db_url = db_url
     if not isdir(self.text_dir):
         mkdir(self.text_dir)
     self.extractor = Extractor(text_dir)
     self.manager = Manager(db_url)
예제 #3
0
    def sync(self, key, data, *a, **ka):
        m = Manager('c:\\qobuzdb\\qobuz.sqlite3')
        m.connect('c:\\qobuzdb\\qobuz.sqlite3')
#        m.create_new_database()
        try:
            self.parse(m, data['data'])
        except Exception as e:
            print "Exception %s" % (repr(e))
            raise e
        m.close()
        return False
예제 #4
0
 def __init__(self, json_path, text_dir, db_url):
     self.json_path = json_path
     self.text_dir = text_dir
     self.db_url = db_url
     if not isdir(self.text_dir):
         mkdir(self.text_dir)
     self.extractor = Extractor(text_dir)
     self.manager = Manager(db_url)
예제 #5
0
 def __init__(self, text_dir, db_url, book_url, should_download=False):
     """
     ``text_dir`` is the directory where a copy of text should be put.
     ``db_url`` should be the url to a database that already exists.
     ``should_download`` indicates whether or not ``book_url`` is a local
     path or a url in the internet.
     """
     self.text_dir = text_dir
     self.db_url = db_url
     self.book_url = book_url
     self.should_download = should_download
     self.manager = Manager(db_url)
     self.extractor = Extractor(text_dir)
예제 #6
0
 def sync(self, key, data, *a, **ka):
     m = Manager('c:\\qobuzdb\\qobuz.sqlite3')
     m.connect('c:\\qobuzdb\\qobuz.sqlite3')
     #        m.create_new_database()
     try:
         self.parse(m, data['data'])
     except Exception as e:
         print "Exception %s" % (repr(e))
         raise e
     m.close()
     return False
예제 #7
0
class Trainer(object):
    def __init__(self, json_path, text_dir, db_url):
        self.json_path = json_path
        self.text_dir = text_dir
        self.db_url = db_url
        if not isdir(self.text_dir):
            mkdir(self.text_dir)
        self.extractor = Extractor(text_dir)
        self.manager = Manager(db_url)

    def json(self):
        if not hasattr(self, "_json"):
            _json = []
            texts = {}
            with open(self.json_path, "r") as f:
                texts = json.load(f)
            for text in texts:
                author = text["Author"]
                title = text["Title"]
                period = text["Period"]
                url = text["URL"]
                _json.append((author, title, period, url))
        return _json

    def get_books(self):
        """
        Downloads the book if it's not in the texts directory.
        """
        files = [f for f in listdir(self.text_dir)]
        for author, title, period, url in self.json():
            filename = format_filename(author, title)
            if not filename in files:
                logger.debug("Getting %s" % filename)
                book = self.extractor.download_book(url, False, author, title,
                                                    period)
            else:
                logger.debug("%s already downloaded" % filename)

    def train(self):
        logger.debug("      STARTING get_books")
        self.get_books()
        logger.debug("      STARTING populate")
        self.populate()
        logger.debug("      STARTING categories")
        self.categories()
        logger.debug("      STARTING conditional_probability")
        self.conditional_probability()
        self.manager.session.close_all()

    def populate(self):
        output = []
        for author, title, period, url in self.json():
            # TODO clean the next line
            words = self.extractor.read_text(format_filename(author, title))
            if len(words) == 0:
                continue
            total_words = reduce(operator.add, words.values())
            #insert period
            dic_period = {'name': period}
            list_search = ['name']
            period_obj = self.manager.get_or_insert(dict_val=dic_period,
                                                    instance=models.Period,
                                                    list_search=list_search)
            #insert book
            # logger.debug(words)
            logger.debug("Total Words: %s", total_words)
            dic_book = {
                'name': title,
                'author': author,
                'period': period_obj,
                'total_words': total_words,
                'sentence_total': 0
            }
            list_search = ['name', 'author', 'period']
            book_obj = self.manager.get_or_insert(dict_val=dic_book,
                                                  instance=models.Book,
                                                  list_search=list_search)
            #Words
            filename = format_filename(author, title)

            if len(words) == 0:
                continue

            logger.debug("Period id : %s %s" %
                         (period_obj.id, period_obj.name))
            logger.debug("Book id : %s %s %s" %
                         (book_obj.id, book_obj.name, book_obj.author))
            self.manager.insert_words(words, book_obj, total_words)

    def categories(self):
        words_all = self.manager.get({}, Word, [], True)
        total = len(words_all)
        logger.debug("  categories Words %s" % total)
        for word_obj in words_all:
            self.calculate_categories(word_obj=word_obj)
            total -= 1
            if total % 500 == 0:
                logger.debug("Progressing Word -- Category... %s" % total)
        self.manager.session.commit()

    def calculate_categories(self, word_obj=None):
        if not word_obj:
            return False
        max_rate, min_rate = self.manager.get_max_min_rate(word_obj)
        self.manager.construct_categories(min_rate, max_rate, word_obj)

    def period_probability(self, period, log=False):
        """
        # libros de esa epoca
        ---
        # total de libros
        """
        books_period = self.manager.session.query(Book).filter_by(
            period=period).count()
        if log:
            logger.debug("      books_period = %f " % (books_period))
        return books_period

    def word_category_period_probability(self,
                                         word,
                                         category,
                                         period,
                                         log=False):
        """
        cuenta cuantos (libros de esa epoca) tienen esa palabra en esa categoria
        ---
        numero de libros de esa epoca
        """
        num_books__word_cat = 0
        books_period = self.manager.session.query(Book).filter_by(
            period=period).all()
        for book in books_period:
            #el libro contiene la palabra
            book_word = self.manager.session.query(WordCount).filter_by(
                book=book, word=word).all()
            word_category = self.manager.session.query(WordCategory).filter_by(
                category=category, word=word).one()

            #if len(book_word)==0, no relation then prob 0
            if len(book_word) > 0 and word_category:
                if book_word[0].rate >= word_category.min_range and book_word[
                        0].rate < word_category.max_range:
                    num_books__word_cat += 1
        if log:
            logger.debug("      num_books__word_cat= %f" %
                         (num_books__word_cat))

        return num_books__word_cat

    def probability(self, word, category, period, log=False):
        """
        probabilidad esa palabra en esa categoria en esa epoca
        ---
        probabilidad de esa epoca = # libros de esa epoca / cantidad de libros
        """
        word_category_period_probability = self.word_category_period_probability(
            word, category, period, log=log)
        period_probability = self.period_probability(period, log=log)
        if log:
            logger.debug(
                "  word cat period prob = %f / period prob = %f = %f" %
                (word_category_period_probability, period_probability,
                 word_category_period_probability / period_probability))
        return word_category_period_probability / period_probability

    def conditional_probability(self):
        """
        """
        self.manager.session.query(WordConditionalProbability).delete()
        bulk = []
        words_all = self.manager.session.query(Word).all()
        periods = self.manager.session.query(Period).all()
        categories = self.manager.session.query(Category).all()
        for period in periods:
            logger.debug(period.name)
            for category in categories:
                logger.debug(category.description)
                total = len(words_all)
                for word in words_all:
                    #word rate?
                    prob = self.probability(word=word,
                                            category=category,
                                            period=period)
                    if prob > 1:
                        logger.debug("word %s category %s  period %s prob %s" %
                                     (word.text, category.description,
                                      period.name, prob))
                        self.probability(word=word,
                                         category=category,
                                         period=period,
                                         log=True)
                    word_cond_prob = WordConditionalProbability(
                        word=word,
                        category=category,
                        period=period,
                        probability=prob)
                    bulk.append(word_cond_prob)
                    total -= 1
                    if total % 500 == 0:
                        logger.debug("left ... %s words" % total)
        self.manager.session.add_all(bulk)
        self.manager.session.commit()
        self.complete_probability()

    def complete_probability(self):
        bulk = []
        list_cat = ['med', 'high', 'high_high']
        cats_ids = self.manager.session.query(Category).filter(
            Category.description.in_(list_cat)).all()
        low = self.manager.session.query(Category).filter(
            Category.description == 'low').one()

        words_all = self.manager.session.query(Word).all()
        periods = self.manager.session.query(Period).all()
        for period in periods:
            total = len(words_all)
            for word in words_all:
                sum_3cat = self.manager.session.query(
                    func.sum(WordConditionalProbability.probability)).filter(
                        and_(
                            WordConditionalProbability.id_category.in_(
                                c.id for c in cats_ids),
                            WordConditionalProbability.id_word == word.id,
                            WordConditionalProbability.id_period ==
                            period.id)).all()[0][0]
                cat_low = self.manager.session.query(
                    WordConditionalProbability).filter(
                        and_(WordConditionalProbability.id_category == low.id,
                             WordConditionalProbability.id_word == word.id,
                             WordConditionalProbability.id_period ==
                             period.id)).all()
                cat_low[0].probability = 1 - sum_3cat
                # print "word_id %s period %d sum %s" %(word.id,period.id,sum_3cat)
                total -= 1
                if total % 500 == 0:
                    logger.debug("left ... %s words" % total)
        self.manager.session.commit()
예제 #8
0
class Trainer(object):

    def __init__(self, json_path, text_dir, db_url):
        self.json_path = json_path
        self.text_dir = text_dir
        self.db_url = db_url
        if not isdir(self.text_dir):
            mkdir(self.text_dir)
        self.extractor = Extractor(text_dir)
        self.manager = Manager(db_url)

    def json(self):
        if not hasattr(self, "_json"):
            _json = []
            texts = {}
            with open(self.json_path, "r") as f:
                texts = json.load(f)
            for text in texts:
                author = text["Author"]
                title = text["Title"]
                period = text["Period"]
                url = text["URL"]
                _json.append((author, title, period, url))
        return _json

    def get_books(self):
        """
        Downloads the book if it's not in the texts directory.
        """
        files = [f for f in listdir(self.text_dir)]
        for author, title, period, url in self.json():
            filename = format_filename(author, title)
            if not filename in files:
                logger.debug("Getting %s" % filename)
                book = self.extractor.download_book(url, False, author, title, period)
            else:
                logger.debug("%s already downloaded" % filename)

    def train(self):
        logger.debug("      STARTING get_books")
        self.get_books()
        logger.debug("      STARTING populate")
        self.populate()
        logger.debug("      STARTING categories")
        self.categories()
        logger.debug("      STARTING conditional_probability")
        self.conditional_probability()
        self.manager.session.close_all()    

    def populate(self):
        output = []
        for author, title, period, url in self.json():
            # TODO clean the next line
            words = self.extractor.read_text(format_filename(author, title))
            if len(words) == 0:
                continue
            total_words = reduce(operator.add, words.values())
            #insert period
            dic_period = {'name':period}
            list_search = ['name']
            period_obj = self.manager.get_or_insert(dict_val=dic_period,
                instance=models.Period, list_search=list_search)
            #insert book
            # logger.debug(words)
            logger.debug("Total Words: %s", total_words)
            dic_book = {'name':title,
                'author':author,
                'period':period_obj,
                'total_words':total_words,
                'sentence_total':0}
            list_search = ['name','author','period']
            book_obj = self.manager.get_or_insert(dict_val=dic_book,
                instance=models.Book,list_search=list_search)
            #Words
            filename = format_filename(author, title)
            
            if len(words) == 0:
                continue

            logger.debug("Period id : %s %s" % (period_obj.id,period_obj.name))
            logger.debug("Book id : %s %s %s" % (book_obj.id,book_obj.name,book_obj.author))
            self.manager.insert_words(words,book_obj,total_words)

    def categories(self):
        words_all = self.manager.get({},Word,[],True)
        total = len(words_all)
        logger.debug("  categories Words %s" % total)
        for word_obj in words_all:
            self.calculate_categories(word_obj=word_obj)
            total -= 1
            if total % 500 ==0:
                logger.debug("Progressing Word -- Category... %s" % total)
        self.manager.session.commit()

    def calculate_categories(self, word_obj=None):
        if not word_obj:
            return False
        max_rate, min_rate = self.manager.get_max_min_rate(word_obj)
        self.manager.construct_categories(min_rate,max_rate, word_obj)


    def period_probability(self, period, log=False):
        """
        # libros de esa epoca
        ---
        # total de libros
        """
        books_period = self.manager.session.query(Book).filter_by(period=period).count()
        if log:
            logger.debug("      books_period = %f " % (books_period))
        return books_period


    def word_category_period_probability(self, word, category, period, log=False):
        """
        cuenta cuantos (libros de esa epoca) tienen esa palabra en esa categoria
        ---
        numero de libros de esa epoca
        """
        num_books__word_cat = 0
        books_period = self.manager.session.query(Book).filter_by(period=period).all()
        for book in books_period:
            #el libro contiene la palabra
            book_word = self.manager.session.query(WordCount).filter_by(
                book=book,word=word).all()
            word_category = self.manager.session.query(WordCategory).filter_by(
                category=category,word=word).one()
            
            #if len(book_word)==0, no relation then prob 0 
            if len(book_word) > 0 and word_category:
                if book_word[0].rate >= word_category.min_range and book_word[0].rate < word_category.max_range:
                    num_books__word_cat += 1
        if log:
            logger.debug("      num_books__word_cat= %f" % (num_books__word_cat))

        return num_books__word_cat

    def probability(self, word, category, period, log=False):
        """
        probabilidad esa palabra en esa categoria en esa epoca
        ---
        probabilidad de esa epoca = # libros de esa epoca / cantidad de libros
        """
        word_category_period_probability = self.word_category_period_probability(word, category, period, log=log)
        period_probability = self.period_probability(period, log=log)
        if log:
            logger.debug("  word cat period prob = %f / period prob = %f = %f" % (word_category_period_probability,period_probability,word_category_period_probability/period_probability))
        return word_category_period_probability/period_probability


    def conditional_probability(self):
        """
        """
        self.manager.session.query(WordConditionalProbability).delete()
        bulk = []
        words_all = self.manager.session.query(Word).all()
        periods = self.manager.session.query(Period).all()
        categories = self.manager.session.query(Category).all()
        for period in periods:
            logger.debug(period.name)
            for category in categories:
                logger.debug(category.description)
                total = len(words_all)
                for word in words_all:
                    #word rate?
                    prob = self.probability(
                        word=word,
                        category=category,
                        period=period)
                    if prob > 1:
                        logger.debug("word %s category %s  period %s prob %s" % (word.text,category.description, period.name, prob))
                        self.probability(word=word,category=category,period=period, log=True)
                    word_cond_prob = WordConditionalProbability(
                        word=word,
                        category=category,
                        period=period,
                        probability=prob)
                    bulk.append(word_cond_prob)
                    total -= 1
                    if total % 500 == 0:
                        logger.debug("left ... %s words" % total)
        self.manager.session.add_all(bulk)
        self.manager.session.commit()
        self.complete_probability()

    def complete_probability(self):
        bulk = []
        list_cat = ['med','high','high_high']
        cats_ids = self.manager.session.query(Category).filter(Category.description.in_(list_cat)).all()
        low = self.manager.session.query(Category).filter(Category.description=='low').one()

        words_all = self.manager.session.query(Word).all()
        periods = self.manager.session.query(Period).all()
        for period in periods:
            total = len(words_all)
            for word in words_all:
                sum_3cat = self.manager.session.query(
                    func.sum(WordConditionalProbability.probability)).filter(
                        and_(WordConditionalProbability.id_category.in_(c.id for c in cats_ids),
                            WordConditionalProbability.id_word == word.id,
                            WordConditionalProbability.id_period == period.id)
                    ).all()[0][0]
                cat_low = self.manager.session.query(WordConditionalProbability).filter(
                        and_(WordConditionalProbability.id_category == low.id,
                            WordConditionalProbability.id_word == word.id,
                            WordConditionalProbability.id_period == period.id)
                    ).all()
                cat_low[0].probability = 1 - sum_3cat
                # print "word_id %s period %d sum %s" %(word.id,period.id,sum_3cat)
                total -= 1
                if total % 500 == 0:
                    logger.debug("left ... %s words" % total)
        self.manager.session.commit()
예제 #9
0
 def setUp(self):
     self.cleanup = True
     create_database("mining.db")
     self.manager = Manager("mining.db")