Exemplo n.º 1
0
def extract_shelfmarks(args):
    settings = get_appsettings(args.configuration)
    setup_logging(args.configuration)
    logger = logging.getLogger('explorethestacks')
    logger.info('Extracting shelf-marks')
    engine = engine_from_config(settings, 'sqlalchemy.')
    DBSession.configure(bind=engine)
    dbsession = DBSession()
    count = 0
    with transaction.manager:
        for shelfmark in dbsession.query(ShelfMark):
            shelfmark.books = []
    with transaction.manager:
        dbsession.query(ShelfMark).delete()
    with transaction.manager:
        for book in dbsession.query(Book):
            for title in book.attrs['shelfmarks']:
                shelfmark = dbsession.query(ShelfMark).filter(
                    ShelfMark.title == title).first()
                if not shelfmark:
                    shelfmark = ShelfMark(title=title)
                dbsession.add(shelfmark)
                shelfmark.books.append(book)
            count = count + 1
            if count % 10000 == 0:
                dbsession.flush()
                logger.debug('%i books processed' % (count))
    logger.debug('%i books processed' % (count))
    prefix_len = len(
        os.path.commonprefix([sm.title for sm in dbsession.query(ShelfMark)]))
    with transaction.manager:
        for shelfmark in dbsession.query(ShelfMark):
            shelfmark.title = shelfmark.title[prefix_len:]
    logger.info('Shelf-marks extracted')
Exemplo n.º 2
0
def main(global_config, **settings):
    """ This function returns a Pyramid WSGI application.
    """
    engine = engine_from_config(settings, 'sqlalchemy.')
    DBSession.configure(bind=engine)
    config = Configurator(settings=settings)
    config.include('kajiki.integration.pyramid')
    config.add_static_view('static', 'static', cache_max_age=3600)
    views.init(config, settings)
    config.scan()
    return config.make_wsgi_app()
Exemplo n.º 3
0
def init_database(args):
    settings = get_appsettings(args.configuration)
    setup_logging(args.configuration)
    logger = logging.getLogger('explorethestacks')
    logger.info('Initialising the database')
    engine = engine_from_config(settings, 'sqlalchemy.')
    DBSession.configure(bind=engine)
    if args.drop_existing:
        Base.metadata.drop_all(engine)
    Base.metadata.create_all(engine)
    logger.info('Database initialised')
Exemplo n.º 4
0
def filter_books(args):
    settings = get_appsettings(args.configuration)
    setup_logging(args.configuration)
    logger = logging.getLogger('explorethestacks')
    logger.info('Filtering books')
    engine = engine_from_config(settings, 'sqlalchemy.')
    DBSession.configure(bind=engine)
    dbsession = DBSession()
    count = 0
    filter_count = 0
    with transaction.manager:
        for book in dbsession.query(Book):
            dbsession.add(book)
            if not book.illustrations:
                dbsession.delete(book)
                filter_count = filter_count + 1
            count = count + 1
            if count % 10000 == 0:
                logger.debug('%i books processed' % (count))
                transaction.commit()
    logger.info('%i books filtered' % (filter_count))
Exemplo n.º 5
0
def load_books(args):
    settings = get_appsettings(args.configuration)
    setup_logging(args.configuration)
    logger = logging.getLogger('explorethestacks')
    logger.info('Loading books')
    engine = engine_from_config(settings, 'sqlalchemy.')
    DBSession.configure(bind=engine)
    dbsession = DBSession()
    with transaction.manager:
        with open(args.source) as f:
            books = json.load(f)
            count = 0
            for book_data in books:
                dbsession.add(
                    Book(book_identifier=book_data['identifier'],
                         attrs=book_data))
                count = count + 1
                if count % 10000 == 0:
                    transaction.commit()
                    logger.debug('%i books loaded' % (count))
    logger.info('%i books loaded' % (count))
Exemplo n.º 6
0
def index_data(args):
    settings = get_appsettings(args.configuration)
    setup_logging(args.configuration)
    logger = logging.getLogger('explorethestacks')
    logger.info('Indexing data')
    engine = engine_from_config(settings, 'sqlalchemy.')
    DBSession.configure(bind=engine)
    dbsession = DBSession()
    es = Elasticsearch(hosts=[
        host.strip() for host in settings['elasticsearch.hosts'].split(',')
    ])
    count = 0
    for book in dbsession.query(Book):
        if book.shelf_marks:
            body = {'shelf_id_': [sm.shelf.id for sm in book.shelf_marks]}
            body.update(book.attrs)
            es.index(index='explore-the-stacks-book',
                     doc_type='book',
                     body=body,
                     id=book.id)
            count = count + 1
            if count % 1000 == 0:
                logger.debug('%i books indexed' % (count))
    logger.info('%i books indexed' % (count))
    count = 0
    for shelf in dbsession.query(Shelf):
        es.index(index='explore-the-stacks-shelf',
                 doc_type='shelf',
                 body={
                     'start': shelf.start,
                     'end': shelf.end,
                     'shelf_id_': shelf.parent_id,
                     'text': ' '.join(recursive_text(shelf))
                 },
                 id=shelf.id)
        count = count + 1
        if count % 10 == 0:
            logger.debug('%i shelves indexed' % (count))
    logger.info('%i shelves indexed' % (count))
Exemplo n.º 7
0
def load_illustrations(args):
    settings = get_appsettings(args.configuration)
    setup_logging(args.configuration)
    logger = logging.getLogger('explorethestacks')
    logger.info('Loading illustrations')
    engine = engine_from_config(settings, 'sqlalchemy.')
    DBSession.configure(bind=engine)
    dbsession = DBSession()
    count = 0
    db_book = None
    for path, _, filenames in os.walk(args.source):
        for filename in filenames:
            if not filename.endswith('.tsv'):
                continue
            with transaction.manager:
                with open('%s/%s' % (path, filename)) as f:
                    reader = DictReader(f, dialect='excel-tab')
                    for line in reader:
                        db_book = dbsession.query(Book).filter(
                            Book.book_identifier ==
                            line['book_identifier']).first()
                        if db_book:
                            for field in [
                                    'date', 'page', 'volume', 'image_idx'
                            ]:
                                try:
                                    line[field] = int(line[field])
                                except ValueError:
                                    pass
                            illustration = Illustration(
                                flickr_id=line['flickr_id'], attrs=line)
                            db_book.illustrations.append(illustration)
                            dbsession.add(illustration)
                            count = count + 1
                            if count % 10000 == 0:
                                logger.debug('%i illustrations loaded' %
                                             (count))
    logger.info('%i illustrations loaded' % (count))
Exemplo n.º 8
0
def create_keywords(args):
    import spacy
    from gensim import corpora, models
    STOPWORDS = ['etc', 'new']
    for language in ['english', 'german', 'french', 'italian', 'spanish']:
        STOPWORDS.extend(
            resource_string('ets',
                            'data/%s' % language).decode('utf-8').split('\n'))
    nlp = spacy.load('en')

    class BookCorpus(object):
        def __init__(self, query, dictionary):
            self.query = query
            self.dictionary = dictionary

        def __iter__(self):
            for book in self.query:
                words = [
                    token.orth_ for title in book.attrs['title']
                    for token in nlp(title)
                    if token.orth_.lower() not in STOPWORDS
                    and len(token.orth_) > 1
                ]
                bow = self.dictionary.doc2bow(words)
                yield bow

    settings = get_appsettings(args.configuration)
    setup_logging(args.configuration)
    logger = logging.getLogger('explorethestacks')
    logger.info('Creating keywords')
    engine = engine_from_config(settings, 'sqlalchemy.')
    DBSession.configure(bind=engine)
    dbsession = DBSession()
    logger.info('Creating dictionary')
    dictionary = corpora.dictionary.Dictionary()
    for book in dbsession.query(Book):
        words = [
            token.orth_ for title in book.attrs['title']
            for token in nlp(title)
            if token.orth_.lower() not in STOPWORDS and len(token.orth_) > 1
        ]
        dictionary.doc2bow(words, allow_update=True)
    dictionary.filter_extremes(keep_n=None)
    dictionary.compactify()
    dictionary.save('corpus.dict')
    logger.info('Creating corpus')
    corpora.MmCorpus.serialize('corpus.mm',
                               BookCorpus(dbsession.query(Book), dictionary))
    dictionary = corpora.dictionary.Dictionary.load('corpus.dict')
    model = models.tfidfmodel.TfidfModel(
        BookCorpus(dbsession.query(Book), dictionary))
    logger.info('Processing shelves')
    with transaction.manager:
        for shelf in dbsession.query(Shelf):
            text = ' '.join(recursive_text(shelf))
            for sep in [
                    '.', ',', ';', ':', '-', '_', '?', '!', '(', ')', '[', ']',
                    '{', '}'
            ]:
                text = text.replace(sep, ' ')
            doc = [
                w.lower() for w in text.split(' ')
                if w.lower() not in STOPWORDS and len(w) > 1
            ]
            topics = model[dictionary.doc2bow(doc)]
            topics.sort(key=lambda k: k[1], reverse=True)
            keywords = [dictionary[t[0]] for t in topics[0:10]]
            shelf.keywords = ', '.join(
                [k[0].upper() + k[1:] for k in keywords])
    logger.info('Keywords created')
Exemplo n.º 9
0
def create_shelves(args):
    settings = get_appsettings(args.configuration)
    setup_logging(args.configuration)
    logger = logging.getLogger('explorethestacks')
    logger.info('Creating shelves')
    engine = engine_from_config(settings, 'sqlalchemy.')
    DBSession.configure(bind=engine)
    dbsession = DBSession()
    shelf = None
    book_count = 0
    idx = 0
    count = 0
    with transaction.manager:
        dbsession.query(Shelf).delete()
    with transaction.manager:
        for shelf_mark in dbsession.query(ShelfMark).order_by(ShelfMark.title):
            if not shelf:
                idx = idx + 1
                shelf = Shelf(order=idx)
                dbsession.add(shelf)
                shelf.shelf_marks.append(shelf_mark)
                book_count = len(shelf_mark.books)
            elif book_count + len(shelf_mark.books) > 200:
                idx = idx + 1
                shelf = Shelf(order=idx)
                dbsession.add(shelf)
                shelf.shelf_marks.append(shelf_mark)
                book_count = len(shelf_mark.books)
            else:
                shelf.shelf_marks.append(shelf_mark)
                book_count = book_count + len(shelf_mark.books)
            count = count + 1
            if count % 10000 == 0:
                dbsession.flush()
                logger.debug('%s shelfmarks processed' % (count))
    logger.debug('%s shelfmarks processed' % (count))
    logger.debug('Creating shelf hierarchy')
    with transaction.manager:
        while dbsession.query(Shelf).filter(
                Shelf.parent_id == None).count() > 50:
            idx = 0
            parent_shelf = None
            child_count = 0
            for shelf in dbsession.query(Shelf).filter(
                    Shelf.parent_id == None).order_by(Shelf.order):
                if not parent_shelf:
                    idx = idx + 1
                    parent_shelf = Shelf(order=idx)
                    dbsession.add(parent_shelf)
                    shelf.parent = parent_shelf
                    child_count = child_count + 1
                elif child_count > 50:
                    idx = idx + 1
                    parent_shelf = Shelf(order=idx)
                    dbsession.add(parent_shelf)
                    shelf.parent = parent_shelf
                    child_count = 1
                else:
                    shelf.parent = parent_shelf
                    child_count = child_count + 1
    with transaction.manager:
        root_shelf = Shelf()
        dbsession.add(root_shelf)
        for shelf in dbsession.query(Shelf).filter(Shelf.parent_id == None):
            if shelf != root_shelf:
                shelf.parent = root_shelf
    logger.debug('Creating shelf titles')

    def create_titles(shelf):
        if shelf.children:
            for child in shelf.children:
                create_titles(child)
            shelf.start = shelf.children[0].start
            shelf.end = shelf.children[-1].end
        elif shelf.shelf_marks:
            shelf.start = shelf.shelf_marks[0].title
            shelf.end = shelf.shelf_marks[-1].title

    with transaction.manager:
        root_shelf = dbsession.query(Shelf).filter(
            Shelf.parent_id == None).first()
        create_titles(root_shelf)
        root_shelf.start = 'Explore the Stacks'
        root_shelf.end = 'Explore the Stacks'
    logger.info('Shelves created')