예제 #1
0
def process_staged_urls():
    """Query download all staged URLs, Update Catalog and Document"""

    engine = db_connect()
    create_tables(engine)
    Session = sessionmaker(bind=engine)
    session = Session()

    # for event in session.query(EventStage).all():
    #     copy_event_from_stage(event)

    for url_record in session.query(UrlStage).all():
        # print(url_record.url)

        place_record = session.query(Place). \
            filter(Place.ocd_division_id == url_record.ocd_division_id).first()
        event_record = session.query(Event). \
            filter(Event.ocd_division_id == url_record.ocd_division_id,
                   Event.record_date == url_record.event_date,
                   Event.name == url_record.event).first()
        print(f'place id: {place_record.id}\n event_id:{event_record.id}')

        catalog_entry = session.query(Catalog). \
            filter(Catalog.url_hash == url_record.url_hash).first()

        # Document already exists in catalog
        if catalog_entry:
            catalog_id = catalog_entry.id
            print(f'catalog_id---------{catalog_id}')
            document = map_document(
                url_record, place_record.id, event_record.id, catalog_id)
            save_record(document)
            print("existing in catalog adding reference to document")

        else:
            print("Does not exist")

            # Download and save document
            catalog = Catalog(
                url=url_record.url,
                url_hash=url_record.url_hash,
                location='placeholder',
                filename=f'{url_record.url_hash}.pdf'
                )

            doc = Media(url_record)

            # download
            result = doc.gather()

            # Add to doc catalog
            if result:
                catalog.location = result
                catalog_id = save_record(catalog)
                # Add document reference
                document = map_document(
                    url_record, place_record.id, event_record.id, catalog_id)
                doc_id = save_record(document)

                print(f'Added {url_record.url_hash} doc_id: {doc_id}')