示例#1
0
def _fields(annis_server):
	from texts.models import Text

	search_fields = []  # List of dict of 'name', 'values'

	all_texts = Text.objects.all()
	logger.info("Fetching metadata annotations for %d texts" % len(all_texts))

	corpus_names_by_id = {c.id: c.annis_corpus_name for c in Corpus.objects.all()}

	for text in all_texts:
		corpus_name = corpus_names_by_id.get(text.corpus_id)
		if not text.corpus_id:
			logger.warn('No corpus ID for text %d %s' % (text.id, text.title))
		else:
			meta_query_url = annis_server.url_document_metadata(corpus_name, text.title)
			logger.info(text.title)

			for name, value in get_selected_annotation_fields(meta_query_url, ('name', 'value')):

				matching_search_fields = [sf for sf in search_fields if sf['name'] == name]
				if matching_search_fields:
					values_list = matching_search_fields[0]['values']
					matching_values_dicts = [vd for vd in values_list if vd['value'] == value]
					if matching_values_dicts:
						matching_values_dicts[0]['texts'].append(text.id)
					else:
						values_list.append({
							'value': value,
							'texts': [text.id]
						})
				else:
					search_fields.append({
						'name': name,
						'values': [{
							'value': value,
							'texts': [text.id]
						}]
					})

	return search_fields
示例#2
0
文件: ingest.py 项目: adampmoore/cts
def fetch_texts(ingest_id):
    from texts.models import Corpus, Text
    from annis.models import AnnisServer

    # Define HTML Formats and the ANNIS server to query
    annis_server = AnnisServer.objects.all()[:1]

    if annis_server:
        annis_server = annis_server[0]
        if not annis_server.base_domain.endswith("/"):
            annis_server.base_domain += "/"
    else:
        logger.error("No ANNIS server found")
        return False

    ingest = _retry_getting_ingest(ingest_id)
    if not ingest:
        logger.error('Ingest with ID %d not found in database' % ingest_id)
        return

    logger.info("Starting virtual framebuffer")
    vdisplay = Xvfb()
    try:
        vdisplay.start()
    except Exception as e:
        logger.error('Unable to start Xvfb: %s' % e)

    ingesting_corpora = Corpus.objects.filter(
        id__in=(ingest.corpora.values_list('id', flat=True)))

    try:
        for corpus in ingesting_corpora:
            corpus_name = corpus.annis_corpus_name
            logger.info('Importing corpus ' + corpus.title)
            doc_names_url = annis_server.url_corpus_docname(corpus_name)
            doc_titles = [
                fields[0] for fields in get_selected_annotation_fields(
                    doc_names_url, ('name', ))
            ]
            logger.info('%d documents found for corpus %s: %s' %
                        (len(doc_titles), corpus_name, ', '.join(doc_titles)))

            for title in doc_titles:
                logger.info('Importing ' + title)

                Text.objects.filter(title=title).delete()

                text = Text()
                text.title = title
                text.slug = slugify(title).__str__()
                text.corpus = corpus
                text.ingest = ingest
                text.save()

                doc_meta_url = annis_server.url_document_metadata(
                    corpus_name, text.title)
                metadata.collect_text_meta(doc_meta_url, text)
                vis.collect(corpus, text, annis_server)

                ingest.num_texts_ingested += 1
                ingest.save()

            ingest.num_corpora_ingested += 1
            ingest.save()
    except VisServerRefusingConn:
        logger.error(
            'Aborting ingestion because visualization server repeatedly refused connections'
        )

    vdisplay.stop()

    logger.info('Finished')
示例#3
0
def fetch_texts( ingest_id ):
	"""
	For all corpora specified in the database, query the document names and ingest
	specified html visualizations for all document names

	"""

	from texts.models import Corpus, Text, TextMeta, HtmlVisualization
	from annis.models import AnnisServer

	# Define HTML Formats and the ANNIS server to query
	annis_server = AnnisServer.objects.all()[:1] 

	if annis_server:
		annis_server = annis_server[0]
		if not annis_server.base_domain.endswith("/"):
			annis_server.base_domain += "/"
	else:
		logger.error("No ANNIS server found")
		return False

	ingest = _retry_getting_ingest(ingest_id)
	if not ingest:
		logger.error('Ingest with ID %d not found in database' % ingest_id)
		return

	corpora_ids = ingest.corpora.values_list('id', flat=True)

	logger.info("Starting virtual framebuffer")
	vdisplay = Xvfb()
	try:
		vdisplay.start()
	except Exception as e:
		logger.error('Unable to start Xvfb: %s' % e)
	logger.info("Starting browser")
	try:
		driver = webdriver.Chrome(os.environ.get('CHROMEDRIVER', '/usr/lib/chromium-browser/chromedriver'))
	except Exception as e:
		logger.error('Unable to start browser: %s' % e)
		vdisplay.stop()
		return
	logger.info(driver)

	try:
		for corpus in Corpus.objects.filter(id__in=(corpora_ids)) if corpora_ids else Corpus.objects.all():
			corpus_name = corpus.annis_corpus_name
			logger.info('Importing corpus ' + corpus.title)
			metadata.collect_corpus_meta(annis_server.url_corpus_metadata(corpus_name), corpus)

			for title, in get_selected_annotation_fields(annis_server.url_corpus_docname(corpus_name), ('name',)):
				slug = slugify(title).__str__()

				logger.info('Importing ' + title)

				Text.objects.filter(title=title).delete()

				text = Text()
				text.title = title
				text.slug = slug
				text.save()  # Todo why save here, and again just below?

				metadata.collect_text_meta(annis_server.url_document_metadata(corpus_name, text.title), text)
				vis.collect(corpus, text, annis_server, driver)

				text.corpus = corpus
				text.ingest = ingest
				text.save()
	except VisServerRefusingConn:
		logger.error('Aborting ingestion because visualization server repeatedly refused connections')
				
	driver.quit()
	vdisplay.stop()

	search.process(annis_server)

	logger.info('Finished')