def _fields(annis_server): from texts.models import Text search_fields = [] # List of dict of 'name', 'values' all_texts = Text.objects.all() logger.info("Fetching metadata annotations for %d texts" % len(all_texts)) corpus_names_by_id = {c.id: c.annis_corpus_name for c in Corpus.objects.all()} for text in all_texts: corpus_name = corpus_names_by_id.get(text.corpus_id) if not text.corpus_id: logger.warn('No corpus ID for text %d %s' % (text.id, text.title)) else: meta_query_url = annis_server.url_document_metadata(corpus_name, text.title) logger.info(text.title) for name, value in get_selected_annotation_fields(meta_query_url, ('name', 'value')): matching_search_fields = [sf for sf in search_fields if sf['name'] == name] if matching_search_fields: values_list = matching_search_fields[0]['values'] matching_values_dicts = [vd for vd in values_list if vd['value'] == value] if matching_values_dicts: matching_values_dicts[0]['texts'].append(text.id) else: values_list.append({ 'value': value, 'texts': [text.id] }) else: search_fields.append({ 'name': name, 'values': [{ 'value': value, 'texts': [text.id] }] }) return search_fields
def fetch_texts(ingest_id): from texts.models import Corpus, Text from annis.models import AnnisServer # Define HTML Formats and the ANNIS server to query annis_server = AnnisServer.objects.all()[:1] if annis_server: annis_server = annis_server[0] if not annis_server.base_domain.endswith("/"): annis_server.base_domain += "/" else: logger.error("No ANNIS server found") return False ingest = _retry_getting_ingest(ingest_id) if not ingest: logger.error('Ingest with ID %d not found in database' % ingest_id) return logger.info("Starting virtual framebuffer") vdisplay = Xvfb() try: vdisplay.start() except Exception as e: logger.error('Unable to start Xvfb: %s' % e) ingesting_corpora = Corpus.objects.filter( id__in=(ingest.corpora.values_list('id', flat=True))) try: for corpus in ingesting_corpora: corpus_name = corpus.annis_corpus_name logger.info('Importing corpus ' + corpus.title) doc_names_url = annis_server.url_corpus_docname(corpus_name) doc_titles = [ fields[0] for fields in get_selected_annotation_fields( doc_names_url, ('name', )) ] logger.info('%d documents found for corpus %s: %s' % (len(doc_titles), corpus_name, ', '.join(doc_titles))) for title in doc_titles: logger.info('Importing ' + title) Text.objects.filter(title=title).delete() text = Text() text.title = title text.slug = slugify(title).__str__() text.corpus = corpus text.ingest = ingest text.save() doc_meta_url = annis_server.url_document_metadata( corpus_name, text.title) metadata.collect_text_meta(doc_meta_url, text) vis.collect(corpus, text, annis_server) ingest.num_texts_ingested += 1 ingest.save() ingest.num_corpora_ingested += 1 ingest.save() except VisServerRefusingConn: logger.error( 'Aborting ingestion because visualization server repeatedly refused connections' ) vdisplay.stop() logger.info('Finished')
def fetch_texts( ingest_id ): """ For all corpora specified in the database, query the document names and ingest specified html visualizations for all document names """ from texts.models import Corpus, Text, TextMeta, HtmlVisualization from annis.models import AnnisServer # Define HTML Formats and the ANNIS server to query annis_server = AnnisServer.objects.all()[:1] if annis_server: annis_server = annis_server[0] if not annis_server.base_domain.endswith("/"): annis_server.base_domain += "/" else: logger.error("No ANNIS server found") return False ingest = _retry_getting_ingest(ingest_id) if not ingest: logger.error('Ingest with ID %d not found in database' % ingest_id) return corpora_ids = ingest.corpora.values_list('id', flat=True) logger.info("Starting virtual framebuffer") vdisplay = Xvfb() try: vdisplay.start() except Exception as e: logger.error('Unable to start Xvfb: %s' % e) logger.info("Starting browser") try: driver = webdriver.Chrome(os.environ.get('CHROMEDRIVER', '/usr/lib/chromium-browser/chromedriver')) except Exception as e: logger.error('Unable to start browser: %s' % e) vdisplay.stop() return logger.info(driver) try: for corpus in Corpus.objects.filter(id__in=(corpora_ids)) if corpora_ids else Corpus.objects.all(): corpus_name = corpus.annis_corpus_name logger.info('Importing corpus ' + corpus.title) metadata.collect_corpus_meta(annis_server.url_corpus_metadata(corpus_name), corpus) for title, in get_selected_annotation_fields(annis_server.url_corpus_docname(corpus_name), ('name',)): slug = slugify(title).__str__() logger.info('Importing ' + title) Text.objects.filter(title=title).delete() text = Text() text.title = title text.slug = slug text.save() # Todo why save here, and again just below? metadata.collect_text_meta(annis_server.url_document_metadata(corpus_name, text.title), text) vis.collect(corpus, text, annis_server, driver) text.corpus = corpus text.ingest = ingest text.save() except VisServerRefusingConn: logger.error('Aborting ingestion because visualization server repeatedly refused connections') driver.quit() vdisplay.stop() search.process(annis_server) logger.info('Finished')