class PreprocessScript(ActionForm): form_class = PreprocessForm def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.progress_monitor = NullMonitor() def _articlefield_as_kwargs(self, article_field: ArticleField): return {k: getattr(article_field, k) for k in ("label", "suggested_destination", "values", "possible_types", "suggested_type")} def run(self): from amcat.scripts.article_upload.upload_plugins import get_upload_plugin self.progress_monitor.update(0, "Preprocessing files") plugin_name = self.form.cleaned_data['script'] plugin = get_upload_plugin(plugin_name) upload = self.form.cleaned_data['upload'] upload.encoding_override(self.form.cleaned_data['encoding']) if plugin.script_cls.has_preprocess(): filesmonitor = self.progress_monitor.submonitor(100, weight=80) for _ in plugin.script_cls._get_files(upload, monitor=filesmonitor): pass else: self.progress_monitor.update(80) self.progress_monitor.update(0, "Collecting fields") fields = [self._articlefield_as_kwargs(field) for field in plugin.script_cls.get_fields(upload)] self.progress_monitor.update(20, "Done") return fields
def remove_articles(self, articles, remove_from_index=True, monitor=NullMonitor()): """ Remove article from this articleset. Also removes CodedArticles (from codingjobs) and updates index if `remove_from_index` is True. @param articles: articles to be removed @type articles: iterable with indexing of integers or Article objects @param remove_from_index: notify elasticsearch of changes @type remove_from_index: bool """ monitor = monitor.submonitor(4) to_remove = {(art if type(art) is int else art.id) for art in articles} monitor.update(message="Deleting articles from database") ArticleSetArticle.objects.filter(articleset=self, article__in=articles).delete() monitor.update(message="Deleting coded articles from database") CodedArticle.objects.filter(codingjob__articleset=self, article__in=articles).delete() if remove_from_index: monitor.update(message="Deleting from index") amcates.ES().remove_from_set(self.id, to_remove) else: monitor.update() monitor.update(message="Deleting from cache") self._reset_property_cache()
def add_articles(self, article_ids, add_to_index=True, monitor=NullMonitor()): """ Add the given articles to this articleset. Implementation is exists of three parts: 1. Adding ArticleSetArticle objects 2. Adding CodedArticle objects 3. Updating index @param article_ids: articles to be removed @type article_ids: iterable with indexing of integers or Article objects @param add_to_index: notify elasticsearch of changes @type add_to_index: bool """ monitor = monitor.submonitor(total=4) article_ids = {(art if type(art) is int else art.id) for art in article_ids} # Only use articles that exist to_add = article_ids - self.get_article_ids() to_add = list(Article.exists(to_add)) monitor.update(message="Adding {n} articles to {aset}..".format( n=len(to_add), aset=self)) ArticleSetArticle.objects.bulk_create( [ ArticleSetArticle(articleset=self, article_id=artid) for artid in to_add ], batch_size=100, ) monitor.update( message= "{n} articleset articles added to database, adding to codingjobs.." .format(n=len(to_add))) cjarts = [ CodedArticle(codingjob=c, article_id=a) for c, a in itertools.product(self.codingjob_set.all(), to_add) ] CodedArticle.objects.bulk_create(cjarts) if add_to_index: monitor.update( message="{n} articles added to codingjobs, adding to index". format(n=len(cjarts))) es = ES() es.add_to_set(self.id, to_add, monitor=monitor) es.refresh() # We need to flush, or setting cache will fail else: monitor.update(2) # Add to property cache properties = ES().get_used_properties(article_ids=to_add) self._add_to_property_cache(properties)
def __init__(self, form=None, file=None, **kargs): if form is None: form = self.form_class(data=kargs, files={"file": file}) super().__init__(form) self.progress_monitor = NullMonitor() self.options = self.form.cleaned_data self.project = self.form.cleaned_data['project'] self.errors = []
def __init__(self, options=None, monitor=NullMonitor(), **kargs): """Default __init__ validates and stores the options form""" if self.options_form is None: self.options = self.options_raw = None else: self.bound_form = self._bind_form(options, **kargs) self._validate_form() self.options = self.bound_form.cleaned_data self.progress_monitor = monitor
def add_to_set(self, setid, article_ids, monitor=NullMonitor()): """Add the given articles to the given set. This is done in batches, so there is no limit on the length of article_ids (which can be a generator).""" if not article_ids: return batches = list(splitlist(article_ids, itemsperbatch=1000)) nbatches = len(batches) for i, batch in enumerate(batches): monitor.update(40/nbatches, "Added batch {iplus}/{nbatches}".format(iplus=i+1, **locals())) self.bulk_update(article_ids, UPDATE_SCRIPT_ADD_TO_SET, params={'set' : setid})
def add_to_set(self, setid, article_ids, monitor=NullMonitor()): """Add the given articles to the given set. This is done in batches, so there is no limit on the length of article_ids (which can be a generator).""" if not article_ids: if monitor: monitor.update() return batches = list(splitlist(article_ids, itemsperbatch=1000)) monitor = monitor.submonitor(total=len(batches)) nbatches = len(batches) for i, batch in enumerate(batches): monitor.update(message="Adding batch {iplus}/{nbatches}..".format( iplus=i + 1, nbatches=nbatches)) self.bulk_update(batch, UPDATE_SCRIPT_ADD_TO_SET, params={'set': setid})
def add_articles(self, articles, add_to_index=True, monitor=NullMonitor()): """ Add the given articles to this article set. Implementation is exists of three parts: 1. Adding ArticleSetArticle objects 2. Adding CodedArticle objects 3. Updating index @param articles: articles to be removed @type articles: iterable with indexing of integers or Article objects @param add_to_index: notify elasticsearch of changes @type add_to_index: bool """ articles = {(art if type(art) is int else art.id) for art in articles} to_add = articles - self.get_article_ids() to_add = Article.objects.filter(pk__in=to_add).values_list("pk", flat=True) monitor.update(10, "{n} articles need to be added".format(n=len(to_add))) ArticleSetArticle.objects.bulk_create([ ArticleSetArticle(articleset=self, article_id=artid) for artid in to_add ]) monitor.update( 20, "{n} articles added to articlesets, adding to codingjobs".format( n=len(to_add))) CodedArticle.objects.bulk_create([ CodedArticle(codingjob=c, article_id=a) for c, a in itertools.product(self.codingjob_set.all(), to_add) ]) monitor.update( 30, "{n} articles added to codingjobs, adding to index".format( n=len(to_add))) if add_to_index: amcates.ES().add_to_set(self.id, to_add, monitor=monitor)
def bulk_insert(self, dicts, batch_size=1000, monitor=NullMonitor()): """ Bulk insert the given articles in batches of batch_size """ batches = list(toolkit.splitlist( dicts, itemsperbatch=batch_size)) if batch_size else [dicts] monitor = monitor.submonitor(total=len(batches)) nbatches = len(batches) for i, batch in enumerate(batches): monitor.update( 1, "Adding batch {iplus}/{nbatches}".format(iplus=i + 1, **locals())) props, articles = set(), {} for d in batch: props |= (set(d.keys()) - ALL_FIELDS) articles[d["id"]] = serialize(d) self.check_properties(props) body = get_bulk_body(articles) resp = self.es.bulk(body=body, index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE) if resp["errors"]: raise ElasticSearchError(resp)
def create_articles(cls, articles, articleset=None, articlesets=None, deduplicate=True, monitor=NullMonitor()): """ Add the given articles to the database, the index, and the given set Duplicates are detected and have ._duplicate and .id set (and are added to sets) @param articles: a collection of objects with the necessary properties (.title etc) @param articleset(s): articleset object(s), specify either or none """ monitor = monitor.submonitor(total=6) if articlesets is None: articlesets = [articleset] if articleset else [] # Check for ids for a in articles: if a.id is not None: raise ValueError( "Specifying explicit article ID in save not allowed") # Compute hashes, mark all articles as non-duplicates for a in articles: a.compute_hash() a._duplicate = None # Determine which articles are dupes of each other, *then* query the database # to check if the database has any articles we just got. if deduplicate: hashes = collections.defaultdict( list) # type: Dict[bytes, List[Article]] for a in articles: if a.hash in hashes: a._duplicate = hashes[a.hash][0] else: hashes[a.hash].append(a) # Check database for duplicates monitor.update(message="Checking _duplicates based on hash..") if hashes: results = Article.objects.filter( hash__in=hashes.keys()).only("hash") for orig in results: dupes = hashes[orig.hash] for dupe in dupes: dupe._duplicate = orig dupe.id = orig.id else: monitor.update() # Save all non-duplicates to_insert = [a for a in articles if not a._duplicate] monitor.update(message="Inserting {} articles into database..".format( len(to_insert))) if to_insert: result = bulk_insert_returning_ids(to_insert) for a, inserted in zip(to_insert, result): a.id = inserted.id dicts = [ a.get_article_dict(sets=[aset.id for aset in articlesets]) for a in to_insert ] amcates.ES().bulk_insert(dicts, batch_size=100, monitor=monitor) else: monitor.update() # At this point we can still have internal duplicates. Give them an ID as well. for article in articles: if article.id is None and article._duplicate is not None: article.id = article._duplicate.id if not articlesets: monitor.update(3) return articles # add new articles and _duplicates to articlesets monitor.update(message="Adding articles to articleset..") new_ids = {a.id for a in to_insert} dupes = {a._duplicate.id for a in articles if a._duplicate} - new_ids for aset in articlesets: if new_ids: aset.add_articles(new_ids, add_to_index=False, monitor=monitor) else: monitor.update() if dupes: aset.add_articles(dupes, add_to_index=True, monitor=monitor) else: monitor.update() # Add to articleset caches properties = set() for article in articles: properties.update(article.properties.keys()) for articleset in articlesets: articleset._add_to_property_cache(properties) return articles
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.progress_monitor = NullMonitor()
def _get_rows(jobs, include_sentences=False, include_multiple=True, include_uncoded_articles=False, include_uncoded_sentences=False, progress_monitor=NullMonitor()): """ @param jobs: output rows for these jobs. Make sure this is a QuerySet object with .prefetch_related("codings__values") @param include_sentences: include sentence level codings (if False, row.sentence and .sentence_coding are always None) @param include_multiple: include multiple codedarticles per article @param include_uncoded_articles: include articles without corresponding codings """ art_filter = Q(coded_articles__codingjob__in=jobs) if include_uncoded_articles: art_filter |= Q(articlesets_set__codingjob_set__in=jobs) job_articles = {a.id: a for a in Article.objects.filter(art_filter)} job_sentences = {s.id: s for s in Sentence.objects.filter(article__id__in=job_articles.keys())} # Mapping of article -> sentences article_sentences = collections.defaultdict(set) for sentence_id, sentence in job_sentences.items(): article_sentences[sentence.article_id].add(sentence_id) # Articles that have been seen in a codingjob already (so we can skip duplicate codings on the same article) seen_articles = set() for i, job in enumerate(jobs): # Get all codings in dicts for later lookup coded_articles = set() # {ca: coding} article_codings = {} # {ca: {sentence_id : [codings]}} sentence_codings = collections.defaultdict(lambda: collections.defaultdict(list)) for ca in job.coded_articles.order_by('id').prefetch_related("codings__values"): coded_articles.add(ca) for c in ca.codings.all(): if c.sentence_id is None: if ca not in article_codings: # HACK, take first entry of duplicate article codings (#79) article_codings[ca.id] = c else: sentence_codings[ca.id][c.sentence_id].append(c) # output the rows for this job for ca in coded_articles: a = job_articles[ca.article_id] if a in seen_articles and not include_multiple: continue article_coding = article_codings.get(ca.id) sentence_ids = sentence_codings[ca.id] if include_sentences and sentence_ids: seen_articles.add(a) for sid in sentence_ids: s = job_sentences[sid] for sentence_coding in sentence_codings[ca.id][sid]: yield CodingRow(job, ca, a, s, article_coding, sentence_coding) if include_uncoded_sentences: non_coded_sentences = article_sentences[ca.article_id] - set(sentence_ids) for sentence in map(job_sentences.get, non_coded_sentences): yield CodingRow(job, ca, a, sentence, article_coding, None) elif article_coding: seen_articles.add(a) yield CodingRow(job, ca, a, None, article_coding, None) if include_uncoded_articles: for article in set(job_articles.values()) - seen_articles: yield CodingRow(job, job.get_coded_article(article), article, None, None, None)