示例#1
0
文件: upload.py 项目: amcat/amcat
class PreprocessScript(ActionForm):
    form_class = PreprocessForm

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.progress_monitor = NullMonitor()

    def _articlefield_as_kwargs(self, article_field: ArticleField):
        return {k: getattr(article_field, k)
             for k in ("label", "suggested_destination", "values", "possible_types", "suggested_type")}


    def run(self):
        from amcat.scripts.article_upload.upload_plugins import get_upload_plugin
        self.progress_monitor.update(0, "Preprocessing files")
        plugin_name = self.form.cleaned_data['script']
        plugin = get_upload_plugin(plugin_name)
        upload = self.form.cleaned_data['upload']
        upload.encoding_override(self.form.cleaned_data['encoding'])
        if plugin.script_cls.has_preprocess():
            filesmonitor = self.progress_monitor.submonitor(100, weight=80)
            for _ in plugin.script_cls._get_files(upload, monitor=filesmonitor):
                pass
        else:
            self.progress_monitor.update(80)
        self.progress_monitor.update(0, "Collecting fields")
        fields = [self._articlefield_as_kwargs(field) for field in
                  plugin.script_cls.get_fields(upload)]

        self.progress_monitor.update(20, "Done")
        return fields
示例#2
0
    def remove_articles(self,
                        articles,
                        remove_from_index=True,
                        monitor=NullMonitor()):
        """
        Remove article from this articleset. Also removes CodedArticles (from codingjobs) and updates
        index if `remove_from_index` is True.

        @param articles: articles to be removed
        @type articles: iterable with indexing of integers or Article objects

        @param remove_from_index: notify elasticsearch of changes
        @type remove_from_index: bool
        """
        monitor = monitor.submonitor(4)
        to_remove = {(art if type(art) is int else art.id) for art in articles}

        monitor.update(message="Deleting articles from database")
        ArticleSetArticle.objects.filter(articleset=self,
                                         article__in=articles).delete()

        monitor.update(message="Deleting coded articles from database")
        CodedArticle.objects.filter(codingjob__articleset=self,
                                    article__in=articles).delete()

        if remove_from_index:
            monitor.update(message="Deleting from index")
            amcates.ES().remove_from_set(self.id, to_remove)
        else:
            monitor.update()

        monitor.update(message="Deleting from cache")
        self._reset_property_cache()
示例#3
0
    def add_articles(self,
                     article_ids,
                     add_to_index=True,
                     monitor=NullMonitor()):
        """
        Add the given articles to this articleset. Implementation is exists of three parts:

          1. Adding ArticleSetArticle objects
          2. Adding CodedArticle objects
          3. Updating index

        @param article_ids: articles to be removed
        @type article_ids: iterable with indexing of integers or Article objects

        @param add_to_index: notify elasticsearch of changes
        @type add_to_index: bool
        """
        monitor = monitor.submonitor(total=4)

        article_ids = {(art if type(art) is int else art.id)
                       for art in article_ids}

        # Only use articles that exist
        to_add = article_ids - self.get_article_ids()
        to_add = list(Article.exists(to_add))

        monitor.update(message="Adding {n} articles to {aset}..".format(
            n=len(to_add), aset=self))
        ArticleSetArticle.objects.bulk_create(
            [
                ArticleSetArticle(articleset=self, article_id=artid)
                for artid in to_add
            ],
            batch_size=100,
        )

        monitor.update(
            message=
            "{n} articleset articles added to database, adding to codingjobs.."
            .format(n=len(to_add)))
        cjarts = [
            CodedArticle(codingjob=c, article_id=a)
            for c, a in itertools.product(self.codingjob_set.all(), to_add)
        ]
        CodedArticle.objects.bulk_create(cjarts)

        if add_to_index:
            monitor.update(
                message="{n} articles added to codingjobs, adding to index".
                format(n=len(cjarts)))
            es = ES()
            es.add_to_set(self.id, to_add, monitor=monitor)
            es.refresh()  # We need to flush, or setting cache will fail
        else:
            monitor.update(2)

        # Add to property cache
        properties = ES().get_used_properties(article_ids=to_add)
        self._add_to_property_cache(properties)
示例#4
0
 def __init__(self, form=None, file=None, **kargs):
     if form is None:
         form = self.form_class(data=kargs, files={"file": file})
     super().__init__(form)
     self.progress_monitor = NullMonitor()
     self.options = self.form.cleaned_data
     self.project = self.form.cleaned_data['project']
     self.errors = []
示例#5
0
 def __init__(self, options=None, monitor=NullMonitor(), **kargs):
     """Default __init__ validates and stores the options form"""
     if self.options_form is None:
         self.options = self.options_raw = None
     else:
         self.bound_form = self._bind_form(options, **kargs)
         self._validate_form()
         self.options = self.bound_form.cleaned_data
     self.progress_monitor = monitor
示例#6
0
 def add_to_set(self, setid, article_ids, monitor=NullMonitor()):
     """Add the given articles to the given set. This is done in batches, so there
     is no limit on the length of article_ids (which can be a generator)."""
     if not article_ids: return
     batches = list(splitlist(article_ids, itemsperbatch=1000))
     nbatches = len(batches)
     for i, batch in enumerate(batches):
         monitor.update(40/nbatches, "Added batch {iplus}/{nbatches}".format(iplus=i+1, **locals()))
         self.bulk_update(article_ids, UPDATE_SCRIPT_ADD_TO_SET, params={'set' : setid})
示例#7
0
文件: amcates.py 项目: isususi/amcat
    def add_to_set(self, setid, article_ids, monitor=NullMonitor()):
        """Add the given articles to the given set. This is done in batches, so there
        is no limit on the length of article_ids (which can be a generator)."""

        if not article_ids:
            if monitor:
                monitor.update()
            return

        batches = list(splitlist(article_ids, itemsperbatch=1000))
        monitor = monitor.submonitor(total=len(batches))

        nbatches = len(batches)
        for i, batch in enumerate(batches):
            monitor.update(message="Adding batch {iplus}/{nbatches}..".format(
                iplus=i + 1, nbatches=nbatches))
            self.bulk_update(batch,
                             UPDATE_SCRIPT_ADD_TO_SET,
                             params={'set': setid})
示例#8
0
    def add_articles(self, articles, add_to_index=True, monitor=NullMonitor()):
        """
        Add the given articles to this article set. Implementation is exists of three parts:

          1. Adding ArticleSetArticle objects
          2. Adding CodedArticle objects
          3. Updating index

        @param articles: articles to be removed
        @type articles: iterable with indexing of integers or Article objects

        @param add_to_index: notify elasticsearch of changes
        @type add_to_index: bool
        """
        articles = {(art if type(art) is int else art.id) for art in articles}
        to_add = articles - self.get_article_ids()
        to_add = Article.objects.filter(pk__in=to_add).values_list("pk",
                                                                   flat=True)

        monitor.update(10,
                       "{n} articles need to be added".format(n=len(to_add)))
        ArticleSetArticle.objects.bulk_create([
            ArticleSetArticle(articleset=self, article_id=artid)
            for artid in to_add
        ])

        monitor.update(
            20,
            "{n} articles added to articlesets, adding to codingjobs".format(
                n=len(to_add)))
        CodedArticle.objects.bulk_create([
            CodedArticle(codingjob=c, article_id=a)
            for c, a in itertools.product(self.codingjob_set.all(), to_add)
        ])

        monitor.update(
            30, "{n} articles added to codingjobs, adding to index".format(
                n=len(to_add)))
        if add_to_index:
            amcates.ES().add_to_set(self.id, to_add, monitor=monitor)
示例#9
0
文件: amcates.py 项目: isususi/amcat
 def bulk_insert(self, dicts, batch_size=1000, monitor=NullMonitor()):
     """
     Bulk insert the given articles in batches of batch_size
     """
     batches = list(toolkit.splitlist(
         dicts, itemsperbatch=batch_size)) if batch_size else [dicts]
     monitor = monitor.submonitor(total=len(batches))
     nbatches = len(batches)
     for i, batch in enumerate(batches):
         monitor.update(
             1, "Adding batch {iplus}/{nbatches}".format(iplus=i + 1,
                                                         **locals()))
         props, articles = set(), {}
         for d in batch:
             props |= (set(d.keys()) - ALL_FIELDS)
             articles[d["id"]] = serialize(d)
         self.check_properties(props)
         body = get_bulk_body(articles)
         resp = self.es.bulk(body=body,
                             index=self.index,
                             doc_type=settings.ES_ARTICLE_DOCTYPE)
         if resp["errors"]:
             raise ElasticSearchError(resp)
示例#10
0
文件: article.py 项目: isususi/amcat
    def create_articles(cls,
                        articles,
                        articleset=None,
                        articlesets=None,
                        deduplicate=True,
                        monitor=NullMonitor()):
        """
        Add the given articles to the database, the index, and the given set

        Duplicates are detected and have ._duplicate and .id set (and are added to sets)

        @param articles: a collection of objects with the necessary properties (.title etc)
        @param articleset(s): articleset object(s), specify either or none
        """
        monitor = monitor.submonitor(total=6)
        if articlesets is None:
            articlesets = [articleset] if articleset else []

        # Check for ids
        for a in articles:
            if a.id is not None:
                raise ValueError(
                    "Specifying explicit article ID in save not allowed")

        # Compute hashes, mark all articles as non-duplicates
        for a in articles:
            a.compute_hash()
            a._duplicate = None

        # Determine which articles are dupes of each other, *then* query the database
        # to check if the database has any articles we just got.
        if deduplicate:
            hashes = collections.defaultdict(
                list)  # type: Dict[bytes, List[Article]]

            for a in articles:
                if a.hash in hashes:
                    a._duplicate = hashes[a.hash][0]
                else:
                    hashes[a.hash].append(a)

            # Check database for duplicates
            monitor.update(message="Checking _duplicates based on hash..")
            if hashes:
                results = Article.objects.filter(
                    hash__in=hashes.keys()).only("hash")
                for orig in results:
                    dupes = hashes[orig.hash]
                    for dupe in dupes:
                        dupe._duplicate = orig
                        dupe.id = orig.id
        else:
            monitor.update()

        # Save all non-duplicates
        to_insert = [a for a in articles if not a._duplicate]
        monitor.update(message="Inserting {} articles into database..".format(
            len(to_insert)))
        if to_insert:
            result = bulk_insert_returning_ids(to_insert)
            for a, inserted in zip(to_insert, result):
                a.id = inserted.id
            dicts = [
                a.get_article_dict(sets=[aset.id for aset in articlesets])
                for a in to_insert
            ]
            amcates.ES().bulk_insert(dicts, batch_size=100, monitor=monitor)
        else:
            monitor.update()

        # At this point we can still have internal duplicates. Give them an ID as well.
        for article in articles:
            if article.id is None and article._duplicate is not None:
                article.id = article._duplicate.id

        if not articlesets:
            monitor.update(3)
            return articles

        # add new articles and _duplicates to articlesets
        monitor.update(message="Adding articles to articleset..")
        new_ids = {a.id for a in to_insert}
        dupes = {a._duplicate.id for a in articles if a._duplicate} - new_ids
        for aset in articlesets:
            if new_ids:
                aset.add_articles(new_ids, add_to_index=False, monitor=monitor)
            else:
                monitor.update()

            if dupes:
                aset.add_articles(dupes, add_to_index=True, monitor=monitor)
            else:
                monitor.update()

        # Add to articleset caches
        properties = set()
        for article in articles:
            properties.update(article.properties.keys())

        for articleset in articlesets:
            articleset._add_to_property_cache(properties)

        return articles
示例#11
0
文件: upload.py 项目: amcat/amcat
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.progress_monitor = NullMonitor()
def _get_rows(jobs, include_sentences=False, include_multiple=True, include_uncoded_articles=False, include_uncoded_sentences=False,
              progress_monitor=NullMonitor()):
    """
    @param jobs: output rows for these jobs. Make sure this is a QuerySet object with .prefetch_related("codings__values")
    @param include_sentences: include sentence level codings (if False, row.sentence and .sentence_coding are always None)
    @param include_multiple: include multiple codedarticles per article
    @param include_uncoded_articles: include articles without corresponding codings
    """
    art_filter = Q(coded_articles__codingjob__in=jobs)
    if include_uncoded_articles:
        art_filter |= Q(articlesets_set__codingjob_set__in=jobs)

    job_articles = {a.id: a for a in Article.objects.filter(art_filter)}
    job_sentences = {s.id: s for s in Sentence.objects.filter(article__id__in=job_articles.keys())}

    # Mapping of article -> sentences
    article_sentences = collections.defaultdict(set)
    for sentence_id, sentence in job_sentences.items():
        article_sentences[sentence.article_id].add(sentence_id)

    # Articles that have been seen in a codingjob already (so we can skip duplicate codings on the same article)
    seen_articles = set()

    for i, job in enumerate(jobs):
        # Get all codings in dicts for later lookup
        coded_articles = set()

        # {ca: coding}
        article_codings = {}

        # {ca: {sentence_id : [codings]}}
        sentence_codings = collections.defaultdict(lambda: collections.defaultdict(list))

        for ca in job.coded_articles.order_by('id').prefetch_related("codings__values"):
            coded_articles.add(ca)
            for c in ca.codings.all():
                if c.sentence_id is None:
                    if ca not in article_codings:  # HACK, take first entry of duplicate article codings (#79)
                        article_codings[ca.id] = c
                else:
                    sentence_codings[ca.id][c.sentence_id].append(c)

        # output the rows for this job
        for ca in coded_articles:
            a = job_articles[ca.article_id]
            if a in seen_articles and not include_multiple:
                continue

            article_coding = article_codings.get(ca.id)
            sentence_ids = sentence_codings[ca.id]

            if include_sentences and sentence_ids:
                seen_articles.add(a)
                for sid in sentence_ids:
                    s = job_sentences[sid]
                    for sentence_coding in sentence_codings[ca.id][sid]:
                        yield CodingRow(job, ca, a, s, article_coding, sentence_coding)

                if include_uncoded_sentences:
                    non_coded_sentences = article_sentences[ca.article_id] - set(sentence_ids)
                    for sentence in map(job_sentences.get, non_coded_sentences):
                        yield CodingRow(job, ca, a, sentence, article_coding, None)

            elif article_coding:
                seen_articles.add(a)
                yield CodingRow(job, ca, a, None, article_coding, None)

    if include_uncoded_articles:
        for article in set(job_articles.values()) - seen_articles:
            yield CodingRow(job, job.get_coded_article(article), article, None, None, None)