Пример #1
0
    def test_get_ids(self):
        tree = ArticleTree(
            Article(id=3), [
                ArticleTree(Article(id=5), []), ArticleTree(Article(id=6), [
                    ArticleTree(Article(id=7), [])
                ])
            ]
        )

        self.assertEqual({3, 5, 6, 7}, set(tree.get_ids()))
Пример #2
0
    def scrape_unit(self, url):
        reader_url = "about:reader?url={}".format(url)
        doc = self.get_html(reader_url, wait_for="div.content p")

        for tag in REMOVE_TAGS:
            for element in doc.cssselect(tag):
                element.getparent().remove(element)

        article = doc.cssselect("div.content")[0]
        article_html = lxml.html.tostring(article).decode()

        title = doc.cssselect("h1.reader-title")[0].text_content().strip()
        text = html2text(article_html)

        if self.__class__.get_date is not GenericScraper.get_date:
            # Get contents of un-firefox-read-ed article
            self.wait(".reader-toolbar .close-button").click()
            time.sleep(0.3)
            doc_html = self.wait("html").get_attribute("outerHTML")
            doc = lxml.html.fromstring(doc_html, base_url=url)

            try:
                date = self.get_date(doc)
            except NotImplementedError:
                date = self.now
            except Exception as e:
                log.warning("get_date() failed for {} with: {}".format(url, e))
                date = self.now
        else:
            date = self.now

        article = Article(date=date, title=title, text=text, url=url)

        return article
Пример #3
0
    def scrape_unit(self, date_and_article_url):
        date, article_url = date_and_article_url
        log.info("Fetching {}".format(article_url))
        article_doc = self.session.get_html(article_url)

        article_el = article_doc.cssselect("#content > article")

        if not article_el:
            log.error("Could not find article on {article_url}".format(**locals()))
            return None

        title = article_el[0].cssselect("h1")[0].text
        text = html2text(article_el[0].cssselect("p"))
        text = text.strip() or "."

        try:
            footer = article_el[0].cssselect("footer")[0]
        except IndexError as e:
            # Contains <embed> tag which is not closed gracefully :-(
            log.exception(e)
            return None

        author = footer.text.rsplit("|", 1)[0].strip()
        timestamp = parse_date(article_el[0].cssselect("footer > time")[0].get("datetime"))
        if not title:
            return None

        children = self._get_comments(title, article_url, article_doc)

        article = Article(date=timestamp, title=title, text=text)
        article.set_property("author", author)
        article.set_property("url", article_url)
        article.set_property("medium", "GeenStijl")

        return ArticleTree(article, [ArticleTree(c, []) for c in children])
Пример #4
0
    def test_elastic_hash(self):
        """Can we reproduce a hash from elastic data alone?"""
        article = Article(**{
            "date": datetime.date(2015, 1, 1),
            "section": "\u6f22\u5b57",
            "pagenr": 1928390,
            "headline": "Headline hier.",
            "byline": "byline..",
            "length": 1928,
            "metastring": "Even more strange characters.. \x0C ..",
            "url": "https://example.com",
            "externalid": None,
            "author": None,
            "addressee": "Hmm",
            "text": "Contains invalid char \x08 woo",
            "medium": create_test_medium(name="abc."),
            "project": create_test_project()
        })

        article.save()

        es = ES()
        es.add_articles([article.id])
        hash = get_article_dict(article)["hash"]
        es.flush()

        es_articles = es.query_all(filters={"ids": [article.id]}, fields=HASH_FIELDS + ["hash"])
        es_article = list(es_articles)[0]

        self.assertEqual(article.id, es_article.id)
        self.assertEqual(hash, es_article.hash)
        self.assertEqual(_get_hash(es_article.to_dict()), hash)
Пример #5
0
    def create(self, validated_data):
        children = validated_data.pop("children")
        article = Article(**validated_data)

        if article.length is None:
            article.length = word_len(article.text)

        return (article, map(self.create, children))
Пример #6
0
    def _parse_comment(self, comment, base_title, base_url):
        text = html2text(comment.cssselect("p"))
        article_id = comment.get("id")
        title = "{base_title}#{article_id}".format(**locals())
        url = "{base_url}#{article_id}".format(**locals())
        author, timestamp = _parse_comment_footer(comment.cssselect("footer")[0].text_content())

        article = Article(date=timestamp, title=title, text=text.strip() or ".", url=url)
        article.set_property("author", author.strip())
        article.set_property("medium", "GeenStijl Comments")
        return article
Пример #7
0
    def parse_file(self, file):
        for doc in split_file(file):
            data = dict(parse_doc(doc))

            art = {}
            for field, setting in self.options['field_map'].items():
                value, typ = setting['value'], setting['type']
                val = data.get(value) if typ == 'field' else value
                if val:
                    art[field] = val
            yield Article(**art)
Пример #8
0
    def test_deduplication(self):
        """Does deduplication work as it is supposed to?"""

        # create dummy articles to have something in the db 
        [amcattest.create_test_article() for i in range(10)]
        amcates.ES().refresh()
        
        art = dict(project=amcattest.create_test_project(),
                   title="deduptest", text="test", date='2001-01-01')

        a1 = amcattest.create_test_article(**art)
        amcates.ES().refresh()
        self.assertEqual(_q(title='deduptest'), {a1.id})

        # duplicate articles should not be added
        a2 = amcattest.create_test_article(**art)
        amcates.ES().refresh()
        self.assertEqual(a2.id, a1.id)
        self.assertTrue(a2._duplicate)
        self.assertEqual(_q(title='deduptest'), {a1.id})

        # however, if an articleset is given the 'existing' article
        # should be added to that set
        s1 = amcattest.create_test_set()
        a3 = amcattest.create_test_article(articleset=s1, **art)
        amcates.ES().refresh()
        self.assertEqual(a3.id, a1.id)
        self.assertEqual(_q(title='deduptest'), {a1.id})
        self.assertEqual(set(s1.get_article_ids()), {a1.id})
        self.assertEqual(_q(sets=s1.id), {a1.id})

        # if an existing hash is set, it should be correct
        art2 = dict(hash=b'hash', **art)
        self.assertRaises(ValueError, amcattest.create_test_article, **art2)

        #TODO! Check duplicates within new articles
        art['title'] = "internaldupe"
        a1, a2 = (Article(**art), Article(**art))
        Article.create_articles([a1, a2], articleset=s1)
        self.assertEqual(a1.id, a2.id)
        self.assertEqual(len(_q(title='internaldupe')), 1)
Пример #9
0
    def create_article(self, art_dict, project):
        art_dict = {
            k: v
            for k, v in art_dict.items() if k in COPY_ARTICLE_FIELDS
        }
        art_dict["project"] = project
        if 'headline' in art_dict and 'title' not in art_dict:
            art_dict['title'] = art_dict.pop('headline')

        art_dict = dict(self._map_es_type(k, v) for k, v in art_dict.items())
        art = Article(**art_dict)
        return art
Пример #10
0
 def scrape_unit(self, unit):
     date = iso8601.iso8601.parse_date(unit["datum"], default_timezone=None)
     hostname = urlparse(unit["url"]).hostname
     publisher = ".".join(hostname.split(".")[-2:])
     title = unit["titel"].strip() or "[No title]"
     article = Article(title=title,
                       text=unit["bericht tekst"],
                       url=unit["url"],
                       date=date)
     article.set_property("author", unit["auteur"])
     article.set_property("publisher", publisher)
     return article
Пример #11
0
def copy_article(article: Article):
    new = Article(
        project_id=article.project_id,
        date=article.date,
        title=article.title,
        url=article.url,
        #text=article.text <-- purposely omit text!
        #hash=article.hash <-- purposely omit hash!
        parent_hash=article.parent_hash)

    new.properties.update(article.properties)

    return new
Пример #12
0
    def scrape_unit_meta(self, article_element):
        CONTEXT['unit'] = article_element

        article_html = article_element.get_attribute("outerHTML")
     #   print(f"dit is html{article_html}")
        article_doc = lxml.html.fromstring(article_html, base_url=SEARCH_URL)
        CONTEXT['doc'] = article_element

        def get_byline_prop(prop):
            for meta_element in article_doc.cssselect(f".nd-article__{prop}"):
                prop_value = meta_element.text_content().strip()
                if prop_value:
                    return prop_value
            else:
                raise ValueError("Article {} has no property '{}'.".format(title, prop))

        text_url = article_doc.cssselect("a.nd-article__headline-text")[0].get("href")
        url = "newsdesk://{}".format(get_newsdesk_article_id(text_url))
        title = article_doc.cssselect("a.nd-article__headline-text")[0].text_content().strip()
        print(title)
        publisher = get_byline_prop("source")
        date_text = article_doc.cssselect(".nd-article__date")[0].get("title")
        date = date_text.split("Publicatiedatum:")
        pub_date = date[-1]
        pub_date = dutch_strptime(pub_date.strip()[:-1], "%d %b %Y %H:%M")
        load_date = date[1]
        load_date = dutch_strptime(load_date.strip(), "%d %b %Y %H:%M")

        article = Article(url=url, title=title, date=pub_date)
        article.set_property("publisher", publisher)
        article.set_property("text_url", text_url)

        # Crashes AmCAT API:
        #article.set_property("pubdate_date", pub_date)

        try:
            author = get_byline_prop("author")
            article.set_property("author", author)
        except ValueError:
            pass
        try:
            article.set_property("wordcount_int", int(get_byline_prop("word-count").split()[0].replace(",", "")))
        except ValueError:
            logging.warning("could not find word count")
        try:
            article.set_property("country", get_byline_prop("source_country"))
        except ValueError:
            pass
        return NewsdeskUnit(article_element, article)
Пример #13
0
    def set_up(self):
        self.aset = amcattest.create_test_set()
        self.asets = ArticleSet.objects.filter(id__in=[self.aset.id])
        self.project = self.aset.project

        self.a1 = Article(
            title="Man leeft nog steeds in de gloria",
            text="Gezongen vloek op verjaardag maakt leven van man tot een vrolijke hel.",
            date=datetime.datetime(2017, 1, 2, 23, 22, 11),
            author="Rudolf Julius",
            publisher="De Speld",
            project=self.project,
            exists="Once",
            page_int=5,
            section_int=10,
            tags_tag={"gloria", "vloek"},
            html="Man <i>leeft</i> nog steeds in de gloria"
        )

        self.a2 = Article(
            title="VVD trots op opkomende zon",
            text="Kabinetsbeleid om geen parasol over Nederland te zetten betaalt zich uit",
            date=datetime.datetime(2016, 12, 14, 15, 13, 12),
            author="Thomas Hogeling",
            publisher="De Speld",
            project=self.project,
            page_int=5,
            section_int=11,
            tags_tag={"vvd", "nederland", "speld"}
        )

        Article.create_articles([self.a1, self.a2], articleset=self.aset)

        amcates.ES().refresh()

        self.qs = ESQuerySet(self.asets)
Пример #14
0
    def scrape_unit(self, entry):
        article = Article()
        try:
            section, text = self.get_article_section_text(entry["link"])
            print(section, text)
        except IndexError:
            return None

        article.set_property("nuid", entry["id"])
        article.set_property("title", entry["title"])
        article.set_property("date", self.parse_date(str(entry["published"])))
        article.set_property("url", entry["link"])
        article.set_property("section", section)
        article.set_property("text", text)
        return article
Пример #15
0
 def _scrape_unit(self, row):
     row = {k:v.decode("utf-8") for k,v in row.iteritems()}
     query = row.pop('zoekopdracht')
     self.queries.add(query)
     medium = Medium.get_or_create(row.pop('type bron'))
     date = row.pop('datum')
     date = datetime.datetime.strptime(date, "%Y-%m-%d %H:%M")
     headline = row.pop('titel')
     pagenr = row.pop('bereik') or None
     text = row.pop('bericht tekst')
     url = row.pop('url')
     author=row.pop('auteur')
     metastring = json.dumps(row)
     
     a = Article(headline=headline, pagenr=pagenr,
                 text=text, date=date,
                 medium=medium, url=url,
                 author=author, metastring=metastring)
     yield a 
Пример #16
0
    def test_elastic_hash(self):
        """Can we reproduce a hash from elastic data alone?"""
        article = Article(
            **{
                "date": datetime.date(2015, 1, 1),
                "title": "\u6f22\u5b57",
                "text": "Even more strange characters.. \x0C and \x08 woo?",
                "url": "https://example.com",
                "project": create_test_project()
            })

        hash = get_article_dict(article)['hash']
        Article.create_articles([article],
                                articleset=amcattest.create_test_set())
        ES().refresh()
        es_articles = ES().query_all(filters={"ids": [article.id]},
                                     fields=["hash"])
        es_articles = list(es_articles)
        es_article = list(es_articles)[0]

        self.assertEqual(article.id, es_article.id)
        self.assertEqual(hash, es_article.hash)
        self.assertEqual(hash, article.hash)
Пример #17
0
    def _scrape_unit(self, document):
        article = Article()
        metadata = list(META)

        # We select all 'div' elements directly under '.article'
        divs = document.cssselect("* > div")

        # Check for author field. If present: remove from metadata
        # fields list
        try:
            author_field = document.cssselect(".author")[0]
        except IndexError:
            pass
        else:
            article.author = author_field.text_content().lstrip("Von").strip()
            divs.remove(author_field)

        # Strip everything before headline
        headline_field = document.cssselect("b.deHeadline")[0].getparent()
        divs = divs[divs.index(headline_field):]

        # Parse metadata. Loop through each 'div' within an article, along with
        # its field name according to META (thus based on its position)
        for field_name, element in zip(metadata, divs):
            if field_name is None:
                continue

            processor = PROCESSORS.get(field_name, lambda x: x)
            text_content = element.text_content().strip()
            setattr(article, field_name, processor(text_content))

        # Fetch text, which is
        paragraphs = [p.text_content() for p in document.cssselect("p")]
        article.text = ("\n\n".join(paragraphs)).strip()

        # We must return a iterable, so we return a one-tuple
        return (article, )
Пример #18
0
    def get_articles(self, fn, media):
        csv.field_size_limit(sys.maxsize)

        def _int(x):
            return int(x) if x else None

        def hash2binary(hash):
            if hash:
                if not isinstance(hash, str):
                    raise TypeError("Hash should be str, not {}".format(
                        type(hash)))
                return "\\x" + hash

        r = csv.reader(open(fn))
        header = next(r)
        index = {col: i for (i, col) in enumerate(header)}
        AID = index['article_id']
        if self.maxid:
            logging.info(
                "*** max(id) set by user: {self.maxid}".format(**locals()))
            max_id, self.n_rows = self.maxid, self.maxid
        else:
            logging.info("*** Scan input CSV to determine #rows and max(id)")
            for row in r:
                max_id = max(max_id, int(row[AID]))
                self.n_rows += 1
                if not self.n_rows % 10000000:
                    logging.info(
                        ".. scanned {self.n_rows} rows".format(**locals()))

        logging.info(
            "{self.n_rows} rows, max ID {max_id}, allocating memory for hashes"
            .format(**locals()))

        hashes = ctypes.create_string_buffer(max_id * 28)
        NULL_HASH = b'\x00' * 28
        orphans = "N/A"
        passno = 1

        if self._continue:
            logging.info(
                "Continuing from previous migration, getting state from DB")
            c = conn().cursor('migration-continue')
            c.itersize = 10000  # how much records to buffer on a client
            c.execute("SELECT article_id, hash FROM articles")
            i = 0
            while True:
                rows = c.fetchmany(10000)
                if not rows:
                    break
                i += len(rows)
                if not i % 1000000:
                    logging.info("Retrieved {i} rows...")
                for (aid, hash) in rows:
                    offset = (aid - 1) * 28
                    hashes[offset:offset + 28] = hash
            self.n_rows -= i
            logging.info(
                "Continuing migration, {i} articles retrieved, {self.n_rows} to go"
                .format(**locals()))

        while orphans:
            logging.info(
                "*** Pass {passno}, #orphans {orphans}".format(**locals()))
            passno += 1
            orphans = 0

            r = csv.reader(open(fn))
            next(r)  # skip header

            for row in r:
                aid = int(row[AID])

                offset = (aid - 1) * 28
                stored_hash = hashes[offset:offset + 28]
                if stored_hash != NULL_HASH:
                    continue

                parent_id = _int(row[index['parent_article_id']])
                if (parent_id == aid) or (parent_id in SKIP_PARENTS):
                    parent_id = None
                if parent_id:
                    poffset = (parent_id - 1) * 28
                    parent_hash = hashes[poffset:poffset + 28]
                    if parent_hash == NULL_HASH:
                        orphans += 1
                        continue
                    parent_hash = binascii.hexlify(parent_hash).decode("ascii")
                else:
                    parent_hash = None

                date = row[index['date']]
                date = date.split("+")[0]
                date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')

                a = Article(project_id=row[index['project_id']],
                            date=date,
                            title=row[index['headline']],
                            url=row[index['url']] or None,
                            text=row[index['text']],
                            parent_hash=parent_hash)

                a.properties = {
                    v: row[index[v]]
                    for v in PROP_FIELDS if row[index[v]]
                }
                a.properties['medium'] = media[int(row[index['medium_id']])]
                a.properties['uuid'] = str(a.properties['uuid'])
                props = json.dumps(a.properties)

                hash = amcates.get_article_dict(a)['hash']
                hashes[offset:offset + 28] = binascii.unhexlify(hash)

                yield (a.project_id, aid, a.date, a.title, a.url, a.text,
                       hash2binary(hash), hash2binary(a.parent_hash), props)
Пример #19
0
 def _scrape_unit(self, row):
     self.queries.add(row[self.lang.query])
     art = self.map_article(row)
     a = Article(**art)
     return a
Пример #20
0
    def scrape_unit(self, article_info: ArticleTuple):
        date, page_num, url = article_info

        try:
            text_url = strip_query(self.session.get_redirected_url(url))
        except RedirectError as e:
            if e.status_code == 404:
                return None
            raise

        try:
            text_doc = self.session.get_html(text_url)
        except HTTPError as e:
            if e.response.status_code == 404:
                logging.warning(f"{url} returned 404 skipping")
                return None
            else:
                raise

        for image in text_doc.cssselect(".image"):
            image.getparent().remove(image)

        date = datetime.datetime(date.year, date.month, date.day)
        try:
            title = text_doc.cssselect("article > h1")[0].text.strip()
        except:
            return None

        text = html2text(text_doc.cssselect("main > article > .body"))
        if not text.strip():
            return None

        article = Article(title=title, date=date, text=text, url=url)

        if text_doc.cssselect("article > header.themed"):
            # New headers style
            author = text_doc.cssselect("article > header .author")[0].text
            section = text_doc.cssselect("article > header .title")[0].text
            article.set_property("author", author)
        else:
            # Old header style
            section = text_doc.cssselect("article > header > .title")
            section = section[0].text if section else "NOSECTION"
            author_a = text_doc.cssselect("article .author a")
            if author_a:
                author = author_a[0].text.strip()
                article.set_property("author", author)
                if author == section:
                    section = "Opinie"

        download = text_doc.cssselect('form[name="download"]')
        if download:
            pdf_url = download[0].get("action")
            article.set_property("pdf_url", pdf_url)

        article.set_property("text_url", text_url)
        article.set_property("image_url", text_url + "?view=img")

        if section:
            article.set_property("section", section.strip())

        return article
Пример #21
0
 def _scrape_unit(self, unit):
     tweets = ["list", "from", "api"]
     for tweet in tweets:
         yield Article(text=tweet, headline=tweet, date='2010-01-01')
Пример #22
0
 def scrape_unit(self, unit: TelegraafUnit):
     return Article(title=unit.title,
                    url=unit.url,
                    text=unit.text,
                    date=unit.date,
                    pagerange=unit.page_range)
Пример #23
0
    def scrape_unit(self, unit: NRCUnit):
        m = re.match(r"https://www.nrc.nl/nieuws/(\d{4})/(\d{2})/(\d{2})/",
                     unit.url)
        if not m:
            logging.warning(f"Invalid URL: {unit.url}")
            return None
        year = int(m.group(1))
        month = int(m.group(2))
        day = int(m.group(3))
        online_date = datetime(year, month, day)
        try:
            html = self.session.get_content(unit.url)
        except HTTPError as e:
            if e.response.status_code == 404:
                logging.warning(f"No article found for {unit.url}")
                return  # some articles don't exist, i.e. cartoons without text
            raise

        doc = lxml.html.fromstring(html, base_url=unit.url)
        intro = doc.cssselect("div.intro")
        if not intro:
            logging.debug(f"Invalid intro: {unit.url}")
            intro = ""
        else:
            intro2 = intro[0].text_content()
        headline = doc.cssselect(".article-header-container h1")
        if not headline:
            headline2 = "-"
            logging.warning(f"No headline {unit.url}")
        else:
            headline2 = headline[0].text_content()
            if not headline2:
                headline2 = "-"
                logging.warning(f"Empty headline {unit.url}")
        author = doc.cssselect("ul.article__byline__text.unstyled a")
        if not author:
            logging.debug(f"Invalid author: {unit.url}")
            author2 = ""
        else:
            author2 = author[0].text_content()
        text = doc.cssselect("div.article__content")
        if not text:
            text = doc.cssselect("div.article__header-and-content")
        text2 = text[0].text_content()
        text2 = re.sub(r"\s*\n\s*", "\n\n", text2).strip()
        text2 = re.sub(r"[ \t]+", " ", text2).strip()
        if intro:
            text3 = f"{intro2},{text2}"
        else:
            text3 = f"{text2}"

        article = dict(date=unit.date,
                       online_date=online_date,
                       title=headline2,
                       text=text3,
                       url=unit.url,
                       pdf_url=unit.pdf,
                       page_tag=unit.pages,
                       section_tag=unit.sections,
                       raw_html=html,
                       author=author2)
        if unit.image is not None:
            article["image_url"] = unit.image
        return Article(**article)
Пример #24
0
 def scrape_unit(self, unit: EPagesUnit):
     return Article(title=unit.title,
                    url=unit.url,
                    text=unit.text,
                    pagenr_int=unit.page,
                    date=unit.date)
Пример #25
0
def json_to_article(article: Dict[str, Any], project: Project) -> Article:
    article = Article(project=project, **article)
    article.compute_hash()
    return article
Пример #26
0
 def test_default(self):
     self.assertEqual(PropertyMapping, type(Article().properties))
Пример #27
0
    def parse_document(self, paragraphs):
        metadata, text = parse_page(paragraphs)
        metadata["medium"] = Medium.get_or_create(metadata["medium"])

        return Article(text=text, **metadata)
Пример #28
0
    def test_highlight_fragments(self):
        self.set_up()

        articleset = amcattest.create_test_set()
        project = articleset.project

        text = """
        The Alderman Proctor's Drinking Fountain (grid reference ST566738) is a historic building
        on Clifton Down, Bristol, England.

        The city of Bristol began supplying municipal drinking water in 1858. To inform the public
        about the new water supply, Robert Lang made a proposal though the Bristol Times that public
        drinking fountains be constructed. Lang began the "Fountain Fund" in January 1859 with a
        donation of one hundred pounds. By 1906, there were more than 40 public drinking fountains
        throughout the city.

        In 1872, Alderman Thomas Proctor commissioned the firm of George and Henry Godwin to build
        the fountain to commemorate the 1861 presentation of <i>Clifton Down</i> to the City of
        Bristol by the Society of Merchant Venturers.

        **Commemorative plaque**

        The three-sided fountain is done in Gothic Revival style. The main portion is of limestone
        with pink marble columns and white marble surround. The commemorative plaque is of black
        lettering on white marble; the plaque reads, "Erected by Alderman Thomas Proctor, of Bristol
        to record the liberal gift of certain rights on Clifton Down made to the citizens by the
        Society of Merchant Venturers under the provision of the Clifton and Drudham Downs Acts
        of Parliament, 1861, whereby the enjoyment of these Downs is preserved to the citizens of
        Bristol for ever." The fountain bears the coat of arms for the city of Bristol, the Society
        of Merchant Venturers and that of Alderman Thomas Proctor.

        The fountain was originally situated at the head of Bridge Valley Road. It became a sight
        impediment to modern auto traffic in the later 20th century. The fountain was moved to the
        other side of the road, closer to the Mansion House in 1987. After the move, it underwent
        restoration and was re-dedicated on 1 May 1988. It has been designated by English Heritage
        as a grade II listed building since 1977.
        """

        paragraphs = [" ".join(s.strip() for s in p.strip().split("\n")) for p in text.split("\n\n")]

        long_article = Article(
            title="Alderman Proctor's Drinking Fountain",
            text="\n\n".join(paragraphs).strip(),
            date=datetime.datetime(2017, 1, 18, 13, 29, 11),
            url="https://en.wikipedia.org/wiki/Alderman_Proctor%27s_Drinking_Fountain",
            publisher="Wikipedia",
            project=project
        )

        Article.create_articles([long_article], articleset)
        amcates.ES().refresh()

        qs = ESQuerySet(ArticleSet.objects.filter(id=articleset.id))
        fragments = qs.highlight_fragments('"Clifton Down"', ("text", "title"), fragment_size=50)

        self.assertEqual(1, len(qs))
        self.assertEqual(1, len(fragments))

        fragments = next(iter(fragments.values()))
        text_fragments = set(fragments["text"])
        title_fragments = fragments["title"]

        self.assertEqual(1, len(title_fragments))
        self.assertNotIn("<mark>", title_fragments[0])
        self.assertEqual(3, len(text_fragments))
        self.assertEqual(text_fragments, {
             ' presentation of &lt;i&gt;<mark>Clifton</mark> <mark>Down</mark>&lt;/i&gt; to the City of Bristol',
             ' <mark>Clifton</mark> <mark>Down</mark>, Bristol, England.\n\nThe city of Bristol',
             ' the liberal gift of certain rights on <mark>Clifton</mark> <mark>Down</mark> made'
        })