Пример #1
0
    def addMemento(self, urim):
        try:
            r = self.session.get(urim)

            if len(r.history) == 0:
                raw_urim = otmt.generate_raw_urim(urim)
            else:
                raw_urim = otmt.generate_raw_urim(r.url)

            self.session.get(raw_urim)
            self.urimlist.append(urim)
        except (ConnectionError, TooManyRedirects, RequestException) as e:
            self.addMementoError(urim, repr(e))
    def generate_raw_urim_archiveorg_happy_path(self):

        urim = "https://web.archive.org/web/20070207050545/http://www.cnn.com:80/"

        raw_urim = "https://web.archive.org/web/20070207050545id_/http://www.cnn.com:80/"

        self.assertEquals(raw_urim, generate_raw_urim(urim))
    def generate_raw_urim_archiveit_happy_path(self):

        urim = "http://wayback.archive-it.org/1068/20110317183254/http://www.amnistia.org.mx/"

        raw_urim = "http://wayback.archive-it.org/1068/20110317183254id_/http://www.amnistia.org.mx/"

        self.assertEquals(raw_urim, generate_raw_urim(urim))
Пример #4
0
def get_newspaper_publication_date(urim, cache_storage):

    import otmt

    dbconn = MongoClient(cache_storage)
    session = get_web_session(cache_storage)
    db = dbconn.get_default_database()

    try:
        return db.derivedvalues.find_one({"urim":
                                          urim})["newspaper publication date"]
    except (KeyError, TypeError):
        raw_urim = otmt.generate_raw_urim(urim)

        r = session.get(raw_urim)
        r.raise_for_status()

        article = Article(urim)
        article.download(r.text)
        article.parse()
        article.nlp()
        pd = article.publish_date

        if pd is None:
            pd = r.headers['memento-datetime']
        else:
            pd = pd.strftime("%a, %d %b %Y %H:%M:%S GMT")

        db.derivedvalues.update(
            {"urim": urim}, {"$set": {
                "newspaper publication date": str(pd)
            }})

        return pd
Пример #5
0
    def getMementoContent(self, urim):
        """Returns the HTTP entity of memento at `urim` provided that it
        was previously stored via `addMemento`.

        If no data was stored via `addMemento` for `urim`, then
        `CollectionModelNoSuchMementoException` is thrown.

        If data was stored via `addMementoError` for `urim`, then
        `CollectionModelMementoErrorException` is thrown.
        """
        raw_urim = otmt.generate_raw_urim(urim)
        return self.session.get(raw_urim).text
Пример #6
0
def get_raw_simhash(urim, cache_storage):

    import otmt

    dbconn = MongoClient(cache_storage)
    session = get_web_session(cache_storage)
    db = dbconn.get_default_database()

    # 1 if lang of urim in cache, return it
    try:
        return db.derivedvalues.find_one({"urim": urim})["raw simhash"]
    except (KeyError, TypeError):

        r = session.get(urim)

        if len(r.history) == 0:
            raw_urim = otmt.generate_raw_urim(urim)
        else:
            raw_urim = otmt.generate_raw_urim(r.url)

        r2 = session.get(raw_urim)
        r2.raise_for_status()

        if 'text/html' not in r2.headers['content-type']:
            raise Exception(
                "Hypercane currently only operates with HTML resources, refusing to compute Simhash on {}"
                .format(urim))

        simhash = Simhash(r2.text).value

        db.derivedvalues.update({"urim": urim},
                                {"$set": {
                                    "raw simhash": str(simhash)
                                }},
                                upsert=True)

        return str(simhash)
Пример #7
0
def synthesize_warc(urim, session, output_directory):

    import otmt
    import glob
    from warcio.warcwriter import WARCWriter
    from warcio.statusandheaders import StatusAndHeaders
    from hashlib import md5
    from datetime import datetime
    import traceback

    m = md5()
    m.update(urim.encode('utf8'))
    urlhash = m.hexdigest()

    if len( glob.glob('{}/{}*.warc.gz'.format(output_directory, urlhash)) ) > 0:
        module_logger.warning("Detected existing WARC for URI-M, skipping {}".format(urim))
        return

    resp = session.get(urim, stream=True)
    resp.raise_for_status()

    headers_list = resp.raw.headers.items()

    # we use response.url instead of urim to (hopefully) avoid raw redirects
    raw_urim = otmt.generate_raw_urim(resp.url)

    raw_response = session.get(raw_urim, stream=True)

    warc_target_uri = None

    # we have to implement this construct in case the archive combines original with other relations
    for link in resp.links:

        if 'original' in link:
            warc_target_uri = resp.links[link]['url']

    if warc_target_uri is None:
        module_logger.warning("could not find this memento's original resource, skipping {}".format(urim))
        return

    try:
        mdt = resp.headers['Memento-Datetime']

    except KeyError:
        module_logger.warning("could not find this memento's memento-datetime, skipping {}".format(urim))
        return

    http_headers = StatusAndHeaders('200 OK',
        headers_list, protocol='HTTP/1.0')

    module_logger.debug("mdt formatted by strptime and converted by strftime: {}".format(
        datetime.strptime(
            mdt, "%a, %d %b %Y %H:%M:%S GMT"
        ).strftime('%Y-%m-%dT%H:%M:%SZ')
    ))

    warc_headers_dict = {}
    warc_headers_dict['WARC-Date'] = datetime.strptime(
        mdt, "%a, %d %b %Y %H:%M:%S GMT"
    ).strftime('%Y-%m-%dT%H:%M:%SZ')

    with open("{}/{}-{}.warc.gz".format(output_directory, urlhash, datetime.now().strftime('%Y%m%d%H%M%S')), 'wb') as output:
        writer = WARCWriter(output, gzip=True)

        record = writer.create_warc_record(
            warc_target_uri, 'response',
            payload=raw_response.raw,
            http_headers=http_headers,
            warc_headers_dict=warc_headers_dict
            )

        writer.write_record(record)
Пример #8
0
    def addManyMementos(self, urims):

        module_logger.info("started with {} URI-Ms for processing...".format(len(urims)))

        # protect the function from duplicates in the urims list
        urims = list(set(urims))

        module_logger.info("found duplicates, now using {} URI-Ms for processing...".format(len(urims)))

        futuressession = FuturesSession(session=self.session)

        retry = Retry(
            total=10,
            read=10,
            connect=10,
            backoff_factor=0.3,
            status_forcelist=(500, 502, 504)
        )
        adapter = HTTPAdapter(max_retries=retry)
        futuressession.mount('http://', adapter)
        futuressession.mount('https://', adapter)

        futures = {}
        raw_futures = {}

        working_urim_list = []
        
        raw_urims = []

        for uri in urims:

            # raw_urim = otmt.generate_raw_urim(uri)
            working_urim_list.append(uri)
            futures[uri] = futuressession.get(uri)
            # futures[raw_urim] = futuressession.get(raw_urim)

        working_starting_size = len(working_urim_list)

        def uri_generator(urilist):

            while len(urilist) > 0:

                uchoice = random.choice(urilist)

                yield uchoice

        for uri in uri_generator(working_urim_list):

            if futures[uri].done():

                module_logger.debug("URI-M {} is done, processing...".format(uri))

                if len(working_urim_list) % 100 == 0:
                    module_logger.info("{}/{} mementos left to process".format(len(working_urim_list), working_starting_size))

                try:
                    r = futures[uri].result()

                    if len(r.history) == 0:
                        raw_urim = otmt.generate_raw_urim(uri)
                    else:
                        raw_urim = otmt.generate_raw_urim(r.url)

                    raw_urims.append( raw_urim )

                    if 'memento-datetime' not in r.headers:
                        self.addMementoError(uri, "URI-M {} does not produce a memento".format(uri))
                    else:
                        # the content should be cached by the session
                        # we just need to keep track of the URI-Ms for this run
                        self.urimlist.append(uri)

                except Exception as e:
                    self.addMementoError(uri, repr(e))

                working_urim_list.remove(uri)
                del futures[uri]

        module_logger.info("done adding {} mementos, now adding corresponding {} raw mementos...".format( len(urims), len(raw_urims) ))

        working_raw_urim_list = []

        for raw_urim in list(set(raw_urims)):

            working_raw_urim_list.append(raw_urim)
            raw_futures[raw_urim] = futuressession.get(raw_urim)

        working_rawurims_starting_size = len(working_raw_urim_list)

        # for raw_urim in uri_generator(working_raw_urim_list):

        while len(working_raw_urim_list) > 0:

            raw_urim = random.choice(working_raw_urim_list)

            module_logger.debug("fetching results for raw URI-M {}".format(raw_urim))
            # module_logger.debug("are the keys the same as the working list: {}".format( set(working_raw_urim_list) == set(list(raw_futures.keys())) ) )
            module_logger.debug("raw mementos working list size: {}".format(len(working_raw_urim_list)))
            module_logger.debug("raw mementos futures keys size: {}".format(len(raw_futures)))

            # try:
            #     raw_futures[raw_urim]
            # except KeyError:
            #     module_logger.error("{} is not in futures".format(raw_urim))
            #     module_logger.error("is it: {}".format( raw_urim in raw_futures ))
            #     module_logger.error("")
            #     module_logger.error("working list follows:")
            #     module_logger.error(pp.pformat(working_raw_urim_list))
            #     module_logger.error("")
            #     module_logger.error("raw_futures keys follows:")
            #     module_logger.error(pp.pformat(list(raw_futures.keys())))
                

            if raw_futures[raw_urim].done():
                module_logger.debug("raw URI-M {} is done, processing...".format(raw_urim))

                if len(working_raw_urim_list) % 100 == 0:
                    module_logger.info("{}/{} raw mementos left to process".format(len(working_raw_urim_list), working_rawurims_starting_size))

                try:
                    r = raw_futures[raw_urim].result()

                    if 'memento-datetime' not in r.headers:
                        self.addMementoError(uri, "raw URI-M {} does not produce a memento".format(raw_urim))
                    else:
                        # the content should be cached by the session
                        # we just need to keep track of the raw URI-Ms for this run
                        self.urimlist.append(raw_urim)

                except Exception as e:
                    self.addMementoError(raw_urim, repr(e))

                # module_logger.debug("removing {} from working raw URI-M list and raw futures keys".format(raw_urim))
                working_raw_urim_list.remove(raw_urim)
                del raw_futures[raw_urim]
                # module_logger.debug("raw URI-M {} in working raw URI-M list still? {}".format( raw_urim, raw_urim in working_raw_urim_list ))
                time.sleep(1)
    def generate_raw_urim_archiveorg_raw_already(self):

        raw_urim = "https://web.archive.org/web/20070207050545id_/http://www.cnn.com:80/"

        self.assertEquals(raw_urim, generate_raw_urim(raw_urim))
    def generate_raw_urim_archiveit_raw_already(self):

        raw_urim = "http://wayback.archive-it.org/1068/20110317183254id_/http://www.amnistia.org.mx/"

        self.assertEquals(raw_urim, generate_raw_urim(raw_urim))
Пример #11
0
def get_boilerplate_free_content(urim,
                                 cache_storage="",
                                 dbconn=None,
                                 session=None):

    import otmt
    from boilerpy3 import extractors

    if dbconn is None:
        dbconn = MongoClient(cache_storage)

    if session is None:
        session = get_web_session(cache_storage)

    db = dbconn.get_default_database()

    # 1. if boilerplate free content in cache, return it
    try:
        module_logger.info(
            "returing boilerplate free content from cache for {}".format(urim))
        bpfree = db.derivedvalues.find_one({"urim":
                                            urim})["boilerplate free content"]
        return bytes(bpfree, "utf8")
    except (KeyError, TypeError):

        module_logger.info(
            "generating boilerplate free content for {}".format(urim))

        r = session.get(urim)

        if len(r.history) == 0:
            raw_urim = otmt.generate_raw_urim(urim)
        else:
            raw_urim = otmt.generate_raw_urim(r.url)

        r2 = session.get(raw_urim)
        r2.raise_for_status()

        module_logger.info("content-type is {}".format(
            r2.headers['content-type']))

        if 'text/html' not in r2.headers['content-type']:
            module_logger.warning(
                "we can only remove boilerplate from HTML, returning zero bytes"
            )
            return bytes()

        # paragraphs = justext(
        #     r.text, get_stoplist('English')
        # )

        # bpfree = ""

        # for paragraph in paragraphs:
        #     bpfree += "{}\n".format(paragraph.text)

        module_logger.debug(
            "attempting to extract boilerplate free content from {}".format(
                urim))

        extractor = extractors.ArticleExtractor()

        try:
            bpfree = extractor.get_content(r2.text)

            module_logger.info(
                "storing boilerplate free content in cache {}".format(urim))

            db.derivedvalues.update(
                {"urim": urim}, {"$set": {
                    "boilerplate free content": bpfree
                }},
                upsert=True)

        except Exception:
            module_logger.exception(
                "failed to extract boilerplate from {}, setting value to empty string"
                .format(urim))
            hypercane.errors.errorstore.add(urim, traceback.format_exc())
            return bytes()

        return bytes(bpfree, "utf8")