Python textify примеры, html2text.textify Python примеры использования

Пример #1

0

Показать файл

Файл: extract_webentities_text.py Проект: medialab/hyphe

def process_page(page, wename, corpus, \
        extractors=["Article", "ArticleSentences", "Default", "Canola"]):

    urlmd5 = md5(page["url"]).hexdigest()
    body = page["body"].decode('zip')
    encoding = page.get("encoding", "")

    try:
        body = body.decode(encoding)
    except Exception :
        body = body.decode("UTF8", "replace")
        encoding = "UTF8-replace"

    html = os.path.join("outputs", corpus, wename, "html", urlmd5)
    with open(html, "w") as f:
        f.write(body.encode("utf-8"))

    text = os.path.join("outputs", corpus, wename, "text", urlmd5)
    with open(text, "w") as f:
        f.write(textify(body, encoding=encoding).encode("utf-8"))

    for method in extractors:
        cleantext = os.path.join("outputs", corpus, wename, \
            "text%s" % method, urlmd5)
        with open(cleantext, "w") as f:
            f.write(textify(body, extractor="%sExtractor" % method, \
                encoding=encoding).encode("utf-8"))

Пример #2

0

Показать файл

def process_page(page, wename, corpus, \
        extractors=["Article", "ArticleSentences", "Default", "Canola"]):

    urlmd5 = md5(page["url"]).hexdigest()
    body = page["body"].decode('zip')
    encoding = page.get("encoding", "")

    try:
        body = body.decode(encoding)
    except Exception :
        body = body.decode("UTF8", "replace")
        encoding = "UTF8-replace"

    html = os.path.join("outputs", corpus, wename, "html", urlmd5)
    with open(html, "w") as f:
        f.write(body.encode("utf-8"))

    text = os.path.join("outputs", corpus, wename, "text", urlmd5)
    with open(text, "w") as f:
        f.write(textify(body, encoding=encoding).encode("utf-8"))

    for method in extractors:
        cleantext = os.path.join("outputs", corpus, wename, \
            "text%s" % method, urlmd5)
        with open(cleantext, "w") as f:
            f.write(textify(body, extractor="%sExtractor" % method, \
                encoding=encoding).encode("utf-8"))

Пример #3

0

Показать файл

Файл: extract_webentities_text.py Проект: medialab/hyphe

def process_pages_matching_keyword(hyphe_core, mongo_pages_coll, corpus, keyword, content_types=["text/plain", "text/html"]):
    query = {
        "status": 200,
        "content_type": {"$in": content_types},
        "body" : {"$exists": True}
    }
    print("TOTAL valid pages:", mongo_pages_coll.count(query))

    headers = ["url", "webentity_id", "webentity_name"]
    files = {}
    for typ in ["html", "text", "canola"]:
        files[typ] = open("%s-%s-%s.csv" % (corpus, keyword, typ), "w")
        print >> files[typ], ",".join([k.encode("utf-8") for k in headers + [typ]])
    match = 0
    total = 0
    for page in mongo_pages_coll.find(query):
        total += 1
        if not total % 100:
            print match, "/", total

        body = page["body"].decode('zip')
        if keyword not in body:
            continue
        match +=1

        encoding = page.get("encoding", "")
        try:
            body = body.decode(encoding)
        except Exception :
            body = body.decode("UTF8", "replace")
            encoding = "UTF8-replace"

        we = hyphe_core.store.get_webentity_for_url_as_lru(page["lru"], corpus)
        try:
            assert we["code"] == "success"
            page["webentity_id"] = we["result"]["id"]
            page["webentity_name"] = we["result"]["name"]
        except:
            print("WARNING! Could not resolve WebEntity for url %s" % page["url"])

        page["html"] = body
        page["text"] = textify(body, encoding=encoding)
        page["canola"] = textify(body, extractor="CanolaExtractor", encoding=encoding)

        for typ in ["html", "text", "canola"]:
            if keyword not in page[typ]:
                continue
            print >> files[typ], ",".join([format_for_csv(page.get(k, "")) for k in headers + [typ]])

    for typ in ["html", "text", "canola"]:
        files[typ].close()

    print('FOUND %s pages matching "%s"' % (match, keyword))

Пример #4

0

Показать файл

def process_pages_matching_keyword(hyphe_core, mongo_pages_coll, corpus, keyword, content_types=["text/plain", "text/html"]):
    query = {
        "status": 200,
        "content_type": {"$in": content_types},
        "body" : {"$exists": True}
    }
    print("TOTAL valid pages:", mongo_pages_coll.count(query))

    headers = ["url", "webentity_id", "webentity_name"]
    files = {}
    for typ in ["html", "text", "canola"]:
        files[typ] = open("%s-%s-%s.csv" % (corpus, keyword, typ), "w")
        print >> files[typ], ",".join([k.encode("utf-8") for k in headers + [typ]])
    match = 0
    total = 0
    for page in mongo_pages_coll.find(query):
        total += 1
        if not total % 100:
            print match, "/", total

        body = page["body"].decode('zip')
        if keyword not in body:
            continue
        match +=1

        encoding = page.get("encoding", "")
        try:
            body = body.decode(encoding)
        except Exception :
            body = body.decode("UTF8", "replace")
            encoding = "UTF8-replace"

        we = hyphe_core.store.get_webentity_for_url_as_lru(page["lru"], corpus)
        try:
            assert we["code"] == "success"
            page["webentity_id"] = we["result"]["id"]
            page["webentity_name"] = we["result"]["name"]
        except:
            print("WARNING! Could not resolve WebEntity for url %s" % page["url"])

        page["html"] = body
        page["text"] = textify(body, encoding=encoding)
        page["canola"] = textify(body, extractor="CanolaExtractor", encoding=encoding)

        for typ in ["html", "text", "canola"]:
            if keyword not in page[typ]:
                continue
            print >> files[typ], ",".join([format_for_csv(page.get(k, "")) for k in headers + [typ]])

    for typ in ["html", "text", "canola"]:
        files[typ].close()

    print('FOUND %s pages matching "%s"' % (match, keyword))

Пример #5

0

Показать файл

def process_page(page, we, corpus,
        extractors=["Article", "ArticleSentences", "Default", "Canola"],
        write_as_csv=False):

    body = page["body"].decode('zip')
    encoding = page.get("encoding", "")

    try:
        body = body.decode(encoding)
    except Exception :
        body = body.decode("UTF8", "replace")
        encoding = "UTF8-replace"

    result = {
        "url": page["url"],
        "webentity_id": we["id"],
        "webentity_name": we["name"],
        "html": body,
        "text": textify(body, encoding=encoding)
    }
    for method in extractors:
        result[method] = textify(body, extractor="%sExtractor" % method, encoding=encoding)

    if not write_as_csv:
        urlmd5 = md5(page["url"]).hexdigest()
        html = os.path.join("outputs", corpus, we["id"], "html", urlmd5)
        with open(html, "w") as f:
            f.write(result["html"].encode("utf-8"))

        text = os.path.join("outputs", corpus, we["id"], "text", urlmd5)
        with open(text, "w") as f:
            f.write(result["text"].encode("utf-8"))

        for method in extractors:
            cleantext = os.path.join("outputs", corpus, we["id"], \
                "text%s" % method, urlmd5)
            with open(cleantext, "w") as f:
                f.write(result[method].encode("utf-8"))

    return result

Пример #6

0

Показать файл

Файл: index_hyphe_web_pages.py Проект: medialab/hyphe2solr

def index_webentity(web_entity_pile,web_entity_done_pile,conf,mainlog):
    processlog=TimeElapsedLogging.create_log(str(os.getpid()),filename="logs/by_pid/%s.log"%os.getpid())
    processlog.info("starting infinite loop")
    corpus = conf['hyphe-core']['corpus_id']
    solr = sunburnt.SolrInterface("http://%s:%s/solr/%s" % (conf["solr"]['host'], conf["solr"]['port'], get_solr_instance_name(conf["solr"]['path'])))
    hyphe_core=jsonrpclib.Server('http://%s:%s'%(conf["hyphe-core"]["host"], conf["hyphe-core"]["port"]), version=1)
    db = pymongo.MongoClient(conf['mongo']['host'], conf['mongo']['port'])
    collname = "%s.pages" % conf['hyphe-core']['corpus_id']
    coll = db[conf["mongo"]["db"]][collname]
    while True :
        we=web_entity_pile.get()

        # logging in proc log
        processlog.info("%s: starting processing"%we["name"])

        #setting LOG
        web_entity_name_safe=re.sub(r"[\W]","",we['name'])
        web_entity_log_id="%s_%s"%(web_entity_name_safe,we["id"])
        logfilename="logs/by_web_entity/%s.log"%(web_entity_log_id[:80])
        errors_solr_document_filename="logs/errors_solr_document/%s.json"%(web_entity_log_id[:80])
        welog=TimeElapsedLogging.create_log(we["id"],filename=logfilename)

        #getting web pages URLS
        welog.log(logging.INFO,"retrieving pages of web entity %s"%(we["name"]))
        #mainlog.info("DEBUG %s"%(we["id"]))
        try:
            web_pages = hyphe_core.store.get_webentity_pages(we["id"], True, corpus)
        except:
            web_pages = hyphe_core.store.get_webentity_pages(we["id"], True, corpus)

        if (web_pages['code'] == 'fail') :
            mainlog.info(we_pages['message'])
        welog.log(logging.INFO,"retrieved %s pages of web entity %s"%(len(web_pages["result"]),we["name"]))
        we["web_pages"]=web_pages["result"]

        processlog.info("%s: got %s webpages"%(we["name"],len(we["web_pages"])))

        #getting mongo html web page
        urls=[page["url"] for page in we["web_pages"]] #if page["http_status"]!=0]
        nb_urls=len(urls)
        last_id=""
        pages_mongo=[]
        nb_pages_mongo=0
        nb_pages_indexed=0
        i=0
        url_slice_len=1000
        welog.info("retrieving + indexing HTML pages from mongo to solr of web entity %s"%(we["name"]))

        while i<len(urls) :
            urls_slice=urls[i:i+url_slice_len]
            pages_mongo_slice=list(coll.find({
                    "url": {"$in": urls_slice},
                    "status": 200,
                    "content_type": {"$in": accepted_content_types},
                    "body" : {"$exists":True}
                },
                projection=["_id","encoding","url","lru","depth","body"]))
            #mainlog.info(str(len(pages_mongo_slice)))
            #local counters
            nb_slice_mongo=len(pages_mongo_slice)
            nb_slice_indexed=0

            welog.info("%s %s: got %s pages in slice %s %s"%(we["name"],we["id"],nb_slice_mongo,i,len(urls_slice)))

            error_solr_doc=[]
            for page_mongo in pages_mongo_slice:
                body = page_mongo["body"].decode('zip')
                try:
                    body = body.decode(page_mongo.get("encoding",""))
                    encoding = page_mongo.get("encoding","")
                except Exception :
                    body = body.decode("UTF8","replace")
                    encoding = "UTF8-replace"
                solr_document={
                    "id":page_mongo["_id"],
                    "web_entity":we["name"],
                    "web_entity_id":we["id"],
                    "web_entity_status":we["status"],
                    "corpus":conf['hyphe-core']['corpus_id'],
                    "encoding":encoding,
                    "original_encoding":page_mongo.get("encoding",""),
                    "url":page_mongo["url"],
                    "lru":page_mongo["lru"],
                    "depth":page_mongo["depth"],
                    #"html":body,
                    "text":html2text.textify(body, extractor="raw", encoding=encoding)
                    #"textCanola":html2text.textify(body, extractor="CanolaExtractor", encoding=encoding)
                }

                try:
                     solr.add(solr_document)
                     nb_slice_indexed+=1
                except Exception as e:
                    # mainlog.info("ERROR %s: %s %s" %(type(e),e, solr_document))
                    #welog.debug("Exception with document :%s %s %s"%(solr_document["id"],solr_document["url"],solr_document["encoding"]))
                    error_solr_doc.append({"text": solr_document["text"],"body": body, "error": "%s: %s" % (type(e), e), "url":solr_document["url"],"encoding":solr_document["encoding"],"original_encoding":solr_document["original_encoding"]})
                    # import traceback
                    # traceback.print_exc()
            if len(error_solr_doc) >0 :
                with open(errors_solr_document_filename,"a") as errors_solr_document_json_file :
                    json.dump(error_solr_doc,errors_solr_document_json_file,indent=4)
            del(error_solr_doc)
			#log
            welog.info("%s %s: indexed %s pages"%(we["name"],we["id"],nb_slice_indexed))
            #processlog.info("indexed %s html pages for %s"%(nb_slice_indexed,(we["name"])))
            # global counters
            nb_pages_mongo+=nb_slice_mongo
            nb_pages_indexed+=nb_slice_indexed
            i=i+url_slice_len


        del we["web_pages"]
        del web_pages
        del urls

        welog.log(logging.INFO,"'%s' indexed (%s web pages on %s)"%(we["name"],nb_pages_indexed,nb_pages_mongo))
        try:
            solr.commit()
        except Exception as e:
            mainlog.info("ERROR %s: %s" %(type(e), e))
            mainlog.info("Retrying...")
            try:
                solr.commit()
            except Exception as e:
                mainlog.info("STILL BROKEN, giving up on %s %s" % (we['id'], we['name']))
		#relying on autocommit
        #welog.info("inserts to solr comited")
        processlog.info("%s: indexed %s on %s Html pages"%(we["name"],nb_pages_indexed, nb_pages_mongo))
        #adding we if to done list
        web_entity_done_pile.put(we["id"])
        del we
        web_entity_pile.task_done()

Пример #7

0

Показать файл

def index_webentity(web_entity_pile, web_entity_done_pile, conf, mainlog):
    processlog = TimeElapsedLogging.create_log(str(os.getpid()),
                                               filename="logs/by_pid/%s.log" %
                                               os.getpid())
    processlog.info("starting infinite loop")
    corpus = conf['hyphe-core']['corpus_id']
    solr = sunburnt.SolrInterface(
        "http://%s:%s/solr/%s" %
        (conf["solr"]['host'], conf["solr"]['port'],
         get_solr_instance_name(conf["solr"]['path'])))
    hyphe_core = jsonrpclib.Server(
        'http://%s:%s' %
        (conf["hyphe-core"]["host"], conf["hyphe-core"]["port"]),
        version=1)
    db = pymongo.MongoClient(conf['mongo']['host'], conf['mongo']['port'])
    collname = "%s.pages" % conf['hyphe-core']['corpus_id']
    coll = db[conf["mongo"]["db"]][collname]
    while True:
        we = web_entity_pile.get()

        # logging in proc log
        processlog.info("%s: starting processing" % we["name"])

        #setting LOG
        web_entity_name_safe = re.sub(r"[\W]", "", we['name'])
        web_entity_log_id = "%s_%s" % (web_entity_name_safe, we["id"])
        logfilename = "logs/by_web_entity/%s.log" % web_entity_log_id
        errors_solr_document_filename = "logs/errors_solr_document/%s.json" % web_entity_log_id
        welog = TimeElapsedLogging.create_log(we["id"], filename=logfilename)

        #getting web pages URLS
        welog.log(logging.INFO,
                  "retrieving pages of web entity %s" % (we["name"]))
        #mainlog.info("DEBUG %s"%(we["id"]))
        web_pages = hyphe_core.store.get_webentity_pages(
            we["id"], True, corpus)
        if (web_pages['code'] == 'fail'):
            mainlog.info(we_pages['message'])
        welog.log(
            logging.INFO, "retrieved %s pages of web entity %s" %
            (len(web_pages["result"]), we["name"]))
        we["web_pages"] = web_pages["result"]

        processlog.info("%s: got %s webpages" %
                        (we["name"], len(we["web_pages"])))

        #getting mongo html web page
        urls = [page["url"]
                for page in we["web_pages"]]  #if page["http_status"]!=0]
        nb_urls = len(urls)
        last_id = ""
        pages_mongo = []
        nb_pages_mongo = 0
        nb_pages_indexed = 0
        i = 0
        url_slice_len = 1000
        welog.info(
            "retrieving + indexing HTML pages from mongo to solr of web entity %s"
            % (we["name"]))

        while i < len(urls):
            urls_slice = urls[i:i + url_slice_len]
            pages_mongo_slice = list(
                coll.find(
                    {
                        "url": {
                            "$in": urls_slice
                        },
                        "status": 200,
                        "content_type": {
                            "$in": accepted_content_types
                        },
                        "body": {
                            "$exists": True
                        }
                    },
                    projection=[
                        "_id", "encoding", "url", "lru", "depth", "body"
                    ]))
            #mainlog.info(str(len(pages_mongo_slice)))
            #local counters
            nb_slice_mongo = len(pages_mongo_slice)
            nb_slice_indexed = 0

            welog.info(
                "%s %s: got %s pages in slice %s %s" %
                (we["name"], we["id"], nb_slice_mongo, i, len(urls_slice)))

            error_solr_doc = []
            for page_mongo in pages_mongo_slice:
                body = page_mongo["body"].decode('zip')
                try:
                    body = body.decode(page_mongo.get("encoding", ""))
                    encoding = page_mongo.get("encoding", "")
                except Exception:
                    body = body.decode("UTF8", "replace")
                    encoding = "UTF8-replace"
                solr_document = {
                    "id": page_mongo["_id"],
                    "web_entity": we["name"],
                    "web_entity_id": we["id"],
                    "web_entity_status": we["status"],
                    "corpus": conf['hyphe-core']['corpus_id'],
                    "encoding": encoding,
                    "original_encoding": page_mongo.get("encoding", ""),
                    "url": page_mongo["url"],
                    "lru": page_mongo["lru"],
                    "depth": page_mongo["depth"],
                    #"html":body,
                    "text": html2text.textify(body, encoding)
                }

                try:
                    solr.add(solr_document)
                    nb_slice_indexed += 1
                except Exception as e:
                    # mainlog.info("ERROR %s: %s %s" %(type(e),e, solr_document))
                    #welog.debug("Exception with document :%s %s %s"%(solr_document["id"],solr_document["url"],solr_document["encoding"]))
                    error_solr_doc.append({
                        "error":
                        "%s: %s" % (type(e), e),
                        "url":
                        solr_document["url"],
                        "encoding":
                        solr_document["encoding"],
                        "original_encoding":
                        solr_document["original_encoding"]
                    })
                    # import traceback
                    # traceback.print_exc()
            if len(error_solr_doc) > 0:
                with open(errors_solr_document_filename,
                          "a") as errors_solr_document_json_file:
                    json.dump(error_solr_doc,
                              errors_solr_document_json_file,
                              indent=4)
            del (error_solr_doc)
            #log
            welog.info("%s %s: indexed %s pages" %
                       (we["name"], we["id"], nb_slice_indexed))
            #processlog.info("indexed %s html pages for %s"%(nb_slice_indexed,(we["name"])))
            # global counters
            nb_pages_mongo += nb_slice_mongo
            nb_pages_indexed += nb_slice_indexed
            i = i + url_slice_len

        del we["web_pages"]
        del web_pages
        del urls

        welog.log(
            logging.INFO, "'%s' indexed (%s web pages on %s)" %
            (we["name"], nb_pages_indexed, nb_pages_mongo))
        try:
            solr.commit()
        except Exception as e:
            mainlog.info("ERROR %s: %s" % (type(e), e))
            mainlog.info("Retrying...")
            try:
                solr.commit()
            except Exception as e:
                mainlog.info("STILL BROKEN, giving up on %s %s" %
                             (we['id'], we['name']))

#relying on autocommit
#welog.info("inserts to solr comited")
        processlog.info("%s: indexed %s on %s Html pages" %
                        (we["name"], nb_pages_indexed, nb_pages_mongo))
        #adding we if to done list
        web_entity_done_pile.put(we["id"])
        del we
        web_entity_pile.task_done()

Пример #8

0

Показать файл

Файл: index_hyphe_web_pages.py Проект: imclab/hyphe2solr

def index_webentity(web_entity_pile,web_entity_done_pile,hyphe_core,coll,solr):
    processlog=TimeElapsedLogging.create_log(str(os.getpid()),filename="logs/by_pid/%s.log"%os.getpid())
    processlog.info("starting infinite loop")
    #hyphe_core=jsonrpclib.Server(hyphe_core_url)
    while True :
        we=web_entity_pile.get()
        # logging in proc log
        processlog.info("%s: starting processing"%we["name"])

        #setting LOG
        web_entity_name_safe=re.sub(r"[\W]","",we['name'])
        web_entity_log_id="%s_%s"%(web_entity_name_safe,we["id"])
        logfilename="logs/by_web_entity/%s.log"%web_entity_log_id
        errors_solr_document_filename="logs/errors_solr_document/%s.json"%web_entity_log_id
        welog=TimeElapsedLogging.create_log(we["id"],filename=logfilename)

        #getting web pages URLS
        welog.log(logging.INFO,"retrieving pages of web entity %s"%(we["name"]))
        web_pages = hyphe_core.store.get_webentity_pages(we["id"])
        welog.log(logging.INFO,"retrieved %s pages of web entity %s"%(len(web_pages["result"]),we["name"]))
        we["web_pages"]=web_pages["result"]

        processlog.info("%s: got %s webpages"%(we["name"],len(we["web_pages"])))

        #getting mongo html web page
        urls=[page["url"] for page in we["web_pages"]] #if page["http_status"]!=0]
        nb_urls=len(urls)
        last_id=""
        pages_mongo=[]
        nb_pages_mongo=0
        nb_pages_indexed=0
        i=0
        url_slice_len=1000
        welog.info("retrieving + indexing HTML pages from mongo to solr of web entity %s"%(we["name"]))
        while i<len(urls) :
            urls_slice=urls[i:i+url_slice_len]
            pages_mongo_slice=coll.find({
                    "url": {"$in": urls_slice},
                    "content_type": {"$in": accepted_content_types},
                    "body" : {"$exists":True}
                },
                fields=["_id","encoding","url","lru","depth","body"])

            #local counters
            nb_slice_mongo=pages_mongo_slice.count()
            nb_slice_indexed=0

            welog.info("%s %s: got %s pages in slice %s %s"%(we["name"],we["id"],nb_slice_mongo,i,len(urls_slice)))

            error_solr_doc=[]
            for page_mongo in pages_mongo_slice:
                body = page_mongo["body"].decode('zip')
                try:
                    body = body.decode(page_mongo.get("encoding",""))
                    encoding = page_mongo.get("encoding","")
                except Exception :
                    body = body.decode("UTF8","replace")
                    encoding = "UTF8-replace"
                solr_document={
                    "id":page_mongo["_id"],
                    "web_entity":we["name"],
                    "web_entity_id":we["id"],
                    "web_entity_status":we["status"],
                    "corpus":"hyphe",
                    "encoding":encoding,
                    "original_encoding":page_mongo.get("encoding",""),
                    "url":page_mongo["url"],
                    "lru":page_mongo["lru"],
                    "depth":page_mongo["depth"],
                    "html":body,
                    "text":html2text.textify(body)
                }

                try:
                     solr.add(solr_document)
                     nb_slice_indexed+=1
                except Exception as e:
                    #welog.debug("Exception with document :%s %s %s"%(solr_document["id"],solr_document["url"],solr_document["encoding"]))
                    error_solr_doc.append({"error": "%s: %s" % (type(e), e), "url":solr_document["url"],"encoding":solr_document["encoding"],"original_encoding":solr_document["original_encoding"]})
            if len(error_solr_doc) >0 :
                with open(errors_solr_document_filename,"a") as errors_solr_document_json_file :
                    json.dump(error_solr_doc,errors_solr_document_json_file,indent=4)
            del(error_solr_doc)
			#log
            welog.info("%s %s: indexed %s pages"%(we["name"],we["id"],nb_slice_indexed))
            #processlog.info("indexed %s html pages for %s"%(nb_slice_indexed,(we["name"])))
            # global counters
            nb_pages_mongo+=nb_slice_mongo
            nb_pages_indexed+=nb_slice_indexed
            i=i+url_slice_len


        del we["web_pages"]
        del web_pages
        del urls

        welog.log(logging.INFO,"'%s' indexed (%s web pages on %s)"%(we["name"],nb_pages_indexed,nb_pages_mongo))
	    #solr.commit()
		#relying on autocommit
        #welog.info("inserts to solr comited")
        processlog.info("%s: indexed %s on %s Html pages"%(we["name"],nb_pages_indexed, nb_pages_mongo))
        #adding we if to done list
        web_entity_done_pile.put(we["id"])
        del we
        web_entity_pile.task_done()

Пример #9

0

Показать файл

Файл: index_hyphe_web_pages.py Проект: legaultpierre/COP21

def index_webentity(web_entity_pile,web_entity_done_pile,hyphe_core,coll,solr, corpus, tags):
    processlog=TimeElapsedLogging.create_log(str(os.getpid()),filename="logs/by_pid/%s.log"%os.getpid())
    processlog.info("starting infinite loop")
    while True :
        we=web_entity_pile.get()

        # Get the tags of the web entity
        tagsWE = tags[we["id"]]
        # logging in proc log
        processlog.info("%s: starting processing"%we["name"])

        #setting LOG
        web_entity_name_safe=re.sub(r"[\W]","",we['name'])
        web_entity_log_id="%s_%s"%(web_entity_name_safe,we["id"])
        logfilename="logs/by_web_entity/%s.log"%web_entity_log_id
        errors_solr_document_filename="logs/errors_solr_document/%s.json"%web_entity_log_id
        welog=TimeElapsedLogging.create_log(we["id"],filename=logfilename)

        #getting web pages URLS
        welog.log(logging.INFO,"retrieving pages of web entity %s"%(we["name"]))
        mainlog.info("DEBUG %s"%(we["id"]))
        web_pages = hyphe_core.store.get_webentity_pages(we["id"], True, corpus)
        if (web_pages['code'] == 'fail') :
            mainlog.info(we_pages['message'])
        welog.log(logging.INFO,"retrieved %s pages of web entity %s"%(len(web_pages["result"]),we["name"]))
        we["web_pages"]=web_pages["result"]

        processlog.info("%s: got %s webpages"%(we["name"],len(we["web_pages"])))

        #getting mongo html web page
        urls=[page["url"] for page in we["web_pages"]] #if page["http_status"]!=0]
        nb_urls=len(urls)
        last_id=""
        pages_mongo=[]
        nb_pages_mongo=0
        nb_pages_indexed=0
        i=0
        url_slice_len=1000
        welog.info("retrieving + indexing HTML pages from mongo to solr of web entity %s"%(we["name"]))

        while i<len(urls) :
            urls_slice=urls[i:i+url_slice_len]
            pages_mongo_slice=coll.find({
                    "url": {"$in": urls_slice},
                    "status": 200,
                    "content_type": {"$in": accepted_content_types},
                    "body" : {"$exists":True}
                },
                fields=["_id","encoding","url","lru","depth","body"])
            # mainlog.info(str(len(list(pages_mongo_slice))))
            #local counters
            nb_slice_mongo=pages_mongo_slice.count()
            nb_slice_indexed=0

            welog.info("%s %s: got %s pages in slice %s %s"%(we["name"],we["id"],nb_slice_mongo,i,len(urls_slice)))

            error_solr_doc=[]
            for page_mongo in pages_mongo_slice:
                body = page_mongo["body"].decode('zip')
                try:
                    body = body.decode(page_mongo.get("encoding",""))
                    encoding = page_mongo.get("encoding","")
                except Exception :
                    body = body.decode("UTF8","replace")
                    encoding = "UTF8-replace"
                solr_document={
                    "id":page_mongo["_id"],
                    "web_entity":we["name"],
                    "web_entity_id":we["id"],
                    "web_entity_status":we["status"],
                    "corpus":conf['hyphe-core']['corpus_id'],
                    "encoding":encoding,
                    "original_encoding":page_mongo.get("encoding",""),
                    "url":page_mongo["url"],
                    "lru":page_mongo["lru"],
                    "depth":page_mongo["depth"],
                    "html":body,
                    "text":html2text.textify(body),
                    "actors_type": tagsWE["ACTORS_TYPE"],
                    "country": tagsWE["COUNTRY"],
                    "anthropogenic": tagsWE["ANTHROPOGENIC_CLIMATE_CHANGE"],
                    "mitigation_adaptation": tagsWE["MITIGATION_ADAPTATION"],
                    "industrial_delegation": tagsWE["INDUSTRIAL_DELEGATION"],
                    "thematic_delegation": tagsWE["THEMATIC_DELEGATION"],
                    "language": tagsWE["LANGUAGE"]
                }

                try:
                     solr.add(solr_document)
                     nb_slice_indexed+=1
                except Exception as e:
                    # mainlog.info("ERROR %s: %s %s" %(type(e),e, solr_document))
                    #welog.debug("Exception with document :%s %s %s"%(solr_document["id"],solr_document["url"],solr_document["encoding"]))
                    error_solr_doc.append({"text": solr_document["text"],"body": solr_document["html"], "error": "%s: %s" % (type(e), e), "url":solr_document["url"],"encoding":solr_document["encoding"],"original_encoding":solr_document["original_encoding"]})
                    # import traceback
                    # traceback.print_exc()
            if len(error_solr_doc) >0 :
                with open(errors_solr_document_filename,"a") as errors_solr_document_json_file :
                    json.dump(error_solr_doc,errors_solr_document_json_file,indent=4)
            del(error_solr_doc)
			#log
            welog.info("%s %s: indexed %s pages"%(we["name"],we["id"],nb_slice_indexed))
            #processlog.info("indexed %s html pages for %s"%(nb_slice_indexed,(we["name"])))
            # global counters
            nb_pages_mongo+=nb_slice_mongo
            nb_pages_indexed+=nb_slice_indexed
            i=i+url_slice_len


        del we["web_pages"]
        del web_pages
        del urls

        welog.log(logging.INFO,"'%s' indexed (%s web pages on %s)"%(we["name"],nb_pages_indexed,nb_pages_mongo))
	    #solr.commit()
		#relying on autocommit
        #welog.info("inserts to solr comited")
        processlog.info("%s: indexed %s on %s Html pages"%(we["name"],nb_pages_indexed, nb_pages_mongo))
        #adding we if to done list
        web_entity_done_pile.put(we["id"])
        del we
        web_entity_pile.task_done()

Python textify примеры использования