def process_page(page, wename, corpus, \ extractors=["Article", "ArticleSentences", "Default", "Canola"]): urlmd5 = md5(page["url"]).hexdigest() body = page["body"].decode('zip') encoding = page.get("encoding", "") try: body = body.decode(encoding) except Exception : body = body.decode("UTF8", "replace") encoding = "UTF8-replace" html = os.path.join("outputs", corpus, wename, "html", urlmd5) with open(html, "w") as f: f.write(body.encode("utf-8")) text = os.path.join("outputs", corpus, wename, "text", urlmd5) with open(text, "w") as f: f.write(textify(body, encoding=encoding).encode("utf-8")) for method in extractors: cleantext = os.path.join("outputs", corpus, wename, \ "text%s" % method, urlmd5) with open(cleantext, "w") as f: f.write(textify(body, extractor="%sExtractor" % method, \ encoding=encoding).encode("utf-8"))
def process_pages_matching_keyword(hyphe_core, mongo_pages_coll, corpus, keyword, content_types=["text/plain", "text/html"]): query = { "status": 200, "content_type": {"$in": content_types}, "body" : {"$exists": True} } print("TOTAL valid pages:", mongo_pages_coll.count(query)) headers = ["url", "webentity_id", "webentity_name"] files = {} for typ in ["html", "text", "canola"]: files[typ] = open("%s-%s-%s.csv" % (corpus, keyword, typ), "w") print >> files[typ], ",".join([k.encode("utf-8") for k in headers + [typ]]) match = 0 total = 0 for page in mongo_pages_coll.find(query): total += 1 if not total % 100: print match, "/", total body = page["body"].decode('zip') if keyword not in body: continue match +=1 encoding = page.get("encoding", "") try: body = body.decode(encoding) except Exception : body = body.decode("UTF8", "replace") encoding = "UTF8-replace" we = hyphe_core.store.get_webentity_for_url_as_lru(page["lru"], corpus) try: assert we["code"] == "success" page["webentity_id"] = we["result"]["id"] page["webentity_name"] = we["result"]["name"] except: print("WARNING! Could not resolve WebEntity for url %s" % page["url"]) page["html"] = body page["text"] = textify(body, encoding=encoding) page["canola"] = textify(body, extractor="CanolaExtractor", encoding=encoding) for typ in ["html", "text", "canola"]: if keyword not in page[typ]: continue print >> files[typ], ",".join([format_for_csv(page.get(k, "")) for k in headers + [typ]]) for typ in ["html", "text", "canola"]: files[typ].close() print('FOUND %s pages matching "%s"' % (match, keyword))
def process_page(page, we, corpus, extractors=["Article", "ArticleSentences", "Default", "Canola"], write_as_csv=False): body = page["body"].decode('zip') encoding = page.get("encoding", "") try: body = body.decode(encoding) except Exception : body = body.decode("UTF8", "replace") encoding = "UTF8-replace" result = { "url": page["url"], "webentity_id": we["id"], "webentity_name": we["name"], "html": body, "text": textify(body, encoding=encoding) } for method in extractors: result[method] = textify(body, extractor="%sExtractor" % method, encoding=encoding) if not write_as_csv: urlmd5 = md5(page["url"]).hexdigest() html = os.path.join("outputs", corpus, we["id"], "html", urlmd5) with open(html, "w") as f: f.write(result["html"].encode("utf-8")) text = os.path.join("outputs", corpus, we["id"], "text", urlmd5) with open(text, "w") as f: f.write(result["text"].encode("utf-8")) for method in extractors: cleantext = os.path.join("outputs", corpus, we["id"], \ "text%s" % method, urlmd5) with open(cleantext, "w") as f: f.write(result[method].encode("utf-8")) return result
def index_webentity(web_entity_pile,web_entity_done_pile,conf,mainlog): processlog=TimeElapsedLogging.create_log(str(os.getpid()),filename="logs/by_pid/%s.log"%os.getpid()) processlog.info("starting infinite loop") corpus = conf['hyphe-core']['corpus_id'] solr = sunburnt.SolrInterface("http://%s:%s/solr/%s" % (conf["solr"]['host'], conf["solr"]['port'], get_solr_instance_name(conf["solr"]['path']))) hyphe_core=jsonrpclib.Server('http://%s:%s'%(conf["hyphe-core"]["host"], conf["hyphe-core"]["port"]), version=1) db = pymongo.MongoClient(conf['mongo']['host'], conf['mongo']['port']) collname = "%s.pages" % conf['hyphe-core']['corpus_id'] coll = db[conf["mongo"]["db"]][collname] while True : we=web_entity_pile.get() # logging in proc log processlog.info("%s: starting processing"%we["name"]) #setting LOG web_entity_name_safe=re.sub(r"[\W]","",we['name']) web_entity_log_id="%s_%s"%(web_entity_name_safe,we["id"]) logfilename="logs/by_web_entity/%s.log"%(web_entity_log_id[:80]) errors_solr_document_filename="logs/errors_solr_document/%s.json"%(web_entity_log_id[:80]) welog=TimeElapsedLogging.create_log(we["id"],filename=logfilename) #getting web pages URLS welog.log(logging.INFO,"retrieving pages of web entity %s"%(we["name"])) #mainlog.info("DEBUG %s"%(we["id"])) try: web_pages = hyphe_core.store.get_webentity_pages(we["id"], True, corpus) except: web_pages = hyphe_core.store.get_webentity_pages(we["id"], True, corpus) if (web_pages['code'] == 'fail') : mainlog.info(we_pages['message']) welog.log(logging.INFO,"retrieved %s pages of web entity %s"%(len(web_pages["result"]),we["name"])) we["web_pages"]=web_pages["result"] processlog.info("%s: got %s webpages"%(we["name"],len(we["web_pages"]))) #getting mongo html web page urls=[page["url"] for page in we["web_pages"]] #if page["http_status"]!=0] nb_urls=len(urls) last_id="" pages_mongo=[] nb_pages_mongo=0 nb_pages_indexed=0 i=0 url_slice_len=1000 welog.info("retrieving + indexing HTML pages from mongo to solr of web entity %s"%(we["name"])) while i<len(urls) : urls_slice=urls[i:i+url_slice_len] pages_mongo_slice=list(coll.find({ "url": {"$in": urls_slice}, "status": 200, "content_type": {"$in": accepted_content_types}, "body" : {"$exists":True} }, projection=["_id","encoding","url","lru","depth","body"])) #mainlog.info(str(len(pages_mongo_slice))) #local counters nb_slice_mongo=len(pages_mongo_slice) nb_slice_indexed=0 welog.info("%s %s: got %s pages in slice %s %s"%(we["name"],we["id"],nb_slice_mongo,i,len(urls_slice))) error_solr_doc=[] for page_mongo in pages_mongo_slice: body = page_mongo["body"].decode('zip') try: body = body.decode(page_mongo.get("encoding","")) encoding = page_mongo.get("encoding","") except Exception : body = body.decode("UTF8","replace") encoding = "UTF8-replace" solr_document={ "id":page_mongo["_id"], "web_entity":we["name"], "web_entity_id":we["id"], "web_entity_status":we["status"], "corpus":conf['hyphe-core']['corpus_id'], "encoding":encoding, "original_encoding":page_mongo.get("encoding",""), "url":page_mongo["url"], "lru":page_mongo["lru"], "depth":page_mongo["depth"], #"html":body, "text":html2text.textify(body, extractor="raw", encoding=encoding) #"textCanola":html2text.textify(body, extractor="CanolaExtractor", encoding=encoding) } try: solr.add(solr_document) nb_slice_indexed+=1 except Exception as e: # mainlog.info("ERROR %s: %s %s" %(type(e),e, solr_document)) #welog.debug("Exception with document :%s %s %s"%(solr_document["id"],solr_document["url"],solr_document["encoding"])) error_solr_doc.append({"text": solr_document["text"],"body": body, "error": "%s: %s" % (type(e), e), "url":solr_document["url"],"encoding":solr_document["encoding"],"original_encoding":solr_document["original_encoding"]}) # import traceback # traceback.print_exc() if len(error_solr_doc) >0 : with open(errors_solr_document_filename,"a") as errors_solr_document_json_file : json.dump(error_solr_doc,errors_solr_document_json_file,indent=4) del(error_solr_doc) #log welog.info("%s %s: indexed %s pages"%(we["name"],we["id"],nb_slice_indexed)) #processlog.info("indexed %s html pages for %s"%(nb_slice_indexed,(we["name"]))) # global counters nb_pages_mongo+=nb_slice_mongo nb_pages_indexed+=nb_slice_indexed i=i+url_slice_len del we["web_pages"] del web_pages del urls welog.log(logging.INFO,"'%s' indexed (%s web pages on %s)"%(we["name"],nb_pages_indexed,nb_pages_mongo)) try: solr.commit() except Exception as e: mainlog.info("ERROR %s: %s" %(type(e), e)) mainlog.info("Retrying...") try: solr.commit() except Exception as e: mainlog.info("STILL BROKEN, giving up on %s %s" % (we['id'], we['name'])) #relying on autocommit #welog.info("inserts to solr comited") processlog.info("%s: indexed %s on %s Html pages"%(we["name"],nb_pages_indexed, nb_pages_mongo)) #adding we if to done list web_entity_done_pile.put(we["id"]) del we web_entity_pile.task_done()
def index_webentity(web_entity_pile, web_entity_done_pile, conf, mainlog): processlog = TimeElapsedLogging.create_log(str(os.getpid()), filename="logs/by_pid/%s.log" % os.getpid()) processlog.info("starting infinite loop") corpus = conf['hyphe-core']['corpus_id'] solr = sunburnt.SolrInterface( "http://%s:%s/solr/%s" % (conf["solr"]['host'], conf["solr"]['port'], get_solr_instance_name(conf["solr"]['path']))) hyphe_core = jsonrpclib.Server( 'http://%s:%s' % (conf["hyphe-core"]["host"], conf["hyphe-core"]["port"]), version=1) db = pymongo.MongoClient(conf['mongo']['host'], conf['mongo']['port']) collname = "%s.pages" % conf['hyphe-core']['corpus_id'] coll = db[conf["mongo"]["db"]][collname] while True: we = web_entity_pile.get() # logging in proc log processlog.info("%s: starting processing" % we["name"]) #setting LOG web_entity_name_safe = re.sub(r"[\W]", "", we['name']) web_entity_log_id = "%s_%s" % (web_entity_name_safe, we["id"]) logfilename = "logs/by_web_entity/%s.log" % web_entity_log_id errors_solr_document_filename = "logs/errors_solr_document/%s.json" % web_entity_log_id welog = TimeElapsedLogging.create_log(we["id"], filename=logfilename) #getting web pages URLS welog.log(logging.INFO, "retrieving pages of web entity %s" % (we["name"])) #mainlog.info("DEBUG %s"%(we["id"])) web_pages = hyphe_core.store.get_webentity_pages( we["id"], True, corpus) if (web_pages['code'] == 'fail'): mainlog.info(we_pages['message']) welog.log( logging.INFO, "retrieved %s pages of web entity %s" % (len(web_pages["result"]), we["name"])) we["web_pages"] = web_pages["result"] processlog.info("%s: got %s webpages" % (we["name"], len(we["web_pages"]))) #getting mongo html web page urls = [page["url"] for page in we["web_pages"]] #if page["http_status"]!=0] nb_urls = len(urls) last_id = "" pages_mongo = [] nb_pages_mongo = 0 nb_pages_indexed = 0 i = 0 url_slice_len = 1000 welog.info( "retrieving + indexing HTML pages from mongo to solr of web entity %s" % (we["name"])) while i < len(urls): urls_slice = urls[i:i + url_slice_len] pages_mongo_slice = list( coll.find( { "url": { "$in": urls_slice }, "status": 200, "content_type": { "$in": accepted_content_types }, "body": { "$exists": True } }, projection=[ "_id", "encoding", "url", "lru", "depth", "body" ])) #mainlog.info(str(len(pages_mongo_slice))) #local counters nb_slice_mongo = len(pages_mongo_slice) nb_slice_indexed = 0 welog.info( "%s %s: got %s pages in slice %s %s" % (we["name"], we["id"], nb_slice_mongo, i, len(urls_slice))) error_solr_doc = [] for page_mongo in pages_mongo_slice: body = page_mongo["body"].decode('zip') try: body = body.decode(page_mongo.get("encoding", "")) encoding = page_mongo.get("encoding", "") except Exception: body = body.decode("UTF8", "replace") encoding = "UTF8-replace" solr_document = { "id": page_mongo["_id"], "web_entity": we["name"], "web_entity_id": we["id"], "web_entity_status": we["status"], "corpus": conf['hyphe-core']['corpus_id'], "encoding": encoding, "original_encoding": page_mongo.get("encoding", ""), "url": page_mongo["url"], "lru": page_mongo["lru"], "depth": page_mongo["depth"], #"html":body, "text": html2text.textify(body, encoding) } try: solr.add(solr_document) nb_slice_indexed += 1 except Exception as e: # mainlog.info("ERROR %s: %s %s" %(type(e),e, solr_document)) #welog.debug("Exception with document :%s %s %s"%(solr_document["id"],solr_document["url"],solr_document["encoding"])) error_solr_doc.append({ "error": "%s: %s" % (type(e), e), "url": solr_document["url"], "encoding": solr_document["encoding"], "original_encoding": solr_document["original_encoding"] }) # import traceback # traceback.print_exc() if len(error_solr_doc) > 0: with open(errors_solr_document_filename, "a") as errors_solr_document_json_file: json.dump(error_solr_doc, errors_solr_document_json_file, indent=4) del (error_solr_doc) #log welog.info("%s %s: indexed %s pages" % (we["name"], we["id"], nb_slice_indexed)) #processlog.info("indexed %s html pages for %s"%(nb_slice_indexed,(we["name"]))) # global counters nb_pages_mongo += nb_slice_mongo nb_pages_indexed += nb_slice_indexed i = i + url_slice_len del we["web_pages"] del web_pages del urls welog.log( logging.INFO, "'%s' indexed (%s web pages on %s)" % (we["name"], nb_pages_indexed, nb_pages_mongo)) try: solr.commit() except Exception as e: mainlog.info("ERROR %s: %s" % (type(e), e)) mainlog.info("Retrying...") try: solr.commit() except Exception as e: mainlog.info("STILL BROKEN, giving up on %s %s" % (we['id'], we['name'])) #relying on autocommit #welog.info("inserts to solr comited") processlog.info("%s: indexed %s on %s Html pages" % (we["name"], nb_pages_indexed, nb_pages_mongo)) #adding we if to done list web_entity_done_pile.put(we["id"]) del we web_entity_pile.task_done()
def index_webentity(web_entity_pile,web_entity_done_pile,hyphe_core,coll,solr): processlog=TimeElapsedLogging.create_log(str(os.getpid()),filename="logs/by_pid/%s.log"%os.getpid()) processlog.info("starting infinite loop") #hyphe_core=jsonrpclib.Server(hyphe_core_url) while True : we=web_entity_pile.get() # logging in proc log processlog.info("%s: starting processing"%we["name"]) #setting LOG web_entity_name_safe=re.sub(r"[\W]","",we['name']) web_entity_log_id="%s_%s"%(web_entity_name_safe,we["id"]) logfilename="logs/by_web_entity/%s.log"%web_entity_log_id errors_solr_document_filename="logs/errors_solr_document/%s.json"%web_entity_log_id welog=TimeElapsedLogging.create_log(we["id"],filename=logfilename) #getting web pages URLS welog.log(logging.INFO,"retrieving pages of web entity %s"%(we["name"])) web_pages = hyphe_core.store.get_webentity_pages(we["id"]) welog.log(logging.INFO,"retrieved %s pages of web entity %s"%(len(web_pages["result"]),we["name"])) we["web_pages"]=web_pages["result"] processlog.info("%s: got %s webpages"%(we["name"],len(we["web_pages"]))) #getting mongo html web page urls=[page["url"] for page in we["web_pages"]] #if page["http_status"]!=0] nb_urls=len(urls) last_id="" pages_mongo=[] nb_pages_mongo=0 nb_pages_indexed=0 i=0 url_slice_len=1000 welog.info("retrieving + indexing HTML pages from mongo to solr of web entity %s"%(we["name"])) while i<len(urls) : urls_slice=urls[i:i+url_slice_len] pages_mongo_slice=coll.find({ "url": {"$in": urls_slice}, "content_type": {"$in": accepted_content_types}, "body" : {"$exists":True} }, fields=["_id","encoding","url","lru","depth","body"]) #local counters nb_slice_mongo=pages_mongo_slice.count() nb_slice_indexed=0 welog.info("%s %s: got %s pages in slice %s %s"%(we["name"],we["id"],nb_slice_mongo,i,len(urls_slice))) error_solr_doc=[] for page_mongo in pages_mongo_slice: body = page_mongo["body"].decode('zip') try: body = body.decode(page_mongo.get("encoding","")) encoding = page_mongo.get("encoding","") except Exception : body = body.decode("UTF8","replace") encoding = "UTF8-replace" solr_document={ "id":page_mongo["_id"], "web_entity":we["name"], "web_entity_id":we["id"], "web_entity_status":we["status"], "corpus":"hyphe", "encoding":encoding, "original_encoding":page_mongo.get("encoding",""), "url":page_mongo["url"], "lru":page_mongo["lru"], "depth":page_mongo["depth"], "html":body, "text":html2text.textify(body) } try: solr.add(solr_document) nb_slice_indexed+=1 except Exception as e: #welog.debug("Exception with document :%s %s %s"%(solr_document["id"],solr_document["url"],solr_document["encoding"])) error_solr_doc.append({"error": "%s: %s" % (type(e), e), "url":solr_document["url"],"encoding":solr_document["encoding"],"original_encoding":solr_document["original_encoding"]}) if len(error_solr_doc) >0 : with open(errors_solr_document_filename,"a") as errors_solr_document_json_file : json.dump(error_solr_doc,errors_solr_document_json_file,indent=4) del(error_solr_doc) #log welog.info("%s %s: indexed %s pages"%(we["name"],we["id"],nb_slice_indexed)) #processlog.info("indexed %s html pages for %s"%(nb_slice_indexed,(we["name"]))) # global counters nb_pages_mongo+=nb_slice_mongo nb_pages_indexed+=nb_slice_indexed i=i+url_slice_len del we["web_pages"] del web_pages del urls welog.log(logging.INFO,"'%s' indexed (%s web pages on %s)"%(we["name"],nb_pages_indexed,nb_pages_mongo)) #solr.commit() #relying on autocommit #welog.info("inserts to solr comited") processlog.info("%s: indexed %s on %s Html pages"%(we["name"],nb_pages_indexed, nb_pages_mongo)) #adding we if to done list web_entity_done_pile.put(we["id"]) del we web_entity_pile.task_done()
def index_webentity(web_entity_pile,web_entity_done_pile,hyphe_core,coll,solr, corpus, tags): processlog=TimeElapsedLogging.create_log(str(os.getpid()),filename="logs/by_pid/%s.log"%os.getpid()) processlog.info("starting infinite loop") while True : we=web_entity_pile.get() # Get the tags of the web entity tagsWE = tags[we["id"]] # logging in proc log processlog.info("%s: starting processing"%we["name"]) #setting LOG web_entity_name_safe=re.sub(r"[\W]","",we['name']) web_entity_log_id="%s_%s"%(web_entity_name_safe,we["id"]) logfilename="logs/by_web_entity/%s.log"%web_entity_log_id errors_solr_document_filename="logs/errors_solr_document/%s.json"%web_entity_log_id welog=TimeElapsedLogging.create_log(we["id"],filename=logfilename) #getting web pages URLS welog.log(logging.INFO,"retrieving pages of web entity %s"%(we["name"])) mainlog.info("DEBUG %s"%(we["id"])) web_pages = hyphe_core.store.get_webentity_pages(we["id"], True, corpus) if (web_pages['code'] == 'fail') : mainlog.info(we_pages['message']) welog.log(logging.INFO,"retrieved %s pages of web entity %s"%(len(web_pages["result"]),we["name"])) we["web_pages"]=web_pages["result"] processlog.info("%s: got %s webpages"%(we["name"],len(we["web_pages"]))) #getting mongo html web page urls=[page["url"] for page in we["web_pages"]] #if page["http_status"]!=0] nb_urls=len(urls) last_id="" pages_mongo=[] nb_pages_mongo=0 nb_pages_indexed=0 i=0 url_slice_len=1000 welog.info("retrieving + indexing HTML pages from mongo to solr of web entity %s"%(we["name"])) while i<len(urls) : urls_slice=urls[i:i+url_slice_len] pages_mongo_slice=coll.find({ "url": {"$in": urls_slice}, "status": 200, "content_type": {"$in": accepted_content_types}, "body" : {"$exists":True} }, fields=["_id","encoding","url","lru","depth","body"]) # mainlog.info(str(len(list(pages_mongo_slice)))) #local counters nb_slice_mongo=pages_mongo_slice.count() nb_slice_indexed=0 welog.info("%s %s: got %s pages in slice %s %s"%(we["name"],we["id"],nb_slice_mongo,i,len(urls_slice))) error_solr_doc=[] for page_mongo in pages_mongo_slice: body = page_mongo["body"].decode('zip') try: body = body.decode(page_mongo.get("encoding","")) encoding = page_mongo.get("encoding","") except Exception : body = body.decode("UTF8","replace") encoding = "UTF8-replace" solr_document={ "id":page_mongo["_id"], "web_entity":we["name"], "web_entity_id":we["id"], "web_entity_status":we["status"], "corpus":conf['hyphe-core']['corpus_id'], "encoding":encoding, "original_encoding":page_mongo.get("encoding",""), "url":page_mongo["url"], "lru":page_mongo["lru"], "depth":page_mongo["depth"], "html":body, "text":html2text.textify(body), "actors_type": tagsWE["ACTORS_TYPE"], "country": tagsWE["COUNTRY"], "anthropogenic": tagsWE["ANTHROPOGENIC_CLIMATE_CHANGE"], "mitigation_adaptation": tagsWE["MITIGATION_ADAPTATION"], "industrial_delegation": tagsWE["INDUSTRIAL_DELEGATION"], "thematic_delegation": tagsWE["THEMATIC_DELEGATION"], "language": tagsWE["LANGUAGE"] } try: solr.add(solr_document) nb_slice_indexed+=1 except Exception as e: # mainlog.info("ERROR %s: %s %s" %(type(e),e, solr_document)) #welog.debug("Exception with document :%s %s %s"%(solr_document["id"],solr_document["url"],solr_document["encoding"])) error_solr_doc.append({"text": solr_document["text"],"body": solr_document["html"], "error": "%s: %s" % (type(e), e), "url":solr_document["url"],"encoding":solr_document["encoding"],"original_encoding":solr_document["original_encoding"]}) # import traceback # traceback.print_exc() if len(error_solr_doc) >0 : with open(errors_solr_document_filename,"a") as errors_solr_document_json_file : json.dump(error_solr_doc,errors_solr_document_json_file,indent=4) del(error_solr_doc) #log welog.info("%s %s: indexed %s pages"%(we["name"],we["id"],nb_slice_indexed)) #processlog.info("indexed %s html pages for %s"%(nb_slice_indexed,(we["name"]))) # global counters nb_pages_mongo+=nb_slice_mongo nb_pages_indexed+=nb_slice_indexed i=i+url_slice_len del we["web_pages"] del web_pages del urls welog.log(logging.INFO,"'%s' indexed (%s web pages on %s)"%(we["name"],nb_pages_indexed,nb_pages_mongo)) #solr.commit() #relying on autocommit #welog.info("inserts to solr comited") processlog.info("%s: indexed %s on %s Html pages"%(we["name"],nb_pages_indexed, nb_pages_mongo)) #adding we if to done list web_entity_done_pile.put(we["id"]) del we web_entity_pile.task_done()