def readJson(jsonurl, readedPage, od, _encoding): '''#if httpResponse is filepath jsonfile = (httpResponse.read()).decode('utf-8') ''' #if httpResponse is saved nto string already try: if(_encoding != None): if("utf" in _encoding.lower()): _encoding = _encoding.upper() try: jsonfile = (readedPage.decode(_encoding)).strip() except: try: jsonfile = (readedPage.decode(sys.stdout.encoding)).strip() except: jsonfile = (readedPage.decode('latin-1')).strip() pass else: try: jsonfile = (readedPage.decode(sys.stdout.encoding)).strip() except: jsonfile = (readedPage.decode('latin-1')).strip() pass dictnry = json.loads(jsonfile) readDictValues(jsonurl, dictnry, set(), od) except: comm.printException(comm.pathToSaveParsingErrors, "read_json.py " + _encoding + " " + jsonurl) pass
def readJson(jsonurl, readedPage, od, _encoding): '''#if httpResponse is filepath jsonfile = (httpResponse.read()).decode('utf-8') ''' #if httpResponse is saved nto string already try: if (_encoding != None): if ("utf" in _encoding.lower()): _encoding = _encoding.upper() try: jsonfile = (readedPage.decode(_encoding)).strip() except: try: jsonfile = (readedPage.decode(sys.stdout.encoding)).strip() except: jsonfile = (readedPage.decode('latin-1')).strip() pass else: try: jsonfile = (readedPage.decode(sys.stdout.encoding)).strip() except: jsonfile = (readedPage.decode('latin-1')).strip() pass dictnry = json.loads(jsonfile) readDictValues(jsonurl, dictnry, set(), od) except: comm.printException(comm.pathToSaveParsingErrors, "read_json.py " + _encoding + " " + jsonurl) pass
def readPlainText(htmlurl, plaintext, ontologyData, _encoding): try: try: punc = (plaintext.decode(_encoding)).strip() except: try: punc = (plaintext.decode(sys.stdout.encoding)).strip() except: try: punc = (plaintext.decode('UTF-8')).strip() except: try: punc = (plaintext.decode('latin-1')).strip() except: try: punc = (plaintext.decode('ISO-8859-1')).strip() except: try: punc = (plaintext.decode()).strip() except: punc = plaintext pass sentences = comm.replaceToPunkts(punc) for sentence in sentences: getEntities.getEntities(htmlurl, sentence, ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_plaintext.py " + _encoding + " " + htmlurl)
def sendFileToParser(contentType, baseUrl, redirectedTo, od, _encoding, localFilename, pageread): contentType = contentType.lower() #it is possible to read excel-type and pdf only after downloading this excel-doc if (("excel" in contentType) or ("pdf" in contentType)): try: dirToSaveDownloads = comm.downloadsDir + baseUrl if not os.path.isdir(dirToSaveDownloads): os.makedirs(dirToSaveDownloads) fileparser.spreadURLsByContentType( redirectedTo, None, contentType, od, _encoding, filePath=(dirToSaveDownloads + "/" + localFilename)) except: comm.printException( comm.pathToSaveProgrammingErrors, "create_dir_for_downloads_and_send_file_to_parser_" + str(baseUrl)) pass else: try: fileparser.spreadURLsByContentType(redirectedTo, pageread, contentType, od, _encoding) except: comm.printException(comm.pathToSaveProgrammingErrors, "send_file_to_parser") pass
def readExcel(filePath, url, ontologyData): try: urr(url, filePath) try: workbook = xlrd.open_workbook(filePath) worksheets = workbook.sheet_names() for worksheet_name in worksheets: worksheet = workbook.sheet_by_name(worksheet_name) num_rows = worksheet.nrows - 1 num_cells = worksheet.ncols - 1 curr_row = -1 while curr_row < num_rows: curr_row += 1 #row = worksheet.row(curr_row) #print ('Row:', curr_row) curr_cell = -1 while curr_cell < num_cells: curr_cell += 1 # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank cell_type = worksheet.cell_type(curr_row, curr_cell) cell_value = worksheet.cell_value(curr_row, curr_cell) if (cell_type == 1): sentences = comm.replaceToPunkts(cell_value) for sentence in sentences: getEntities.getEntities(url, sentence, ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url) pass except: comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url) pass
def insertValuesToDict(dictnry, localFilename, page_redirected, page_info, page_sha254, page_status ): try: dictnry[localFilename] = dict() dictnry[localFilename][page_sha254] = page_info dictnry[localFilename][page_sha254]["localFilename"] = localFilename dictnry[localFilename][page_sha254]["file_url"] = page_redirected dictnry[localFilename][page_sha254]["sha224"] = page_sha254 dictnry[localFilename][page_sha254]["status"] = page_status dictnry[localFilename][page_sha254]["timeDir"] = comm.timeDir return dictnry except: comm.printException(comm.pathToSaveJsonErrors, "insertValuesToDict") pass
def readPdf(filePath, url, od): urldownl(url, filePath) pdf = PdfFileReader(open(filePath, "rb")) pdf.strict = True try: for page in pdf.pages: text = (page.extractText()) sentences = comm.replaceToPunkts(text) for sentence in sentences: getEntities.getEntities(url, sentence, od) except: comm.printException(comm.pathToSaveParsingErrors, "read_pdf.py " + url) pass
def sendFileToParser(contentType, baseUrl, redirectedTo, od, _encoding, localFilename, pageread): contentType = contentType.lower() #it is possible to read excel-type and pdf only after downloading this excel-doc if(("excel" in contentType) or ("pdf" in contentType)): try: dirToSaveDownloads = comm.downloadsDir + baseUrl if not os.path.isdir(dirToSaveDownloads): os.makedirs(dirToSaveDownloads) fileparser.spreadURLsByContentType(redirectedTo, None, contentType, od, _encoding, filePath = (dirToSaveDownloads + "/" + localFilename)) except: comm.printException(comm.pathToSaveProgrammingErrors, "create_dir_for_downloads_and_send_file_to_parser_" + str(baseUrl)) pass else: try: fileparser.spreadURLsByContentType(redirectedTo, pageread, contentType, od, _encoding) except: comm.printException(comm.pathToSaveProgrammingErrors, "send_file_to_parser") pass
def addTriples(self, chunkedList, addLemmas = True): try: newDataExists = False g = self.getPerRdfGraph() g_new = Graph() #define specific namespace prefix self.bindNamespaces(g) for andmed in chunkedList: for webpage in andmed: gName = andmed[webpage]["gName"] fName = andmed[webpage]["fName"] name = andmed[webpage]["name"] lemmaList = andmed[webpage]["lemmaSet"] #print (lemmaList) try: #make triples newPerson = URIRef(self.perStr + name.replace (">", "").replace ("<", "").replace ("|", "").replace (" ", "_").lower()) newGivenName = Literal(gName) newFamilyName = Literal(fName) newPerName = Literal(name) newWebpage = URIRef(webpage); #add triples #check if graph contains bob already if ( newPerson, RDF.type, URIRef(self.person)) not in g: newDataExists = True g_new.add( (newPerson, RDF.type, URIRef(self.person)) ) if(newGivenName != Literal("")): g_new.add( (newPerson, self.givenName, newGivenName) ) if(newFamilyName != Literal("")): g_new.add( (newPerson, self.familyName, newFamilyName) ) g_new.add( (newPerson, self.perName, newPerName) ) #check if graph contains bob already if ( newPerson, self.mentionedAtSite, newWebpage) not in g: newDataExists = True g_new.add( (newPerson, self.mentionedAtSite, newWebpage) ) #add lemmas also if(addLemmas): for newLemma in lemmaList: #check if graph contains bob already if ( newPerson, self.lemma, Literal(newLemma)) not in g: newDataExists = True g_new.add( (newPerson, self.lemma, Literal(newLemma)) ) except: comm.printException(comm.initRdfErrorsFilePath, "build_per_graph") pass #print(str(newDataExists)) #write rdf into file if (newDataExists): try: gg = g+g_new (gg).serialize(self.perRdf, format='pretty-xml', encoding='utf-8') except: comm.printException(comm.initRdfErrorsFilePath, "RDF People Manager serialization error: ") pass except: comm.printException(comm.initRdfErrorsFilePath, "RDF People Manager (addTriples) error: ") pass
def readHtmlPage(htmlurl, readedPage, ontologyData, _encoding): try: sentences = set() root = parse(htmlurl).getroot() if (root is not None): for element in root.iter("head"): element.drop_tree() for element in root.iter("script"): element.drop_tree() for element in root.iter("style"): element.drop_tree() for element in root.iter("noscript"): element.drop_tree() for element in root.iter("input"): element.drop_tree() for element in root.iter("form"): element.drop_tree() for element in root.iter("title"): element.drop_tree() for element in root.iter("img"): element.drop_tree() for element in root.iter("body"): try: sentences.add(element.text_content()) except: pass if (len(sentences) > 0): lsent = list(sentences) for lau in lsent: if (lau != ""): laused = comm.replaceToPunkts(lau) for s6ne in laused: getEntities.getEntities(htmlurl, s6ne.strip(), ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_html.py " + _encoding + " " + htmlurl) pass
def readHtmlPage(htmlurl, readedPage, ontologyData, _encoding): try: sentences = set() root = parse(htmlurl).getroot() if (root is not None): for element in root.iter("head"): element.drop_tree() for element in root.iter("script"): element.drop_tree() for element in root.iter("style"): element.drop_tree() for element in root.iter("noscript"): element.drop_tree() for element in root.iter("input"): element.drop_tree() for element in root.iter("form"): element.drop_tree() for element in root.iter("title"): element.drop_tree() for element in root.iter("img"): element.drop_tree() for element in root.iter("body"): try: sentences.add(element.text_content()) except: pass if(len(sentences) > 0): lsent = list(sentences) for lau in lsent: if(lau != ""): laused = comm.replaceToPunkts(lau) for s6ne in laused: getEntities.getEntities(htmlurl, s6ne.strip(), ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_html.py " + _encoding + " " + htmlurl) pass
def addTriples(self, chunkedList, addLemmas = True): try: newDataExists = False g = self.getLocRdfGraph() g_new = Graph() #define specific namespace prefix self.bindNamespaces(g) for andmed in chunkedList: for webpage in andmed: for objName in andmed[webpage]: lemmaList = andmed[webpage][objName] #print (lemmaList) try: #make triples newLocation = URIRef(self.locStr + objName.replace (">", "").replace ("<", "").replace ("|", "").replace (" ", "_").lower()) newLocationName = Literal(objName) newWebpage = URIRef(webpage); #add triples #check if graph contains bob already if ( newLocation, RDF.type, URIRef(self.location)) not in g: newDataExists = True g_new .add( (newLocation, RDF.type, URIRef(self.location)) ) g_new .add( (newLocation, self.locationName, newLocationName) ) #g_new .add( (newLocation, od.mentionedAtSite, newWebpage) ) #check if graph contains bob already if ( newLocation, self.mentionedAtSite, newWebpage) not in g: newDataExists = True g_new .add( (newLocation, self.mentionedAtSite, newWebpage) ) #add lemmas also if(addLemmas): for newLemma in lemmaList: #check if graph contains bob already if ( newLocation, self.lemma, Literal(newLemma)) not in g: newDataExists = True g_new .add( (newLocation, self.lemma, Literal(newLemma)) ) except: comm.printException(comm.initRdfErrorsFilePath, "build_loc_graph") pass #write rdf into file if (newDataExists): try: gg = g+g_new (gg).serialize(self.locRdf, format='pretty-xml', encoding='utf-8') except: comm.printException(comm.initRdfErrorsFilePath, "RDF Location Manager serialization error: ") pass except: comm.printException(comm.initRdfErrorsFilePath, "RDF Location Manager (addTriples) error: ") pass
def saveMetadata(url, od): #save the result of trying of opening a page into variable 'canOpen' canOpen = True try: #try to open document at URL redirectedTo = requests.get(url).url except ConnectionError: #it was not possible to open this web document canOpen = False #save exception (error of getting web document) errStr = (url + " Cannot_open_web-source \n" ) comm.printException(comm.pathToConnectionErrors, errStr) #continue without terminating a program pass except: #it was not possible to open this web document canOpen = False #save exception (error of getting web document) errStr = (url + " Cannot_open_web-source \n" ) comm.printException(comm.pathToSaveJsonErrors, errStr) #continue without terminating a program pass #continue only if 'canOpen' is still true if canOpen is True: #continue only if url is valid isValidUrl = valide.isNeededUrl(redirectedTo) if(isValidUrl): #print("can open: " + str(canOpen)) if not os.path.isdir(comm.jsonsDir): os.makedirs(comm.jsonsDir) try: #in following, use only the URL, where one was redirected, if at all page = requests.get(redirectedTo) statusCode = page.status_code #textual content of a doc pageread = page.text #get doc's metadata pageInfo = dict(page.headers) #generate filename for local storage: #it will be the hash of doc URL localFilename = comm.getUrlSHA(redirectedTo) #important metadata: content type contentType = page.headers['content-type'] isValid_contentType = valide.isValideType(contentType) #base_url denotes host name, #all documents that are from the same host, will be saved into same json-file #base_url becomes also json-file name (pathToSaveMetadata) baseUrl = None if(isValid_contentType): baseUrl = (urlparse(redirectedTo)).netloc if(baseUrl is not None): #generate hash of the content of doc. #this hash is used later for detecting whether the doc's content has changed or not. # this chnge-detection happens in cases of #1. monthly update #2. current method, when appears, that this URL have processed earlier sha224_ = (hashlib.sha224(pageread.encode('utf-8')).hexdigest()) #important data for parsers: encoding _encoding = page.encoding #_encoding = comm.getDocumentEncoding(contentType, pageread) #print("-----------------------------------------------------") #exclude doc types where it is not possible to find textual content: e.g images, videos isDesiredType = comm.isDesiredContent(contentType, od) #continue only witj desired types if(isDesiredType): #jsonsDir is actually a so called 'bucket' name in Google Cloud Storage jsonsDir = comm.jsonsDir print(jsonsDir) #jsonsFile becomes a so called 'object' inside a bucket #object's name is URL's host name and extension is '.json' jsonsFile = baseUrl + ".json" #build dictionary of address of object of this meta data jsonsPath = dict() jsonsPath["object"] = jsonsFile#'hostname.json'# jsonsPath["bucket"] = jsonsDir#e.g. 'datadownload_json'# pathToSaveMetadata_ = json.dumps(jsonsPath) #save meta data into dictionary structure infoDict_tmp = dict() infoDict_tmp["base_url"] = baseUrl infoDict = insertValuesToDict(infoDict_tmp, localFilename, redirectedTo, pageInfo, sha224_, statusCode ) #convert dictionary into json-string jsondata = json.dumps(infoDict, indent=4) #dict for sending collected data to 'updateObj.py' insertJson = dict() insertJson["jsondata"] = jsondata insertJson["localFilename"] = localFilename insertJson["redirectedTo"] = redirectedTo insertJson["pageInfo"] = pageInfo insertJson["sha224_"] = sha224_ insertJson["statusCode"] = statusCode insertJson["timeDir"] = comm.timeDir insertJson["address"] = pathToSaveMetadata_ #variable 'someNewData' is for storing knowledge about #whether this doc at this url #is processed earlier: #1. if Yes and doc's content has changed, then 'someNewData' becomes True; else remains False #2. if No, then 'someNewData' becomes True someNewData = False #string for saving a unique error message errr="" try: #convert dictionary into json-string for sending argument to 'updateObj.py' jd = json.dumps(insertJson) #get info back about whether here is some new data #'p' is a returned boolean value of 'someNewData' #communication with google-python-api-client is done using older version, python2.7 p = subprocess.Popen(["python2", "storage/updateObj.py", jd], stdout=subprocess.PIPE) out, err = p.communicate() someNewData = out.decode() errr = str(err).lower() print("\nsomeNewData " + str(someNewData)) print("\nerrr " + str(errr)) except: errstr = errr if ((errr != "") & (errr != "none")) else "storage-updateObj.py-ERROR" comm.printException(comm.pathToSaveJsonErrors, errstr) pass #continue with parsing of doc only when new data was detected if someNewData: sendFileToParser(contentType, baseUrl, redirectedTo, od, _encoding, localFilename, pageread) #record errors except urr.HTTPError as e: errStr = (redirectedTo + " HTTPError " + str(e.code) + " " + str(e.reason) + " \n" ) comm.printException(comm.pathToSaveJsonErrors, errStr) pass except urr.URLError as e: errStr = (redirectedTo + " URLError " + str(e.reason) + " \n" ) comm.printException(comm.pathToSaveJsonErrors, errStr) pass except IOError as e: errStr = (redirectedTo + " " + str("I/O_erRror({0}):_{1}".format(e.errno, e.strerror)) + "\n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass except ValueError: errStr = (redirectedTo + " ValueError_Could_not_convert_data_to_an_integer.\n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass except TypeError: errStr = (redirectedTo + " TypeError\n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass except: errStr = (redirectedTo + " Unexpected_error:_" + (str(sys.exc_info()[0])) + "\n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass
def saveMetadata(url, od): #save the result of trying of opening a page into variable 'canOpen' canOpen = True try: #try to open document at URL redirectedTo = requests.get(url).url except ConnectionError: #it was not possible to open this web document canOpen = False #save exception (error of getting web document) errStr = (url + " Cannot_open_web-source \n") comm.printException(comm.pathToConnectionErrors, errStr) #continue without terminating a program pass except: #it was not possible to open this web document canOpen = False #save exception (error of getting web document) errStr = (url + " Cannot_open_web-source \n") comm.printException(comm.pathToSaveJsonErrors, errStr) #continue without terminating a program pass #continue only if 'canOpen' is still true if canOpen is True: #continue only if url is valid isValidUrl = valide.isNeededUrl(redirectedTo) if (isValidUrl): #print("can open: " + str(canOpen)) if not os.path.isdir(comm.jsonsDir): os.makedirs(comm.jsonsDir) try: #in following, use only the URL, where one was redirected, if at all page = requests.get(redirectedTo) statusCode = page.status_code #textual content of a doc pageread = page.text #get doc's metadata pageInfo = dict(page.headers) #generate filename for local storage: #it will be the hash of doc URL localFilename = comm.getUrlSHA(redirectedTo) #important metadata: content type contentType = page.headers['content-type'] isValid_contentType = valide.isValideType(contentType) #base_url denotes host name, #all documents that are from the same host, will be saved into same json-file #base_url becomes also json-file name (pathToSaveMetadata) baseUrl = None if (isValid_contentType): baseUrl = (urlparse(redirectedTo)).netloc if (baseUrl is not None): #generate hash of the content of doc. #this hash is used later for detecting whether the doc's content has changed or not. # this chnge-detection happens in cases of #1. monthly update #2. current method, when appears, that this URL have processed earlier sha224_ = (hashlib.sha224( pageread.encode('utf-8')).hexdigest()) #important data for parsers: encoding _encoding = page.encoding #_encoding = comm.getDocumentEncoding(contentType, pageread) #print("-----------------------------------------------------") #exclude doc types where it is not possible to find textual content: e.g images, videos isDesiredType = comm.isDesiredContent(contentType, od) #continue only witj desired types if (isDesiredType): #jsonsDir is actually a so called 'bucket' name in Google Cloud Storage jsonsDir = comm.jsonsDir print(jsonsDir) #jsonsFile becomes a so called 'object' inside a bucket #object's name is URL's host name and extension is '.json' jsonsFile = baseUrl + ".json" #build dictionary of address of object of this meta data jsonsPath = dict() jsonsPath["object"] = jsonsFile #'hostname.json'# jsonsPath[ "bucket"] = jsonsDir #e.g. 'datadownload_json'# pathToSaveMetadata_ = json.dumps(jsonsPath) #save meta data into dictionary structure infoDict_tmp = dict() infoDict_tmp["base_url"] = baseUrl infoDict = insertValuesToDict( infoDict_tmp, localFilename, redirectedTo, pageInfo, sha224_, statusCode) #convert dictionary into json-string jsondata = json.dumps(infoDict, indent=4) #dict for sending collected data to 'updateObj.py' insertJson = dict() insertJson["jsondata"] = jsondata insertJson["localFilename"] = localFilename insertJson["redirectedTo"] = redirectedTo insertJson["pageInfo"] = pageInfo insertJson["sha224_"] = sha224_ insertJson["statusCode"] = statusCode insertJson["timeDir"] = comm.timeDir insertJson["address"] = pathToSaveMetadata_ #variable 'someNewData' is for storing knowledge about #whether this doc at this url #is processed earlier: #1. if Yes and doc's content has changed, then 'someNewData' becomes True; else remains False #2. if No, then 'someNewData' becomes True someNewData = False #string for saving a unique error message errr = "" try: #convert dictionary into json-string for sending argument to 'updateObj.py' jd = json.dumps(insertJson) #get info back about whether here is some new data #'p' is a returned boolean value of 'someNewData' #communication with google-python-api-client is done using older version, python2.7 p = subprocess.Popen( ["python2", "storage/updateObj.py", jd], stdout=subprocess.PIPE) out, err = p.communicate() someNewData = out.decode() errr = str(err).lower() print("\nsomeNewData " + str(someNewData)) print("\nerrr " + str(errr)) except: errstr = errr if ( (errr != "") & (errr != "none") ) else "storage-updateObj.py-ERROR" comm.printException(comm.pathToSaveJsonErrors, errstr) pass #continue with parsing of doc only when new data was detected if someNewData: sendFileToParser(contentType, baseUrl, redirectedTo, od, _encoding, localFilename, pageread) #record errors except urr.HTTPError as e: errStr = (redirectedTo + " HTTPError " + str(e.code) + " " + str(e.reason) + " \n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass except urr.URLError as e: errStr = (redirectedTo + " URLError " + str(e.reason) + " \n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass except IOError as e: errStr = (redirectedTo + " " + str( "I/O_erRror({0}):_{1}".format(e.errno, e.strerror)) + "\n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass except ValueError: errStr = ( redirectedTo + " ValueError_Could_not_convert_data_to_an_integer.\n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass except TypeError: errStr = (redirectedTo + " TypeError\n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass except: errStr = (redirectedTo + " Unexpected_error:_" + (str( sys.exc_info()[0])) + "\n") comm.printException(comm.pathToSaveJsonErrors, errStr) pass
def addTriples(self, chunkedList, addLemmas=True): try: newDataExists = False g = self.getLocRdfGraph() g_new = Graph() #define specific namespace prefix self.bindNamespaces(g) for andmed in chunkedList: for webpage in andmed: for objName in andmed[webpage]: lemmaList = andmed[webpage][objName] #print (lemmaList) try: #make triples newLocation = URIRef( self.locStr + objName.replace(">", "").replace("<", ""). replace("|", "").replace(" ", "_").lower()) newLocationName = Literal(objName) newWebpage = URIRef(webpage) #add triples #check if graph contains bob already if (newLocation, RDF.type, URIRef(self.location)) not in g: newDataExists = True g_new.add((newLocation, RDF.type, URIRef(self.location))) g_new.add((newLocation, self.locationName, newLocationName)) #g_new .add( (newLocation, od.mentionedAtSite, newWebpage) ) #check if graph contains bob already if (newLocation, self.mentionedAtSite, newWebpage) not in g: newDataExists = True g_new.add((newLocation, self.mentionedAtSite, newWebpage)) #add lemmas also if (addLemmas): for newLemma in lemmaList: #check if graph contains bob already if (newLocation, self.lemma, Literal(newLemma)) not in g: newDataExists = True g_new.add((newLocation, self.lemma, Literal(newLemma))) except: comm.printException(comm.initRdfErrorsFilePath, "build_loc_graph") pass #write rdf into file if (newDataExists): try: gg = g + g_new (gg).serialize(self.locRdf, format='pretty-xml', encoding='utf-8') except: comm.printException( comm.initRdfErrorsFilePath, "RDF Location Manager serialization error: ") pass except: comm.printException(comm.initRdfErrorsFilePath, "RDF Location Manager (addTriples) error: ") pass
def spreadURLsByContentType(url, httpResponse, tyyp, od, _encoding, filePath = None): doctext = httpResponse '''#parse excel file''' if("excel" in tyyp): try: '''#parse web page excel''' read_eksel.readExcel(filePath, url, od) except: comm.printException(comm.pathToSaveParsingErrors, "fileparser_excel") pass elif("pdf" in tyyp): try: '''#parse pdf''' read_pdf.readPdf(filePath, url, od) except: comm.printException(comm.pathToSaveParsingErrors, "fileparser_pdf") pass elif("xml" in tyyp): try: '''#parse web page xml''' doctext = detectEncoding(_encoding, httpResponse) read_xml.readXml(url, doctext, od) except: comm.printException(comm.pathToSaveParsingErrors, "fileparser_xml") pass elif("html" in tyyp) : try: '''#parse web page html/txt''' doctext = detectEncoding(_encoding, httpResponse) read_html.readHtmlPage(url, doctext, od, _encoding) except: comm.printException(comm.pathToSaveParsingErrors, "fileparser_html") pass elif("json" in tyyp): try: '''#parse json app/json''' doctext = detectEncoding(_encoding, httpResponse) read_json.readJson(url, doctext, od, _encoding) except: comm.printException(comm.pathToSaveParsingErrors, "fileparser_json") pass elif("plain" in tyyp) or ("text" in tyyp): try: doctext = detectEncoding(_encoding, httpResponse) '''#assumes incoming is plain text try to parse text lines''' read_plaintext.readPlainText(url, doctext, od, _encoding) except: comm.printException(comm.pathToSaveParsingErrors, "fileparser_plainText") pass else: comm.printException(comm.pathToSaveParsingErrors, "The_parser_for_the_type_" + tyyp + "_is_not_implemented\n")
def getEntities( url, text, ontologyData, orgWords=[ "kogu", "selts", "ansambel", "keskus", "ühendus", "ühing", "mtü", "oü", "as", "klubi", "asutus", "keskus", "fond", "cup", ], locWords=["vabarii", "maakond"], ): # print("GETENTITIES ", url) # printIncomingText(text) if "^" not in text: # invalid for RDFlibURI ntwl = list() ner_tagged = None try: ner_tagged = tagger(analyzer(tokenizer(text))) except: comm.printException( comm.initRdfErrorsFilePath, "\n" + url + "\n_getEntities.py-def_getEntities_:_ner_tagged " + str(text) ) pass if ner_tagged is not None: try: ntwl = ner_tagged.named_entities except: comm.printException( comm.initRdfErrorsFilePath, "\n" + url + "\n_getEntities.py-def_getEntities_:_ntwl" + str(len(ntwl)) + " " + str(text), ) pass try: if len(ntwl) > 0: andmed = dict() # get label for entity for i in ntwl: label = i.label freqLemma = comm.replaceWith(" ", i.lemma) # replace some chars with space frlower = freqLemma.lower() # correct some ner labels for ow in orgWords: if ow.lower() in frlower: label = "ORG" for lw in locWords: if lw.lower() in frlower: label = "LOC" # process values by labels if label == "PER": entitySet = set() if freqLemma != "": name = freqLemma.title() names = name.split(" ") gName = "" fName = "" try: if len(names) > 1: if len(names) > 2: # more than 1 given name, assuming 2 of them gName = names[0] + " " + names[1] fName = names[2] # family name elif len(names) == 2: gName = names[0] # one given name fName = names[1] # family name except: comm.printException( comm.initRdfErrorsFilePath, "\n" + url + "\n_getEntities.py-def_getEntities_gname-fname", ) pass entitySet.add(freqLemma) # to later remove, currently for avoid double values entitySet.add(name) entitySet.add(gName) entitySet.add(fName) wConcat = comm.replaceWith("", (" ".join(w.text for w in i.words))) entitySet.add(wConcat) lemmalist = list() for w in i.words: lemmalist.append(w.lemmas) produkt = itertools.product(*lemmalist) for j in produkt: entitySet.add( " ".join( str(u) for u in (list(j)) if ((u.lower() != name.lower()) & (u != "") & (u.title() in names)) ) ) # now remove double values if name in entitySet: entitySet.remove(name) if gName in entitySet: entitySet.remove(gName) if fName in entitySet: entitySet.remove(fName) if "" in entitySet: entitySet.remove("") andmed = {url: {"gName": gName, "fName": fName, "name": name, "lemmaSet": entitySet}} if not (ontologyData.sharedList_per._callmethod("__contains__", (andmed,))): ontologyData.sharedList_per._callmethod("append", (andmed,)) if (ontologyData.sharedList_per)._callmethod("__len__") > comm.chunksize: try: chunkedList = ontologyData.sharedList_per[:] # makes copy,not refrence del ontologyData.sharedList_per[:] perManager = init_rdf.PeopleManager(ontologyData) perManager.addTriples(chunkedList) except: comm.printException(comm.initRdfErrorsFilePath, "\n" + url + "\n_get_PER_entities") pass else: objName = freqLemma.title() entitySet = set() entitySet.add(freqLemma) wConcat = comm.replaceWith("", (" ".join(w.text for w in i.words))) entitySet.add(wConcat) lemmalist = list() for w in i.words: lemmalist.append(w.lemmas) produkt = itertools.product(*lemmalist) for j in produkt: entitySet.add( " ".join(str(u) for u in (list(j)) if ((u.lower() != objName.lower()) & (u != ""))) ) if "" in entitySet: entitySet.remove("") andmed = {url: {objName: entitySet}} if label == "ORG": if not (ontologyData.sharedList_org._callmethod("__contains__", (andmed,))): ontologyData.sharedList_org._callmethod("append", (andmed,)) elif label == "LOC": if not (ontologyData.sharedList_loc._callmethod("__contains__", (andmed,))): ontologyData.sharedList_loc._callmethod("append", (andmed,)) if (ontologyData.sharedList_org)._callmethod("__len__") > comm.chunksize: try: chunkedList = ontologyData.sharedList_org[:] # makes copy,not refrence del ontologyData.sharedList_org[:] # tests # jf = open("tEst.txt", 'a', encoding='utf-8') # jf.write(str(len(chunkedList)) + "\n") # jf.close() orgManager = init_rdf.OrganizationManager(ontologyData) orgManager.addTriples(chunkedList) except: comm.printException(comm.initRdfErrorsFilePath, "\n" + url + "\n_get_ORG_entities") pass if (ontologyData.sharedList_loc)._callmethod("__len__") > comm.chunksize: try: chunkedList = ontologyData.sharedList_loc[:] # makes copy,not refrence del ontologyData.sharedList_loc[:] locManager = init_rdf.LocationManager(ontologyData) locManager.addTriples(chunkedList) except: comm.printException(comm.initRdfErrorsFilePath, "\n" + url + "\n_get_LOC_entities") pass except: comm.printException(comm.initRdfErrorsFilePath, "\n" + url + "\n_getEntities.py") pass
''' data0 = None data = None # try: data = json.loads(( json.loads(sys.argv[1]) )["data"]) data0 = json.loads(sys.argv[1]) #data = json.loads((data0)["data"]) comm.chunksize = int(json.loads((data0)["chunksize"])) #jf = open("/var/www/html/ch.txt", 'a') #jf.write(str(datetime.datetime.now()) + "\nCHUNKSIZE: " + str(comm.chunksize) + "\n\n") #jf.close() except: comm.printException(comm.pathToSaveProgrammingErrors, "load_DATA_in_connector") pass if(data is not None): jobs = list(data.values()) nrOfJobs=len(jobs) pool = Pool(processes=os.cpu_count()) pool.map(sendUrl, jobs) pool.close() pool.join() #FINALLY add triples from lists, that left over. #In the file 'getEntities', #when chunking shared lists, it starts to create RDF-s, when list size exceeds chunksize (e.g. 25 items),
def getEntities(url, text, ontologyData, orgWords=['kogu', 'selts', 'ansambel', 'keskus', 'ühendus', 'ühing', 'mtü', 'oü', 'as', 'klubi', 'asutus', 'keskus', 'fond', 'cup'], locWords=['vabarii', 'maakond']): #print("GETENTITIES ", url) #printIncomingText(text) if("^" not in text):#invalid for RDFlibURI ntwl = list() ner_tagged = None try: ner_tagged = tagger(analyzer(tokenizer(text))) except: comm.printException(comm.initRdfErrorsFilePath, "\n" + url + "\n_getEntities.py-def_getEntities_:_ner_tagged " + str(text)) pass if (ner_tagged is not None): try: ntwl = ner_tagged.named_entities except: comm.printException(comm.initRdfErrorsFilePath, "\n" + url + "\n_getEntities.py-def_getEntities_:_ntwl" + str(len(ntwl)) + " " + str(text)) pass try: if(len(ntwl) > 0): andmed = dict() #get label for entity for i in ntwl: label = i.label freqLemma = comm.replaceWith(' ', i.lemma)#replace some chars with space frlower = freqLemma.lower() #correct some ner labels for ow in orgWords: if(ow.lower() in frlower ): label = "ORG" for lw in locWords: if(lw.lower() in frlower ): label = "LOC" #process values by labels if label == "PER": entitySet = set() if(freqLemma != ""): name = freqLemma.title() names = name.split(' ') gName = "" fName = "" try: if len(names) > 1: if len(names) > 2:#more than 1 given name, assuming 2 of them gName = names[0] + " " + names[1] fName = names[2]#family name elif len(names) == 2: gName = names[0]#one given name fName = names[1]#family name except: comm.printException(comm.initRdfErrorsFilePath, "\n" + url + "\n_getEntities.py-def_getEntities_gname-fname") pass entitySet.add(freqLemma) #to later remove, currently for avoid double values entitySet.add(name) entitySet.add(gName) entitySet.add(fName) wConcat = comm.replaceWith('', (' '.join(w.text for w in i.words))) entitySet.add(wConcat) lemmalist = list() for w in i.words: lemmalist.append(w.lemmas) produkt = itertools.product(*lemmalist) for j in produkt: entitySet.add( " ".join(str(u) for u in(list(j)) if ((u.lower() != name.lower()) & (u != "") & (u.title() in names)) ) ) #now remove double values if name in entitySet: entitySet.remove(name) if gName in entitySet: entitySet.remove(gName) if fName in entitySet: entitySet.remove(fName) if "" in entitySet: entitySet.remove("") andmed={url: {"gName": gName, "fName": fName, "name": name, "lemmaSet": entitySet}}; if not(ontologyData.sharedList_per._callmethod('__contains__', (andmed,))): ontologyData.sharedList_per._callmethod('append', (andmed,)) if ((ontologyData.sharedList_per)._callmethod('__len__') > comm.chunksize): try: chunkedList = ontologyData.sharedList_per[:]#makes copy,not refrence del ontologyData.sharedList_per[:] perManager = init_rdf.PeopleManager(ontologyData) perManager.addTriples(chunkedList) except: comm.printException(comm.initRdfErrorsFilePath, "\n" + url + "\n_get_PER_entities") pass else: objName = freqLemma.title() entitySet = set(); entitySet.add(freqLemma); wConcat = comm.replaceWith('', (' '.join(w.text for w in i.words))) entitySet.add(wConcat) lemmalist = list() for w in i.words: lemmalist.append(w.lemmas) produkt = itertools.product(*lemmalist) for j in produkt: entitySet.add( " ".join(str(u) for u in(list(j)) if ((u.lower() != objName.lower()) & (u != "")) ) ) if "" in entitySet: entitySet.remove("") andmed={url: {objName: entitySet}}; if(label == "ORG"): if not(ontologyData.sharedList_org._callmethod('__contains__', (andmed,))): ontologyData.sharedList_org._callmethod('append', (andmed,)) elif(label == "LOC"): if not(ontologyData.sharedList_loc._callmethod('__contains__', (andmed,))): ontologyData.sharedList_loc._callmethod('append', (andmed,)) if ((ontologyData.sharedList_org)._callmethod('__len__') > comm.chunksize): try: chunkedList = (ontologyData.sharedList_org[:])#makes copy,not refrence del ontologyData.sharedList_org[:] #tests #jf = open("tEst.txt", 'a', encoding='utf-8') #jf.write(str(len(chunkedList)) + "\n") #jf.close() orgManager = init_rdf.OrganizationManager(ontologyData) orgManager.addTriples(chunkedList) except: comm.printException(comm.initRdfErrorsFilePath, "\n" + url + "\n_get_ORG_entities") pass if ((ontologyData.sharedList_loc)._callmethod('__len__') > comm.chunksize): try: chunkedList = ontologyData.sharedList_loc[:]#makes copy,not refrence del ontologyData.sharedList_loc[:] locManager = init_rdf.LocationManager(ontologyData) locManager.addTriples(chunkedList) except: comm.printException(comm.initRdfErrorsFilePath, "\n" + url + "\n_get_LOC_entities") pass except: comm.printException(comm.initRdfErrorsFilePath, "\n" + url + "\n_getEntities.py") pass
''' data0 = None data = None # try: data = json.loads((json.loads(sys.argv[1]))["data"]) data0 = json.loads(sys.argv[1]) #data = json.loads((data0)["data"]) comm.chunksize = int(json.loads((data0)["chunksize"])) #jf = open("/var/www/html/ch.txt", 'a') #jf.write(str(datetime.datetime.now()) + "\nCHUNKSIZE: " + str(comm.chunksize) + "\n\n") #jf.close() except: comm.printException(comm.pathToSaveProgrammingErrors, "load_DATA_in_connector") pass if (data is not None): jobs = list(data.values()) nrOfJobs = len(jobs) pool = Pool(processes=os.cpu_count()) pool.map(sendUrl, jobs) pool.close() pool.join() #FINALLY add triples from lists, that left over. #In the file 'getEntities', #when chunking shared lists, it starts to create RDF-s, when list size exceeds chunksize (e.g. 25 items), #but when there are eventually less items in lists than chunksize, the items in it were not tripled so far. #print("PER")
def addTriples(self, chunkedList, addLemmas=True): try: newDataExists = False g = self.getPerRdfGraph() g_new = Graph() #define specific namespace prefix self.bindNamespaces(g) for andmed in chunkedList: for webpage in andmed: gName = andmed[webpage]["gName"] fName = andmed[webpage]["fName"] name = andmed[webpage]["name"] lemmaList = andmed[webpage]["lemmaSet"] #print (lemmaList) try: #make triples newPerson = URIRef( self.perStr + name.replace(">", "").replace("<", "").replace( "|", "").replace(" ", "_").lower()) newGivenName = Literal(gName) newFamilyName = Literal(fName) newPerName = Literal(name) newWebpage = URIRef(webpage) #add triples #check if graph contains bob already if (newPerson, RDF.type, URIRef(self.person)) not in g: newDataExists = True g_new.add( (newPerson, RDF.type, URIRef(self.person))) if (newGivenName != Literal("")): g_new.add( (newPerson, self.givenName, newGivenName)) if (newFamilyName != Literal("")): g_new.add((newPerson, self.familyName, newFamilyName)) g_new.add((newPerson, self.perName, newPerName)) #check if graph contains bob already if (newPerson, self.mentionedAtSite, newWebpage) not in g: newDataExists = True g_new.add( (newPerson, self.mentionedAtSite, newWebpage)) #add lemmas also if (addLemmas): for newLemma in lemmaList: #check if graph contains bob already if (newPerson, self.lemma, Literal(newLemma)) not in g: newDataExists = True g_new.add((newPerson, self.lemma, Literal(newLemma))) except: comm.printException(comm.initRdfErrorsFilePath, "build_per_graph") pass #print(str(newDataExists)) #write rdf into file if (newDataExists): try: gg = g + g_new (gg).serialize(self.perRdf, format='pretty-xml', encoding='utf-8') except: comm.printException( comm.initRdfErrorsFilePath, "RDF People Manager serialization error: ") pass except: comm.printException(comm.initRdfErrorsFilePath, "RDF People Manager (addTriples) error: ") pass