def readExcel(filePath, url, ontologyData): try: urr(url, filePath) try: workbook = xlrd.open_workbook(filePath) worksheets = workbook.sheet_names() for worksheet_name in worksheets: worksheet = workbook.sheet_by_name(worksheet_name) num_rows = worksheet.nrows - 1 num_cells = worksheet.ncols - 1 curr_row = -1 while curr_row < num_rows: curr_row += 1 # row = worksheet.row(curr_row) # print ('Row:', curr_row) curr_cell = -1 while curr_cell < num_cells: curr_cell += 1 # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank cell_type = worksheet.cell_type(curr_row, curr_cell) cell_value = worksheet.cell_value(curr_row, curr_cell) if cell_type == 1: sentences = comm.replaceToPunkts(cell_value) for sentence in sentences: getEntities.getEntities(url, sentence, ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url) pass except: comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url) pass
def readExcel(filePath, url, ontologyData): try: urr(url, filePath) try: workbook = xlrd.open_workbook(filePath) worksheets = workbook.sheet_names() for worksheet_name in worksheets: worksheet = workbook.sheet_by_name(worksheet_name) num_rows = worksheet.nrows - 1 num_cells = worksheet.ncols - 1 curr_row = -1 while curr_row < num_rows: curr_row += 1 #row = worksheet.row(curr_row) #print ('Row:', curr_row) curr_cell = -1 while curr_cell < num_cells: curr_cell += 1 # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank cell_type = worksheet.cell_type(curr_row, curr_cell) cell_value = worksheet.cell_value(curr_row, curr_cell) if (cell_type == 1): sentences = comm.replaceToPunkts(cell_value) for sentence in sentences: getEntities.getEntities(url, sentence, ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url) pass except: comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url) pass
def readPlainText(htmlurl, plaintext, ontologyData): try: punc = (plaintext).strip() sentences = comm.replaceToPunkts(punc) for sentence in sentences: getEntities.getEntities(htmlurl, sentence, ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_plaintext.py " + htmlurl)
def readPdf(url, readdedpdf, od): b = BytesIO(readdedpdf) pdfFile = PdfFileReader(b, "rb") pdfFile.strict = False #pdfFile = PdfFileReader("pdf-sample.pdf", "rb") #print(pdfFile) try: for i in range(pdfFile.numPages): #print(i) pageObject = pdfFile.getPage(i)#ContentStream(pdfFile.getPage(i)["/Contents"]) text = (pageObject.extractText()) sentences = comm.replaceToPunkts(text) for sentence in sentences: getEntities.getEntities(url, sentence, od) except: comm.printException(comm.pathToSaveParsingErrors, "read_pdf.py " + url) pass
def readPdf(url, readdedpdf, od): b = BytesIO(readdedpdf) pdfFile = PdfFileReader(b, "rb") pdfFile.strict = False #pdfFile = PdfFileReader("pdf-sample.pdf", "rb") #print(pdfFile) try: for i in range(pdfFile.numPages): #print(i) pageObject = pdfFile.getPage( i) #ContentStream(pdfFile.getPage(i)["/Contents"]) text = (pageObject.extractText()) sentences = comm.replaceToPunkts(text) for sentence in sentences: getEntities.getEntities(url, sentence, od) except: comm.printException(comm.pathToSaveParsingErrors, "read_pdf.py " + url) pass
def readHtmlPage(htmlurl, readedPage, ontologyData): try: sentences = set() root = parse(htmlurl).getroot() #if the root is null, the html is incorrectly formed if(root is not None): for element in root.iter("head"): element.drop_tree() for element in root.iter("script"): element.drop_tree() for element in root.iter("style"): element.drop_tree() for element in root.iter("noscript"): element.drop_tree() for element in root.iter("input"): element.drop_tree() for element in root.iter("form"): element.drop_tree() for element in root.iter("title"): element.drop_tree() for element in root.iter("img"): element.drop_tree() for element in root.iter("body"): try: sentences.add(element.text_content()) except: pass if(len(sentences) > 0): lsent = list(sentences) for lau in lsent: if(lau != ""): laused = comm.replaceToPunkts(lau) for s6ne in laused: getEntities.getEntities(htmlurl, s6ne.strip(), ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_html.py " + htmlurl) pass
def readXml(xmlurl, pathToFile, ontologyData): # https://docs.python.org/3.4/library/functions.html#setattr """ #https://docs.python.org/3.4/library/xml.etree.elementtree.html?highlight=elementtree#parsing-xml if httpResponse is path to xml file: tree = ET.parse(httpResponse) root = tree.getroot() Or directly from a string: (xml is read already): """ try: root = ET.fromstring(pathToFile) if root is not None: for data in root.iter(): if data.text is not None: stripped = data.text.strip() if (stripped is not None) & (len(stripped) > 2): sentences = comm.replaceToPunkts(stripped) for sentence in sentences: getEntities.getEntities(xmlurl, sentence, ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_xml.py " + xmlurl) pass
def readXml(xmlurl, pathToFile, ontologyData): #https://docs.python.org/3.4/library/functions.html#setattr ''' #https://docs.python.org/3.4/library/xml.etree.elementtree.html?highlight=elementtree#parsing-xml if httpResponse is path to xml file: tree = ET.parse(httpResponse) root = tree.getroot() Or directly from a string: (xml is read already): ''' try: root = ET.fromstring(pathToFile) if(root is not None): for data in root.iter(): if(data.text is not None): stripped = data.text.strip() if(stripped is not None) & (len(stripped)>2): sentences = comm.replaceToPunkts(stripped) for sentence in sentences: getEntities.getEntities(xmlurl, sentence, ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_xml.py " + xmlurl) pass
def startToGetEntities(jsonurl, lause, ontologyData): sentences = comm.replaceToPunkts(lause) for sentence in sentences: getEntities.getEntities(jsonurl, sentence, ontologyData)