Пример #1
0
def readExcel(filePath, url, ontologyData):
    try:
        urr(url, filePath)
        try:
            workbook = xlrd.open_workbook(filePath)
            worksheets = workbook.sheet_names()
            for worksheet_name in worksheets:
                worksheet = workbook.sheet_by_name(worksheet_name)
                num_rows = worksheet.nrows - 1
                num_cells = worksheet.ncols - 1
                curr_row = -1
                while curr_row < num_rows:
                    curr_row += 1
                    # row = worksheet.row(curr_row)
                    # print ('Row:', curr_row)
                    curr_cell = -1
                    while curr_cell < num_cells:
                        curr_cell += 1
                        # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank
                        cell_type = worksheet.cell_type(curr_row, curr_cell)
                        cell_value = worksheet.cell_value(curr_row, curr_cell)
                        if cell_type == 1:
                            sentences = comm.replaceToPunkts(cell_value)
                            for sentence in sentences:
                                getEntities.getEntities(url, sentence, ontologyData)

        except:
            comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url)
            pass
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url)
        pass
Пример #2
0
def readExcel(filePath, url, ontologyData):
    try:
        urr(url, filePath)
        try:
            workbook = xlrd.open_workbook(filePath)
            worksheets = workbook.sheet_names()
            for worksheet_name in worksheets:
                worksheet = workbook.sheet_by_name(worksheet_name)
                num_rows = worksheet.nrows - 1
                num_cells = worksheet.ncols - 1
                curr_row = -1
                while curr_row < num_rows:
                    curr_row += 1
                    #row = worksheet.row(curr_row)
                    #print ('Row:', curr_row)
                    curr_cell = -1
                    while curr_cell < num_cells:
                        curr_cell += 1
                        # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank
                        cell_type = worksheet.cell_type(curr_row, curr_cell)
                        cell_value = worksheet.cell_value(curr_row, curr_cell)
                        if (cell_type == 1):
                            sentences = comm.replaceToPunkts(cell_value)
                            for sentence in sentences:
                                getEntities.getEntities(url, sentence, ontologyData)
        
        except:
            comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url)
            pass
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url)
        pass
Пример #3
0
def readPlainText(htmlurl, plaintext, ontologyData):
    try:
        punc = (plaintext).strip() 
        sentences = comm.replaceToPunkts(punc)
        for sentence in sentences:
            getEntities.getEntities(htmlurl, sentence, ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_plaintext.py " + htmlurl)
Пример #4
0
def readPlainText(htmlurl, plaintext, ontologyData):
    try:
        punc = (plaintext).strip()
        sentences = comm.replaceToPunkts(punc)
        for sentence in sentences:
            getEntities.getEntities(htmlurl, sentence, ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors,
                            "read_plaintext.py " + htmlurl)
Пример #5
0
def readPdf(url, readdedpdf, od):
    b = BytesIO(readdedpdf)
    pdfFile = PdfFileReader(b, "rb")
    pdfFile.strict = False
    #pdfFile = PdfFileReader("pdf-sample.pdf", "rb")
    
    #print(pdfFile)
    try:
        for i in range(pdfFile.numPages):
            #print(i)
            pageObject = pdfFile.getPage(i)#ContentStream(pdfFile.getPage(i)["/Contents"])
            text = (pageObject.extractText())
            sentences = comm.replaceToPunkts(text)
            for sentence in sentences:
                getEntities.getEntities(url, sentence, od)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_pdf.py " + url)
        pass
Пример #6
0
def readPdf(url, readdedpdf, od):
    b = BytesIO(readdedpdf)
    pdfFile = PdfFileReader(b, "rb")
    pdfFile.strict = False
    #pdfFile = PdfFileReader("pdf-sample.pdf", "rb")

    #print(pdfFile)
    try:
        for i in range(pdfFile.numPages):
            #print(i)
            pageObject = pdfFile.getPage(
                i)  #ContentStream(pdfFile.getPage(i)["/Contents"])
            text = (pageObject.extractText())
            sentences = comm.replaceToPunkts(text)
            for sentence in sentences:
                getEntities.getEntities(url, sentence, od)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_pdf.py " + url)
        pass
Пример #7
0
def readHtmlPage(htmlurl, readedPage, ontologyData):
    try:
        sentences = set()
        root = parse(htmlurl).getroot()
        #if the root is null, the html is incorrectly formed
        if(root is  not None):
            for element in root.iter("head"):
                element.drop_tree()
            for element in root.iter("script"):
                element.drop_tree()
            for element in root.iter("style"):
                element.drop_tree()
            for element in root.iter("noscript"):
                element.drop_tree()
            for element in root.iter("input"):
                element.drop_tree()
            for element in root.iter("form"):
                element.drop_tree()
            for element in root.iter("title"):
                element.drop_tree()
            for element in root.iter("img"):
                element.drop_tree()
            
            for element in root.iter("body"):
                try:
                    sentences.add(element.text_content())
                except:
                    pass
            if(len(sentences) > 0): 
                lsent = list(sentences)
                for lau in lsent:
                    if(lau != ""):
                        laused = comm.replaceToPunkts(lau)
                        for s6ne in laused:
                            getEntities.getEntities(htmlurl, s6ne.strip(), ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_html.py " + htmlurl)
        pass
Пример #8
0
def readXml(xmlurl, pathToFile, ontologyData):

    # https://docs.python.org/3.4/library/functions.html#setattr
    """
    #https://docs.python.org/3.4/library/xml.etree.elementtree.html?highlight=elementtree#parsing-xml
    if httpResponse is path to xml file:
      tree = ET.parse(httpResponse)
      root = tree.getroot()
    
    Or directly from a string: (xml is read already):
    """
    try:
        root = ET.fromstring(pathToFile)
        if root is not None:
            for data in root.iter():
                if data.text is not None:
                    stripped = data.text.strip()
                    if (stripped is not None) & (len(stripped) > 2):
                        sentences = comm.replaceToPunkts(stripped)
                        for sentence in sentences:
                            getEntities.getEntities(xmlurl, sentence, ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_xml.py " + xmlurl)
        pass
Пример #9
0
def readXml(xmlurl, pathToFile, ontologyData):
    
    #https://docs.python.org/3.4/library/functions.html#setattr
    '''
    #https://docs.python.org/3.4/library/xml.etree.elementtree.html?highlight=elementtree#parsing-xml
    if httpResponse is path to xml file:
      tree = ET.parse(httpResponse)
      root = tree.getroot()
    
    Or directly from a string: (xml is read already):
    '''
    try:
        root = ET.fromstring(pathToFile)
        if(root is  not None):
            for data in root.iter():
                if(data.text is not None):
                    stripped = data.text.strip()
                    if(stripped is not None) & (len(stripped)>2):
                        sentences = comm.replaceToPunkts(stripped)
                        for sentence in sentences:
                            getEntities.getEntities(xmlurl, sentence, ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_xml.py " + xmlurl)
        pass
Пример #10
0
def startToGetEntities(jsonurl, lause, ontologyData):
    sentences = comm.replaceToPunkts(lause)
    for sentence in sentences:
        getEntities.getEntities(jsonurl, sentence, ontologyData)