Пример #1
0
def readExcel(filePath, url, ontologyData):
    try:
        urr(url, filePath)
        try:
            workbook = xlrd.open_workbook(filePath)
            worksheets = workbook.sheet_names()
            for worksheet_name in worksheets:
                worksheet = workbook.sheet_by_name(worksheet_name)
                num_rows = worksheet.nrows - 1
                num_cells = worksheet.ncols - 1
                curr_row = -1
                while curr_row < num_rows:
                    curr_row += 1
                    #row = worksheet.row(curr_row)
                    #print ('Row:', curr_row)
                    curr_cell = -1
                    while curr_cell < num_cells:
                        curr_cell += 1
                        # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank
                        cell_type = worksheet.cell_type(curr_row, curr_cell)
                        cell_value = worksheet.cell_value(curr_row, curr_cell)
                        if (cell_type == 1):
                            sentences = comm.replaceToPunkts(cell_value)
                            for sentence in sentences:
                                getEntities.getEntities(url, sentence, ontologyData)
        
        except:
            comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url)
            pass
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url)
        pass
Пример #2
0
def readExcel(filePath, url, ontologyData):
    try:
        urr(url, filePath)
        try:
            workbook = xlrd.open_workbook(filePath)
            worksheets = workbook.sheet_names()
            for worksheet_name in worksheets:
                worksheet = workbook.sheet_by_name(worksheet_name)
                num_rows = worksheet.nrows - 1
                num_cells = worksheet.ncols - 1
                curr_row = -1
                while curr_row < num_rows:
                    curr_row += 1
                    # row = worksheet.row(curr_row)
                    # print ('Row:', curr_row)
                    curr_cell = -1
                    while curr_cell < num_cells:
                        curr_cell += 1
                        # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank
                        cell_type = worksheet.cell_type(curr_row, curr_cell)
                        cell_value = worksheet.cell_value(curr_row, curr_cell)
                        if cell_type == 1:
                            sentences = comm.replaceToPunkts(cell_value)
                            for sentence in sentences:
                                getEntities.getEntities(url, sentence, ontologyData)

        except:
            comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url)
            pass
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url)
        pass
Пример #3
0
def readPlainText(htmlurl, plaintext, ontologyData, _encoding):
    try:
        try:
            punc = (plaintext.decode(_encoding)).strip()
        except:
            try:
                punc = (plaintext.decode(sys.stdout.encoding)).strip()
            except:
                try:
                    punc = (plaintext.decode('UTF-8')).strip()
                except:
                    try:
                        punc = (plaintext.decode('latin-1')).strip()
                    except:
                        try:
                            punc = (plaintext.decode('ISO-8859-1')).strip()
                        except:
                            try:
                                punc = (plaintext.decode()).strip()
                            except:
                                punc = plaintext
                                pass

        sentences = comm.replaceToPunkts(punc)
        for sentence in sentences:
            getEntities.getEntities(htmlurl, sentence, ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors,
                            "read_plaintext.py " + _encoding + " " + htmlurl)
Пример #4
0
def readPlainText(htmlurl, plaintext, ontologyData, _encoding):
    try:
        try:
            punc = (plaintext.decode(_encoding)).strip()
        except:
            try:
                punc = (plaintext.decode(sys.stdout.encoding)).strip()
            except:
                try:
                    punc = (plaintext.decode('UTF-8')).strip()
                except:
                    try:
                        punc = (plaintext.decode('latin-1')).strip() 
                    except:
                        try:
                            punc = (plaintext.decode('ISO-8859-1')).strip()  
                        except:
                            try:
                                punc = (plaintext.decode()).strip()  
                            except:
                                punc = plaintext
                                pass 
                        
        sentences = comm.replaceToPunkts(punc)
        for sentence in sentences:
                getEntities.getEntities(htmlurl, sentence, ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_plaintext.py " + _encoding + " " + htmlurl)
Пример #5
0
def readPlainText(htmlurl, plaintext, ontologyData):
    try:
        punc = (plaintext).strip() 
        sentences = comm.replaceToPunkts(punc)
        for sentence in sentences:
            getEntities.getEntities(htmlurl, sentence, ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_plaintext.py " + htmlurl)
Пример #6
0
def readPlainText(htmlurl, plaintext, ontologyData):
    try:
        punc = (plaintext).strip()
        sentences = comm.replaceToPunkts(punc)
        for sentence in sentences:
            getEntities.getEntities(htmlurl, sentence, ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors,
                            "read_plaintext.py " + htmlurl)
Пример #7
0
def readPdf(filePath, url, od):
    urldownl(url, filePath)
    pdf = PdfFileReader(open(filePath, "rb"))
    pdf.strict = True

    try:
        for page in pdf.pages:
            text = (page.extractText())
            sentences = comm.replaceToPunkts(text)
            for sentence in sentences:
                getEntities.getEntities(url, sentence, od)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_pdf.py " + url)
        pass
Пример #8
0
def readPdf(filePath, url, od):
    urldownl(url, filePath)
    pdf = PdfFileReader(open(filePath, "rb"))
    pdf.strict = True
 
    try:
        for page in pdf.pages:
            text = (page.extractText())
            sentences = comm.replaceToPunkts(text)
            for sentence in sentences:
                getEntities.getEntities(url, sentence, od)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_pdf.py " + url)
        pass
Пример #9
0
def readPdf(url, readdedpdf, od):
    b = BytesIO(readdedpdf)
    pdfFile = PdfFileReader(b, "rb")
    pdfFile.strict = False
    #pdfFile = PdfFileReader("pdf-sample.pdf", "rb")
    
    #print(pdfFile)
    try:
        for i in range(pdfFile.numPages):
            #print(i)
            pageObject = pdfFile.getPage(i)#ContentStream(pdfFile.getPage(i)["/Contents"])
            text = (pageObject.extractText())
            sentences = comm.replaceToPunkts(text)
            for sentence in sentences:
                getEntities.getEntities(url, sentence, od)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_pdf.py " + url)
        pass
Пример #10
0
def readPdf(url, readdedpdf, od):
    b = BytesIO(readdedpdf)
    pdfFile = PdfFileReader(b, "rb")
    pdfFile.strict = False
    #pdfFile = PdfFileReader("pdf-sample.pdf", "rb")

    #print(pdfFile)
    try:
        for i in range(pdfFile.numPages):
            #print(i)
            pageObject = pdfFile.getPage(
                i)  #ContentStream(pdfFile.getPage(i)["/Contents"])
            text = (pageObject.extractText())
            sentences = comm.replaceToPunkts(text)
            for sentence in sentences:
                getEntities.getEntities(url, sentence, od)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_pdf.py " + url)
        pass
Пример #11
0
 def test_getEntities(self):
     tweet = unicode('I wanna be a professional killer Like John Wick yeah')
     xEntities = {}
     actual_value = getEntities.getEntities(TestUM.parser, tweet, xEntities)
     expected_value = {
         'I': 'PRP',
         'Wick': 'NNP',
         'John': 'NNP',
         'killer': 'NN'
     }
     self.assertEqual(actual_value, expected_value)
Пример #12
0
def readHtmlPage(htmlurl, readedPage, ontologyData, _encoding):
    try:
        sentences = set()
        root = parse(htmlurl).getroot()
        if (root is not None):
            for element in root.iter("head"):
                element.drop_tree()
            for element in root.iter("script"):
                element.drop_tree()
            for element in root.iter("style"):
                element.drop_tree()
            for element in root.iter("noscript"):
                element.drop_tree()
            for element in root.iter("input"):
                element.drop_tree()
            for element in root.iter("form"):
                element.drop_tree()
            for element in root.iter("title"):
                element.drop_tree()
            for element in root.iter("img"):
                element.drop_tree()

            for element in root.iter("body"):
                try:
                    sentences.add(element.text_content())
                except:
                    pass
            if (len(sentences) > 0):
                lsent = list(sentences)
                for lau in lsent:
                    if (lau != ""):
                        laused = comm.replaceToPunkts(lau)
                        for s6ne in laused:
                            getEntities.getEntities(htmlurl, s6ne.strip(),
                                                    ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors,
                            "read_html.py " + _encoding + " " + htmlurl)
        pass
Пример #13
0
def readHtmlPage(htmlurl, readedPage, ontologyData):
    try:
        sentences = set()
        root = parse(htmlurl).getroot()
        #if the root is null, the html is incorrectly formed
        if(root is  not None):
            for element in root.iter("head"):
                element.drop_tree()
            for element in root.iter("script"):
                element.drop_tree()
            for element in root.iter("style"):
                element.drop_tree()
            for element in root.iter("noscript"):
                element.drop_tree()
            for element in root.iter("input"):
                element.drop_tree()
            for element in root.iter("form"):
                element.drop_tree()
            for element in root.iter("title"):
                element.drop_tree()
            for element in root.iter("img"):
                element.drop_tree()
            
            for element in root.iter("body"):
                try:
                    sentences.add(element.text_content())
                except:
                    pass
            if(len(sentences) > 0): 
                lsent = list(sentences)
                for lau in lsent:
                    if(lau != ""):
                        laused = comm.replaceToPunkts(lau)
                        for s6ne in laused:
                            getEntities.getEntities(htmlurl, s6ne.strip(), ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_html.py " + htmlurl)
        pass
Пример #14
0
def readXml(xmlurl, pathToFile, ontologyData):

    # https://docs.python.org/3.4/library/functions.html#setattr
    """
    #https://docs.python.org/3.4/library/xml.etree.elementtree.html?highlight=elementtree#parsing-xml
    if httpResponse is path to xml file:
      tree = ET.parse(httpResponse)
      root = tree.getroot()
    
    Or directly from a string: (xml is read already):
    """
    try:
        root = ET.fromstring(pathToFile)
        if root is not None:
            for data in root.iter():
                if data.text is not None:
                    stripped = data.text.strip()
                    if (stripped is not None) & (len(stripped) > 2):
                        sentences = comm.replaceToPunkts(stripped)
                        for sentence in sentences:
                            getEntities.getEntities(xmlurl, sentence, ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_xml.py " + xmlurl)
        pass
Пример #15
0
def readXml(xmlurl, pathToFile, ontologyData):
    
    #https://docs.python.org/3.4/library/functions.html#setattr
    '''
    #https://docs.python.org/3.4/library/xml.etree.elementtree.html?highlight=elementtree#parsing-xml
    if httpResponse is path to xml file:
      tree = ET.parse(httpResponse)
      root = tree.getroot()
    
    Or directly from a string: (xml is read already):
    '''
    try:
        root = ET.fromstring(pathToFile)
        if(root is  not None):
            for data in root.iter():
                if(data.text is not None):
                    stripped = data.text.strip()
                    if(stripped is not None) & (len(stripped)>2):
                        sentences = comm.replaceToPunkts(stripped)
                        for sentence in sentences:
                            getEntities.getEntities(xmlurl, sentence, ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_xml.py " + xmlurl)
        pass
Пример #16
0
 def test_getEntities2(self):
     tweet = unicode(
         'OpenMinted will create a readable summary of licenses and a harmonized vocabulary for text miners - Penny Labropoulou'
     )
     xEntities = {}
     actual_value = getEntities.getEntities(TestUM.parser, tweet, xEntities)
     expected_value = {
         'OpenMinted': 'NNP',
         'vocabulary': 'NN',
         'text': 'NN',
         'Penny': 'NNP',
         'summary': 'NN',
         'Labropoulou': 'NNP'
     }
     self.assertEqual(actual_value, expected_value)
Пример #17
0
        while True:
            pending = pendingQ.fetch_row(1, 1)
            if pending == ():
                break

            if pending[0]['title'] != '' and pending[0][
                    'body'] != '' and pending[0]['title'] != None and pending[
                        0]['body'] != None:
                print pending[0]['title']
                infoModule.info.entityList = {}
                #limit body to 5000 chars
                if len(pending[0]['body']) > 5000:
                    pending[0]['body'] = pending[0]['body'][:5000]
                try:
                    ents = getEntities.getEntities(pending[0]['body'],
                                                   pending[0]['title'], True,
                                                   True)
                except:
                    #generic catch-all error for the many things that could go wrong with getEntities
                    print "getEntities failed"
                else:
                    try:
                        entities = json.loads(ents)
                    except ValueError:
                        print "Value Error"
                        pass
                    else:
                        for entity in entities:
                            #print entity['id']
                            #print entity['primo']
                            #print primoMapper[entity['primo']]
Пример #18
0
def startToGetEntities(jsonurl, lause, ontologyData):
    sentences = comm.replaceToPunkts(lause)
    for sentence in sentences:
        getEntities.getEntities(jsonurl, sentence, ontologyData)
Пример #19
0
	def test_getEntities2(self):
		tweet = unicode('OpenMinted will create a readable summary of licenses and a harmonized vocabulary for text miners - Penny Labropoulou')
		xEntities = {}
		actual_value = getEntities.getEntities(TestUM.parser,tweet,xEntities)
		expected_value = {'OpenMinted': 'NNP', 'vocabulary': 'NN', 'text': 'NN', 'Penny': 'NNP', 'summary': 'NN', 'Labropoulou': 'NNP'}
		self.assertEqual(actual_value,expected_value)
Пример #20
0
	def test_getEntities1(self):
		tweet = unicode('James Bond is extraordinary character')
		xEntities = {}
		actual_value = getEntities.getEntities(TestUM.parser,tweet,xEntities)
		expected_value = {'James': 'NNP', 'character': 'NN', 'Bond': 'NNP'}
		self.assertEqual(actual_value,expected_value)
Пример #21
0
	def test_getEntities(self):
		tweet = unicode('I wanna be a professional killer Like John Wick yeah')
		xEntities = {}
		actual_value = getEntities.getEntities(TestUM.parser,tweet,xEntities)
		expected_value = {'I': 'PRP', 'Wick': 'NNP', 'John': 'NNP', 'killer': 'NN'}
		self.assertEqual(actual_value,expected_value)
Пример #22
0
 sql = "select story_id, title, body from peepbuzz.stories where story_id > " + str(start_id) + " order by story_id asc"
 primoMapper = {'Y' : '1', '2' : '2', '3' : '3', '4' : '4', 'N' : '10'}
 pendingQ = mysql_tools.mysqlQuery(sql, link)
 while True:
     pending = pendingQ.fetch_row(1,1)
     if pending == ():
         break
 
     if pending[0]['title'] != '' and pending[0]['body'] != '' and pending[0]['title'] != None and pending[0]['body'] != None:
         print pending[0]['title']
         infoModule.info.entityList = {}
         #limit body to 5000 chars
         if len(pending[0]['body']) > 5000:
             pending[0]['body'] = pending[0]['body'][:5000]
         try:
             ents = getEntities.getEntities(pending[0]['body'], pending[0]['title'], True, True)
         except:
             #generic catch-all error for the many things that could go wrong with getEntities
             print "getEntities failed"
         else:
             try:
                 entities = json.loads(ents)
             except ValueError:
                 print "Value Error"
                 pass
             else:
                 for entity in entities:
                     #print entity['id']
                     #print entity['primo']
                     #print primoMapper[entity['primo']]
                     sql = "insert into peepbuzz.story_entities set story_id=" + str(pending[0]['story_id']) + ", entity_id=" + str(entity['id']) + ", primo=" + str(primoMapper[entity['primo']])
Пример #23
0
 def test_getEntities1(self):
     tweet = unicode('James Bond is extraordinary character')
     xEntities = {}
     actual_value = getEntities.getEntities(TestUM.parser, tweet, xEntities)
     expected_value = {'James': 'NNP', 'character': 'NN', 'Bond': 'NNP'}
     self.assertEqual(actual_value, expected_value)