def readExcel(filePath, url, ontologyData): try: urr(url, filePath) try: workbook = xlrd.open_workbook(filePath) worksheets = workbook.sheet_names() for worksheet_name in worksheets: worksheet = workbook.sheet_by_name(worksheet_name) num_rows = worksheet.nrows - 1 num_cells = worksheet.ncols - 1 curr_row = -1 while curr_row < num_rows: curr_row += 1 #row = worksheet.row(curr_row) #print ('Row:', curr_row) curr_cell = -1 while curr_cell < num_cells: curr_cell += 1 # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank cell_type = worksheet.cell_type(curr_row, curr_cell) cell_value = worksheet.cell_value(curr_row, curr_cell) if (cell_type == 1): sentences = comm.replaceToPunkts(cell_value) for sentence in sentences: getEntities.getEntities(url, sentence, ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url) pass except: comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url) pass
def readExcel(filePath, url, ontologyData): try: urr(url, filePath) try: workbook = xlrd.open_workbook(filePath) worksheets = workbook.sheet_names() for worksheet_name in worksheets: worksheet = workbook.sheet_by_name(worksheet_name) num_rows = worksheet.nrows - 1 num_cells = worksheet.ncols - 1 curr_row = -1 while curr_row < num_rows: curr_row += 1 # row = worksheet.row(curr_row) # print ('Row:', curr_row) curr_cell = -1 while curr_cell < num_cells: curr_cell += 1 # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank cell_type = worksheet.cell_type(curr_row, curr_cell) cell_value = worksheet.cell_value(curr_row, curr_cell) if cell_type == 1: sentences = comm.replaceToPunkts(cell_value) for sentence in sentences: getEntities.getEntities(url, sentence, ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url) pass except: comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url) pass
def readPlainText(htmlurl, plaintext, ontologyData, _encoding): try: try: punc = (plaintext.decode(_encoding)).strip() except: try: punc = (plaintext.decode(sys.stdout.encoding)).strip() except: try: punc = (plaintext.decode('UTF-8')).strip() except: try: punc = (plaintext.decode('latin-1')).strip() except: try: punc = (plaintext.decode('ISO-8859-1')).strip() except: try: punc = (plaintext.decode()).strip() except: punc = plaintext pass sentences = comm.replaceToPunkts(punc) for sentence in sentences: getEntities.getEntities(htmlurl, sentence, ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_plaintext.py " + _encoding + " " + htmlurl)
def readPlainText(htmlurl, plaintext, ontologyData): try: punc = (plaintext).strip() sentences = comm.replaceToPunkts(punc) for sentence in sentences: getEntities.getEntities(htmlurl, sentence, ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_plaintext.py " + htmlurl)
def readPdf(filePath, url, od): urldownl(url, filePath) pdf = PdfFileReader(open(filePath, "rb")) pdf.strict = True try: for page in pdf.pages: text = (page.extractText()) sentences = comm.replaceToPunkts(text) for sentence in sentences: getEntities.getEntities(url, sentence, od) except: comm.printException(comm.pathToSaveParsingErrors, "read_pdf.py " + url) pass
def readPdf(url, readdedpdf, od): b = BytesIO(readdedpdf) pdfFile = PdfFileReader(b, "rb") pdfFile.strict = False #pdfFile = PdfFileReader("pdf-sample.pdf", "rb") #print(pdfFile) try: for i in range(pdfFile.numPages): #print(i) pageObject = pdfFile.getPage(i)#ContentStream(pdfFile.getPage(i)["/Contents"]) text = (pageObject.extractText()) sentences = comm.replaceToPunkts(text) for sentence in sentences: getEntities.getEntities(url, sentence, od) except: comm.printException(comm.pathToSaveParsingErrors, "read_pdf.py " + url) pass
def readPdf(url, readdedpdf, od): b = BytesIO(readdedpdf) pdfFile = PdfFileReader(b, "rb") pdfFile.strict = False #pdfFile = PdfFileReader("pdf-sample.pdf", "rb") #print(pdfFile) try: for i in range(pdfFile.numPages): #print(i) pageObject = pdfFile.getPage( i) #ContentStream(pdfFile.getPage(i)["/Contents"]) text = (pageObject.extractText()) sentences = comm.replaceToPunkts(text) for sentence in sentences: getEntities.getEntities(url, sentence, od) except: comm.printException(comm.pathToSaveParsingErrors, "read_pdf.py " + url) pass
def test_getEntities(self): tweet = unicode('I wanna be a professional killer Like John Wick yeah') xEntities = {} actual_value = getEntities.getEntities(TestUM.parser, tweet, xEntities) expected_value = { 'I': 'PRP', 'Wick': 'NNP', 'John': 'NNP', 'killer': 'NN' } self.assertEqual(actual_value, expected_value)
def readHtmlPage(htmlurl, readedPage, ontologyData, _encoding): try: sentences = set() root = parse(htmlurl).getroot() if (root is not None): for element in root.iter("head"): element.drop_tree() for element in root.iter("script"): element.drop_tree() for element in root.iter("style"): element.drop_tree() for element in root.iter("noscript"): element.drop_tree() for element in root.iter("input"): element.drop_tree() for element in root.iter("form"): element.drop_tree() for element in root.iter("title"): element.drop_tree() for element in root.iter("img"): element.drop_tree() for element in root.iter("body"): try: sentences.add(element.text_content()) except: pass if (len(sentences) > 0): lsent = list(sentences) for lau in lsent: if (lau != ""): laused = comm.replaceToPunkts(lau) for s6ne in laused: getEntities.getEntities(htmlurl, s6ne.strip(), ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_html.py " + _encoding + " " + htmlurl) pass
def readHtmlPage(htmlurl, readedPage, ontologyData): try: sentences = set() root = parse(htmlurl).getroot() #if the root is null, the html is incorrectly formed if(root is not None): for element in root.iter("head"): element.drop_tree() for element in root.iter("script"): element.drop_tree() for element in root.iter("style"): element.drop_tree() for element in root.iter("noscript"): element.drop_tree() for element in root.iter("input"): element.drop_tree() for element in root.iter("form"): element.drop_tree() for element in root.iter("title"): element.drop_tree() for element in root.iter("img"): element.drop_tree() for element in root.iter("body"): try: sentences.add(element.text_content()) except: pass if(len(sentences) > 0): lsent = list(sentences) for lau in lsent: if(lau != ""): laused = comm.replaceToPunkts(lau) for s6ne in laused: getEntities.getEntities(htmlurl, s6ne.strip(), ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_html.py " + htmlurl) pass
def readXml(xmlurl, pathToFile, ontologyData): # https://docs.python.org/3.4/library/functions.html#setattr """ #https://docs.python.org/3.4/library/xml.etree.elementtree.html?highlight=elementtree#parsing-xml if httpResponse is path to xml file: tree = ET.parse(httpResponse) root = tree.getroot() Or directly from a string: (xml is read already): """ try: root = ET.fromstring(pathToFile) if root is not None: for data in root.iter(): if data.text is not None: stripped = data.text.strip() if (stripped is not None) & (len(stripped) > 2): sentences = comm.replaceToPunkts(stripped) for sentence in sentences: getEntities.getEntities(xmlurl, sentence, ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_xml.py " + xmlurl) pass
def readXml(xmlurl, pathToFile, ontologyData): #https://docs.python.org/3.4/library/functions.html#setattr ''' #https://docs.python.org/3.4/library/xml.etree.elementtree.html?highlight=elementtree#parsing-xml if httpResponse is path to xml file: tree = ET.parse(httpResponse) root = tree.getroot() Or directly from a string: (xml is read already): ''' try: root = ET.fromstring(pathToFile) if(root is not None): for data in root.iter(): if(data.text is not None): stripped = data.text.strip() if(stripped is not None) & (len(stripped)>2): sentences = comm.replaceToPunkts(stripped) for sentence in sentences: getEntities.getEntities(xmlurl, sentence, ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_xml.py " + xmlurl) pass
def test_getEntities2(self): tweet = unicode( 'OpenMinted will create a readable summary of licenses and a harmonized vocabulary for text miners - Penny Labropoulou' ) xEntities = {} actual_value = getEntities.getEntities(TestUM.parser, tweet, xEntities) expected_value = { 'OpenMinted': 'NNP', 'vocabulary': 'NN', 'text': 'NN', 'Penny': 'NNP', 'summary': 'NN', 'Labropoulou': 'NNP' } self.assertEqual(actual_value, expected_value)
while True: pending = pendingQ.fetch_row(1, 1) if pending == (): break if pending[0]['title'] != '' and pending[0][ 'body'] != '' and pending[0]['title'] != None and pending[ 0]['body'] != None: print pending[0]['title'] infoModule.info.entityList = {} #limit body to 5000 chars if len(pending[0]['body']) > 5000: pending[0]['body'] = pending[0]['body'][:5000] try: ents = getEntities.getEntities(pending[0]['body'], pending[0]['title'], True, True) except: #generic catch-all error for the many things that could go wrong with getEntities print "getEntities failed" else: try: entities = json.loads(ents) except ValueError: print "Value Error" pass else: for entity in entities: #print entity['id'] #print entity['primo'] #print primoMapper[entity['primo']]
def startToGetEntities(jsonurl, lause, ontologyData): sentences = comm.replaceToPunkts(lause) for sentence in sentences: getEntities.getEntities(jsonurl, sentence, ontologyData)
def test_getEntities2(self): tweet = unicode('OpenMinted will create a readable summary of licenses and a harmonized vocabulary for text miners - Penny Labropoulou') xEntities = {} actual_value = getEntities.getEntities(TestUM.parser,tweet,xEntities) expected_value = {'OpenMinted': 'NNP', 'vocabulary': 'NN', 'text': 'NN', 'Penny': 'NNP', 'summary': 'NN', 'Labropoulou': 'NNP'} self.assertEqual(actual_value,expected_value)
def test_getEntities1(self): tweet = unicode('James Bond is extraordinary character') xEntities = {} actual_value = getEntities.getEntities(TestUM.parser,tweet,xEntities) expected_value = {'James': 'NNP', 'character': 'NN', 'Bond': 'NNP'} self.assertEqual(actual_value,expected_value)
def test_getEntities(self): tweet = unicode('I wanna be a professional killer Like John Wick yeah') xEntities = {} actual_value = getEntities.getEntities(TestUM.parser,tweet,xEntities) expected_value = {'I': 'PRP', 'Wick': 'NNP', 'John': 'NNP', 'killer': 'NN'} self.assertEqual(actual_value,expected_value)
sql = "select story_id, title, body from peepbuzz.stories where story_id > " + str(start_id) + " order by story_id asc" primoMapper = {'Y' : '1', '2' : '2', '3' : '3', '4' : '4', 'N' : '10'} pendingQ = mysql_tools.mysqlQuery(sql, link) while True: pending = pendingQ.fetch_row(1,1) if pending == (): break if pending[0]['title'] != '' and pending[0]['body'] != '' and pending[0]['title'] != None and pending[0]['body'] != None: print pending[0]['title'] infoModule.info.entityList = {} #limit body to 5000 chars if len(pending[0]['body']) > 5000: pending[0]['body'] = pending[0]['body'][:5000] try: ents = getEntities.getEntities(pending[0]['body'], pending[0]['title'], True, True) except: #generic catch-all error for the many things that could go wrong with getEntities print "getEntities failed" else: try: entities = json.loads(ents) except ValueError: print "Value Error" pass else: for entity in entities: #print entity['id'] #print entity['primo'] #print primoMapper[entity['primo']] sql = "insert into peepbuzz.story_entities set story_id=" + str(pending[0]['story_id']) + ", entity_id=" + str(entity['id']) + ", primo=" + str(primoMapper[entity['primo']])
def test_getEntities1(self): tweet = unicode('James Bond is extraordinary character') xEntities = {} actual_value = getEntities.getEntities(TestUM.parser, tweet, xEntities) expected_value = {'James': 'NNP', 'character': 'NN', 'Bond': 'NNP'} self.assertEqual(actual_value, expected_value)