def get_names(contents): text = contents['declaration'] # Names only occur in the declaration names = [] # This Regex so far has been successful sometimes but not always, uncomment and test individually regex = [ #':\s(.*)\s([\u06F0-\u06F90-9]{10})', #': (.*) \,', #'[\u06F0-\u06F90-9]{10} (.*) :', #namehelper.SALUTATIONS[0] + ' (.*)' + namehelper.NATIONAL_BANK_NUMBER + ' ([\u06F0-\u06F90-9]{10})', #'([\u06F0-\u06F90-9]{10}) ' + namehelper.NATIONAL_BANK_NUMBER + ' (.*) ' + namehelper.SALUTATIONS[0], #'([\u06F0-\u06F90-9]{10}) به شماره ملی' ] # Note: This regex is currently innacurate, but serves as a test extraction for i in regex: # Loop through every regex pattern namesNew = re.findall(i, text) for j in namesNew: # If the regex supported grabbing an ID, then add it otherwise don't if len(j) == 1: names += [{ 'name': translator.convert(j[0]), 'employee_id': None }] else: names += [{ 'name': translator.convert(j[0]), 'employee_id': translator.convert(j[1]) }] return names
def parse_dates(html, contents): newspaperdateId = 'cphMain_lblNewsPaperDate' registrationdateId = 'cphMain_lblNewsDate' soup = BeautifulSoup(html, 'html.parser') newspaper_date = translator.convert( soup.find(id=newspaperdateId).contents[0]) registration_date = translator.convert( soup.find(id=registrationdateId).contents[0]) date_regex = [ '[\u06F0-\u06F90-9]{4}/[\u06F0-\u06F90-9]{2}/[\u06F0-\u06F90-9]{2}', '[\u06F0-\u06F90-9]{2}/[\u06F0-\u06F90-9]{2}/[\u06F0-\u06F90-9]{4}' ] random_dates = [] for i in date_regex: random_dates += re.findall(i, contents['declaration']) meeting_date = None if len(random_dates) >= 1: meeting_date = translator.convert(random_dates[0]) return { 'newspaper_date': newspaper_date, 'registration_date': registration_date, 'meeting_date': meeting_date, 'random_dates': random_dates }
def parse_name_sandwhich(contents): '''This function operates on the principle that a name always occurs between two words/phrases and therefore if we compile a list of every word that precedes a name and every word that follows it, we can therefore ensure we retreive every name. Args: contents - Generalized text content object as in parser.py Returns: list of dictionaries with keys `name` and `id` ''' to_return = [] text = contents['declaration'] # Parse out the main body for substring in PRECEDING_SET: for i in substring_indexes(substring, text): bottom = i + 1 # Move to the next character partition = 30 # Number of characters to search through search_span = text[bottom:bottom + partition] name_set = [] re_id = '[\u06F0-\u06F90-9]{10}' id_search_span = text[bottom:bottom + partition + 10] ids = re.findall(re_id, id_search_span) id = None if len(ids) > 1: print('ERROR: multiple ids found in sandwhich method') print(ids) id = ids[ 0] # The first ID in the string is probably the right one if len(ids) == 1: id = ids[0] for proceed in PROCEEDING_SET: top = search_span.find( proceed) # Find first instance of the proceeding text if not top == -1: name = search_span[: top] # By definition idx=0 is the start of name span name_set += [{ 'name': translator.convert(name), 'employee_id': translator.convert(id) }] if len(name_set) > 1: print( 'Multiple names where found for the same beggining. Ensure clean function works.' ) to_return += name_set return to_return
def double_tap_names(contents): # This function is built on the principle of finding any ID at all # and then confirming or denying the existence of other words, etc. # nearby until you know exactly where to cut the name from. # Notes: re namespaces regex variables text = contents['declaration'] # Grabs ID and 40 characters on either side re_id_chunk = '.{30}[\u06F0-\u06F90-9]{10}.{20}' re_id = '[\u06F0-\u06F90-9]{10}' all_id_chunks = re.findall( re_id_chunk, text) # Find every possible ID (might include business IDs, etc.) people = [] for i in all_id_chunks: if len(re.findall(re_id, i)) > 1: print('Unfortunately we found multiple IDs in the string.') # Double tap confirmed_name = False parsed_name = None if namehelper.NATIONAL_BANK_NUMBER in i: # This national bank number has only every appeared connected with a person's ID confirmed_name = True idx_signifier = i.find(namehelper.NATIONAL_BANK_NUMBER) parsed_name = i[: idx_signifier] # Liberal cutting but won't miss a portion of the name elif namehelper.SALUTATIONS[ 0] in i: # If we see Mr. then we know it's a name confirmed_name = True idx_signifier = i.find(namehelper.SALUTATIONS[0]) parsed_name = i[ idx_signifier:] # Too liberal as well but can be refined if not parsed_name: # In the future we will do named entity recognition here print('No name grabbed') pass if confirmed_name: parsed_name = translator.convert( parsed_name) # Clean it up as much as we can people += [{ 'name': parsed_name, 'employee_id': translator.convert(re.findall(re_id, i)[0]) }] # ID is changed to English numerals return people
def parse(fileName): certaintyScore = 100 # This will be decreased by percentages if signs of uncertainty show # Get document ID file_chunks = fileName.split('/') last_chunk = file_chunks[len(file_chunks) - 1] document_id = last_chunk[:len(last_chunk) - 5] # Remove the .html at the end print('Document ID:', document_id) # Grab file contents htmlFile = open(fileName) html = htmlFile.read() htmlFile.close() # Extract all pertinent sections soup = BeautifulSoup(html, 'html.parser') # This extracts the declaration which doesn't include the document information section. This is where the names occur. declaration = '' try: declaration = soup.find(class_='Jus').contents[0] except: print('Malformed file') return # This extracts where the national ID is sometimes stored title = '' try: title = soup.find(id='cphMain_lblNewsTitle').contents[0] except: title = '' # These are all the extracted text chunks functions will have available to parse contents = {'title': title, 'declaration': declaration} # Company ID retreival companyId = nationalid.parse_id(contents['title']) if DEBUG: print('National ID (title):', companyId) if companyId == None: certaintyScore *= 0.8 # Proof of concept not good companyId = nationalid.parse_id( contents['declaration'] ) # This finds a company ID in the text but it might be referencing another corporation if DEBUG: print('National ID (declaration):', nationalid.parse_id(contents['declaration'])) # Various date retreival try: dates = dateextract.parse_dates(html, contents) except: print('Malformed file. Returning early') return # Name retreival names = get_names(contents) names += double_tap_names(contents) names += namehelper.parse_name_sandwhich(contents) cleaned_names = clean(names) # Now we get the document date in English dates persian_date = dates['registration_date'] date_data = persian_date.split('/') document_date = jdatetime.datetime(int(date_data[0]), int(date_data[1]), int(date_data[2])).togregorian() document_timestamp = time.mktime(document_date.timetuple()) print(document_timestamp) return { 'document_id': document_id, 'document_date': document_timestamp, 'company_id': translator.convert(companyId), 's3_path': fileName[6:], 'names': cleaned_names, 'dates': dates, 'raw_title': contents['title'], 'raw_body': contents['declaration'], 'certainty_score': certaintyScore, 'parser_version': 1 }
def processRequest(self): """Interpret a request, relay to further processing and prepare response headers.""" global debug if "rdf-translator-dev" in self.request.url: debug = True if self.html == True: self.do_pygmentize = True self.response.headers['Content-Type'] = "text/html" else: if self.target_format == "pretty-xml" or self.target_format == "xml": self.response.headers['Content-Type'] = "application/rdf+xml" elif self.target_format == "n3": self.response.headers['Content-Type'] = "text/n3" elif self.target_format == "turtle": self.response.headers['Content-Type'] = "text/turtle" elif self.target_format == "nquads": self.response.headers['Content-Type'] = "text/x-nquads" elif self.target_format == "nt": self.response.headers['Content-Type'] = "text/plain" elif self.target_format == "trix": self.response.headers['Content-Type'] = "application/xml" elif self.target_format == "rdf-json" or self.target_format == "rdf-json-pretty": self.response.headers['Content-Type'] = "application/json" elif self.target_format == "json-ld": self.response.headers['Content-Type'] = "application/ld+json" elif self.target_format == "rdfa" or self.target_format == "microdata": self.response.headers['Content-Type'] = "text/html" else: self.response.headers['Content-Type'] = "text/plain" if not self.source_format or self.source_format == "detect": if self.content: source = create_input_source(data=self.content, format=self.source_format) self.source_format = source.content_type elif self.page: source = create_input_source(location=self.page, format=self.source_format) self.source_format = source.content_type if self.source_format == "text/html": self.source_format = "rdfa" # microdata is fallback try: self.response_string = "<p style='color: red; font-weight: bold; padding-top: 12px'>Translation failed</p>" if self.content: self.response_string = translator.convert(self.content, do_pygmentize=self.do_pygmentize, file_format="string", source_format=self.source_format, target_format=self.target_format) if self.response_string.strip() == "" and self.source_format == "rdfa": # fix microdata test self.response_string = translator.convert(self.content, do_pygmentize=self.do_pygmentize, file_format="string", source_format="microdata", target_format=self.target_format) elif self.page: self.response_string = translator.convert(self.page, do_pygmentize=self.do_pygmentize, file_format="file", source_format=self.source_format, target_format=self.target_format) if self.response_string.strip() == "" and self.source_format == "rdfa": # fix microdata test self.response_string = translator.convert(self.page, do_pygmentize=self.do_pygmentize, file_format="file", source_format="microdata", target_format=self.target_format) if self.response_string.strip() == "": raise Exception("empty result returned") except Exception, e: self.response.set_status(500) if debug: tb = traceback.format_exc() e = "<pre style=\"color: red\">"+tb+"</pre>" else: e = "<pre style=\"color: red\">"+str(e)+"</pre>" error_message = "No error message available" if str(e).strip() != "": error_message = "Error message:<br>%s" % str(e) self.response_string = "<p style='color: red; font-weight: bold; padding-top: 12px'>Could not convert from %s to %s for provided resource...<br><br>%s</p>" % (self.source_format, self.target_format, error_message)
def processRequest(self): """Interpret a request, relay to further processing and prepare response headers.""" global debug if "rdf-translator-dev" in self.request.url: debug = True if self.html == True: self.do_pygmentize = True self.response.headers['Content-Type'] = "text/html" else: if self.target_format == "pretty-xml" or self.target_format == "xml": self.response.headers['Content-Type'] = "application/rdf+xml" elif self.target_format == "n3": self.response.headers['Content-Type'] = "text/n3" elif self.target_format == "turtle": self.response.headers['Content-Type'] = "text/turtle" elif self.target_format == "nquads": self.response.headers['Content-Type'] = "text/x-nquads" elif self.target_format == "nt": self.response.headers['Content-Type'] = "text/plain" elif self.target_format == "trix": self.response.headers['Content-Type'] = "application/xml" elif self.target_format == "rdf-json" or self.target_format == "rdf-json-pretty": self.response.headers['Content-Type'] = "application/json" elif self.target_format == "json-ld": self.response.headers['Content-Type'] = "application/ld+json" elif self.target_format == "rdfa" or self.target_format == "microdata": self.response.headers['Content-Type'] = "text/html" else: self.response.headers['Content-Type'] = "text/plain" if not self.source_format or self.source_format == "detect": if self.content: source = create_input_source(data=self.content, format=self.source_format) self.source_format = source.content_type elif self.page: source = create_input_source(location=self.page, format=self.source_format) self.source_format = source.content_type if self.source_format == "text/html": self.source_format = "rdfa" # microdata is fallback try: self.response_string = "<p style='color: red; font-weight: bold; padding-top: 12px'>Translation failed</p>" if self.content: self.response_string = translator.convert( self.content, do_pygmentize=self.do_pygmentize, file_format="string", source_format=self.source_format, target_format=self.target_format) if self.response_string.strip( ) == "" and self.source_format == "rdfa": # fix microdata test self.response_string = translator.convert( self.content, do_pygmentize=self.do_pygmentize, file_format="string", source_format="microdata", target_format=self.target_format) elif self.page: self.response_string = translator.convert( self.page, do_pygmentize=self.do_pygmentize, file_format="file", source_format=self.source_format, target_format=self.target_format) if self.response_string.strip( ) == "" and self.source_format == "rdfa": # fix microdata test self.response_string = translator.convert( self.page, do_pygmentize=self.do_pygmentize, file_format="file", source_format="microdata", target_format=self.target_format) if self.response_string.strip() == "": raise Exception("empty result returned") except Exception, e: self.response.set_status(500) if debug: tb = traceback.format_exc() e = "<pre style=\"color: red\">" + tb + "</pre>" else: e = "<pre style=\"color: red\">" + str(e) + "</pre>" error_message = "No error message available" if str(e).strip() != "": error_message = "Error message:<br>%s" % str(e) self.response_string = "<p style='color: red; font-weight: bold; padding-top: 12px'>Could not convert from %s to %s for provided resource...<br><br>%s</p>" % ( self.source_format, self.target_format, error_message)