def get_raw_chars(page_text, library_entry, unicode_chars): data_flag = library_entry['data_flag'] flag_inst = library_entry['data_flag_inst'] direction = library_entry['direction'] num_chars = library_entry['raw_char_collect'] if direction == RIGHT: index_and_match = jc.get_index_and_match(page_text, data_flag, flag_inst) match_length = len(index_and_match['match']) try: start_index = index_and_match['index'] + match_length end_index = start_index + num_chars if end_index > len(page_text): end_index = len(page_text) raw_text = page_text[start_index:end_index].strip() except: start_index = index_and_match['index'] end_index = index_and_match['index'] raw_text = index_and_match['index'] elif direction == LEFT: ## if left, just find the index of the flag index_and_match = jc.get_index_and_match(page_text, data_flag, flag_inst) match_length = len(index_and_match['match']) try: ## And subtract to get the right character range end_index = index_and_match['index'] - 1 start_index = end_index - num_chars if start_index < 0: start_index = 0 raw_text = page_text[start_index:end_index].strip() except: start_index = index_and_match['index'] end_index = index_and_match['index'] raw_text = index_and_match['index'] # This was added later to try and prevent encoding errors from popping up. # It looks for problem characters and replaces them with more regular ones. # The only affected part of the program is printing the raw chars to the xlsx raw_text_string_2 = "" matched = 0 for char in raw_text: matched = 0 for key in unicode_chars.iterkeys(): if char == key: print char, print "equals", print key raw_text_string_2 = raw_text_string_2 + unicode_chars[key] print raw_text_string2 matched = 1 else: pass if matched == 0: raw_text_string_2 = raw_text_string_2 + char else: pass raw_text = raw_text_string_2 return {'raw_text': raw_text, 'match': index_and_match['match']}
def get_raw_chars(page_text,library_entry,unicode_chars): data_flag=library_entry['data_flag'] flag_inst=library_entry['data_flag_inst'] direction=library_entry['direction'] num_chars=library_entry['raw_char_collect'] if direction==RIGHT: index_and_match=jc.get_index_and_match(page_text,data_flag,flag_inst) match_length=len(index_and_match['match']) try: start_index=index_and_match['index']+match_length end_index=start_index+num_chars if end_index>len(page_text): end_index=len(page_text) raw_text=page_text[start_index:end_index].strip() except: start_index=index_and_match['index'] end_index=index_and_match['index'] raw_text=index_and_match['index'] elif direction==LEFT: ## if left, just find the index of the flag index_and_match=jc.get_index_and_match(page_text,data_flag,flag_inst) match_length=len(index_and_match['match']) try: ## And subtract to get the right character range end_index=index_and_match['index'] - 1 start_index=end_index-num_chars if start_index<0: start_index=0 raw_text=page_text[start_index:end_index].strip() except: start_index=index_and_match['index'] end_index=index_and_match['index'] raw_text=index_and_match['index'] # This was added later to try and prevent encoding errors from popping up. # It looks for problem characters and replaces them with more regular ones. # The only affected part of the program is printing the raw chars to the xlsx raw_text_string_2="" matched=0 for char in raw_text: matched=0 for key in unicode_chars.iterkeys(): if char==key: print char, print "equals", print key raw_text_string_2=raw_text_string_2+unicode_chars[key] print raw_text_string2 matched=1 else:pass if matched==0: raw_text_string_2=raw_text_string_2+char else:pass raw_text=raw_text_string_2 return {'raw_text':raw_text,'match':index_and_match['match']}
def get_ref_text(raw_text_string, library_entry): ## The idea is to take the raw chars and the instructions and get the data ## The raw text function gets one peice of info at a time it looks like. ## So this should do the same. ## Initializations using the library collection_method = library_entry['collection_method'] left_bound_regex = library_entry['left_bound_regex'] right_bound_regex = library_entry['right_bound_regex'] data_regex = library_entry['data_regex'] character_list = library_entry['character_list'] character_trans = library_entry['character_trans'] ## If the raw text reads no matches or instance not found, then the refined text is ## "No Raw Text" if raw_text_string == "NO MATCHES": return "No Raw Text" elif raw_text_string == "INSTANCE NOT FOUND": return "No Raw Text" ## If it is not, we continue. What is the collection method? ## Meaning - Am I using regular expressions to fine the bounds of the desired information ## or the information itself. elif collection_method == 'bounds': ##If I'm using the bounds if left_bound_regex == "null": ## Check to see what the LB Regex is start_index = 0 ## If it's the literal string "null" then set start_index to start of string else: ## If it isn't, find the left bound using the leftbound regex ## and set the start_index equal to its instance. index_and_match = jc.get_index_and_match(raw_text_string, left_bound_regex, 1) # Try is necessary here incase the left bound regex doesn't come up with anything ## But why isn't there one for the right bound? try: start_index = index_and_match['index'] + len( index_and_match['match']) except: start_index = index_and_match['index'] if right_bound_regex == "null": end_index = len(raw_text_string) else: index_and_match = jc.get_index_and_match(raw_text_string, right_bound_regex, 1) end_index = index_and_match['index'] elif collection_method == 'data': index_and_match = jc.get_index_and_match(raw_text_string, data_regex, 1) start_index = index_and_match['index'] try: end_index = len(index_and_match['match']) + start_index except: end_index = len(raw_text_string) try: ref_text = raw_text_string[start_index:end_index] except: ref_text = "Bad Raw Text" if ref_text == "Bad Raw Text": pass elif ref_text == "No Raw Text": pass else: if character_list != "none": ref_text = jc.character_selection(ref_text, character_list) if character_trans != "none": ref_text = jc.character_transform(ref_text, character_trans) return ref_text
def get_ref_text(raw_text_string,library_entry): ## The idea is to take the raw chars and the instructions and get the data ## The raw text function gets one peice of info at a time it looks like. ## So this should do the same. ## Initializations using the library collection_method=library_entry['collection_method'] left_bound_regex=library_entry['left_bound_regex'] right_bound_regex=library_entry['right_bound_regex'] data_regex=library_entry['data_regex'] character_list=library_entry['character_list'] character_trans=library_entry['character_trans'] ## If the raw text reads no matches or instance not found, then the refined text is ## "No Raw Text" if raw_text_string=="NO MATCHES": return "No Raw Text" elif raw_text_string=="INSTANCE NOT FOUND": return "No Raw Text" ## If it is not, we continue. What is the collection method? ## Meaning - Am I using regular expressions to fine the bounds of the desired information ## or the information itself. elif collection_method=='bounds': ##If I'm using the bounds if left_bound_regex=="null": ## Check to see what the LB Regex is start_index=0 ## If it's the literal string "null" then set start_index to start of string else: ## If it isn't, find the left bound using the leftbound regex ## and set the start_index equal to its instance. index_and_match=jc.get_index_and_match(raw_text_string,left_bound_regex,1) # Try is necessary here incase the left bound regex doesn't come up with anything ## But why isn't there one for the right bound? try: start_index=index_and_match['index']+len(index_and_match['match']) except: start_index=index_and_match['index'] if right_bound_regex=="null": end_index=len(raw_text_string) else: index_and_match=jc.get_index_and_match(raw_text_string,right_bound_regex,1) end_index=index_and_match['index'] elif collection_method=='data': index_and_match=jc.get_index_and_match(raw_text_string,data_regex,1) start_index=index_and_match['index'] try: end_index=len(index_and_match['match'])+start_index except: end_index=len(raw_text_string) try: ref_text=raw_text_string[start_index:end_index] except: ref_text="Bad Raw Text" if ref_text=="Bad Raw Text": pass elif ref_text=="No Raw Text": pass else: if character_list != "none": ref_text=jc.character_selection(ref_text,character_list) if character_trans != "none": ref_text=jc.character_transform(ref_text,character_trans) return ref_text