def get_raw_chars(page_text, library_entry): data_flag = library_entry['data_flag'] flag_inst = library_entry['data_flag_inst'] direction = library_entry['direction'] num_chars = library_entry['raw_char_collect'] if direction == RIGHT: ## If going to the right find flag, and make start index right after the last char of the flag index_and_match = useful.get_index_and_match(page_text, data_flag, flag_inst) match_length = len(index_and_match['match']) try: start_index = index_and_match['index'] + match_length end_index = start_index + num_chars if end_index > len(page_text): end_index = len(page_text) raw_text = page_text[start_index:end_index].strip() except: start_index = index_and_match['index'] end_index = index_and_match['index'] raw_text = index_and_match['index'] ## Get end_index be adding elif direction == LEFT: ## if left, just find the index of the flag index_and_match = useful.get_index_and_match(page_text, data_flag, flag_inst) match_length = len(index_and_match['match']) try: ## And subtract to get the right character range end_index = index_and_match['index'] - 1 start_index = end_index - num_chars if start_index < 0: start_index = 0 raw_text = page_text[start_index:end_index].strip() except: start_index = index_and_match['index'] end_index = index_and_match['index'] raw_text = index_and_match['index'] raw_text_string_2 = "" matched = 0 for char in raw_text: matched = 0 for key in unicode_chars.iterkeys(): if char == key: raw_text_string_2 = raw_text_string_2 + unicode_chars[key] matched = 1 else: pass if matched == 0: raw_text_string_2 = raw_text_string_2 + char else: pass raw_text = raw_text_string_2 return {'raw_text': raw_text, 'match': index_and_match['match']}
def get_ref_text(raw_text_string,library_entry): ## The idea is to take the raw chars and the instructions and get the data ## The raw text function gets one peice of info at a time it looks like. ## So this should do the same. collection_method=library_entry['collection_method'] left_bound_regex=library_entry['left_bound_regex'] right_bound_regex=library_entry['right_bound_regex'] data_regex=library_entry['data_regex'] character_list=library_entry['character_list'] character_trans=library_entry['character_trans'] if raw_text_string=="NO MATCHES": return "No Raw Text" elif raw_text_string=="INSTANCE NOT FOUND": return "No Raw Text" elif collection_method=='bounds': if left_bound_regex=="null": start_index=0 else: index_and_match=useful.get_index_and_match(raw_text_string,left_bound_regex,1) try: start_index=index_and_match['index']+len(index_and_match['match']) except: start_index=index_and_match['index'] if right_bound_regex=="null": end_index=len(raw_text) else: index_and_match=useful.get_index_and_match(raw_text_string,right_bound_regex,1) end_index=index_and_match['index'] elif collection_method=='data': index_and_match=useful.get_index_and_match(raw_text_string,data_regex,1) start_index=index_and_match['index'] try: end_index=len(index_and_match['match'])+start_index #print "tried and exceeded" #print index_and_match["match"] except: end_index=len(raw_text_string) try: ref_text=raw_text_string[start_index:end_index] except: ref_text="Bad Raw Text" if ref_text=="Bad Raw Text": pass elif ref_text=="No Raw Text": pass else: if character_list != "none": ref_text=useful.character_selection(ref_text,character_list) if character_trans != "none": ref_text=useful.character_transform(ref_text,character_trans) return ref_text
def get_raw_chars(page_text,library_entry): data_flag=library_entry['data_flag'] flag_inst=library_entry['data_flag_inst'] direction=library_entry['direction'] num_chars=library_entry['raw_char_collect'] if direction==RIGHT: ## If going to the right find flag, and make start index right after the last char of the flag index_and_match=useful.get_index_and_match(page_text,data_flag,flag_inst) match_length=len(index_and_match['match']) try: start_index=index_and_match['index']+match_length end_index=start_index+num_chars if end_index>len(page_text): end_index=len(page_text) raw_text=page_text[start_index:end_index].strip() except: start_index=index_and_match['index'] end_index=index_and_match['index'] raw_text=index_and_match['index'] ## Get end_index be adding elif direction==LEFT: ## if left, just find the index of the flag index_and_match=useful.get_index_and_match(page_text,data_flag,flag_inst) match_length=len(index_and_match['match']) try: ## And subtract to get the right character range end_index=index_and_match['index'] - 1 start_index=end_index-num_chars if start_index<0: start_index=0 raw_text=page_text[start_index:end_index].strip() except: start_index=index_and_match['index'] end_index=index_and_match['index'] raw_text=index_and_match['index'] raw_text_string_2="" matched=0 for char in raw_text: matched=0 for key in unicode_chars.iterkeys(): if char==key: raw_text_string_2=raw_text_string_2+unicode_chars[key] matched=1 else:pass if matched==0: raw_text_string_2=raw_text_string_2+char else:pass raw_text=raw_text_string_2 return {'raw_text':raw_text,'match':index_and_match['match']}
def get_ref_text(raw_text_string, library_entry): ## The idea is to take the raw chars and the instructions and get the data ## The raw text function gets one peice of info at a time it looks like. ## So this should do the same. collection_method = library_entry['collection_method'] left_bound_regex = library_entry['left_bound_regex'] right_bound_regex = library_entry['right_bound_regex'] data_regex = library_entry['data_regex'] character_list = library_entry['character_list'] character_trans = library_entry['character_trans'] if raw_text_string == "NO MATCHES": return "No Raw Text" elif raw_text_string == "INSTANCE NOT FOUND": return "No Raw Text" elif collection_method == 'bounds': if left_bound_regex == "null": start_index = 0 else: index_and_match = useful.get_index_and_match( raw_text_string, left_bound_regex, 1) try: start_index = index_and_match['index'] + len( index_and_match['match']) except: start_index = index_and_match['index'] if right_bound_regex == "null": end_index = len(raw_text) else: index_and_match = useful.get_index_and_match( raw_text_string, right_bound_regex, 1) end_index = index_and_match['index'] elif collection_method == 'data': index_and_match = useful.get_index_and_match(raw_text_string, data_regex, 1) start_index = index_and_match['index'] try: end_index = len(index_and_match['match']) + start_index #print "tried and exceeded" #print index_and_match["match"] except: end_index = len(raw_text_string) try: ref_text = raw_text_string[start_index:end_index] except: ref_text = "Bad Raw Text" if ref_text == "Bad Raw Text": pass elif ref_text == "No Raw Text": pass else: if character_list != "none": ref_text = useful.character_selection(ref_text, character_list) if character_trans != "none": ref_text = useful.character_transform(ref_text, character_trans) return ref_text