Python get_index_and_match示例，useful.get_index_and_match Python示例

示例#1

0

显示文件

文件： extraction2_0.py 项目： mikpim01/PDF2EXCEL

def get_raw_chars(page_text, library_entry):
    data_flag = library_entry['data_flag']
    flag_inst = library_entry['data_flag_inst']
    direction = library_entry['direction']
    num_chars = library_entry['raw_char_collect']

    if direction == RIGHT:
        ## If going to the right find flag, and make start index right after the last char of the flag
        index_and_match = useful.get_index_and_match(page_text, data_flag,
                                                     flag_inst)
        match_length = len(index_and_match['match'])

        try:
            start_index = index_and_match['index'] + match_length
            end_index = start_index + num_chars
            if end_index > len(page_text):
                end_index = len(page_text)
            raw_text = page_text[start_index:end_index].strip()
        except:
            start_index = index_and_match['index']
            end_index = index_and_match['index']
            raw_text = index_and_match['index']

        ## Get end_index be adding
    elif direction == LEFT:
        ## if left, just find the index of the flag
        index_and_match = useful.get_index_and_match(page_text, data_flag,
                                                     flag_inst)
        match_length = len(index_and_match['match'])
        try:
            ## And subtract to get the right character range
            end_index = index_and_match['index'] - 1
            start_index = end_index - num_chars
            if start_index < 0:
                start_index = 0
            raw_text = page_text[start_index:end_index].strip()
        except:
            start_index = index_and_match['index']
            end_index = index_and_match['index']
            raw_text = index_and_match['index']

    raw_text_string_2 = ""
    matched = 0
    for char in raw_text:
        matched = 0
        for key in unicode_chars.iterkeys():
            if char == key:
                raw_text_string_2 = raw_text_string_2 + unicode_chars[key]
                matched = 1
            else:
                pass

        if matched == 0:
            raw_text_string_2 = raw_text_string_2 + char
        else:
            pass

    raw_text = raw_text_string_2

    return {'raw_text': raw_text, 'match': index_and_match['match']}

示例#2

0

显示文件

文件： extraction2_0.py 项目： wantsomechocolate/PDF2EXCEL

def get_ref_text(raw_text_string,library_entry):
    ## The idea is to take the raw chars and the instructions and get the data
    ## The raw text function gets one peice of info at a time it looks like.
    ## So this should do the same.
    collection_method=library_entry['collection_method']
    left_bound_regex=library_entry['left_bound_regex']
    right_bound_regex=library_entry['right_bound_regex']
    data_regex=library_entry['data_regex']
    character_list=library_entry['character_list']
    character_trans=library_entry['character_trans']

    if raw_text_string=="NO MATCHES":
        return "No Raw Text"
    elif raw_text_string=="INSTANCE NOT FOUND":
        return "No Raw Text"

    elif collection_method=='bounds':
        if left_bound_regex=="null":
            start_index=0
        else:
            index_and_match=useful.get_index_and_match(raw_text_string,left_bound_regex,1)
            try:
                start_index=index_and_match['index']+len(index_and_match['match'])
            except:
                start_index=index_and_match['index']
        if right_bound_regex=="null":
            end_index=len(raw_text)
        else:
            index_and_match=useful.get_index_and_match(raw_text_string,right_bound_regex,1)
            end_index=index_and_match['index']
    elif collection_method=='data':
        index_and_match=useful.get_index_and_match(raw_text_string,data_regex,1)
        start_index=index_and_match['index']
        try:
            end_index=len(index_and_match['match'])+start_index
            #print "tried and exceeded"
            #print index_and_match["match"]
        except:
            end_index=len(raw_text_string)

    try:        
        ref_text=raw_text_string[start_index:end_index]
        
    except:
        ref_text="Bad Raw Text"

    if ref_text=="Bad Raw Text":
        pass
    elif ref_text=="No Raw Text":
        pass
    else:
        if character_list != "none":
            ref_text=useful.character_selection(ref_text,character_list)
        if character_trans != "none":
            ref_text=useful.character_transform(ref_text,character_trans)

    return ref_text

示例#3

0

显示文件

文件： extraction2_0.py 项目： wantsomechocolate/PDF2EXCEL

def get_raw_chars(page_text,library_entry):
    data_flag=library_entry['data_flag']
    flag_inst=library_entry['data_flag_inst']
    direction=library_entry['direction']
    num_chars=library_entry['raw_char_collect']

    if direction==RIGHT:
        ## If going to the right find flag, and make start index right after the last char of the flag
        index_and_match=useful.get_index_and_match(page_text,data_flag,flag_inst)
        match_length=len(index_and_match['match'])

        try:
            start_index=index_and_match['index']+match_length
            end_index=start_index+num_chars
            if end_index>len(page_text):
                end_index=len(page_text)
            raw_text=page_text[start_index:end_index].strip()
        except:
            start_index=index_and_match['index']
            end_index=index_and_match['index']
            raw_text=index_and_match['index']
      
        ## Get end_index be adding 
    elif direction==LEFT:
        ## if left, just find the index of the flag
        index_and_match=useful.get_index_and_match(page_text,data_flag,flag_inst)
        match_length=len(index_and_match['match'])
        try:
        ## And subtract to get the right character range
            end_index=index_and_match['index'] - 1
            start_index=end_index-num_chars
            if start_index<0:
                start_index=0
            raw_text=page_text[start_index:end_index].strip()
        except:
            start_index=index_and_match['index']
            end_index=index_and_match['index']
            raw_text=index_and_match['index']

    raw_text_string_2=""
    matched=0
    for char in raw_text:
        matched=0
        for key in unicode_chars.iterkeys():
            if char==key:
                raw_text_string_2=raw_text_string_2+unicode_chars[key]
                matched=1
            else:pass
            
        if matched==0:
            raw_text_string_2=raw_text_string_2+char
        else:pass

    raw_text=raw_text_string_2    
            
    return {'raw_text':raw_text,'match':index_and_match['match']}

示例#4

0

显示文件

文件： extraction2_0.py 项目： mikpim01/PDF2EXCEL

def get_ref_text(raw_text_string, library_entry):
    ## The idea is to take the raw chars and the instructions and get the data
    ## The raw text function gets one peice of info at a time it looks like.
    ## So this should do the same.
    collection_method = library_entry['collection_method']
    left_bound_regex = library_entry['left_bound_regex']
    right_bound_regex = library_entry['right_bound_regex']
    data_regex = library_entry['data_regex']
    character_list = library_entry['character_list']
    character_trans = library_entry['character_trans']

    if raw_text_string == "NO MATCHES":
        return "No Raw Text"
    elif raw_text_string == "INSTANCE NOT FOUND":
        return "No Raw Text"

    elif collection_method == 'bounds':
        if left_bound_regex == "null":
            start_index = 0
        else:
            index_and_match = useful.get_index_and_match(
                raw_text_string, left_bound_regex, 1)
            try:
                start_index = index_and_match['index'] + len(
                    index_and_match['match'])
            except:
                start_index = index_and_match['index']
        if right_bound_regex == "null":
            end_index = len(raw_text)
        else:
            index_and_match = useful.get_index_and_match(
                raw_text_string, right_bound_regex, 1)
            end_index = index_and_match['index']
    elif collection_method == 'data':
        index_and_match = useful.get_index_and_match(raw_text_string,
                                                     data_regex, 1)
        start_index = index_and_match['index']
        try:
            end_index = len(index_and_match['match']) + start_index
            #print "tried and exceeded"
            #print index_and_match["match"]
        except:
            end_index = len(raw_text_string)

    try:
        ref_text = raw_text_string[start_index:end_index]

    except:
        ref_text = "Bad Raw Text"

    if ref_text == "Bad Raw Text":
        pass
    elif ref_text == "No Raw Text":
        pass
    else:
        if character_list != "none":
            ref_text = useful.character_selection(ref_text, character_list)
        if character_trans != "none":
            ref_text = useful.character_transform(ref_text, character_trans)

    return ref_text