예제 #1
0
def get_raw_chars(page_text, library_entry, unicode_chars):
    data_flag = library_entry['data_flag']
    flag_inst = library_entry['data_flag_inst']
    direction = library_entry['direction']
    num_chars = library_entry['raw_char_collect']

    if direction == RIGHT:
        index_and_match = jc.get_index_and_match(page_text, data_flag,
                                                 flag_inst)
        match_length = len(index_and_match['match'])

        try:
            start_index = index_and_match['index'] + match_length
            end_index = start_index + num_chars
            if end_index > len(page_text):
                end_index = len(page_text)
            raw_text = page_text[start_index:end_index].strip()

        except:
            start_index = index_and_match['index']
            end_index = index_and_match['index']
            raw_text = index_and_match['index']

    elif direction == LEFT:
        ## if left, just find the index of the flag
        index_and_match = jc.get_index_and_match(page_text, data_flag,
                                                 flag_inst)
        match_length = len(index_and_match['match'])

        try:
            ## And subtract to get the right character range
            end_index = index_and_match['index'] - 1
            start_index = end_index - num_chars
            if start_index < 0:
                start_index = 0
            raw_text = page_text[start_index:end_index].strip()

        except:
            start_index = index_and_match['index']
            end_index = index_and_match['index']
            raw_text = index_and_match['index']

    # This was added later to try and prevent encoding errors from popping up.
    # It looks for problem characters and replaces them with more regular ones.
    # The only affected part of the program is printing the raw chars to the xlsx
    raw_text_string_2 = ""
    matched = 0
    for char in raw_text:
        matched = 0
        for key in unicode_chars.iterkeys():
            if char == key:
                print char,
                print "equals",
                print key
                raw_text_string_2 = raw_text_string_2 + unicode_chars[key]
                print raw_text_string2
                matched = 1
            else:
                pass
        if matched == 0:
            raw_text_string_2 = raw_text_string_2 + char
        else:
            pass
    raw_text = raw_text_string_2

    return {'raw_text': raw_text, 'match': index_and_match['match']}
def get_raw_chars(page_text,library_entry,unicode_chars):
    data_flag=library_entry['data_flag']
    flag_inst=library_entry['data_flag_inst']
    direction=library_entry['direction']
    num_chars=library_entry['raw_char_collect']

    if direction==RIGHT:
        index_and_match=jc.get_index_and_match(page_text,data_flag,flag_inst)
        match_length=len(index_and_match['match'])

        try:
            start_index=index_and_match['index']+match_length
            end_index=start_index+num_chars
            if end_index>len(page_text):
                end_index=len(page_text)
            raw_text=page_text[start_index:end_index].strip()
            
        except:
            start_index=index_and_match['index']
            end_index=index_and_match['index']
            raw_text=index_and_match['index']
      
    elif direction==LEFT:
        ## if left, just find the index of the flag
        index_and_match=jc.get_index_and_match(page_text,data_flag,flag_inst)
        match_length=len(index_and_match['match'])
        
        try:
        ## And subtract to get the right character range
            end_index=index_and_match['index'] - 1
            start_index=end_index-num_chars
            if start_index<0:
                start_index=0
            raw_text=page_text[start_index:end_index].strip()
            
        except:
            start_index=index_and_match['index']
            end_index=index_and_match['index']
            raw_text=index_and_match['index']

    # This was added later to try and prevent encoding errors from popping up.
    # It looks for problem characters and replaces them with more regular ones.
    # The only affected part of the program is printing the raw chars to the xlsx
    raw_text_string_2=""
    matched=0
    for char in raw_text:
        matched=0
        for key in unicode_chars.iterkeys():
            if char==key:
                print char,
                print "equals",
                print key
                raw_text_string_2=raw_text_string_2+unicode_chars[key]
                print raw_text_string2
                matched=1
            else:pass
        if matched==0:
            raw_text_string_2=raw_text_string_2+char
        else:pass
    raw_text=raw_text_string_2    
            
    return {'raw_text':raw_text,'match':index_and_match['match']}
예제 #3
0
def get_ref_text(raw_text_string, library_entry):
    ## The idea is to take the raw chars and the instructions and get the data
    ## The raw text function gets one peice of info at a time it looks like.
    ## So this should do the same.

    ## Initializations using the library
    collection_method = library_entry['collection_method']
    left_bound_regex = library_entry['left_bound_regex']
    right_bound_regex = library_entry['right_bound_regex']
    data_regex = library_entry['data_regex']
    character_list = library_entry['character_list']
    character_trans = library_entry['character_trans']

    ## If the raw text reads no matches or instance not found, then the refined text is
    ## "No Raw Text"
    if raw_text_string == "NO MATCHES":
        return "No Raw Text"
    elif raw_text_string == "INSTANCE NOT FOUND":
        return "No Raw Text"

    ## If it is not, we continue. What is the collection method?
    ## Meaning - Am I using regular expressions to fine the bounds of the desired information
    ## or the information itself.
    elif collection_method == 'bounds':  ##If I'm using the bounds
        if left_bound_regex == "null":  ## Check to see what the LB Regex is
            start_index = 0  ## If it's the literal string "null" then set start_index to start of string
        else:  ## If it isn't, find the left bound using the leftbound regex
            ## and set the start_index equal to its instance.
            index_and_match = jc.get_index_and_match(raw_text_string,
                                                     left_bound_regex, 1)

            # Try is necessary here incase the left bound regex doesn't come up with anything
            ## But why isn't there one for the right bound?
            try:
                start_index = index_and_match['index'] + len(
                    index_and_match['match'])
            except:
                start_index = index_and_match['index']
        if right_bound_regex == "null":
            end_index = len(raw_text_string)
        else:
            index_and_match = jc.get_index_and_match(raw_text_string,
                                                     right_bound_regex, 1)
            end_index = index_and_match['index']
    elif collection_method == 'data':
        index_and_match = jc.get_index_and_match(raw_text_string, data_regex,
                                                 1)
        start_index = index_and_match['index']
        try:
            end_index = len(index_and_match['match']) + start_index
        except:
            end_index = len(raw_text_string)

    try:
        ref_text = raw_text_string[start_index:end_index]
    except:
        ref_text = "Bad Raw Text"

    if ref_text == "Bad Raw Text":
        pass
    elif ref_text == "No Raw Text":
        pass
    else:
        if character_list != "none":
            ref_text = jc.character_selection(ref_text, character_list)
        if character_trans != "none":
            ref_text = jc.character_transform(ref_text, character_trans)

    return ref_text
def get_ref_text(raw_text_string,library_entry):
    ## The idea is to take the raw chars and the instructions and get the data
    ## The raw text function gets one peice of info at a time it looks like.
    ## So this should do the same.

    ## Initializations using the library
    collection_method=library_entry['collection_method']
    left_bound_regex=library_entry['left_bound_regex']
    right_bound_regex=library_entry['right_bound_regex']
    data_regex=library_entry['data_regex']
    character_list=library_entry['character_list']
    character_trans=library_entry['character_trans']

    ## If the raw text reads no matches or instance not found, then the refined text is
    ## "No Raw Text"
    if raw_text_string=="NO MATCHES":
        return "No Raw Text"
    elif raw_text_string=="INSTANCE NOT FOUND":
        return "No Raw Text"

    ## If it is not, we continue. What is the collection method?
    ## Meaning - Am I using regular expressions to fine the bounds of the desired information
    ## or the information itself. 
    elif collection_method=='bounds': ##If I'm using the bounds
        if left_bound_regex=="null": ## Check to see what the LB Regex is
            start_index=0 ## If it's the literal string "null" then set start_index to start of string
        else:  ## If it isn't, find the left bound using the leftbound regex
               ## and set the start_index equal to its instance. 
            index_and_match=jc.get_index_and_match(raw_text_string,left_bound_regex,1)

            # Try is necessary here incase the left bound regex doesn't come up with anything
            ## But why isn't there one for the right bound?
            try:
                start_index=index_and_match['index']+len(index_and_match['match'])
            except:
                start_index=index_and_match['index']
        if right_bound_regex=="null":
            end_index=len(raw_text_string)
        else:
            index_and_match=jc.get_index_and_match(raw_text_string,right_bound_regex,1)
            end_index=index_and_match['index']
    elif collection_method=='data':
        index_and_match=jc.get_index_and_match(raw_text_string,data_regex,1)
        start_index=index_and_match['index']
        try:
            end_index=len(index_and_match['match'])+start_index
        except:
            end_index=len(raw_text_string)

    try:        
        ref_text=raw_text_string[start_index:end_index]
    except:
        ref_text="Bad Raw Text"
        
    if ref_text=="Bad Raw Text":
        pass
    elif ref_text=="No Raw Text":
        pass
    else:
        if character_list != "none":
            ref_text=jc.character_selection(ref_text,character_list)
        if character_trans != "none":
            ref_text=jc.character_transform(ref_text,character_trans)

    return ref_text