def __init__(self):
     super(CombinedBudgetCSVGenerator, self).__init__()
     self.keywords_extractor = KeywordsExtractor()
     self.input_file = None
     self.output_dir = None
     self.currency_handle = '(` crore)'
     self.separator = " || "
     self.continued_symbol = "(CONTD"
 def __init__(self):
     super(ExpenditureBudgetCSVGenerator, self).__init__()
     self.header_rows_indices = [0,1,2]
     self.header_rows_cap = 5
     self.header_format = None
     self.header_padding_required = False
     self.keywords_extractor = KeywordsExtractor()        
     self.bold_keywords = []
Exemplo n.º 3
0
 def __init__(self):
     super(KarnatakaBudgetCSVGenerator, self).__init__()
     self.keywords_extractor = KeywordsExtractor()
     self.min_col_count = 8
     self.max_col_count = 10
     self.currency_slug = "(Rs. in Lakhs)"
     self.empty_char_regex = r'(\xe2|\xc3|\x82|\xa2|\x80)'
     self.parent_scheme_regex = r"([A-Z]+\.|\([a-z]+\)|\d{4,}|^[MDCLXVI]+ |^Total)"
     self.voted_charged_column = True
 def __init__(self):
     super(CombinedBudgetCSVGenerator, self).__init__()
     self.keywords_extractor = KeywordsExtractor()        
     self.input_file = None
     self.output_dir = None
     self.currency_handle = '(` crore)'
     self.separator = " || " 
     self.continued_symbol = "(CONTD"
 def __init__(self):
     super(KarnatakaBudgetCSVGenerator, self).__init__()
     self.keywords_extractor = KeywordsExtractor()
     self.min_col_count = 8
     self.max_col_count = 10
     self.currency_slug = "(Rs. in Lakhs)"
     self.empty_char_regex =  r'(\xe2|\xc3|\x82|\xa2|\x80)' 
     self.parent_scheme_regex = r"([A-Z]+\.|\([a-z]+\)|\d{4,}|^[MDCLXVI]+ |^Total)"  
     self.voted_charged_column = True
class CombinedBudgetCSVGenerator(PDF2CSV):
    def __init__(self):
        super(CombinedBudgetCSVGenerator, self).__init__()
        self.keywords_extractor = KeywordsExtractor()
        self.input_file = None
        self.output_dir = None
        self.currency_handle = '(` crore)'
        self.separator = " || "
        self.continued_symbol = "(CONTD"

    def generate_combined_budget_csv(self, input_file, output_dir):
        self.input_file = input_file
        self.output_dir = output_dir
        self.generate_csv_file(input_file,
                               input_file.split(".pdf")[0] + ".csv",
                               is_header=False,
                               check_page_rotation=True)

    def modify_table_data(self, table):
        pagewise_table = self.split_pages(table)
        pagewise_table = self.clean_pagewise_table(pagewise_table)
        pagewise_table, pagewise_keywords = self.create_page_to_file_map(
            pagewise_table)
        self.generate_child_csv_files(pagewise_table, pagewise_keywords)
        return None

    def split_pages(self, table):
        pagewise_table = {}
        temp_list = []
        page_num = 1
        for row in table:
            if row[0] == self.page_break.replace('"', ''):
                if temp_list and len(temp_list[0]) > MIN_COL_COUNT:
                    pagewise_table[page_num] = temp_list
                    temp_list = []
                page_num += 1
            elif len(row) > MIN_COL_COUNT:
                temp_list.append(row)
        if temp_list and len(temp_list[0]) > MIN_COL_COUNT:
            pagewise_table[page_num] = temp_list
        return pagewise_table

    def clean_pagewise_table(self, pagewise_table):
        for page_num in pagewise_table:
            page_table = pagewise_table[page_num]
            for row_index in range(len(page_table)):
                for col_index in range(len(page_table[row_index])):
                    page_table[row_index][col_index] = page_table[row_index][
                        col_index].strip()
            if "crore" in "".join(page_table[0]):
                page_table.pop(0)
            while page_table[0][0].strip() == page_table[0][1].strip() == "":
                for row in page_table:
                    row[0] = (row[0] + " " + row[1]).strip()
                    row.pop(1)
            while True:
                empty_row_indices = []
                for row_index in range(len(page_table)):
                    if row_index in empty_row_indices:
                        continue
                    if page_table[row_index][0] == page_table[row_index][
                            1] == "":
                        for col_index in range(len(page_table[row_index])):
                            page_table[row_index - 1][col_index] = page_table[
                                row_index - 1][col_index] + " " + page_table[
                                    row_index][col_index]
                        empty_row_indices.append(row_index)
                    elif not "".join(page_table[row_index][1:]).strip():
                        if re.match(r"[0-9A-Z]\.|[a-z]+\)|[0-9]",
                                    page_table[row_index]
                                    [0]) and row_index < len(page_table) - 1:
                            for col_index in range(len(page_table[row_index])):
                                page_table[row_index][col_index] = page_table[
                                    row_index][col_index] + " " + page_table[
                                        row_index + 1][col_index]
                            empty_row_indices.append(row_index + 1)
                        else:
                            page_table[row_index - 1][0] = page_table[
                                row_index -
                                1][0] + " " + page_table[row_index][0]
                            empty_row_indices.append(row_index)
                num = 0
                if not empty_row_indices:
                    break
                for row_count in empty_row_indices:
                    page_table.pop(row_count - num)
                    num += 1
            pagewise_table[page_num] = page_table
        return pagewise_table

    def get_rotated_pdf_keywords(self, input_pdf_filepath, page_num):
        input_pdf_obj = PdfFileReader(open(input_pdf_filepath, 'rb'))
        temp_pdf_obj = PdfFileWriter()
        temp_pdf_obj.addPage(
            input_pdf_obj.getPage(page_num).rotateClockwise(90))
        output_stream = file(TEMP_PDF_FILE, "wb")
        temp_pdf_obj.write(output_stream)
        output_stream.close()
        return self.keywords_extractor.get_bold_text_phrases(
            TEMP_PDF_FILE,
            is_other_starting_phrases=True,
            single_word=True,
            page_num=1,
            lower_case=False)

    def create_page_to_file_map(self, pagewise_table):
        pagewise_keywords = {}
        for page_num in pagewise_table:
            keyword_list = self.keywords_extractor.get_bold_text_phrases(
                self.input_file,
                is_other_starting_phrases=True,
                single_word=True,
                page_num=page_num,
                lower_case=False)
            if not keyword_list or len(keyword_list[0]) < MIN_TITLE_CHARS:
                keyword_list = self.get_rotated_pdf_keywords(
                    self.input_file, page_num - 1)
            pagewise_keywords[page_num] = keyword_list
        for page_num in pagewise_keywords:
            currency_handle_found = False
            for keyword_index in range(len(pagewise_keywords[page_num])):
                if pagewise_keywords[page_num][
                        keyword_index] == self.currency_handle:
                    currency_handle_found = True
                    pagewise_keywords[page_num] = self.separator.join(
                        pagewise_keywords[page_num][0:keyword_index])
                    break
            if not currency_handle_found:
                pagewise_keywords[page_num] = pagewise_keywords[page_num][0]
            if self.continued_symbol in pagewise_keywords[page_num]:
                if not page_num - 1 in pagewise_table:
                    continue
                if not self.separator in pagewise_keywords[page_num]:
                    pagewise_keywords[page_num] = pagewise_keywords[page_num -
                                                                    1]
                    pagewise_table[page_num].pop(0)
                    pagewise_table[page_num] = pagewise_table[
                        page_num - 1] + pagewise_table[page_num]
                    pagewise_table.pop(page_num - 1)
                else:
                    pagewise_keywords[page_num] = pagewise_keywords[
                        page_num - 1].split(
                            self.separator
                        )[0] + self.separator + pagewise_keywords[
                            page_num].split(self.separator)[-1]
        return pagewise_table, pagewise_keywords

    def generate_child_csv_files(self, pagewise_table, pagewise_keywords):
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
        for page_num in pagewise_table:
            file_name = re.sub(r'^TABLE ', '',
                               pagewise_keywords[page_num]).strip()
            file_name = file_name.replace("/", '|')
            file_name = file_name.replace(self.separator, '-')
            out_csv_file = open(self.output_dir + "/" + file_name + ".csv",
                                "wb")
            csv_writer = csv.writer(out_csv_file, delimiter=',')
            for row in pagewise_table[page_num]:
                csv_writer.writerow(row)
            out_csv_file.close()
class KarnatakaBudgetCSVGenerator(PDF2CSV):
    def __init__(self):
        super(KarnatakaBudgetCSVGenerator, self).__init__()
        self.keywords_extractor = KeywordsExtractor()
        self.min_col_count = 8
        self.max_col_count = 10
        self.currency_slug = "(Rs. in Lakhs)"
        self.empty_char_regex =  r'(\xe2|\xc3|\x82|\xa2|\x80)' 
        self.parent_scheme_regex = r"([A-Z]+\.|\([a-z]+\)|\d{4,}|^[MDCLXVI]+ |^Total)"  
        self.voted_charged_column = True

    def generate_karnataka_budget_csv(self, input_file, output_dir):
        '''Main call comes here setting global variable and calling PDF to CSV
        '''
        self.input_file = input_file
        self.output_dir = output_dir
        self.generate_csv_file(input_file, input_file.split(".pdf")[0] + ".csv", is_header=False, identify_columns=True)

    def modify_table_data(self, table):
        '''Modifying output of PDF to CSV to clean, wrangle and generate multiple CSV files
        '''
        pagewise_table = self.split_pages(table)
        pagewise_table = self.extract_head_codes(pagewise_table)
        pagewise_table = self.clean_pagewise_table(pagewise_table)
        for page_num in pagewise_table:
            unwanted_row_indices = []
            page_table = pagewise_table[page_num]
            header_found = False
            for row_index in range(len(page_table)):
                self.correct_column_count(row_index, page_table)
                unwanted_header_row_indices = self.clean_header_values(row_index, page_table)
                if unwanted_header_row_indices:
                    unwanted_row_indices += unwanted_header_row_indices
                    header_found = True
                elif header_found and self.voted_charged_column:
                    page_table[row_index].insert(2, "")
                self.correct_combined_values(row_index, page_table)
            unwanted_row_indices += self.merge_splitted_rows(page_table)
            self.delete_unwanted_rows(unwanted_row_indices, page_table)
        pagewise_headers = self.generate_page_headers_map(pagewise_table)
        pagewise_table = self.extract_budget_codes(pagewise_table)
        self.generate_pagewise_csv_files(pagewise_table, pagewise_headers)
   
    def split_pages(self, table):
        '''Splitting main table into pagewise tables
        '''
        pagewise_table = {}
        temp_list = []
        page_num = 1
        for row in table:
            if row[0] == self.page_break.replace('"',''):
                if temp_list and len(temp_list[0]) > self.min_col_count:
                    pagewise_table[page_num] = temp_list
                    temp_list = []
                page_num += 1
            elif len(row) > self.min_col_count:
                temp_list.append(row)
        if temp_list and len(temp_list[0]) > self.min_col_count:
            pagewise_table[page_num] = temp_list
        return pagewise_table 

    def extract_head_codes(self, pagewise_table):
        '''Extracting Head codes from scheme descriptions, inheriting classes can customize it
        '''
        return pagewise_table
    
    def extract_budget_codes(self, pagewise_table):
        '''Extracting Budget codes from scheme descriptions, inheriting classes can customize it
        '''
        return pagewise_table

    def clean_pagewise_table(self, pagewise_table):
        '''Cleansing pagewise tables to remove Kannada chars(Windows-1252 encoded)
        '''
        for page_num in pagewise_table:
            page_table = pagewise_table[page_num]
            unwanted_row_indices = {}
            for row_index in range(len(page_table)):
                for col_index in range(len(page_table[row_index])):
                    val = page_table[row_index][col_index]
                    val = re.sub(r'(\xe2|\x80|vjU)', '', val).replace('\x90', '-') 
                    if '\\x' in val.encode('string-escape'):
                        if " " in val:
                            val = re.sub(r"\s{2,}", " ", val)
                            val_list = val.split(" ") 
                            clear_index = 0
                            for val_index in range(len(val_list)):
                                if not '\\x' in val_list[val_index].encode('string-escape') and re.findall(r"[a-zA-Z0-9\.\(\)\&\-\+]{1,}", val_list[val_index]):
                                    if clear_index == 0:
                                        clear_index = val_index
                                else:
                                    clear_index = 0
                            if clear_index > 0:
                                val =  " ".join(val.split(" ")[clear_index:])
                            else:
                                val = ""
                        else:
                            val = ""
                    page_table[row_index][col_index] = val.strip() 
                if not "".join(page_table[row_index]).strip():
                    unwanted_row_indices[row_index] = True
            self.delete_unwanted_rows(unwanted_row_indices.keys(), page_table)
        return pagewise_table

    def correct_column_count(self,row_index, page_table):
        '''Inserting extra columns wherever required
        '''
        while len(page_table[row_index]) < self.max_col_count:
            page_table[row_index].insert(0, "")
    
    def correct_combined_values(self, row_index, page_table):
        '''Correcting Grand Total and Voted/Charged values which got merged in original doc
        '''
        if page_table[row_index][1] == "GRAND TOTAL (PLAN + NON-PLAN)":
            col_index = 2
            while col_index < len(page_table[row_index]):
                if not "." in page_table[row_index][col_index]:
                    page_table[row_index][col_index+1] = page_table[row_index][col_index] + page_table[row_index][col_index+1]
                    page_table[row_index][col_index] = "P+NP ="
                col_index += 2
        voted_charged_match = re.findall(r"(\s){,1}(Voted|Charged)$", page_table[row_index][1]) 
        if voted_charged_match:
            voted_charged_match = "".join(map(list, voted_charged_match)[0])
            page_table[row_index][1] = page_table[row_index][1].split(voted_charged_match)[0]
            page_table[row_index][2] = voted_charged_match.strip()

    def clean_header_values(self, row_index, page_table):
        '''CLeaning and generating correct header values and unwanted row indices
        '''
        unwanted_row_indices = []
        if page_table[row_index][2] == "Plan":
            page_table[row_index][0] = "Budget Code"
            header_1_val = ""
            for index in range(row_index+1):
                header_1_val += " " + page_table[index][1]
                if index != row_index:
                    unwanted_row_indices.append(index)
            page_table[row_index][1] = header_1_val.strip()
            year_index = 2
            for col_index in range(2, len(page_table[row_index])):
                if col_index%2 == 0 and col_index != year_index:
                    year_index += 2
                if not " " in page_table[0][year_index+1]:
                    page_table[0][year_index+1] = " " + page_table[0][year_index+1] 
                page_table[row_index][col_index] = page_table[0][year_index] + page_table[0][year_index+1] + " " + page_table[row_index][col_index] + " " + self.currency_slug
            page_table[row_index].insert(2, 'Voted/Charged')
        elif page_table[row_index][2] == "2":
            unwanted_row_indices.append(row_index)
        return unwanted_row_indices

    def merge_splitted_rows(self, page_table):
        '''Merging splitted rows into one
        '''
        unwanted_row_indices = {}
        for row_index in range(5, len(page_table)):
            if re.match(self.parent_scheme_regex, page_table[row_index][1]) or page_table[row_index][0]:
                continue
            elif not "".join(page_table[row_index][2:]):
                parent_row_index = row_index
                while not (re.match(self.parent_scheme_regex, page_table[parent_row_index][1]) or page_table[parent_row_index][0]):
                    parent_row_index -= 1 
                    if parent_row_index in unwanted_row_indices:
                        continue
                    if page_table[row_index][1].strip(): 
                        page_table[parent_row_index][1] += ' ' + page_table[row_index][1].strip()
                    unwanted_row_indices[row_index] = True
        return unwanted_row_indices.keys()
    
    def delete_unwanted_rows(self, unwanted_row_indices, page_table):
        '''Deleting unwanted row indices from page tables
        '''
        unwanted_row_indices.sort()
        num = 0
        for row_index in unwanted_row_indices:
            page_table.pop(row_index-num)
            num += 1

    def generate_page_headers_map(self, pagewise_table): 
        '''Generating pagewise headers for tables
        '''
        pagewise_keywords = {}
        page_headers_map = {}
        for page_num in pagewise_table:
            keyword_list = self.keywords_extractor.get_bold_text_phrases(self.input_file, keyword_xpath="//text()", is_other_starting_phrases=True, single_word=True, page_num=page_num, lower_case=False)
            page_header = []
            for keyword in keyword_list:
                keyword = re.sub(self.empty_char_regex, '', keyword).replace('\x90', '-')
                if not '\\x' in keyword.encode('string-escape') and not "<!--" in keyword:
                    if " ".join(self.currency_slug.split(" ")[1:]) in keyword or "in Lakhs" in keyword:
                        break
                    keyword = keyword.decode('unicode_escape').encode('ascii','ignore').strip()
                    keyword = re.sub(r"\s{2,}", " ", keyword)
                    page_header.append(keyword.strip())
            page_headers_map[page_num] = "|".join(page_header)
        return page_headers_map

    def write_page_table(self, file_name, file_table):
        '''Creating new file and writing file table in it
        '''
        file_name = file_name.replace("/", "|")
        file_name = file_name.split("|")[-1].strip()
        out_csv_file = open(self.output_dir + "/" + file_name + ".csv", "wb")
        csv_writer = csv.writer(out_csv_file, delimiter=',')
        for row in file_table:
            csv_writer.writerow(row)
        out_csv_file.close() 
    
    def generate_pagewise_csv_files(self, pagewise_table, pagewise_headers):
        '''Generating pagewise CSV files 
        '''
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
        file_name = ""
        file_table = []
        for page_num in pagewise_table:
            if file_name and file_name != pagewise_headers[page_num]:
                self.write_page_table(file_name, file_table)
                file_table = pagewise_table[page_num]
                file_name = pagewise_headers[page_num]
            else:
                if not file_name:
                    file_table += pagewise_table[page_num]
                elif len(pagewise_table[page_num]) <= 1:
                    continue
                else:
                    if re.match(self.parent_scheme_regex, pagewise_table[page_num][1][1]) or pagewise_table[page_num][1][0]:
                        file_table += pagewise_table[page_num][1:]
                    elif not "".join(pagewise_table[page_num][1][2:]):
                        file_table[-1][1] += " " + pagewise_table[page_num][1][1]
                        file_table += pagewise_table[page_num][2:]
                file_name = pagewise_headers[page_num]
        if file_table:
            self.write_page_table(file_name, file_table)
class ExpenditureBudgetCSVGenerator(PDF2CSV):
    def __init__(self):
        super(ExpenditureBudgetCSVGenerator, self).__init__()
        self.header_rows_indices = [0,1,2]
        self.header_rows_cap = 5
        self.header_format = None
        self.header_padding_required = False
        self.keywords_extractor = KeywordsExtractor()        
        self.bold_keywords = []

    def check_missing_vertical(self, lines):
        found_missing_vertical = False
        min_vertical = None
        for line in lines:
            for x1,y1,x2,y2 in line:
                if x1 == x2:
                    if not min_vertical or x1 < min_vertical:
                        min_vertical = x1
        if min_vertical:
            for line in lines:
                for x1,y1,x2,y2 in line:
                    if y1 == y2 and min_vertical/2 < x1 < min_vertical:
                        found_missing_vertical = x1
        return found_missing_vertical
    
    def modify_table_data(self, table):
        table = self.correct_upper_header_rows(table)    
        table = self.merge_splitted_coloumns(table)
        if not table:
            return table
        table = self.split_merged_coloumns(table)
        table = self.fix_second_header(table)
        table = self.remove_dulicate_headers(table)
        empty_row_indices = []
        for row_num in range(len(table)-1):
            if "".join(table[row_num]).strip() == "Major" and table[row_num+1][1].strip() == "Head":
                table[row_num+1][1] = "Major " + table[row_num+1][1] 
                table[row_num][1] == ""
                empty_row_indices.append(row_num)
        for row_count in empty_row_indices:
            table.pop(row_count)
        table = self.merge_splitted_rows(table)
        table = self.ensure_table_intergrity(table)
        table = self.add_index_coloumn(table)
        table = self.add_new_headers(table)
        return table

    def correct_upper_header_rows(self, table):
        for num in range(0,5):
            if table[num][0].strip() != "":
                table[num] = [""] + table[num]
                self.header_padding_required = True
        return table

    def merge_splitted_coloumns(self, table):
        pagewise_table_list = []
        temp_list = []
        for row in table:
            if row[0] == self.page_break.replace('"',''):
                if temp_list and len(temp_list[0]) > MIN_COL_COUNT:
                    pagewise_table_list.append(temp_list)
                    temp_list = []
            elif len(row) > MIN_COL_COUNT:
                row[0] = row[0].strip()
                row[1] = row[1].strip()
                if re.search(r'^[\d\.]+$', row[0]) and row[1]:
                    row[0] += " " + row[1] 
                    row[1] = ""  
                temp_list.append(row)
        if temp_list and len(temp_list[0]) > MIN_COL_COUNT:
            pagewise_table_list.append(temp_list)
        table = [] 
        for page_table in pagewise_table_list:
            #page_table = self.delete_empty_coloumns(page_table)
            for row_index in range(len(page_table)):
                while page_table[row_index][0].strip() == page_table[row_index][1].strip() == "":
                    page_table[row_index][0] += " " + page_table[row_index][1]
                    page_table[row_index].pop(1)
            header_row = page_table[1]
            merge_upper_bound = 0
            for col_index in range(1, len(header_row)):
                header_stub = header_row[col_index].strip().lower() 
                if re.search(r"^(major|head of dev*)$", header_stub):
                    merge_upper_bound = col_index
                    break
            for row in page_table:
                if merge_upper_bound:
                    num = 0
                    row[0] = row[0].strip()
                    for col_index in range(1, merge_upper_bound):
                        row[0] += " " + row[col_index-num]
                        row.pop(col_index-num)
                        num += 1
                table.append(row)
        return table
    
    def split_merged_coloumns(self, table):
        for row in table:
            for col_index in range(2,len(row)):
                if re.search(r'^(\.{3}\s(\.{3}|[0-9]|\-[0-9]))', row[col_index].strip()) and not row[col_index-1].strip():
                    row[col_index-1] = "..."
                    row[col_index] = re.sub(r'^\.{3}\s', '', row[col_index]).strip()
        new_col_indices = []
        for row_index in range(len(table)):
            row = table[row_index]
            new_col_values_map = {}
            for col_index in range(0, len(row)):
                row[col_index] = row[col_index].strip()
                multi_col_match = re.search(r'^((\.{3}|(\-)*[0-9]+(\.[0-9]+){,1}|Plan|Non-Plan|Total)(\s)*)+$', row[col_index].strip())
                if multi_col_match:
                    multi_col_match_str = multi_col_match.group(0)
                    if " " not in multi_col_match_str.strip():
                        continue
                    col_values = multi_col_match_str.split(" ")
                    row[col_index] = col_values[0]
                    index_correction = len(new_col_values_map)
                    for col_count in range(1,len(col_values)):
                        if col_index+col_count < len(row) and not row[col_index+col_count].strip():
                            row[col_index+col_count] = col_values[col_count]
                        else:
                            insert_pointer = col_index+col_count+index_correction 
                            new_col_values_map[insert_pointer] = col_values[col_count] 
                            if not insert_pointer in new_col_indices and row_index <= self.header_rows_cap:
                                new_col_indices.append(insert_pointer)
            for index in sorted(new_col_values_map):
                row.insert(index, new_col_values_map[index])
        table = self.correct_major_head_values(table)
        col_shifted = False
        for row_index in range(len(table)):
            table[row_index][0] = table[row_index][0].strip()  
            table[row_index][1] = table[row_index][1].strip()  
            if (re.match(r'head of', table[row_index][0].lower()) and re.match(r'budget', table[row_index][1].strip().lower())):
                continue
            if (re.match(r'head of', table[row_index][1].lower()) and re.match(r'budget', table[row_index][2].strip().lower())):
                continue
            if len(table[row_index]) > len(FORMAT_DICT[self.header_format]["SECOND_HEADER_ROW"]) or not " " in " ".join(table[row_index]).strip():
                while not table[row_index][0] and "".join(table[row_index][1:]).strip():
                    table[row_index].pop(0)
                    col_shifted = True
        if col_shifted:
            table = self.correct_major_head_values(table)
        for col_index in new_col_indices:
            table[0].insert(col_index, " ")
        return table
       
    def correct_major_head_values(self, table):
        for row in table:
            if row[0].strip() == "Grand Total":
                if row[1].strip():
                    row.insert(1, "")
                continue
            major_code_match = re.findall(r'\s[0-9]{4,}$|^[0-9]{4,}$|^Head of Dev|Head of$| Dev$|\s{,1}Total$|^Net$', row[0].strip())
            if major_code_match:
                major_code = major_code_match[-1]
                scheme_name = row[0].split(major_code)[0]
                if len(row) < len(FORMAT_DICT[self.header_format]["SECOND_HEADER_ROW"]):
                    row[0] = scheme_name.strip()
                    if not row[1].strip():
                        row[1] = major_code
                    else:
                        row.insert(1, major_code)
                elif not row[1].strip():
                    row[0] = scheme_name.strip()
                    row[1] = major_code
        for row_index in range(2, len(table)):
            major_head_cell_val = table[row_index][1].strip() 
            if not major_head_cell_val or (re.match(r'head of', major_head_cell_val.lower()) and re.match(r'budget', table[row_index][2].strip().lower())):
                continue
            if re.search(r'(^[0-9]{1,}\.[0-9]{1,})|\.{3}', major_head_cell_val):
                table[row_index].insert(1, '')
            elif re.search(r'\D{2,}(\s\D{2,})+', major_head_cell_val):
                major_code_match = re.search(r'\d{4,}', major_head_cell_val)
                if major_code_match:
                    major_code = major_code_match.group(0)
                    table[row_index][0] += table[row_index][0].strip() + " " + major_head_cell_val.split(major_code)[0].strip()
                    table[row_index][1] = major_code
                else:
                    table[row_index][0] = major_head_cell_val
                    table[row_index][1] = ""
        return table
    
    def remove_dulicate_headers(self, table):
        empty_row_indices = []
        header_rows = []
        seconary_header_stub = "".join(FORMAT_DICT[self.header_format]["SECOND_HEADER_ROW"]).replace(" ", "")
        for row_num in self.header_rows_indices:
            header_rows.append("".join(table[row_num]).replace(" ", ""))
        for row_num in range(self.header_rows_indices[-1]+1, len(table)):
            row_stub = "".join(table[row_num]).replace(" ", "")
            if row_stub in header_rows or re.search(r"^(%s)" % "|".join(HEADER_STUBS), row_stub):
                empty_row_indices.append(row_num)
            if row_stub == seconary_header_stub and not seconary_header_stub in header_rows:
                header_rows.append(seconary_header_stub)
            elif "IEBR" in row_stub:
                empty_row_indices.append(row_num)
        num = 0
        for row_count in empty_row_indices:
            table.pop(row_count-num)
            num += 1
        return table

    def merge_splitted_rows(self, table):
        empty_row_indices = []
        previous_data_slug = ""
        for row_index in range(1,len(table)):
            table[row_index][0] = table[row_index][0].strip()
            if not table[row_index][0] or re.search(r'^(Total|[A-C]\.)|\:$', table[row_index][0]) or (re.sub(r'\s{2,}', ' ', table[row_index][0].lower()) in self.bold_keywords): 
                continue
            elif re.search(r'^[0-9]', table[row_index][0]) and not "".join(table[row_index][1:]).strip():
                index_col_val = re.search(r'^(\d{1,2}(\.)*)+(\s){0,1}', table[row_index][0]).group(0).strip()
                scheme_col_val = re.sub(r'\s{2,}', ' ', table[row_index][0].lower().replace(index_col_val, "").strip())
                if scheme_col_val in self.bold_keywords:
                    continue
                previous_data_slug += table[row_index][0] + " "
                empty_row_indices.append(row_index)
                continue 
            elif table[row_index][0].strip() and not "".join(table[row_index][1:]).strip() and table[row_index-1][0].strip():
                parent_row_index = row_index
                while parent_row_index > 1:
                    parent_row_index -= 1
                    if "".join(table[parent_row_index][1:]).strip():
                        break
                table[parent_row_index][0] = table[parent_row_index][0].strip() + " " + table[row_index][0].strip()
                empty_row_indices.append(row_index)
            elif previous_data_slug:
                table[row_index][0] =  previous_data_slug + table[row_index][0]
                previous_data_slug = ""
        num = 0
        for row_count in empty_row_indices:
            table.pop(row_count-num)
            num += 1
        return table
    
    def add_index_coloumn(self, table):
        for row_index in range(len(table)):
            index_col_val = ""
            table[row_index][0] = table[row_index][0].strip()
            if re.search(r'^(\d{1,2}(\.)*)+(\s){0,1}\D+', table[row_index][0]):
                index_col_val = re.search(r'^(\d{1,2}(\.)*)+(\s){0,1}', table[row_index][0]).group(0).strip()
                table[row_index][0] = table[row_index][0].split(index_col_val)[-1].strip()
            table[row_index] = [index_col_val] + table[row_index]
        return table
    
    def add_new_headers(self, table):
        year_list = []
        header_size = len(table[0])
        for col_index in range(header_size):
            year_data_match = re.search(r'[0-9]{4}\-[0-9]{4}', table[0][col_index])
            if year_data_match:
                year = year_data_match.group(0)
                year_list.append(year)
        table[0] = FORMAT_DICT[self.header_format]["FIRST_HEADER_ROW"][:]
        num = 3
        for year in year_list:
            for val in range(0,3):
                table[0][num] += " " + year
                num += 1
        while len(table[0]) > len(table[1]):
            table[1] = [""] + table[1]
        table = self.merge_up_rows(0, table)
        return table

    def fix_second_header(self, table):
        merge_up_required = False
        header_row_index = None
        for row_index in range(len(table)):
            row = table[row_index]
            if self.header_padding_required:
                col_value = row[1].lower().strip()
            else:
                col_value = row[2].lower().strip()
            if re.match(r'%s' % SECOND_HEADER_FIELD, col_value):
                header_row_index = row_index
                table[header_row_index] = FORMAT_DICT[self.header_format]["SECOND_HEADER_ROW"][:]
            elif "Head of" in "".join(row).strip():
                header_row_index = row_index
                if table[header_row_index][0].strip(): 
                    table[header_row_index+1][0] = (table[header_row_index][0] + " " + table[header_row_index + 1][0]).strip() 
                table[header_row_index] = FORMAT_DICT[self.header_format]["SECOND_HEADER_ROW"][:]
                merge_up_required = True
                break
        if merge_up_required:
            if table[header_row_index+1][0]:
                table[header_row_index+1] = [table[header_row_index+1][0]]
            else:
                table.pop(header_row_index+1)
        return table

    def merge_up_rows(self, row_index, table):
        for col_index in range(len(table[row_index])): 
            table[row_index][col_index] = (table[row_index][col_index].strip() + " " + table[row_index+1][col_index].strip()).strip()
        table.pop(row_index+1)
        return table

    def generate_expenditure_budgets_csv(self, doc_dir, header_format, page_header, identify_columns):
        self.header_format = header_format
        year_data_match = re.search(r'[0-9]{4}\-[0-9]{2,}', doc_dir) 
        if year_data_match:
            year = year_data_match.group(0) 
        for file_name in glob.glob("%s*.pdf" % doc_dir):
            department_name = os.path.basename(file_name).lower().split(".pdf")[0].decode('utf-8')
            if not department_name in SKIP_FILENAMES:
                logger.info("Processing PDF document for department: %s" % department_name)
                try:
                    self.bold_keywords = self.keywords_extractor.get_bold_text_phrases(file_name, is_other_starting_phrases=True, single_word=True)
                    logger.info("BOLD Keywords: %s" % str(self.bold_keywords))
                    self.generate_csv_file(file_name, file_name.split(".pdf")[0] + ".csv", is_header=page_header, identify_columns=identify_columns, temp_file_postfix=year)
                except Exception, error_message:
                    logger.error("Unable to extract CSV for department: %s, error_message: %s" % (department_name, error_message), exc_info = True)
Exemplo n.º 9
0
class KarnatakaBudgetCSVGenerator(PDF2CSV):
    def __init__(self):
        super(KarnatakaBudgetCSVGenerator, self).__init__()
        self.keywords_extractor = KeywordsExtractor()
        self.min_col_count = 8
        self.max_col_count = 10
        self.currency_slug = "(Rs. in Lakhs)"
        self.empty_char_regex = r'(\xe2|\xc3|\x82|\xa2|\x80)'
        self.parent_scheme_regex = r"([A-Z]+\.|\([a-z]+\)|\d{4,}|^[MDCLXVI]+ |^Total)"
        self.voted_charged_column = True

    def generate_karnataka_budget_csv(self, input_file, output_dir):
        '''
        Main call comes here setting global variable and calling PDF to CSV
        '''
        self.input_file = input_file
        self.output_dir = output_dir
        self.generate_csv_file(input_file,
                               input_file.split(".pdf")[0] + ".csv",
                               is_header=True,
                               identify_columns=True)

    def modify_table_data(self, table):
        '''
        Modifying output of PDF to CSV to clean, wrangle and generate multiple CSV files
        '''
        pagewise_table = self.split_pages(table)
        pagewise_table = self.extract_head_codes(pagewise_table)
        pagewise_table = self.clean_pagewise_table(pagewise_table)
        for page_num in pagewise_table:
            unwanted_row_indices = []
            page_table = pagewise_table[page_num]
            header_found = False
            for row_index in range(len(page_table)):
                self.correct_column_count(row_index, page_table)
                unwanted_header_row_indices = self.clean_header_values(
                    row_index, page_table)
                if unwanted_header_row_indices:
                    unwanted_row_indices += unwanted_header_row_indices
                    header_found = True
                elif header_found and self.voted_charged_column:
                    page_table[row_index].insert(2, "")
                self.correct_combined_values(row_index, page_table)
            unwanted_row_indices += self.merge_splitted_rows(page_table)
            self.delete_unwanted_rows(unwanted_row_indices, page_table)
        pagewise_headers = self.generate_page_headers_map(pagewise_table)
        pagewise_table = self.extract_budget_codes(pagewise_table)
        self.generate_pagewise_csv_files(pagewise_table, pagewise_headers)

    def split_pages(self, table):
        '''
        Splitting main table into pagewise tables
        '''
        pagewise_table = {}
        temp_list = []
        page_num = 1
        for row in table:
            if row[0] == self.page_break.replace('"', ''):
                if temp_list and len(temp_list[0]) > self.min_col_count:
                    pagewise_table[page_num] = temp_list
                    temp_list = []
                page_num += 1
            elif len(row) > self.min_col_count:
                temp_list.append(row)
        if temp_list and len(temp_list[0]) > self.min_col_count:
            pagewise_table[page_num] = temp_list
        return pagewise_table

    def extract_head_codes(self, pagewise_table):
        '''
        Extracting Head codes from scheme descriptions, inheriting classes can customize it
        '''
        return pagewise_table

    def extract_budget_codes(self, pagewise_table):
        '''
        Extracting Budget codes from scheme descriptions, inheriting classes can customize it
        '''
        return pagewise_table

    def clean_pagewise_table(self, pagewise_table):
        '''
        Cleansing pagewise tables to remove Kannada chars(Windows-1252 encoded)
        '''
        for page_num in pagewise_table:
            page_table = pagewise_table[page_num]
            unwanted_row_indices = {}
            for row_index in range(len(page_table)):
                for col_index in range(len(page_table[row_index])):
                    val = page_table[row_index][col_index]
                    val = re.sub(r'(\xe2|\x80|vjU)', '',
                                 val).replace('\x90', '-')
                    if '\\x' in val.encode('string-escape'):
                        if " " in val:
                            val = re.sub(r"\s{2,}", " ", val)
                            val_list = val.split(" ")
                            clear_index = 0
                            for val_index in range(len(val_list)):
                                if not '\\x' in val_list[val_index].encode(
                                        'string-escape') and re.findall(
                                            r"[a-zA-Z0-9\.\(\)\&\-\+]{1,}",
                                            val_list[val_index]):
                                    if clear_index == 0:
                                        clear_index = val_index
                                else:
                                    clear_index = 0
                            if clear_index > 0:
                                val = " ".join(val.split(" ")[clear_index:])
                            else:
                                val = ""
                        else:
                            val = ""
                    page_table[row_index][col_index] = val.strip()
                if not "".join(page_table[row_index]).strip():
                    unwanted_row_indices[row_index] = True
            self.delete_unwanted_rows(unwanted_row_indices.keys(), page_table)
        return pagewise_table

    def correct_column_count(self, row_index, page_table):
        '''Inserting extra columns wherever required
        '''
        while len(page_table[row_index]) < self.max_col_count:
            page_table[row_index].insert(0, "")

    def correct_combined_values(self, row_index, page_table):
        '''Correcting Grand Total and Voted/Charged values which got merged in original doc
        '''
        if page_table[row_index][1] == "GRAND TOTAL (PLAN + NON-PLAN)":
            col_index = 2
            while col_index < len(page_table[row_index]):
                if not "." in page_table[row_index][col_index]:
                    page_table[row_index][
                        col_index + 1] = page_table[row_index][
                            col_index] + page_table[row_index][col_index + 1]
                    page_table[row_index][col_index] = "P+NP ="
                col_index += 2
        voted_charged_match = re.findall(r"(\s){,1}(Voted|Charged)$",
                                         page_table[row_index][1])
        if voted_charged_match:
            voted_charged_match = "".join(map(list, voted_charged_match)[0])
            page_table[row_index][1] = page_table[row_index][1].split(
                voted_charged_match)[0]
            page_table[row_index][2] = voted_charged_match.strip()

    def clean_header_values(self, row_index, page_table):
        '''CLeaning and generating correct header values and unwanted row indices
        '''
        unwanted_row_indices = []
        if page_table[row_index][2] == "Plan":
            page_table[row_index][0] = "Budget Code"
            header_1_val = ""
            for index in range(row_index + 1):
                header_1_val += " " + page_table[index][1]
                if index != row_index:
                    unwanted_row_indices.append(index)
            page_table[row_index][1] = header_1_val.strip()
            year_index = 2
            for col_index in range(2, len(page_table[row_index])):
                if col_index % 2 == 0 and col_index != year_index:
                    year_index += 2
                if not " " in page_table[0][year_index + 1]:
                    page_table[0][year_index +
                                  1] = " " + page_table[0][year_index + 1]
                page_table[row_index][
                    col_index] = page_table[0][year_index] + page_table[0][
                        year_index + 1] + " " + page_table[row_index][
                            col_index] + " " + self.currency_slug
            page_table[row_index].insert(2, 'Voted/Charged')
        elif page_table[row_index][2] == "2":
            unwanted_row_indices.append(row_index)
        return unwanted_row_indices

    def merge_splitted_rows(self, page_table):
        '''Merging splitted rows into one
        '''
        unwanted_row_indices = {}
        for row_index in range(5, len(page_table)):
            if re.match(self.parent_scheme_regex,
                        page_table[row_index][1]) or page_table[row_index][0]:
                continue
            elif not "".join(page_table[row_index][2:]):
                parent_row_index = row_index
                while not (re.match(self.parent_scheme_regex,
                                    page_table[parent_row_index][1])
                           or page_table[parent_row_index][0]):
                    parent_row_index -= 1
                    if parent_row_index in unwanted_row_indices:
                        continue
                    if page_table[row_index][1].strip():
                        if len(page_table) < abs(parent_row_index):
                            page_table[parent_row_index][
                                1] += ' ' + page_table[row_index][1].strip()
                    unwanted_row_indices[row_index] = True
        return unwanted_row_indices.keys()

    def delete_unwanted_rows(self, unwanted_row_indices, page_table):
        '''Deleting unwanted row indices from page tables
        '''
        unwanted_row_indices.sort()
        num = 0
        for row_index in unwanted_row_indices:
            page_table.pop(row_index - num)
            num += 1

    def generate_page_headers_map(self, pagewise_table):
        '''Generating pagewise headers for tables
        '''
        page_headers_map = {}
        for page_num in pagewise_table:
            keyword_list = self.keywords_extractor.get_bold_text_phrases(
                self.input_file,
                keyword_xpath="//text()",
                is_other_starting_phrases=True,
                single_word=True,
                page_num=page_num,
                lower_case=False)
            page_header = []
            for keyword_index in range(len(keyword_list)):
                keyword = keyword_list[keyword_index]
                keyword = re.sub(self.empty_char_regex, '',
                                 keyword).replace('\x90', '-')
                if not '\\x' in keyword.encode(
                        'string-escape') and not "<!--" in keyword:
                    if " ".join(self.currency_slug.split(" ")
                                [1:]) in keyword or "in Lakhs" in keyword:
                        break
                    keyword = keyword.decode('unicode_escape').encode(
                        'ascii', 'ignore').strip()
                    keyword = re.sub(r"\s{2,}", " ", keyword)
                    if "VOLUME" in keyword and keyword_index > 0:
                        keyword = keyword + keyword_list[keyword_index -
                                                         1].split(":")[-1]
                    page_header.append(keyword.strip())
            page_headers_map[page_num] = "|".join(page_header[:3])
        return page_headers_map

    def write_page_table(self, file_name, file_table):
        '''Creating new file and writing file table in it
        '''
        file_name = file_name.split("|")[2].strip() + "|" + file_name.split(
            "|")[1].strip()
        file_name = file_name.replace("/", "|")
        out_csv_file = open(self.output_dir + "/" + file_name + ".csv", "wb")
        csv_writer = csv.writer(out_csv_file, delimiter=',')
        for row in file_table:
            csv_writer.writerow(row)
        out_csv_file.close()

    def generate_pagewise_csv_files(self, pagewise_table, pagewise_headers):
        '''Generating pagewise CSV files
        '''
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
        file_name = ""
        file_table = []
        for page_num in pagewise_table:
            if file_name and file_name != pagewise_headers[page_num]:
                self.write_page_table(file_name, file_table)
                file_table = pagewise_table[page_num]
                file_name = pagewise_headers[page_num]
            else:
                if not file_name:
                    file_table += pagewise_table[page_num]
                elif len(pagewise_table[page_num]) <= 1:
                    continue
                else:
                    if re.match(self.parent_scheme_regex,
                                pagewise_table[page_num][1]
                                [1]) or pagewise_table[page_num][1][0]:
                        file_table += pagewise_table[page_num][1:]
                    elif not "".join(pagewise_table[page_num][1][2:]):
                        file_table[-1][
                            1] += " " + pagewise_table[page_num][1][1]
                        file_table += pagewise_table[page_num][2:]
                file_name = pagewise_headers[page_num]
        if file_table:
            self.write_page_table(file_name, file_table)
class CombinedBudgetCSVGenerator(PDF2CSV):
    def __init__(self):
        super(CombinedBudgetCSVGenerator, self).__init__()
        self.keywords_extractor = KeywordsExtractor()        
        self.input_file = None
        self.output_dir = None
        self.currency_handle = '(` crore)'
        self.separator = " || " 
        self.continued_symbol = "(CONTD"

    def generate_combined_budget_csv(self, input_file, output_dir):
        self.input_file = input_file
        self.output_dir = output_dir
        self.generate_csv_file(input_file, input_file.split(".pdf")[0] + ".csv", is_header=False, check_page_rotation=True)

    def modify_table_data(self, table):
        pagewise_table = self.split_pages(table)
        pagewise_table = self.clean_pagewise_table(pagewise_table)
        pagewise_table,pagewise_keywords = self.create_page_to_file_map(pagewise_table)
        self.generate_child_csv_files(pagewise_table, pagewise_keywords)
        return None

    def split_pages(self, table):
        pagewise_table = {}
        temp_list = []
        page_num = 1
        for row in table:
            if row[0] == self.page_break.replace('"',''):
                if temp_list and len(temp_list[0]) > MIN_COL_COUNT:
                    pagewise_table[page_num] = temp_list
                    temp_list = []
                page_num += 1
            elif len(row) > MIN_COL_COUNT:
                temp_list.append(row)
        if temp_list and len(temp_list[0]) > MIN_COL_COUNT:
            pagewise_table[page_num] = temp_list
        return pagewise_table

    def clean_pagewise_table(self, pagewise_table):
        for page_num in pagewise_table:
            page_table = pagewise_table[page_num]
            for row_index in range(len(page_table)):
                for col_index in range(len(page_table[row_index])):
                    page_table[row_index][col_index] = page_table[row_index][col_index].strip() 
            if "crore" in "".join(page_table[0]):
                page_table.pop(0)
            while page_table[0][0].strip() == page_table[0][1].strip() == "":
                for row in page_table:
                    row[0] = (row[0] + " " + row[1]).strip()
                    row.pop(1)
            while True:
                empty_row_indices = []
                for row_index in range(len(page_table)):
                    if row_index in empty_row_indices:
                        continue
                    if page_table[row_index][0] == page_table[row_index][1] == "": 
                        for col_index in range(len(page_table[row_index])):
                            page_table[row_index-1][col_index] = page_table[row_index-1][col_index] + " " + page_table[row_index][col_index]
                        empty_row_indices.append(row_index)
                    elif not "".join(page_table[row_index][1:]).strip():
                        if re.match(r"[0-9A-Z]\.|[a-z]+\)|[0-9]", page_table[row_index][0]) and row_index < len(page_table)-1:
                            for col_index in range(len(page_table[row_index])):
                                page_table[row_index][col_index] = page_table[row_index][col_index] + " " + page_table[row_index+1][col_index]
                            empty_row_indices.append(row_index+1)
                        else:
                            page_table[row_index-1][0] = page_table[row_index-1][0] + " " + page_table[row_index][0]
                            empty_row_indices.append(row_index)
                num = 0
                if not empty_row_indices:
                    break
                for row_count in empty_row_indices:
                    page_table.pop(row_count-num)
                    num += 1
            pagewise_table[page_num] = page_table 
        return pagewise_table

    def get_rotated_pdf_keywords(self, input_pdf_filepath, page_num):
        input_pdf_obj = PdfFileReader(open(input_pdf_filepath, 'rb')) 
        temp_pdf_obj = PdfFileWriter()
        temp_pdf_obj.addPage(input_pdf_obj.getPage(page_num).rotateClockwise(90)) 
        output_stream = file(TEMP_PDF_FILE, "wb")
        temp_pdf_obj.write(output_stream)
        output_stream.close()
        return self.keywords_extractor.get_bold_text_phrases(TEMP_PDF_FILE, is_other_starting_phrases=True, single_word=True, page_num=1, lower_case=False) 

    def create_page_to_file_map(self, pagewise_table): 
        pagewise_keywords = {}
        for page_num in pagewise_table:
            keyword_list = self.keywords_extractor.get_bold_text_phrases(self.input_file, is_other_starting_phrases=True, single_word=True, page_num=page_num, lower_case=False)
            if not keyword_list or len(keyword_list[0]) < MIN_TITLE_CHARS:
                keyword_list = self.get_rotated_pdf_keywords(self.input_file, page_num-1)
            pagewise_keywords[page_num] = keyword_list
        for page_num in pagewise_keywords:
            currency_handle_found = False
            for keyword_index in range(len(pagewise_keywords[page_num])):
                if pagewise_keywords[page_num][keyword_index] == self.currency_handle:
                    currency_handle_found = True
                    pagewise_keywords[page_num] = self.separator.join(pagewise_keywords[page_num][0:keyword_index])
                    break
            if not currency_handle_found:
                pagewise_keywords[page_num] = pagewise_keywords[page_num][0]
            if self.continued_symbol in pagewise_keywords[page_num]:
                if not page_num-1 in pagewise_table:
                    continue
                if not self.separator in pagewise_keywords[page_num]:
                    pagewise_keywords[page_num] = pagewise_keywords[page_num-1] 
                    pagewise_table[page_num].pop(0) 
                    pagewise_table[page_num] = pagewise_table[page_num-1] + pagewise_table[page_num]
                    pagewise_table.pop(page_num-1)
                else:
                    pagewise_keywords[page_num] = pagewise_keywords[page_num-1].split(self.separator)[0] + self.separator + pagewise_keywords[page_num].split(self.separator)[-1] 
        return pagewise_table,pagewise_keywords

    def generate_child_csv_files(self, pagewise_table, pagewise_keywords):
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
        for page_num in pagewise_table:
            file_name = re.sub(r'^TABLE ', '', pagewise_keywords[page_num]).strip()
            file_name = file_name.replace("/", '|')
            file_name = file_name.replace(self.separator, '-')
            out_csv_file = open(self.output_dir + "/" + file_name + ".csv", "wb")
            csv_writer = csv.writer(out_csv_file, delimiter=',')
            for row in pagewise_table[page_num]:
                csv_writer.writerow(row)
            out_csv_file.close()