def __init__(self): super(CombinedBudgetCSVGenerator, self).__init__() self.keywords_extractor = KeywordsExtractor() self.input_file = None self.output_dir = None self.currency_handle = '(` crore)' self.separator = " || " self.continued_symbol = "(CONTD"
def __init__(self): super(ExpenditureBudgetCSVGenerator, self).__init__() self.header_rows_indices = [0,1,2] self.header_rows_cap = 5 self.header_format = None self.header_padding_required = False self.keywords_extractor = KeywordsExtractor() self.bold_keywords = []
def __init__(self): super(KarnatakaBudgetCSVGenerator, self).__init__() self.keywords_extractor = KeywordsExtractor() self.min_col_count = 8 self.max_col_count = 10 self.currency_slug = "(Rs. in Lakhs)" self.empty_char_regex = r'(\xe2|\xc3|\x82|\xa2|\x80)' self.parent_scheme_regex = r"([A-Z]+\.|\([a-z]+\)|\d{4,}|^[MDCLXVI]+ |^Total)" self.voted_charged_column = True
class CombinedBudgetCSVGenerator(PDF2CSV): def __init__(self): super(CombinedBudgetCSVGenerator, self).__init__() self.keywords_extractor = KeywordsExtractor() self.input_file = None self.output_dir = None self.currency_handle = '(` crore)' self.separator = " || " self.continued_symbol = "(CONTD" def generate_combined_budget_csv(self, input_file, output_dir): self.input_file = input_file self.output_dir = output_dir self.generate_csv_file(input_file, input_file.split(".pdf")[0] + ".csv", is_header=False, check_page_rotation=True) def modify_table_data(self, table): pagewise_table = self.split_pages(table) pagewise_table = self.clean_pagewise_table(pagewise_table) pagewise_table, pagewise_keywords = self.create_page_to_file_map( pagewise_table) self.generate_child_csv_files(pagewise_table, pagewise_keywords) return None def split_pages(self, table): pagewise_table = {} temp_list = [] page_num = 1 for row in table: if row[0] == self.page_break.replace('"', ''): if temp_list and len(temp_list[0]) > MIN_COL_COUNT: pagewise_table[page_num] = temp_list temp_list = [] page_num += 1 elif len(row) > MIN_COL_COUNT: temp_list.append(row) if temp_list and len(temp_list[0]) > MIN_COL_COUNT: pagewise_table[page_num] = temp_list return pagewise_table def clean_pagewise_table(self, pagewise_table): for page_num in pagewise_table: page_table = pagewise_table[page_num] for row_index in range(len(page_table)): for col_index in range(len(page_table[row_index])): page_table[row_index][col_index] = page_table[row_index][ col_index].strip() if "crore" in "".join(page_table[0]): page_table.pop(0) while page_table[0][0].strip() == page_table[0][1].strip() == "": for row in page_table: row[0] = (row[0] + " " + row[1]).strip() row.pop(1) while True: empty_row_indices = [] for row_index in range(len(page_table)): if row_index in empty_row_indices: continue if page_table[row_index][0] == page_table[row_index][ 1] == "": for col_index in range(len(page_table[row_index])): page_table[row_index - 1][col_index] = page_table[ row_index - 1][col_index] + " " + page_table[ row_index][col_index] empty_row_indices.append(row_index) elif not "".join(page_table[row_index][1:]).strip(): if re.match(r"[0-9A-Z]\.|[a-z]+\)|[0-9]", page_table[row_index] [0]) and row_index < len(page_table) - 1: for col_index in range(len(page_table[row_index])): page_table[row_index][col_index] = page_table[ row_index][col_index] + " " + page_table[ row_index + 1][col_index] empty_row_indices.append(row_index + 1) else: page_table[row_index - 1][0] = page_table[ row_index - 1][0] + " " + page_table[row_index][0] empty_row_indices.append(row_index) num = 0 if not empty_row_indices: break for row_count in empty_row_indices: page_table.pop(row_count - num) num += 1 pagewise_table[page_num] = page_table return pagewise_table def get_rotated_pdf_keywords(self, input_pdf_filepath, page_num): input_pdf_obj = PdfFileReader(open(input_pdf_filepath, 'rb')) temp_pdf_obj = PdfFileWriter() temp_pdf_obj.addPage( input_pdf_obj.getPage(page_num).rotateClockwise(90)) output_stream = file(TEMP_PDF_FILE, "wb") temp_pdf_obj.write(output_stream) output_stream.close() return self.keywords_extractor.get_bold_text_phrases( TEMP_PDF_FILE, is_other_starting_phrases=True, single_word=True, page_num=1, lower_case=False) def create_page_to_file_map(self, pagewise_table): pagewise_keywords = {} for page_num in pagewise_table: keyword_list = self.keywords_extractor.get_bold_text_phrases( self.input_file, is_other_starting_phrases=True, single_word=True, page_num=page_num, lower_case=False) if not keyword_list or len(keyword_list[0]) < MIN_TITLE_CHARS: keyword_list = self.get_rotated_pdf_keywords( self.input_file, page_num - 1) pagewise_keywords[page_num] = keyword_list for page_num in pagewise_keywords: currency_handle_found = False for keyword_index in range(len(pagewise_keywords[page_num])): if pagewise_keywords[page_num][ keyword_index] == self.currency_handle: currency_handle_found = True pagewise_keywords[page_num] = self.separator.join( pagewise_keywords[page_num][0:keyword_index]) break if not currency_handle_found: pagewise_keywords[page_num] = pagewise_keywords[page_num][0] if self.continued_symbol in pagewise_keywords[page_num]: if not page_num - 1 in pagewise_table: continue if not self.separator in pagewise_keywords[page_num]: pagewise_keywords[page_num] = pagewise_keywords[page_num - 1] pagewise_table[page_num].pop(0) pagewise_table[page_num] = pagewise_table[ page_num - 1] + pagewise_table[page_num] pagewise_table.pop(page_num - 1) else: pagewise_keywords[page_num] = pagewise_keywords[ page_num - 1].split( self.separator )[0] + self.separator + pagewise_keywords[ page_num].split(self.separator)[-1] return pagewise_table, pagewise_keywords def generate_child_csv_files(self, pagewise_table, pagewise_keywords): if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) for page_num in pagewise_table: file_name = re.sub(r'^TABLE ', '', pagewise_keywords[page_num]).strip() file_name = file_name.replace("/", '|') file_name = file_name.replace(self.separator, '-') out_csv_file = open(self.output_dir + "/" + file_name + ".csv", "wb") csv_writer = csv.writer(out_csv_file, delimiter=',') for row in pagewise_table[page_num]: csv_writer.writerow(row) out_csv_file.close()
class KarnatakaBudgetCSVGenerator(PDF2CSV): def __init__(self): super(KarnatakaBudgetCSVGenerator, self).__init__() self.keywords_extractor = KeywordsExtractor() self.min_col_count = 8 self.max_col_count = 10 self.currency_slug = "(Rs. in Lakhs)" self.empty_char_regex = r'(\xe2|\xc3|\x82|\xa2|\x80)' self.parent_scheme_regex = r"([A-Z]+\.|\([a-z]+\)|\d{4,}|^[MDCLXVI]+ |^Total)" self.voted_charged_column = True def generate_karnataka_budget_csv(self, input_file, output_dir): '''Main call comes here setting global variable and calling PDF to CSV ''' self.input_file = input_file self.output_dir = output_dir self.generate_csv_file(input_file, input_file.split(".pdf")[0] + ".csv", is_header=False, identify_columns=True) def modify_table_data(self, table): '''Modifying output of PDF to CSV to clean, wrangle and generate multiple CSV files ''' pagewise_table = self.split_pages(table) pagewise_table = self.extract_head_codes(pagewise_table) pagewise_table = self.clean_pagewise_table(pagewise_table) for page_num in pagewise_table: unwanted_row_indices = [] page_table = pagewise_table[page_num] header_found = False for row_index in range(len(page_table)): self.correct_column_count(row_index, page_table) unwanted_header_row_indices = self.clean_header_values(row_index, page_table) if unwanted_header_row_indices: unwanted_row_indices += unwanted_header_row_indices header_found = True elif header_found and self.voted_charged_column: page_table[row_index].insert(2, "") self.correct_combined_values(row_index, page_table) unwanted_row_indices += self.merge_splitted_rows(page_table) self.delete_unwanted_rows(unwanted_row_indices, page_table) pagewise_headers = self.generate_page_headers_map(pagewise_table) pagewise_table = self.extract_budget_codes(pagewise_table) self.generate_pagewise_csv_files(pagewise_table, pagewise_headers) def split_pages(self, table): '''Splitting main table into pagewise tables ''' pagewise_table = {} temp_list = [] page_num = 1 for row in table: if row[0] == self.page_break.replace('"',''): if temp_list and len(temp_list[0]) > self.min_col_count: pagewise_table[page_num] = temp_list temp_list = [] page_num += 1 elif len(row) > self.min_col_count: temp_list.append(row) if temp_list and len(temp_list[0]) > self.min_col_count: pagewise_table[page_num] = temp_list return pagewise_table def extract_head_codes(self, pagewise_table): '''Extracting Head codes from scheme descriptions, inheriting classes can customize it ''' return pagewise_table def extract_budget_codes(self, pagewise_table): '''Extracting Budget codes from scheme descriptions, inheriting classes can customize it ''' return pagewise_table def clean_pagewise_table(self, pagewise_table): '''Cleansing pagewise tables to remove Kannada chars(Windows-1252 encoded) ''' for page_num in pagewise_table: page_table = pagewise_table[page_num] unwanted_row_indices = {} for row_index in range(len(page_table)): for col_index in range(len(page_table[row_index])): val = page_table[row_index][col_index] val = re.sub(r'(\xe2|\x80|vjU)', '', val).replace('\x90', '-') if '\\x' in val.encode('string-escape'): if " " in val: val = re.sub(r"\s{2,}", " ", val) val_list = val.split(" ") clear_index = 0 for val_index in range(len(val_list)): if not '\\x' in val_list[val_index].encode('string-escape') and re.findall(r"[a-zA-Z0-9\.\(\)\&\-\+]{1,}", val_list[val_index]): if clear_index == 0: clear_index = val_index else: clear_index = 0 if clear_index > 0: val = " ".join(val.split(" ")[clear_index:]) else: val = "" else: val = "" page_table[row_index][col_index] = val.strip() if not "".join(page_table[row_index]).strip(): unwanted_row_indices[row_index] = True self.delete_unwanted_rows(unwanted_row_indices.keys(), page_table) return pagewise_table def correct_column_count(self,row_index, page_table): '''Inserting extra columns wherever required ''' while len(page_table[row_index]) < self.max_col_count: page_table[row_index].insert(0, "") def correct_combined_values(self, row_index, page_table): '''Correcting Grand Total and Voted/Charged values which got merged in original doc ''' if page_table[row_index][1] == "GRAND TOTAL (PLAN + NON-PLAN)": col_index = 2 while col_index < len(page_table[row_index]): if not "." in page_table[row_index][col_index]: page_table[row_index][col_index+1] = page_table[row_index][col_index] + page_table[row_index][col_index+1] page_table[row_index][col_index] = "P+NP =" col_index += 2 voted_charged_match = re.findall(r"(\s){,1}(Voted|Charged)$", page_table[row_index][1]) if voted_charged_match: voted_charged_match = "".join(map(list, voted_charged_match)[0]) page_table[row_index][1] = page_table[row_index][1].split(voted_charged_match)[0] page_table[row_index][2] = voted_charged_match.strip() def clean_header_values(self, row_index, page_table): '''CLeaning and generating correct header values and unwanted row indices ''' unwanted_row_indices = [] if page_table[row_index][2] == "Plan": page_table[row_index][0] = "Budget Code" header_1_val = "" for index in range(row_index+1): header_1_val += " " + page_table[index][1] if index != row_index: unwanted_row_indices.append(index) page_table[row_index][1] = header_1_val.strip() year_index = 2 for col_index in range(2, len(page_table[row_index])): if col_index%2 == 0 and col_index != year_index: year_index += 2 if not " " in page_table[0][year_index+1]: page_table[0][year_index+1] = " " + page_table[0][year_index+1] page_table[row_index][col_index] = page_table[0][year_index] + page_table[0][year_index+1] + " " + page_table[row_index][col_index] + " " + self.currency_slug page_table[row_index].insert(2, 'Voted/Charged') elif page_table[row_index][2] == "2": unwanted_row_indices.append(row_index) return unwanted_row_indices def merge_splitted_rows(self, page_table): '''Merging splitted rows into one ''' unwanted_row_indices = {} for row_index in range(5, len(page_table)): if re.match(self.parent_scheme_regex, page_table[row_index][1]) or page_table[row_index][0]: continue elif not "".join(page_table[row_index][2:]): parent_row_index = row_index while not (re.match(self.parent_scheme_regex, page_table[parent_row_index][1]) or page_table[parent_row_index][0]): parent_row_index -= 1 if parent_row_index in unwanted_row_indices: continue if page_table[row_index][1].strip(): page_table[parent_row_index][1] += ' ' + page_table[row_index][1].strip() unwanted_row_indices[row_index] = True return unwanted_row_indices.keys() def delete_unwanted_rows(self, unwanted_row_indices, page_table): '''Deleting unwanted row indices from page tables ''' unwanted_row_indices.sort() num = 0 for row_index in unwanted_row_indices: page_table.pop(row_index-num) num += 1 def generate_page_headers_map(self, pagewise_table): '''Generating pagewise headers for tables ''' pagewise_keywords = {} page_headers_map = {} for page_num in pagewise_table: keyword_list = self.keywords_extractor.get_bold_text_phrases(self.input_file, keyword_xpath="//text()", is_other_starting_phrases=True, single_word=True, page_num=page_num, lower_case=False) page_header = [] for keyword in keyword_list: keyword = re.sub(self.empty_char_regex, '', keyword).replace('\x90', '-') if not '\\x' in keyword.encode('string-escape') and not "<!--" in keyword: if " ".join(self.currency_slug.split(" ")[1:]) in keyword or "in Lakhs" in keyword: break keyword = keyword.decode('unicode_escape').encode('ascii','ignore').strip() keyword = re.sub(r"\s{2,}", " ", keyword) page_header.append(keyword.strip()) page_headers_map[page_num] = "|".join(page_header) return page_headers_map def write_page_table(self, file_name, file_table): '''Creating new file and writing file table in it ''' file_name = file_name.replace("/", "|") file_name = file_name.split("|")[-1].strip() out_csv_file = open(self.output_dir + "/" + file_name + ".csv", "wb") csv_writer = csv.writer(out_csv_file, delimiter=',') for row in file_table: csv_writer.writerow(row) out_csv_file.close() def generate_pagewise_csv_files(self, pagewise_table, pagewise_headers): '''Generating pagewise CSV files ''' if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) file_name = "" file_table = [] for page_num in pagewise_table: if file_name and file_name != pagewise_headers[page_num]: self.write_page_table(file_name, file_table) file_table = pagewise_table[page_num] file_name = pagewise_headers[page_num] else: if not file_name: file_table += pagewise_table[page_num] elif len(pagewise_table[page_num]) <= 1: continue else: if re.match(self.parent_scheme_regex, pagewise_table[page_num][1][1]) or pagewise_table[page_num][1][0]: file_table += pagewise_table[page_num][1:] elif not "".join(pagewise_table[page_num][1][2:]): file_table[-1][1] += " " + pagewise_table[page_num][1][1] file_table += pagewise_table[page_num][2:] file_name = pagewise_headers[page_num] if file_table: self.write_page_table(file_name, file_table)
class ExpenditureBudgetCSVGenerator(PDF2CSV): def __init__(self): super(ExpenditureBudgetCSVGenerator, self).__init__() self.header_rows_indices = [0,1,2] self.header_rows_cap = 5 self.header_format = None self.header_padding_required = False self.keywords_extractor = KeywordsExtractor() self.bold_keywords = [] def check_missing_vertical(self, lines): found_missing_vertical = False min_vertical = None for line in lines: for x1,y1,x2,y2 in line: if x1 == x2: if not min_vertical or x1 < min_vertical: min_vertical = x1 if min_vertical: for line in lines: for x1,y1,x2,y2 in line: if y1 == y2 and min_vertical/2 < x1 < min_vertical: found_missing_vertical = x1 return found_missing_vertical def modify_table_data(self, table): table = self.correct_upper_header_rows(table) table = self.merge_splitted_coloumns(table) if not table: return table table = self.split_merged_coloumns(table) table = self.fix_second_header(table) table = self.remove_dulicate_headers(table) empty_row_indices = [] for row_num in range(len(table)-1): if "".join(table[row_num]).strip() == "Major" and table[row_num+1][1].strip() == "Head": table[row_num+1][1] = "Major " + table[row_num+1][1] table[row_num][1] == "" empty_row_indices.append(row_num) for row_count in empty_row_indices: table.pop(row_count) table = self.merge_splitted_rows(table) table = self.ensure_table_intergrity(table) table = self.add_index_coloumn(table) table = self.add_new_headers(table) return table def correct_upper_header_rows(self, table): for num in range(0,5): if table[num][0].strip() != "": table[num] = [""] + table[num] self.header_padding_required = True return table def merge_splitted_coloumns(self, table): pagewise_table_list = [] temp_list = [] for row in table: if row[0] == self.page_break.replace('"',''): if temp_list and len(temp_list[0]) > MIN_COL_COUNT: pagewise_table_list.append(temp_list) temp_list = [] elif len(row) > MIN_COL_COUNT: row[0] = row[0].strip() row[1] = row[1].strip() if re.search(r'^[\d\.]+$', row[0]) and row[1]: row[0] += " " + row[1] row[1] = "" temp_list.append(row) if temp_list and len(temp_list[0]) > MIN_COL_COUNT: pagewise_table_list.append(temp_list) table = [] for page_table in pagewise_table_list: #page_table = self.delete_empty_coloumns(page_table) for row_index in range(len(page_table)): while page_table[row_index][0].strip() == page_table[row_index][1].strip() == "": page_table[row_index][0] += " " + page_table[row_index][1] page_table[row_index].pop(1) header_row = page_table[1] merge_upper_bound = 0 for col_index in range(1, len(header_row)): header_stub = header_row[col_index].strip().lower() if re.search(r"^(major|head of dev*)$", header_stub): merge_upper_bound = col_index break for row in page_table: if merge_upper_bound: num = 0 row[0] = row[0].strip() for col_index in range(1, merge_upper_bound): row[0] += " " + row[col_index-num] row.pop(col_index-num) num += 1 table.append(row) return table def split_merged_coloumns(self, table): for row in table: for col_index in range(2,len(row)): if re.search(r'^(\.{3}\s(\.{3}|[0-9]|\-[0-9]))', row[col_index].strip()) and not row[col_index-1].strip(): row[col_index-1] = "..." row[col_index] = re.sub(r'^\.{3}\s', '', row[col_index]).strip() new_col_indices = [] for row_index in range(len(table)): row = table[row_index] new_col_values_map = {} for col_index in range(0, len(row)): row[col_index] = row[col_index].strip() multi_col_match = re.search(r'^((\.{3}|(\-)*[0-9]+(\.[0-9]+){,1}|Plan|Non-Plan|Total)(\s)*)+$', row[col_index].strip()) if multi_col_match: multi_col_match_str = multi_col_match.group(0) if " " not in multi_col_match_str.strip(): continue col_values = multi_col_match_str.split(" ") row[col_index] = col_values[0] index_correction = len(new_col_values_map) for col_count in range(1,len(col_values)): if col_index+col_count < len(row) and not row[col_index+col_count].strip(): row[col_index+col_count] = col_values[col_count] else: insert_pointer = col_index+col_count+index_correction new_col_values_map[insert_pointer] = col_values[col_count] if not insert_pointer in new_col_indices and row_index <= self.header_rows_cap: new_col_indices.append(insert_pointer) for index in sorted(new_col_values_map): row.insert(index, new_col_values_map[index]) table = self.correct_major_head_values(table) col_shifted = False for row_index in range(len(table)): table[row_index][0] = table[row_index][0].strip() table[row_index][1] = table[row_index][1].strip() if (re.match(r'head of', table[row_index][0].lower()) and re.match(r'budget', table[row_index][1].strip().lower())): continue if (re.match(r'head of', table[row_index][1].lower()) and re.match(r'budget', table[row_index][2].strip().lower())): continue if len(table[row_index]) > len(FORMAT_DICT[self.header_format]["SECOND_HEADER_ROW"]) or not " " in " ".join(table[row_index]).strip(): while not table[row_index][0] and "".join(table[row_index][1:]).strip(): table[row_index].pop(0) col_shifted = True if col_shifted: table = self.correct_major_head_values(table) for col_index in new_col_indices: table[0].insert(col_index, " ") return table def correct_major_head_values(self, table): for row in table: if row[0].strip() == "Grand Total": if row[1].strip(): row.insert(1, "") continue major_code_match = re.findall(r'\s[0-9]{4,}$|^[0-9]{4,}$|^Head of Dev|Head of$| Dev$|\s{,1}Total$|^Net$', row[0].strip()) if major_code_match: major_code = major_code_match[-1] scheme_name = row[0].split(major_code)[0] if len(row) < len(FORMAT_DICT[self.header_format]["SECOND_HEADER_ROW"]): row[0] = scheme_name.strip() if not row[1].strip(): row[1] = major_code else: row.insert(1, major_code) elif not row[1].strip(): row[0] = scheme_name.strip() row[1] = major_code for row_index in range(2, len(table)): major_head_cell_val = table[row_index][1].strip() if not major_head_cell_val or (re.match(r'head of', major_head_cell_val.lower()) and re.match(r'budget', table[row_index][2].strip().lower())): continue if re.search(r'(^[0-9]{1,}\.[0-9]{1,})|\.{3}', major_head_cell_val): table[row_index].insert(1, '') elif re.search(r'\D{2,}(\s\D{2,})+', major_head_cell_val): major_code_match = re.search(r'\d{4,}', major_head_cell_val) if major_code_match: major_code = major_code_match.group(0) table[row_index][0] += table[row_index][0].strip() + " " + major_head_cell_val.split(major_code)[0].strip() table[row_index][1] = major_code else: table[row_index][0] = major_head_cell_val table[row_index][1] = "" return table def remove_dulicate_headers(self, table): empty_row_indices = [] header_rows = [] seconary_header_stub = "".join(FORMAT_DICT[self.header_format]["SECOND_HEADER_ROW"]).replace(" ", "") for row_num in self.header_rows_indices: header_rows.append("".join(table[row_num]).replace(" ", "")) for row_num in range(self.header_rows_indices[-1]+1, len(table)): row_stub = "".join(table[row_num]).replace(" ", "") if row_stub in header_rows or re.search(r"^(%s)" % "|".join(HEADER_STUBS), row_stub): empty_row_indices.append(row_num) if row_stub == seconary_header_stub and not seconary_header_stub in header_rows: header_rows.append(seconary_header_stub) elif "IEBR" in row_stub: empty_row_indices.append(row_num) num = 0 for row_count in empty_row_indices: table.pop(row_count-num) num += 1 return table def merge_splitted_rows(self, table): empty_row_indices = [] previous_data_slug = "" for row_index in range(1,len(table)): table[row_index][0] = table[row_index][0].strip() if not table[row_index][0] or re.search(r'^(Total|[A-C]\.)|\:$', table[row_index][0]) or (re.sub(r'\s{2,}', ' ', table[row_index][0].lower()) in self.bold_keywords): continue elif re.search(r'^[0-9]', table[row_index][0]) and not "".join(table[row_index][1:]).strip(): index_col_val = re.search(r'^(\d{1,2}(\.)*)+(\s){0,1}', table[row_index][0]).group(0).strip() scheme_col_val = re.sub(r'\s{2,}', ' ', table[row_index][0].lower().replace(index_col_val, "").strip()) if scheme_col_val in self.bold_keywords: continue previous_data_slug += table[row_index][0] + " " empty_row_indices.append(row_index) continue elif table[row_index][0].strip() and not "".join(table[row_index][1:]).strip() and table[row_index-1][0].strip(): parent_row_index = row_index while parent_row_index > 1: parent_row_index -= 1 if "".join(table[parent_row_index][1:]).strip(): break table[parent_row_index][0] = table[parent_row_index][0].strip() + " " + table[row_index][0].strip() empty_row_indices.append(row_index) elif previous_data_slug: table[row_index][0] = previous_data_slug + table[row_index][0] previous_data_slug = "" num = 0 for row_count in empty_row_indices: table.pop(row_count-num) num += 1 return table def add_index_coloumn(self, table): for row_index in range(len(table)): index_col_val = "" table[row_index][0] = table[row_index][0].strip() if re.search(r'^(\d{1,2}(\.)*)+(\s){0,1}\D+', table[row_index][0]): index_col_val = re.search(r'^(\d{1,2}(\.)*)+(\s){0,1}', table[row_index][0]).group(0).strip() table[row_index][0] = table[row_index][0].split(index_col_val)[-1].strip() table[row_index] = [index_col_val] + table[row_index] return table def add_new_headers(self, table): year_list = [] header_size = len(table[0]) for col_index in range(header_size): year_data_match = re.search(r'[0-9]{4}\-[0-9]{4}', table[0][col_index]) if year_data_match: year = year_data_match.group(0) year_list.append(year) table[0] = FORMAT_DICT[self.header_format]["FIRST_HEADER_ROW"][:] num = 3 for year in year_list: for val in range(0,3): table[0][num] += " " + year num += 1 while len(table[0]) > len(table[1]): table[1] = [""] + table[1] table = self.merge_up_rows(0, table) return table def fix_second_header(self, table): merge_up_required = False header_row_index = None for row_index in range(len(table)): row = table[row_index] if self.header_padding_required: col_value = row[1].lower().strip() else: col_value = row[2].lower().strip() if re.match(r'%s' % SECOND_HEADER_FIELD, col_value): header_row_index = row_index table[header_row_index] = FORMAT_DICT[self.header_format]["SECOND_HEADER_ROW"][:] elif "Head of" in "".join(row).strip(): header_row_index = row_index if table[header_row_index][0].strip(): table[header_row_index+1][0] = (table[header_row_index][0] + " " + table[header_row_index + 1][0]).strip() table[header_row_index] = FORMAT_DICT[self.header_format]["SECOND_HEADER_ROW"][:] merge_up_required = True break if merge_up_required: if table[header_row_index+1][0]: table[header_row_index+1] = [table[header_row_index+1][0]] else: table.pop(header_row_index+1) return table def merge_up_rows(self, row_index, table): for col_index in range(len(table[row_index])): table[row_index][col_index] = (table[row_index][col_index].strip() + " " + table[row_index+1][col_index].strip()).strip() table.pop(row_index+1) return table def generate_expenditure_budgets_csv(self, doc_dir, header_format, page_header, identify_columns): self.header_format = header_format year_data_match = re.search(r'[0-9]{4}\-[0-9]{2,}', doc_dir) if year_data_match: year = year_data_match.group(0) for file_name in glob.glob("%s*.pdf" % doc_dir): department_name = os.path.basename(file_name).lower().split(".pdf")[0].decode('utf-8') if not department_name in SKIP_FILENAMES: logger.info("Processing PDF document for department: %s" % department_name) try: self.bold_keywords = self.keywords_extractor.get_bold_text_phrases(file_name, is_other_starting_phrases=True, single_word=True) logger.info("BOLD Keywords: %s" % str(self.bold_keywords)) self.generate_csv_file(file_name, file_name.split(".pdf")[0] + ".csv", is_header=page_header, identify_columns=identify_columns, temp_file_postfix=year) except Exception, error_message: logger.error("Unable to extract CSV for department: %s, error_message: %s" % (department_name, error_message), exc_info = True)
class KarnatakaBudgetCSVGenerator(PDF2CSV): def __init__(self): super(KarnatakaBudgetCSVGenerator, self).__init__() self.keywords_extractor = KeywordsExtractor() self.min_col_count = 8 self.max_col_count = 10 self.currency_slug = "(Rs. in Lakhs)" self.empty_char_regex = r'(\xe2|\xc3|\x82|\xa2|\x80)' self.parent_scheme_regex = r"([A-Z]+\.|\([a-z]+\)|\d{4,}|^[MDCLXVI]+ |^Total)" self.voted_charged_column = True def generate_karnataka_budget_csv(self, input_file, output_dir): ''' Main call comes here setting global variable and calling PDF to CSV ''' self.input_file = input_file self.output_dir = output_dir self.generate_csv_file(input_file, input_file.split(".pdf")[0] + ".csv", is_header=True, identify_columns=True) def modify_table_data(self, table): ''' Modifying output of PDF to CSV to clean, wrangle and generate multiple CSV files ''' pagewise_table = self.split_pages(table) pagewise_table = self.extract_head_codes(pagewise_table) pagewise_table = self.clean_pagewise_table(pagewise_table) for page_num in pagewise_table: unwanted_row_indices = [] page_table = pagewise_table[page_num] header_found = False for row_index in range(len(page_table)): self.correct_column_count(row_index, page_table) unwanted_header_row_indices = self.clean_header_values( row_index, page_table) if unwanted_header_row_indices: unwanted_row_indices += unwanted_header_row_indices header_found = True elif header_found and self.voted_charged_column: page_table[row_index].insert(2, "") self.correct_combined_values(row_index, page_table) unwanted_row_indices += self.merge_splitted_rows(page_table) self.delete_unwanted_rows(unwanted_row_indices, page_table) pagewise_headers = self.generate_page_headers_map(pagewise_table) pagewise_table = self.extract_budget_codes(pagewise_table) self.generate_pagewise_csv_files(pagewise_table, pagewise_headers) def split_pages(self, table): ''' Splitting main table into pagewise tables ''' pagewise_table = {} temp_list = [] page_num = 1 for row in table: if row[0] == self.page_break.replace('"', ''): if temp_list and len(temp_list[0]) > self.min_col_count: pagewise_table[page_num] = temp_list temp_list = [] page_num += 1 elif len(row) > self.min_col_count: temp_list.append(row) if temp_list and len(temp_list[0]) > self.min_col_count: pagewise_table[page_num] = temp_list return pagewise_table def extract_head_codes(self, pagewise_table): ''' Extracting Head codes from scheme descriptions, inheriting classes can customize it ''' return pagewise_table def extract_budget_codes(self, pagewise_table): ''' Extracting Budget codes from scheme descriptions, inheriting classes can customize it ''' return pagewise_table def clean_pagewise_table(self, pagewise_table): ''' Cleansing pagewise tables to remove Kannada chars(Windows-1252 encoded) ''' for page_num in pagewise_table: page_table = pagewise_table[page_num] unwanted_row_indices = {} for row_index in range(len(page_table)): for col_index in range(len(page_table[row_index])): val = page_table[row_index][col_index] val = re.sub(r'(\xe2|\x80|vjU)', '', val).replace('\x90', '-') if '\\x' in val.encode('string-escape'): if " " in val: val = re.sub(r"\s{2,}", " ", val) val_list = val.split(" ") clear_index = 0 for val_index in range(len(val_list)): if not '\\x' in val_list[val_index].encode( 'string-escape') and re.findall( r"[a-zA-Z0-9\.\(\)\&\-\+]{1,}", val_list[val_index]): if clear_index == 0: clear_index = val_index else: clear_index = 0 if clear_index > 0: val = " ".join(val.split(" ")[clear_index:]) else: val = "" else: val = "" page_table[row_index][col_index] = val.strip() if not "".join(page_table[row_index]).strip(): unwanted_row_indices[row_index] = True self.delete_unwanted_rows(unwanted_row_indices.keys(), page_table) return pagewise_table def correct_column_count(self, row_index, page_table): '''Inserting extra columns wherever required ''' while len(page_table[row_index]) < self.max_col_count: page_table[row_index].insert(0, "") def correct_combined_values(self, row_index, page_table): '''Correcting Grand Total and Voted/Charged values which got merged in original doc ''' if page_table[row_index][1] == "GRAND TOTAL (PLAN + NON-PLAN)": col_index = 2 while col_index < len(page_table[row_index]): if not "." in page_table[row_index][col_index]: page_table[row_index][ col_index + 1] = page_table[row_index][ col_index] + page_table[row_index][col_index + 1] page_table[row_index][col_index] = "P+NP =" col_index += 2 voted_charged_match = re.findall(r"(\s){,1}(Voted|Charged)$", page_table[row_index][1]) if voted_charged_match: voted_charged_match = "".join(map(list, voted_charged_match)[0]) page_table[row_index][1] = page_table[row_index][1].split( voted_charged_match)[0] page_table[row_index][2] = voted_charged_match.strip() def clean_header_values(self, row_index, page_table): '''CLeaning and generating correct header values and unwanted row indices ''' unwanted_row_indices = [] if page_table[row_index][2] == "Plan": page_table[row_index][0] = "Budget Code" header_1_val = "" for index in range(row_index + 1): header_1_val += " " + page_table[index][1] if index != row_index: unwanted_row_indices.append(index) page_table[row_index][1] = header_1_val.strip() year_index = 2 for col_index in range(2, len(page_table[row_index])): if col_index % 2 == 0 and col_index != year_index: year_index += 2 if not " " in page_table[0][year_index + 1]: page_table[0][year_index + 1] = " " + page_table[0][year_index + 1] page_table[row_index][ col_index] = page_table[0][year_index] + page_table[0][ year_index + 1] + " " + page_table[row_index][ col_index] + " " + self.currency_slug page_table[row_index].insert(2, 'Voted/Charged') elif page_table[row_index][2] == "2": unwanted_row_indices.append(row_index) return unwanted_row_indices def merge_splitted_rows(self, page_table): '''Merging splitted rows into one ''' unwanted_row_indices = {} for row_index in range(5, len(page_table)): if re.match(self.parent_scheme_regex, page_table[row_index][1]) or page_table[row_index][0]: continue elif not "".join(page_table[row_index][2:]): parent_row_index = row_index while not (re.match(self.parent_scheme_regex, page_table[parent_row_index][1]) or page_table[parent_row_index][0]): parent_row_index -= 1 if parent_row_index in unwanted_row_indices: continue if page_table[row_index][1].strip(): if len(page_table) < abs(parent_row_index): page_table[parent_row_index][ 1] += ' ' + page_table[row_index][1].strip() unwanted_row_indices[row_index] = True return unwanted_row_indices.keys() def delete_unwanted_rows(self, unwanted_row_indices, page_table): '''Deleting unwanted row indices from page tables ''' unwanted_row_indices.sort() num = 0 for row_index in unwanted_row_indices: page_table.pop(row_index - num) num += 1 def generate_page_headers_map(self, pagewise_table): '''Generating pagewise headers for tables ''' page_headers_map = {} for page_num in pagewise_table: keyword_list = self.keywords_extractor.get_bold_text_phrases( self.input_file, keyword_xpath="//text()", is_other_starting_phrases=True, single_word=True, page_num=page_num, lower_case=False) page_header = [] for keyword_index in range(len(keyword_list)): keyword = keyword_list[keyword_index] keyword = re.sub(self.empty_char_regex, '', keyword).replace('\x90', '-') if not '\\x' in keyword.encode( 'string-escape') and not "<!--" in keyword: if " ".join(self.currency_slug.split(" ") [1:]) in keyword or "in Lakhs" in keyword: break keyword = keyword.decode('unicode_escape').encode( 'ascii', 'ignore').strip() keyword = re.sub(r"\s{2,}", " ", keyword) if "VOLUME" in keyword and keyword_index > 0: keyword = keyword + keyword_list[keyword_index - 1].split(":")[-1] page_header.append(keyword.strip()) page_headers_map[page_num] = "|".join(page_header[:3]) return page_headers_map def write_page_table(self, file_name, file_table): '''Creating new file and writing file table in it ''' file_name = file_name.split("|")[2].strip() + "|" + file_name.split( "|")[1].strip() file_name = file_name.replace("/", "|") out_csv_file = open(self.output_dir + "/" + file_name + ".csv", "wb") csv_writer = csv.writer(out_csv_file, delimiter=',') for row in file_table: csv_writer.writerow(row) out_csv_file.close() def generate_pagewise_csv_files(self, pagewise_table, pagewise_headers): '''Generating pagewise CSV files ''' if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) file_name = "" file_table = [] for page_num in pagewise_table: if file_name and file_name != pagewise_headers[page_num]: self.write_page_table(file_name, file_table) file_table = pagewise_table[page_num] file_name = pagewise_headers[page_num] else: if not file_name: file_table += pagewise_table[page_num] elif len(pagewise_table[page_num]) <= 1: continue else: if re.match(self.parent_scheme_regex, pagewise_table[page_num][1] [1]) or pagewise_table[page_num][1][0]: file_table += pagewise_table[page_num][1:] elif not "".join(pagewise_table[page_num][1][2:]): file_table[-1][ 1] += " " + pagewise_table[page_num][1][1] file_table += pagewise_table[page_num][2:] file_name = pagewise_headers[page_num] if file_table: self.write_page_table(file_name, file_table)
class CombinedBudgetCSVGenerator(PDF2CSV): def __init__(self): super(CombinedBudgetCSVGenerator, self).__init__() self.keywords_extractor = KeywordsExtractor() self.input_file = None self.output_dir = None self.currency_handle = '(` crore)' self.separator = " || " self.continued_symbol = "(CONTD" def generate_combined_budget_csv(self, input_file, output_dir): self.input_file = input_file self.output_dir = output_dir self.generate_csv_file(input_file, input_file.split(".pdf")[0] + ".csv", is_header=False, check_page_rotation=True) def modify_table_data(self, table): pagewise_table = self.split_pages(table) pagewise_table = self.clean_pagewise_table(pagewise_table) pagewise_table,pagewise_keywords = self.create_page_to_file_map(pagewise_table) self.generate_child_csv_files(pagewise_table, pagewise_keywords) return None def split_pages(self, table): pagewise_table = {} temp_list = [] page_num = 1 for row in table: if row[0] == self.page_break.replace('"',''): if temp_list and len(temp_list[0]) > MIN_COL_COUNT: pagewise_table[page_num] = temp_list temp_list = [] page_num += 1 elif len(row) > MIN_COL_COUNT: temp_list.append(row) if temp_list and len(temp_list[0]) > MIN_COL_COUNT: pagewise_table[page_num] = temp_list return pagewise_table def clean_pagewise_table(self, pagewise_table): for page_num in pagewise_table: page_table = pagewise_table[page_num] for row_index in range(len(page_table)): for col_index in range(len(page_table[row_index])): page_table[row_index][col_index] = page_table[row_index][col_index].strip() if "crore" in "".join(page_table[0]): page_table.pop(0) while page_table[0][0].strip() == page_table[0][1].strip() == "": for row in page_table: row[0] = (row[0] + " " + row[1]).strip() row.pop(1) while True: empty_row_indices = [] for row_index in range(len(page_table)): if row_index in empty_row_indices: continue if page_table[row_index][0] == page_table[row_index][1] == "": for col_index in range(len(page_table[row_index])): page_table[row_index-1][col_index] = page_table[row_index-1][col_index] + " " + page_table[row_index][col_index] empty_row_indices.append(row_index) elif not "".join(page_table[row_index][1:]).strip(): if re.match(r"[0-9A-Z]\.|[a-z]+\)|[0-9]", page_table[row_index][0]) and row_index < len(page_table)-1: for col_index in range(len(page_table[row_index])): page_table[row_index][col_index] = page_table[row_index][col_index] + " " + page_table[row_index+1][col_index] empty_row_indices.append(row_index+1) else: page_table[row_index-1][0] = page_table[row_index-1][0] + " " + page_table[row_index][0] empty_row_indices.append(row_index) num = 0 if not empty_row_indices: break for row_count in empty_row_indices: page_table.pop(row_count-num) num += 1 pagewise_table[page_num] = page_table return pagewise_table def get_rotated_pdf_keywords(self, input_pdf_filepath, page_num): input_pdf_obj = PdfFileReader(open(input_pdf_filepath, 'rb')) temp_pdf_obj = PdfFileWriter() temp_pdf_obj.addPage(input_pdf_obj.getPage(page_num).rotateClockwise(90)) output_stream = file(TEMP_PDF_FILE, "wb") temp_pdf_obj.write(output_stream) output_stream.close() return self.keywords_extractor.get_bold_text_phrases(TEMP_PDF_FILE, is_other_starting_phrases=True, single_word=True, page_num=1, lower_case=False) def create_page_to_file_map(self, pagewise_table): pagewise_keywords = {} for page_num in pagewise_table: keyword_list = self.keywords_extractor.get_bold_text_phrases(self.input_file, is_other_starting_phrases=True, single_word=True, page_num=page_num, lower_case=False) if not keyword_list or len(keyword_list[0]) < MIN_TITLE_CHARS: keyword_list = self.get_rotated_pdf_keywords(self.input_file, page_num-1) pagewise_keywords[page_num] = keyword_list for page_num in pagewise_keywords: currency_handle_found = False for keyword_index in range(len(pagewise_keywords[page_num])): if pagewise_keywords[page_num][keyword_index] == self.currency_handle: currency_handle_found = True pagewise_keywords[page_num] = self.separator.join(pagewise_keywords[page_num][0:keyword_index]) break if not currency_handle_found: pagewise_keywords[page_num] = pagewise_keywords[page_num][0] if self.continued_symbol in pagewise_keywords[page_num]: if not page_num-1 in pagewise_table: continue if not self.separator in pagewise_keywords[page_num]: pagewise_keywords[page_num] = pagewise_keywords[page_num-1] pagewise_table[page_num].pop(0) pagewise_table[page_num] = pagewise_table[page_num-1] + pagewise_table[page_num] pagewise_table.pop(page_num-1) else: pagewise_keywords[page_num] = pagewise_keywords[page_num-1].split(self.separator)[0] + self.separator + pagewise_keywords[page_num].split(self.separator)[-1] return pagewise_table,pagewise_keywords def generate_child_csv_files(self, pagewise_table, pagewise_keywords): if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) for page_num in pagewise_table: file_name = re.sub(r'^TABLE ', '', pagewise_keywords[page_num]).strip() file_name = file_name.replace("/", '|') file_name = file_name.replace(self.separator, '-') out_csv_file = open(self.output_dir + "/" + file_name + ".csv", "wb") csv_writer = csv.writer(out_csv_file, delimiter=',') for row in pagewise_table[page_num]: csv_writer.writerow(row) out_csv_file.close()