def __extract_textboxes(document): """Extract textboxes from document. It is kept here instead of document since ArthurDocument does not need to know the concept of a textbox. In ArthurDocument a textbox is just a set of features happen to have the same textbox_id. Different document type may have different configurations and approaches to this. Args: document(ArthurDocument): ArthurDocument instance textboxes will be extracted from. Returns: list: List of textboxes i.e. grouped features from document. """ features = document.get_features() page_feature_id = ArthurDocument.get_feature_id('page') textbox_feature_id = ArthurDocument.get_feature_id('textbox_id') page_textbox_pairs = features[:, [page_feature_id, textbox_feature_id]] unique_page_textbox_pairs = unique_rows(page_textbox_pairs) textboxes = [] for page, textbox_id in unique_page_textbox_pairs: textbox = features[np.where( (features[:, page_feature_id]==page) * (features[:, textbox_feature_id]==textbox_id) )] textboxes.append(textbox) return textboxes
def process_batch(zipfile, corpus_dir, batch, total, counter=0): for docname in batch: counter += 1 if not os.path.exists(corpus_dir): os.makedirs(corpus_dir) filename = os.path.join(corpus_dir, docname+'.txt') if os.path.isfile(filename) and not overwrite: if stdout is not None: stdout.write("%s already exists (%i/%i)\n" % (docname, counter, total)) else: content = zipfile.read(docname) if stdout is not None: stdout.write("processing %s (%i/%i)\n" % (docname, counter, total)) document = ArthurDocument(content, name=docname) textboxes = __extract_textboxes(document) texts = [] for idx, textbox in enumerate(textboxes): remove = __find_duplicates(textbox) ctextbox = np.delete(textbox, remove, axis=0) texts.append(document.get_text(ctextbox)) if len(texts) > 0: if not os.path.isdir(corpus_dir): os.mkdir(corpus_dir) with open(filename,'w') as fout: for text in texts: print>>fout, text else: if stdout is not None: stdout.write(" empty text! moving on...\n")
def __find_duplicates(features): """Finds duplicates of a set of features. Example of usage >>> pdf_path = os.path.join(base_path, 'test', 'test.pdf') >>> f = open(pdf_path, 'rb') >>> document = ArthurDocument(f.read(), doctype='pdf') >>> textboxes = __extract_textboxes(document) >>> print(document.get_text(textboxes[11])) Property TypeProperty Type Property TypeProperty Type Single Family >>> remove_indexes = __find_duplicates(textboxes[11]) >>> cfeatures = np.delete(textboxes[11], remove_indexes, axis=0) >>> print(document.get_text(cfeatures)) Property Type Single Family Args: features(np.array): List of features to find duplicates of. Returns: list: Returns a tuple of corrected block and removed indexes. """ fxid = ArthurDocument.get_feature_id('x') fyid = ArthurDocument.get_feature_id('y') positions = features[:,[fxid,fyid]] tree = cKDTree(positions) # Removes duplicate elements that are close together radius = 0.4 neighbors = tree.query_ball_point(positions, radius) neighbors = np.unique(neighbors) # This returns numpy array like: # [[0, 13, 26, 39] [1, 14, 27, 40] [5, 31, 44, 18] [11, 24, 37, 50] # [16, 29, 42, 3] [17, 30, 43, 4] [21, 8, 34, 47] [22, 35, 48, 9] # [32, 45, 19, 6] [36, 23, 10, 49] [38, 12, 25, 51] [41, 28, 2, 15] # [46, 33, 7, 20] [52] [53] [54] [55] [56] [57] [58] [59] [60] [61] [62] # [63] [64]] # # Which we will then remove duplicates e.g. remove index 13, 26, 39, 14, 27, etc. removed = [] for n in neighbors: removed.extend(np.sort(n)[1:]) # Removes image elements removed.extend(np.where(features[:,ArthurDocument.get_feature_id('img_width')] != -1)[0].tolist()) return removed
def extract_expressions(self, document, features=None): """Returns expressions from given features and multi-word expressions. In addition to passing a document into this method, MWEs or Multi-Word Expressions can be given to treat some multi words as one expression. >>> from document import ArthurDocument >>> pdf_path = base_path + '/test/test.pdf' >>> with open(pdf_path, 'rb') as f: ... document = ArthurDocument(f.read()) >>> features = document.get_features()[730:816,:] >>> print(document.get_text(features)) # doctest:+ELLIPSIS VICTORIA'S CROWN JEWEL OF WATERFRONT ESTATES. Nestled on a quiet cove in the exclusive Multi-word expression should be detected: >>> clusterer = DumbClusterer(mwes=['crown jewel', 'waterfront estates']) >>> expressions = clusterer.extract_expressions(document, features) >>> print(expressions[2]['text']) CROWN JEWEL x position should equal x of "C" from "CROWN JEWEL" : >>> expressions[2]['x'] == features[11, ArthurDocument.get_feature_id('x')] True and width should equal to width of "CROWN JEWEL": >>> expr_width = expressions[2]['x1']-expressions[2]['x'] >>> ftr_width = features[21, ArthurDocument.get_feature_id('x1')] - features[11, ArthurDocument.get_feature_id('x')] >>> expr_width == ftr_width True Args: document(ArthurDocument): Document to extract data fields from. features(list): List of features containing data fields to extract. If not given, use all document features. mwes(list): List of Multi-Word Expressions. Example value: `['property type', 'single family)]`. With that list, both "property type" and "single family" will each be treated as single expressions. Returns: np.array: An array of data_fields. """ mwes = self.mwes if features is None: features = document.get_features() text = document.get_text(features) for idx, mwe in enumerate(mwes): if isinstance(mwe, str): mwes[idx] = word_tokenize(mwe.lower()) elif hasattr(mwe, '__iter__'): mwes[idx] = [x.lower() for x in mwe] tokenizer = MWETokenizer(mwes, separator=' ') tokenized = tokenizer.tokenize(word_tokenize(text.lower())) expressions = [] pos = 0 for token in tokenized: # token could be "deez nutz" but text contains multiple spaces e.g. "deez nutz", # so we need to split the token and find position of first and last characters. words = token.split() start_pos = text.lower().index(words[0], pos) for word in words: ipos = text.lower().index(word, pos) end_pos = ipos + len(word) pos = end_pos min_x = 0 max_x = 0 min_y = 0 max_y = 0 page = 0 if len(features[start_pos:end_pos,:] > 0): min_x = np.amin(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('x')] max_x = np.amax(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('x1')] min_y = np.amin(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('y')] max_y = np.amax(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('y1')] page = features[start_pos, ArthurDocument.get_feature_id('page')] expressions.append({ 'text': text[start_pos:end_pos], 'x': min_x, 'x1': max_x, 'y': min_y, 'y1': max_y, 'page': page }) return expressions