Python ArthurDocument примеры использования

Язык программирования: Python

Пространство имен/Пакет: document

Класс/Тип: ArthurDocument

Примеров на hotexamples.com: 4

Python ArthurDocument - 4 примера найдено. Это лучшие примеры Python кода для document.ArthurDocument, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

get_feature_id(3)

get_text(1)

Пример #1

Показать файл

Файл: reader.py Проект: jaycode/-archive-Arthur

def __extract_textboxes(document):
    """Extract textboxes from document.

    It is kept here instead of document since ArthurDocument does not need to know the concept of a textbox.
    In ArthurDocument a textbox is just a set of features happen to have the same textbox_id. Different document
    type may have different configurations and approaches to this.

    Args:
        document(ArthurDocument): ArthurDocument instance textboxes will be extracted from.

    Returns:
        list: List of textboxes i.e. grouped features from document.
    """
    features = document.get_features()
    page_feature_id = ArthurDocument.get_feature_id('page')
    textbox_feature_id = ArthurDocument.get_feature_id('textbox_id')

    page_textbox_pairs = features[:, [page_feature_id, textbox_feature_id]]
    unique_page_textbox_pairs = unique_rows(page_textbox_pairs)
    textboxes = []
    for page, textbox_id in unique_page_textbox_pairs:
        textbox = features[np.where(
            (features[:, page_feature_id]==page) * 
            (features[:, textbox_feature_id]==textbox_id)
        )]
        textboxes.append(textbox)
    return textboxes

Пример #2

Показать файл

Файл: reader.py Проект: jaycode/-archive-Arthur

    def process_batch(zipfile, corpus_dir, batch, total, counter=0):
        for docname in batch:
            counter += 1
            if not os.path.exists(corpus_dir):
                os.makedirs(corpus_dir)
            filename = os.path.join(corpus_dir, docname+'.txt')
            if os.path.isfile(filename) and not overwrite:
                if stdout is not None:
                    stdout.write("%s already exists (%i/%i)\n" % (docname, counter, total))
            else:
                content = zipfile.read(docname)
                if stdout is not None:
                    stdout.write("processing %s (%i/%i)\n" % (docname, counter, total))
                document = ArthurDocument(content, name=docname)

                textboxes = __extract_textboxes(document)

                texts = []
                for idx, textbox in enumerate(textboxes):
                    remove = __find_duplicates(textbox)
                    ctextbox = np.delete(textbox, remove, axis=0)
                    texts.append(document.get_text(ctextbox))

                if len(texts) > 0:
                    if not os.path.isdir(corpus_dir):
                        os.mkdir(corpus_dir)

                    with open(filename,'w') as fout:
                        for text in texts:
                            print>>fout, text
                else:
                    if stdout is not None:
                        stdout.write("    empty text! moving on...\n")

Пример #3

Показать файл

Файл: reader.py Проект: jaycode/-archive-Arthur

def __find_duplicates(features):
    """Finds duplicates of a set of features.
    
    Example of usage
    >>> pdf_path = os.path.join(base_path, 'test', 'test.pdf')
    >>> f = open(pdf_path, 'rb')
    >>> document = ArthurDocument(f.read(), doctype='pdf')
    >>> textboxes = __extract_textboxes(document)
    >>> print(document.get_text(textboxes[11]))
    Property TypeProperty Type Property TypeProperty Type Single Family

    >>> remove_indexes = __find_duplicates(textboxes[11])
    >>> cfeatures = np.delete(textboxes[11], remove_indexes, axis=0)
    >>> print(document.get_text(cfeatures))
    Property Type Single Family

    Args:
        features(np.array): List of features to find duplicates of.

    Returns:
        list: Returns a tuple of corrected block and removed indexes.
    """
    fxid = ArthurDocument.get_feature_id('x')
    fyid = ArthurDocument.get_feature_id('y')
    positions = features[:,[fxid,fyid]]
    tree = cKDTree(positions)

    # Removes duplicate elements that are close together
    radius = 0.4
    neighbors = tree.query_ball_point(positions, radius)
    neighbors = np.unique(neighbors)
    # This returns numpy array like:
    # [[0, 13, 26, 39] [1, 14, 27, 40] [5, 31, 44, 18] [11, 24, 37, 50]
    # [16, 29, 42, 3] [17, 30, 43, 4] [21, 8, 34, 47] [22, 35, 48, 9]
    # [32, 45, 19, 6] [36, 23, 10, 49] [38, 12, 25, 51] [41, 28, 2, 15]
    # [46, 33, 7, 20] [52] [53] [54] [55] [56] [57] [58] [59] [60] [61] [62]
    # [63] [64]]
    #
    # Which we will then remove duplicates e.g. remove index 13, 26, 39, 14, 27, etc.
    removed = []
    for n in neighbors:
        removed.extend(np.sort(n)[1:])
    
    # Removes image elements
    removed.extend(np.where(features[:,ArthurDocument.get_feature_id('img_width')] != -1)[0].tolist())

    return removed

Пример #4

Показать файл

Файл: dumb_clusterer.py Проект: jaycode/Arthur.workspace

    def extract_expressions(self, document, features=None):
        """Returns expressions from given features and multi-word expressions.
        
        In addition to passing a document into this method, MWEs or Multi-Word Expressions
        can be given to treat some multi words as one expression.

        >>> from document import ArthurDocument
        >>> pdf_path = base_path + '/test/test.pdf'
        >>> with open(pdf_path, 'rb') as f:
        ...     document = ArthurDocument(f.read())
        >>> features = document.get_features()[730:816,:]
        >>> print(document.get_text(features)) # doctest:+ELLIPSIS
        VICTORIA'S CROWN JEWEL OF WATERFRONT ESTATES. Nestled on a quiet cove in the exclusive

        Multi-word expression should be detected:
        >>> clusterer = DumbClusterer(mwes=['crown jewel', 'waterfront estates'])
        >>> expressions = clusterer.extract_expressions(document, features)
        >>> print(expressions[2]['text'])
        CROWN JEWEL

        x position should equal x of "C" from "CROWN JEWEL" :
        >>> expressions[2]['x'] == features[11, ArthurDocument.get_feature_id('x')]
        True

        and width should equal to width of "CROWN JEWEL":
        >>> expr_width = expressions[2]['x1']-expressions[2]['x']
        >>> ftr_width = features[21, ArthurDocument.get_feature_id('x1')] - features[11, ArthurDocument.get_feature_id('x')]
        >>> expr_width == ftr_width
        True

        Args:
            document(ArthurDocument): Document to extract data fields from.
            features(list): List of features containing data fields to extract. If not given, use
                            all document features.
            mwes(list): List of Multi-Word Expressions. Example value:
                        `['property type', 'single family)]`. With that list, both "property type"
                        and "single family" will each be treated as single expressions.        
        Returns:
            np.array: An array of data_fields.
        """
        mwes = self.mwes
        if features is None:
            features = document.get_features()
        text = document.get_text(features)
        for idx, mwe in enumerate(mwes):
            if isinstance(mwe, str):
                mwes[idx] = word_tokenize(mwe.lower())
            elif hasattr(mwe, '__iter__'):
                mwes[idx] = [x.lower() for x in mwe]
        tokenizer = MWETokenizer(mwes, separator=' ')
        tokenized = tokenizer.tokenize(word_tokenize(text.lower()))

        expressions = []
        pos = 0
        for token in tokenized:
            # token could be "deez nutz" but text contains multiple spaces e.g. "deez  nutz",
            # so we need to split the token and find position of first and last characters.
            words = token.split()
            start_pos = text.lower().index(words[0], pos)
            for word in words:
                ipos = text.lower().index(word, pos)
                end_pos = ipos + len(word)
            pos = end_pos
            min_x = 0
            max_x = 0
            min_y = 0
            max_y = 0
            page = 0
            if len(features[start_pos:end_pos,:] > 0):
                min_x =  np.amin(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('x')]
                max_x =  np.amax(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('x1')]
                min_y =  np.amin(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('y')]
                max_y =  np.amax(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('y1')]
                page = features[start_pos, ArthurDocument.get_feature_id('page')]

            expressions.append({
                'text': text[start_pos:end_pos],
                'x': min_x,
                'x1': max_x,
                'y': min_y,
                'y1': max_y,
                'page': page
            })
        return expressions