Python HierarchyParser примеры использования

Язык программирования: Python

Пространство имен/Пакет: pdfstructure.hierarchy.parser

Класс/Тип: HierarchyParser

Примеров на hotexamples.com: 8

Python HierarchyParser - 8 примеров найдено. Это лучшие примеры Python кода для pdfstructure.hierarchy.parser.HierarchyParser, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

HierarchyParser(8)

parse_pdf(4)

Основные методы

HierarchyParser (8)

parse_pdf (4)

Пример #1

Показать файл

Файл: test_hierarchy.py Проект: trungthanhnguyen0502/pdfstructure

class TestHierarchy(TestCase):
    doc_with_columns = str(Path("resources/IE00BM67HT60-ATB-FS-DE-2020-2-28.pdf").absolute())
    straight_forward_doc = str(Path("resources/interview_cheatsheet.pdf").absolute())
    nested_doc_bold_title = str(Path("resources/5648.pdf").absolute())

    same_style_doc = str(Path("resources/SameStyleOnly.pdf").absolute())
    same_size_bold_header = str(Path("resources/SameSize_BoldTitle.pdf").absolute())
    same_size_enum_header = str(Path("resources/SameSize_EnumeratedTitle.pdf").absolute())

    parser = HierarchyParser()

    def test_no_hierarchy_detected(self):
        pdf = self.parser.parse_pdf(FileSource(self.same_style_doc))
        self.assertEqual(4, len(pdf.elements[0].children))

        self.assertIsInstance(pdf.elements[0], DanglingTextSection)

    def test_hierarchy_bold_title(self):
        pdf = self.parser.parse_pdf(FileSource(self.same_size_bold_header))
        self.assertEqual(2, len(pdf.elements))
        self.assertEqual("Lorem Ipsum.", pdf.elements[0].heading.text)
        self.assertEqual("Appendix", pdf.elements[1].heading.text)

    def test_hierarchy_pdf_parser(self):
        path = self.straight_forward_doc
        source = FileSource(path)
        pdf = self.parser.parse_pdf(source)
        self.assertEqual(9, len(pdf.elements))
        self.assertEqual("Data Structure Basics", pdf.elements[5].heading.text)
        self.assertEqual("Basic Types of Algorithms", pdf.elements[8].heading.text)
        self.assertEqual(4, pdf.elements[8].heading.page)

    def test_grouping(self):
        test_doc = self.nested_doc_bold_title
        doc = self.parser.parse_pdf(FileSource(test_doc))
        self.assertEqual(1, doc.elements.__len__())
        self.assertEqual(13, doc.elements[0].children.__len__())
        self.assertEqual("Outdoorpädagogik", doc.elements[0].heading.text)
        self.assertEqual("„Fange den Stock“", doc.elements[0].children[0].heading.text)

    def test_grouping_bold_key_and_size(self):
        doc = self.parser.parse_pdf(FileSource(self.straight_forward_doc))
        self.assertEqual(len(doc.elements), 9)

    def test_grouping_bold_columns(self):
        doc = self.parser.parse_pdf(FileSource(self.doc_with_columns))
        self.assertEqual("Xtrackers MSCI World Information Technology UCITS ETF 1C", doc.elements[1].heading.text)

Пример #2

Показать файл

Файл: test_custom_use_cases.py Проект: trungthanhnguyen0502/pdfstructure

class TestExamples(TestCase):
    parser = HierarchyParser()

    def test_count_paragraph_words(self):
        test_file = str(Path("resources/lorem.pdf"))
        document = self.parser.parse_pdf(FileSource(file_path=test_file))
        assert_token_order = [50, 100, 150]
        for level, title, content in traverse_inorder_sections_with_content(
                document):
            prefix = txtPrinter.get_title_prefix(level)
            tokens = content.split()
            self.assertEqual(assert_token_order.pop(0), len(tokens))
            print("{}{};\twords: {}".format(prefix, title, len(tokens)))

    def test_load_book(self):
        book_path = Path("resources/interview_cheatsheet.pdf")
        document = self.parser.parse_pdf(FileSource(file_path=str(book_path)))

        for level, title, content in traverse_inorder_sections_with_content(
                document):
            prefix = txtPrinter.get_title_prefix(level)
            print("{}{};\twords: {}".format(prefix, title,
                                            len(content.split())))

Пример #3

Показать файл

 def skip_test_pdfstructure(self):
     path = TestHierarchy.straight_forward_doc
     parser = HierarchyParser()
     source = FileSource(path)
     pdf = parser.parse_pdf(source)
     print(PrettyStringPrinter().print(pdf))

Пример #4

Показать файл

    def setUpClass(cls) -> None:
        parser = HierarchyParser()
        cls.test_doc_same_style = parser.parse_pdf(
            FileSource(cls.same_style_doc))

        cls.test_doc = parser.parse_pdf(FileSource(cls.straight_forward_doc))

Пример #5

Показать файл

def app():
    def text_on_page(dict_var, id_json, list_res, page):
        if type(dict_var) is dict:
            for k, v in dict_var.items():
                if k == id_json and v == page:
                    if v > page: return list_res
                    list_res.append(dict_var["text"])
                elif isinstance(v, dict):
                    text_on_page(v, id_json, list_res, page)
                elif isinstance(v, list):
                    for item in v:
                        text_on_page(item, id_json, list_res, page)
        return list_res

    def get_page(data, page):
        lines = []
        for chunk in data["elements"]:
            lines.extend(text_on_page(chunk, "page", [], page))
        return lines

    def get_histogram(docs, top=20):
        tokens = []
        for s in docs.values():
            tokens += s.split()
        uniques, counts = np.unique(tokens, return_counts=True)
        sorted_inds = np.argsort(counts)
        uniques_sorted = uniques[sorted_inds[-top:]][::-1]
        counts_sorted = counts[sorted_inds[-top:]][::-1]
        return (uniques_sorted, counts_sorted)

    file = st.file_uploader("test", type="pdf", key=2)
    start = 1
    max_val = 1000
    end = 25
    slider_val = st.slider('Page range:',
                           min_value=start,
                           max_value=max_val,
                           value=(1, end),
                           step=1)

    #probably need to put '@st.cache(suppress_st_warning=True)' above a function where the 'with open ...' code below is the function.

    if file is not None:
        file_details = {
            "FileName": file.name,
            "FileType": file.type,
            "FileSize": str(file.size / 1000000) + 'mb'
        }
        data_load_state = st.text('Loading data... Thank you for waiting 😊')

        st.write(file_details)
        parser = HierarchyParser()
        source = FileSource(file, page_numbers=list(range(start, end)))
        document = parser.parse_pdf(source)
        printer = JsonFilePrinter()
        file_path = pathlib.Path('pdf.json')
        printer.print(document, file_path=str(file_path.absolute()))
        with open('pdf.json') as file:
            data = json.load(file)
        pages = {i: ' '.join(get_page(data, i)) for i in range(end)}

        doc_size = 0.25
        (formatted_docs,
         paragraph_page_idx) = preprocessing1.get_formatted_docs(
             pages, doc_size)
        preprocessed_docs = preprocessing1.get_preprocessed_docs(
            formatted_docs)
        data_load_state.text("Done!")
        st.subheader('First page in the selected range')
        st.write({"page 1": pages[0]})
        st.subheader('Page range word distribution')
        # (uniques, counts) = get_histogram(preprocessed_docs)
        # fig = px.bar(x = uniques, y = counts)
        # st.plotly_chart(fig)
        (uniques, counts) = get_histogram(preprocessed_docs)
        fig, ax = plt.subplots(figsize=(10, 10))
        ax.bar(uniques, counts)
        plt.setp(ax.get_xticklabels(), rotation='vertical')
        st.pyplot(fig)

        tfidf_vectorizer = cosine1.get_tfidf_vectorizer()
        tfidf_matrix = tfidf_vectorizer.fit_transform(
            list(preprocessed_docs.values())).toarray()
        query = st.text_input("Search:")
        if query:
            q = cosine1.get_query_vector(query, tfidf_vectorizer)
            cos_sims = cosine1.get_cosine_sim(q, tfidf_matrix)
            (rankings, scores) = cosine1.get_rankings(cos_sims)

            idx = rankings[0]
            score = scores[0]
            page_num = paragraph_page_idx[idx] + 1
            doc = formatted_docs[idx]
            if score > 0.0:
                st.subheader("Similarity: " + str(score))
                st.write({"page " + str(page_num): str(doc)})

            else:
                st.subheader("No matches found.")

    st.subheader('made with ❤️ by:')
    st.markdown(
        '[Vince Bartle](https://bartle.io) (vb344) | [Dubem Ogwulumba](https://www.linkedin.com/in/dubem-ogwulumba/) (dao52) | [Erik Ossner](https://erikossner.com/) (eco9) | [Qiyu Yang](https://github.com/qiyuyang16/) (qy35) | [Youhan Yuan](https://github.com/nukenukenukelol) (yy435)'
    )

Пример #6

Показать файл

Файл: v3_app.py Проект: qiyuyang16/CS4300_microGoogle

def app():
    global email_logged_in
    choice = st.sidebar.selectbox("Menu", ["Login", "Sign Up"])
    if email_logged_in == "":
        if choice == "Login":
            email = st.sidebar.text_input("Email")
            password = st.sidebar.text_input("Password", type='password')
            if st.sidebar.button("Login"):
                # Match from fire base
                check_email = db.collection("users").where(
                    u'email', u'==', email).stream()
                user_dict = dict()
                for user in check_email:
                    user_dict = user.to_dict()
                    break
                if len(user_dict) > 0:
                    salt = user_dict['salt']
                    key = user_dict['key']
                    new_key = hashlib.pbkdf2_hmac('sha256',
                                                  password.encode('utf-8'),
                                                  salt, 100000)
                    if key == new_key:
                        st.sidebar.success("Logged in as {}".format(email))
                        email_logged_in = email
                        if st.sidebar.button("Logout"):
                            email_logged_in = ""
                            st.sidebar.success(
                                "Logged out! Use the sidebar to sign back in")

                    else:
                        st.sidebar.warning("Incorrect Password!")
                else:
                    st.sidebar.warning("No account with that email exists")
        else:
            new_email = st.sidebar.text_input("New Email")
            new_pass = st.sidebar.text_input("New Password", type='password')
            new_pass_2 = st.sidebar.text_input("Verify Password",
                                               type='password')
            if st.sidebar.button("Sign Up"):
                check_email = db.collection("users").where(
                    u'email', u'==', new_email).stream()
                good_email = True
                for e in check_email:
                    st.sidebar.warning(
                        "An account exists with this email already!")
                    good_email = False
                    break
                if new_pass == new_pass_2 and good_email:
                    st.sidebar.success(
                        "Successfully created account! Login from the sidebar")
                    #Write to firebase
                    salt = os.urandom(32)  # A new salt for this user
                    key = hashlib.pbkdf2_hmac('sha256',
                                              new_pass.encode('utf-8'), salt,
                                              100000)
                    login_ref = db.collection("users").document()
                    login_ref.set({
                        "email": new_email,
                        "salt": salt,
                        "key": key
                    })
                elif good_email:
                    st.sidebar.warning("Passwords do not match!")
    else:
        st.sidebar.warning("You are already logged in!")
        if st.sidebar.button("Logout"):
            email_logged_in = ""
            st.sidebar.success("Logged out! Use the sidebar to sign back in")

    def text_on_page(dict_var, id_json, list_res, page):
        if type(dict_var) is dict:
            for k, v in dict_var.items():
                if k == id_json and v == page:
                    if v > page: return list_res
                    list_res.append(dict_var["text"])
                elif isinstance(v, dict):
                    text_on_page(v, id_json, list_res, page)
                elif isinstance(v, list):
                    for item in v:
                        text_on_page(item, id_json, list_res, page)
        return list_res

    def get_page(data, page):
        lines = []
        for chunk in data["elements"]:
            lines.extend(text_on_page(chunk, "page", [], page))
        return lines

    def get_histogram(docs, top=20):
        tokens = []
        for s in docs.values():
            tokens += s.split()
        uniques, counts = np.unique(tokens, return_counts=True)
        sorted_inds = np.argsort(counts)
        uniques_sorted = uniques[sorted_inds[-top:]][::-1]
        counts_sorted = counts[sorted_inds[-top:]][::-1]
        return (uniques_sorted, counts_sorted)

    # def st_display_pdf(pdf_file):
    #     base64_pdf = base64.b64encode(pdf_file.read()).decode('utf-8')
    #     st.write(base64_pdf)
    #     pdf_display = f'<embed src="data:application/pdf;base64,{base64_pdf}" width="700" height="1000" type="application/pdf">'
    #     st.markdown(pdf_display, unsafe_allow_html=True)

    # Read from fire base if logged in
    counter_queries = 1
    if not email_logged_in == "":

        queries_collection_user = db.collection("queries")
        user_queries = queries_collection_user.where(
            u'email', u'==', email_logged_in).order_by(
                u'timeStamp',
                direction=firestore.Query.DESCENDING).limit(5).stream()
        with st.beta_expander("Your Most Recent Queries:"):
            for doc in user_queries:
                doc_dict = doc.to_dict()
                st.markdown("<strong>Query " + str(counter_queries) +
                            "</strong>: \n",
                            unsafe_allow_html=True)
                st.markdown("<u>Query</u>: " + doc_dict["query"] + "\n",
                            unsafe_allow_html=True)
                st.markdown("<u>Top Match</u>: " + doc_dict["topMatch"] + "\n",
                            unsafe_allow_html=True)
                if doc_dict["upvote"] < 0:
                    st.markdown(
                        "<small>So far " + str(abs(doc_dict["upvote"])) +
                        "people don't think it's a good match.</small>",
                        unsafe_allow_html=True)
                else:
                    st.markdown("<small>So far " + str(doc_dict["upvote"]) +
                                " people think it's a good match.</small>",
                                unsafe_allow_html=True)
                st.markdown("<hr>", unsafe_allow_html=True)
                counter_queries += 1
            if counter_queries == 1:
                st.write("No queries...yet!")
    file = st.file_uploader("Upload:", type="pdf", key=2)
    file_length = 100
    if file is not None:
        with pdfplumber.open(file) as raw:
            file_length = len(raw.pages)
    st.write(
        '🙊 If the app runs slowly, please consider reducing the page range!')
    st.write(
        '🌴 Also consider collapsing cells as you go for ease of navigating features.'
    )
    if file_length > 20 and file_length < 100:
        slider_val = st.slider('Page range:',
                               min_value=1,
                               max_value=file_length,
                               value=(1, int(file_length * .1)),
                               step=1)
    if file_length >= 100:
        slider_val = st.slider('Page range:',
                               min_value=1,
                               max_value=file_length,
                               value=(1, 25),
                               step=1)
    if file_length <= 20:
        slider_val = st.slider('Page range:',
                               min_value=1,
                               max_value=file_length,
                               value=(1, file_length),
                               step=1)

    if slider_val[1] - slider_val[0] > 50:
        st.write('Range greater than 50 pages, ‼️ this may run slowly.')
        st.subheader('')
    if file is not None:
        file_details = {
            "FileName": file.name,
            "FileType": file.type,
            "FileSize": str(file.size / 1000000) + 'mb'
        }
        data_load_state = st.text('Loading data... Thank you for waiting 😊. ')

        st.write(file_details)
        parser = HierarchyParser()
        source = FileSource(file,
                            page_numbers=list(
                                range(slider_val[0] - 1, slider_val[1])))

        @st.cache(suppress_st_warning=True)
        def fetch_pages(source):
            document = parser.parse_pdf(source)
            printer = JsonFilePrinter()
            file_path = pathlib.Path('pdf.json')
            printer.print(document, file_path=str(file_path.absolute()))

            with open('pdf.json') as json_file:
                data = json.load(json_file)
            json_file.close()
            pages = {i + 1: get_page(data, i) for i in range(0, slider_val[1])}

            return pages, file_path

        pages, _ = fetch_pages(source)
        paragraphs = [i for j in [i[1] for i in pages.items()] for i in j]
        windowed_paragraphs = [
            i for i in list(enumerate(paragraphs))
            if count_words(i[1]) > word_window
        ]
        text_blob = ' '.join([i[1] for i in windowed_paragraphs])
        word_list = text_blob.split()

        (formatted_docs,
         paragraph_page_idx) = preprocessing3.get_formatted_docs(pages)
        preprocessed_docs = preprocessing3.get_preprocessed_docs(
            formatted_docs)
        data_load_state.text(
            "Done! 🎉 If you receive an error messages from the server it will likely not impede app functionality."
        )
        with st.beta_expander('View word distribution.', expanded=True):
            radio_1 = st.radio("Select 1-word or 2-word distribution.",
                               ("1-word", "2-word", "1-word dispersion"))
            if radio_1 == "1-word":
                (uniques, counts) = get_histogram(preprocessed_docs)
                fig = px.bar(x=uniques, y=counts)
                st.plotly_chart(fig)
            if radio_1 == "2-word":
                bigrams = zip(word_list, word_list[1:])
                counts = Counter(bigrams)
                uniques = [
                    str(i)
                    for i in list(np.vstack(counts.most_common())[:, 0][:20])
                ]
                counts = list(np.vstack(counts.most_common())[:, 1][:20])
                fig = px.bar(x=uniques, y=counts)
                st.plotly_chart(fig)
            if radio_1 == "1-word dispersion":
                dispersion = st.text_input("Insert words separated by spaces.")
                dispersion_query = (''.join(dispersion)).split()
                wt_words = text_blob.split()
                if dispersion_query:
                    points = [(x, y) for x in range(len(wt_words))
                              for y in range(len(dispersion_query))
                              if wt_words[x] == dispersion_query[y]]
                    if points:
                        x, y = zip(*points)
                    else:
                        x = y = ()
                    import plotly.graph_objects as go
                    fig = go.Figure(data=go.Scatter(
                        x=x,
                        y=y,
                        mode="markers",
                    ))
                    if len(y) == 0:
                        st.write('No results! 😔 Please try other words.')
                    else:
                        fig.update_layout(
                            title="Where these words land in the document.",
                            yaxis=dict(tickmode='array',
                                       tickvals=list(range(max(y) + 1)),
                                       ticktext=dispersion_query))
                    st.plotly_chart(fig)
                # fig.show()

        st.subheader('First paragraphs on page ' + str(slider_val[0]) + ":")
        if len(pages[slider_val[0]]) >= 3:
            for i in range(3):
                st.markdown("<u>Paragraph " + str(i + 1) + "</u>: " +
                            pages[slider_val[0]][i],
                            unsafe_allow_html=True)
        else:
            st.markdown("Page " + str(slider_val[0]) + " is empty.")

        tfidf_vectorizer = cosine3.get_tfidf_vectorizer()
        tfidf_matrix = tfidf_vectorizer.fit_transform(
            list(preprocessed_docs.values())).toarray()
        (_, num_terms) = tfidf_matrix.shape
        query1 = st.text_input("Cosine-SVD Search")
        if query1:
            q = cosine3.get_query_vector(query1, tfidf_vectorizer)
            if num_terms > 1000:
                (doc_mat, weight_mat, term_mat) = cosine3.get_svd(tfidf_matrix)
                cos_sims = cosine3.get_cosine_sim_svd(q, doc_mat, weight_mat,
                                                      term_mat)
            else:
                cos_sims = cosine3.get_cosine_sim(q, tfidf_matrix)
            (rankings, scores) = cosine3.get_rankings(cos_sims)

            ranking_lengths = []
            for i in range(len(rankings)):
                idx = rankings[i]
                score = scores[i]
                page_num = paragraph_page_idx[idx]
                doc = formatted_docs[idx]
                curr_len = count_words(doc)
                ranking_lengths.append(curr_len)
            global word_window
            word_window = st.slider("Minimum word count",
                                    min_value=1,
                                    max_value=max(ranking_lengths),
                                    value=10)
            for i in range(len(rankings)):
                # there's probably a more efficient way to do this but these are at most 10 loops so sufficient for now.
                idx = rankings[i]
                score = scores[i]
                page_num = paragraph_page_idx[idx]
                doc = formatted_docs[idx]
                curr_len = count_words(doc)
                if curr_len >= word_window:
                    # st.write(curr_len,word_window)
                    break

            if score > 0.0:
                st.subheader("Similarity: " + str(round(score, 4)) +
                             ", Ranking: " + str(i + 1))
                st.markdown("<u>Match</u>: " + str(doc),
                            unsafe_allow_html=True)
                st.markdown("<u>Page Number</u>: " + str(page_num),
                            unsafe_allow_html=True)

                #columns used to layout the button to ask user to upload the result to db
                uploadCols = st.beta_columns(4)
                #columns used to write thank you message if user click upload
                thankyouCols = st.beta_columns(4)
                if uploadCols[-1].button(
                        "Submit Your Search Result for our Study!"):
                    thankyouCols[-1].write(
                        "Thank you! We can't get better without your support😃")
                    #write match and query to the db
                    doc_ref = db.collection("queries").document()
                    doc_ref.set({
                        "id": doc_ref.get().id,
                        "query": query1,
                        "topMatch": str(doc),
                        "timeStamp": firestore.SERVER_TIMESTAMP,
                        "upvote": 0,
                        "queryType": "Cosine",
                        "email": email_logged_in
                    })
                    cosMultiSub = True

                #columns used to layout explanation of upload button
                explainCols = st.beta_columns(4)
                explainCols[-1].markdown(
                    "<i><small>By clicking the submit button you agree with our <a href=\
                    'https://theuniversityfaculty.cornell.edu/dean/academic-integrity/'>terms of service</a></small></i>",
                    unsafe_allow_html=True)

            else:
                st.subheader("No matches found.")

        with st.beta_expander('Compare with Verbatim Search:'):
            query3 = st.text_input("Search:")
            verbatim_search = lambda query: [
                msg for msg in windowed_paragraphs if query in msg[1]
            ]
            if query3:
                v_result = verbatim_search(query3)
                if len(v_result) == 0:
                    st.write("No matches found.")
                else:
                    st.write("Matches found. 🎉")
                    v_slider = st.slider("View at most this many matches:", 1,
                                         100, 3)
                    display_result = v_result[:v_slider]
                    counter = 0
                    for pageNum, text in display_result:
                        counter += 1
                        st.subheader("Result " + str(counter) + ":")
                        st.markdown("&nbsp")
                        st.markdown("<u>Match</u>: " + str(text),
                                    unsafe_allow_html=True)
                        st.markdown("<u>Page Number</u>: " + str(pageNum),
                                    unsafe_allow_html=True)
                        st.markdown("&nbsp")
                        #columns used to layout the button to ask user to upload the result to db
                        uploadCols = st.beta_columns(4)
                        #columns used to write thank you message if user click upload
                        thankyouCols = st.beta_columns(4)
                        if uploadCols[-1].button(
                                "Submit Your Search Result for our Study",
                                key="Verbatim" + str(counter)):
                            thankyouCols[-1].write(
                                "Thank you! We can't get better without your support😃"
                            )
                            #write match and query to the db
                            doc_ref = db.collection("queries").document()
                            doc_ref.set({
                                "id": doc_ref.get().id,
                                "query": query1,
                                "topMatch": str(doc),
                                "timeStamp": firestore.SERVER_TIMESTAMP,
                                "upvote": 0,
                                "queryType": "Verbatim",
                                "email": email_logged_in
                            })

        with st.beta_expander('Explore Paragraph Similarities.'):
            st.write(
                'Browse and zoom into the similarity heatmap. Generally moderate matches, with a score between .4 and .6 that lie further from the diagonal are the most informative. Request the matching paragraphs below.'
            )
            sim_mat = tfidf_matrix @ tfidf_matrix.T
            fig1 = px.imshow(sim_mat)
            st.plotly_chart(fig1)

            # windowed_paragraphs = [i for i in list(enumerate(paragraphs)) if count_words(i[1])>word_window]

            number_query1 = st.number_input("Select 1st paragraph", 0,
                                            len(paragraphs))
            number_query2 = st.number_input("Select 2nd paragraph", 0,
                                            len(paragraphs))

            st.write(paragraphs[number_query1])
            st.write(paragraphs[number_query2])

        # st.subheader('Paragraph similarity heatmap')

    queries_collection_ref = db.collection("queries")
    query = queries_collection_ref.order_by(
        u'timeStamp', direction=firestore.Query.DESCENDING).limit(5)
    counter = 0

    #helper function to write upvote onto the page
    def writeUpvote(voteCount):
        if voteCount < 0:
            st.markdown("<small>So far " + str(abs(voteCount)) +
                        " people don't think it's a good match.</small>",
                        unsafe_allow_html=True)
        else:
            st.markdown("<small>So far " + str(voteCount) +
                        " people think it's a good match.</small>",
                        unsafe_allow_html=True)

    #helper function to update upvote given doc id and queries collection ref. Return the new upvote
    def updateVotes(queries_collection_ref, id, inc):
        doc_ref = queries_collection_ref.document(id)
        latestUpvote = doc_ref.get().to_dict()["upvote"]
        if inc:
            latestUpvote += 1
        else:
            latestUpvote -= 1

        doc_ref.update({"upvote": latestUpvote})
        return latestUpvote

    with st.beta_expander("Recent Queries We Processed..."):
        for doc in query.stream():
            counter += 1
            doc_dict = doc.to_dict()
            st.markdown("<strong>Query " + str(counter) + "</strong>: \n",
                        unsafe_allow_html=True)
            st.markdown("<u>Query</u>: " + doc_dict["query"] + "\n",
                        unsafe_allow_html=True)
            st.markdown("<u>Top Match</u>: " + doc_dict["topMatch"] + "\n",
                        unsafe_allow_html=True)
            st.markdown("<u>Search Method</u>: " + doc_dict["queryType"] +
                        "\n",
                        unsafe_allow_html=True)

            st.markdown("&nbsp")

            st.markdown(
                "<i><small>Do you think this is a good match?</small></i>",
                unsafe_allow_html=True)
            cols = st.beta_columns(12)
            likeButton = cols[0].button("👍", key="YesButton" + str(counter))
            dislikeButton = cols[1].button("👎", key="NoButton" + str(counter))
            newUpvote = doc_dict["upvote"]
            if likeButton:
                newUpvote = updateVotes(queries_collection_ref, doc_dict["id"],
                                        True)
                writeUpvote(newUpvote)

            elif dislikeButton:
                newUpvote = newUpvote = updateVotes(queries_collection_ref,
                                                    doc_dict["id"], False)
                writeUpvote(newUpvote)
            else:
                writeUpvote(newUpvote)

            if counter != 5:
                st.markdown("<hr>", unsafe_allow_html=True)
    # if file is not None:
    #     # st.write(file_path)
    #     st_display_pdf(file)

    st.subheader('made with ❤️ by:')
    st.markdown(
        '[Vince Bartle](https://bartle.io) (vb344) | [Dubem Ogwulumba](https://www.linkedin.com/in/dubem-ogwulumba/) (dao52) | [Erik Ossner](https://erikossner.com/) (eco9) | [Qiyu Yang](https://github.com/qiyuyang16/) (qy35) | [Youhan Yuan](https://github.com/nukenukenukelol) (yy435)'
    )

Пример #7

Показать файл

 def setUpClass(cls) -> None:
     parser = HierarchyParser()
     cls.testDocument = parser.parse_pdf(
         FileSource(cls.straight_forward_doc))

Пример #8

Показать файл

def app():
    def text_on_page(dict_var, id_json, list_res, page):
        if type(dict_var) is dict:
            for k, v in dict_var.items():
                if k == id_json and v == page:
                    if v > page: return list_res
                    list_res.append(dict_var["text"])
                elif isinstance(v, dict):
                    text_on_page(v, id_json, list_res, page)
                elif isinstance(v, list):
                    for item in v:
                        text_on_page(item, id_json, list_res, page)
        return list_res

    def get_page(data, page):
        lines = []
        for chunk in data["elements"]:
            lines.extend(text_on_page(chunk, "page", [], page))
        return lines

    def get_histogram(docs, top=20):
        tokens = []
        for s in docs.values():
            tokens += s.split()
        uniques, counts = np.unique(tokens, return_counts=True)
        sorted_inds = np.argsort(counts)
        uniques_sorted = uniques[sorted_inds[-top:]][::-1]
        counts_sorted = counts[sorted_inds[-top:]][::-1]
        return (uniques_sorted, counts_sorted)

    file = st.file_uploader("test", type="pdf", key=2)
    start = 1
    max_val = 1000
    end = 5
    slider_val = st.slider('Page range:',
                           min_value=start,
                           max_value=max_val,
                           value=(1, end),
                           step=1)

    if file is not None:
        file_details = {
            "FileName": file.name,
            "FileType": file.type,
            "FileSize": str(file.size / 1000000) + 'mb'
        }
        data_load_state = st.text('Loading data... Thank you for waiting 😊')

        parser = HierarchyParser()
        source = FileSource(file, page_numbers=list(range(start - 1, end)))

        @st.cache(suppress_st_warning=True)
        def fetch_doc(source):
            return parser.parse_pdf(source)

        document = fetch_doc(source)
        printer = JsonFilePrinter()
        file_path = pathlib.Path('pdf.json')
        printer.print(document, file_path=str(file_path.absolute()))

        with open('pdf.json') as json_file:
            data = json.load(json_file)
        json_file.close()
        pages = {
            i: get_page(data, i)
            for i in range(slider_val[0], slider_val[1])
        }

        (formatted_docs,
         paragraph_page_idx) = preprocessing2.get_formatted_docs(
             pages, max_paragraphs=5)
        preprocessed_docs = preprocessing2.get_preprocessed_docs(
            formatted_docs)
        data_load_state.text("Done!")
        st.write(file_details)
        with st.beta_expander("PDF Extraction details"):
            st.subheader('First paragraphs on page ' + str(slider_val[0]))
            if len(pages[slider_val[0]]) >= 5:
                for i in range(5):
                    st.markdown("<u>¶ " + str(i + 1) + "</u>: " +
                                pages[slider_val[0]][i],
                                unsafe_allow_html=True)
            else:
                for i in range(len(pages[slider_val[0]])):
                    st.markdown("<u>¶ " + str(i + 1) + "</u>: " +
                                pages[slider_val[0]][i],
                                unsafe_allow_html=True)

            st.subheader('PDF word distribution')
            (uniques, counts) = get_histogram(preprocessed_docs)
            fig = px.bar(x=uniques, y=counts)
            fig.update_xaxes(title_text='words')
            fig.update_yaxes(title_text='occurances')
            st.plotly_chart(fig)

            st.subheader('Paragraph similarity heatmap')

        tfidf_vectorizer = cosine2.get_tfidf_vectorizer()
        tfidf_matrix = tfidf_vectorizer.fit_transform(
            list(preprocessed_docs.values())).toarray()
        query1 = st.text_input("Cosine-SVD Search")
        if query1:
            q = cosine2.get_query_vector(query1, tfidf_vectorizer)
            cos_sims = cosine2.get_cosine_sim(q, tfidf_matrix)
            (rankings, scores) = cosine2.get_rankings(cos_sims)

            idx = rankings[0]
            score = scores[0]
            page_num = paragraph_page_idx[idx] + 1
            doc = formatted_docs[idx]
            if score > 0.0:
                st.subheader("Similarity: " + str(score))
                st.markdown("<u>Match</u>: " + str(doc),
                            unsafe_allow_html=True)
                st.markdown("<u>Page Number</u>: " + str(page_num),
                            unsafe_allow_html=True)

                #write match and query to the db
                doc_ref = db.collection("queries").document()
                doc_ref.set({
                    "query": query1,
                    "topMatch": str(doc),
                    "timeStamp": firestore.SERVER_TIMESTAMP,
                    "upvote": 0
                })

            else:
                st.subheader("No matches found.")
        st.write("Following methods are under construction 😊 Stay tuned!")
        query2 = st.text_input("Synonymized Query Search")
        query3 = st.text_input("Verbatim Search")

    st.subheader("Recent search results:")
    q_ref = db.collection("queries").order_by(
        u'timeStamp', direction=firestore.Query.DESCENDING)
    counter = 0
    yesButtons = []
    noButtons = []
    for doc in q_ref.stream():
        counter += 1
        doc_dict = doc.to_dict()

        st.markdown("<strong>Query " + str(counter) + "</strong>: \n",
                    unsafe_allow_html=True)
        st.markdown("<u>Query</u>: " + doc_dict["query"] + "\n",
                    unsafe_allow_html=True)
        st.markdown("<u>Top Match</u>: " + doc_dict["topMatch"] + "\n",
                    unsafe_allow_html=True)
        st.markdown("&nbsp")
        if doc_dict["upvote"] < 0:
            st.markdown("<small>So far " + str(abs(doc_dict["upvote"])) +
                        "people don't think it's a good match.</small>",
                        unsafe_allow_html=True)
        else:
            st.markdown("<small>So far " + str(doc_dict["upvote"]) +
                        " people think it's a good match.</small>",
                        unsafe_allow_html=True)

        st.markdown("<i><small>Do you think this is a good match?</small></i>",
                    unsafe_allow_html=True)
        yesButtons.append(st.button("👍", key="YesButton" + str(counter)))
        noButtons.append(st.button("👎", key="NoButton" + str(counter)))

        st.markdown("<hr>", unsafe_allow_html=True)

        if counter == 5:
            break

    st.subheader('made with ❤️ by:')
    st.markdown(
        '[Vince Bartle](https://bartle.io) (vb344) | [Dubem Ogwulumba](https://www.linkedin.com/in/dubem-ogwulumba/) (dao52) | [Erik Ossner](https://erikossner.com/) (eco9) | [Qiyu Yang](https://github.com/qiyuyang16/) (qy35) | [Youhan Yuan](https://github.com/nukenukenukelol) (yy435)'
    )