示例#1
0
 def test_reading_order_paper_format(self):
     pdf = self.parser.parse_pdf(FileSource(self.paper))
     self.assertEqual(1, len(pdf.elements))
     self.assertEqual("5 Experiments: Passage Retrieval",
                      pdf.elements[0].children[-2].heading_text)
     self.assertEqual("6 Experiments: Question Answering",
                      pdf.elements[0].children[-1].heading_text)
 def test_grouping(self):
     test_doc = self.nested_doc_bold_title
     doc = self.parser.parse_pdf(FileSource(test_doc))
     self.assertEqual(1, doc.elements.__len__())
     self.assertEqual(13, doc.elements[0].children.__len__())
     self.assertEqual("Outdoorpädagogik", doc.elements[0].heading.text)
     self.assertEqual("„Fange den Stock“", doc.elements[0].children[0].heading.text)
 def test_hierarchy_pdf_parser(self):
     path = self.straight_forward_doc
     source = FileSource(path)
     pdf = self.parser.parse_pdf(source)
     self.assertEqual(9, len(pdf.elements))
     self.assertEqual("Data Structure Basics", pdf.elements[5].heading.text)
     self.assertEqual("Basic Types of Algorithms", pdf.elements[8].heading.text)
     self.assertEqual(4, pdf.elements[8].heading.page)
    def test_load_book(self):
        book_path = Path("resources/interview_cheatsheet.pdf")
        document = self.parser.parse_pdf(FileSource(file_path=str(book_path)))

        for level, title, content in traverse_inorder_sections_with_content(
                document):
            prefix = txtPrinter.get_title_prefix(level)
            print("{}{};\twords: {}".format(prefix, title,
                                            len(content.split())))
 def test_count_paragraph_words(self):
     test_file = str(Path("resources/lorem.pdf"))
     document = self.parser.parse_pdf(FileSource(file_path=test_file))
     assert_token_order = [50, 100, 150]
     for level, title, content in traverse_inorder_sections_with_content(
             document):
         prefix = txtPrinter.get_title_prefix(level)
         tokens = content.split()
         self.assertEqual(assert_token_order.pop(0), len(tokens))
         print("{}{};\twords: {}".format(prefix, title, len(tokens)))
示例#6
0
 def skip_test_grouping_bold_columns(self):
     doc = self.parser.parse_pdf(FileSource(self.doc_with_columns))
     self.assertEqual(
         "Xtrackers MSCI World Information Technology UCITS ETF 1C",
         doc.elements[1].heading.text)
示例#7
0
 def test_grouping_bold_key_and_size(self):
     doc = self.parser.parse_pdf(FileSource(self.straight_forward_doc))
     self.assertEqual(len(doc.elements), 9)
示例#8
0
 def test_hierarchy_bold_title(self):
     pdf = self.parser.parse_pdf(FileSource(self.same_size_bold_header))
     self.assertEqual(2, len(pdf.elements))
     self.assertEqual("Lorem Ipsum.", pdf.elements[0].heading.text)
     self.assertEqual("Appendix", pdf.elements[1].heading.text)
示例#9
0
    def test_no_hierarchy_detected(self):
        pdf = self.parser.parse_pdf(FileSource(self.same_style_doc))
        self.assertEqual(4, len(pdf.elements[0].children))

        self.assertIsInstance(pdf.elements[0], DanglingTextSection)
示例#10
0
 def skip_test_pdfstructure(self):
     path = TestHierarchy.straight_forward_doc
     parser = HierarchyParser()
     source = FileSource(path)
     pdf = parser.parse_pdf(source)
     print(PrettyStringPrinter().print(pdf))
示例#11
0
    def setUpClass(cls) -> None:
        parser = HierarchyParser()
        cls.test_doc_same_style = parser.parse_pdf(
            FileSource(cls.same_style_doc))

        cls.test_doc = parser.parse_pdf(FileSource(cls.straight_forward_doc))
示例#12
0
def app():
    def text_on_page(dict_var, id_json, list_res, page):
        if type(dict_var) is dict:
            for k, v in dict_var.items():
                if k == id_json and v == page:
                    if v > page: return list_res
                    list_res.append(dict_var["text"])
                elif isinstance(v, dict):
                    text_on_page(v, id_json, list_res, page)
                elif isinstance(v, list):
                    for item in v:
                        text_on_page(item, id_json, list_res, page)
        return list_res

    def get_page(data, page):
        lines = []
        for chunk in data["elements"]:
            lines.extend(text_on_page(chunk, "page", [], page))
        return lines

    def get_histogram(docs, top=20):
        tokens = []
        for s in docs.values():
            tokens += s.split()
        uniques, counts = np.unique(tokens, return_counts=True)
        sorted_inds = np.argsort(counts)
        uniques_sorted = uniques[sorted_inds[-top:]][::-1]
        counts_sorted = counts[sorted_inds[-top:]][::-1]
        return (uniques_sorted, counts_sorted)

    file = st.file_uploader("test", type="pdf", key=2)
    start = 1
    max_val = 1000
    end = 25
    slider_val = st.slider('Page range:',
                           min_value=start,
                           max_value=max_val,
                           value=(1, end),
                           step=1)

    #probably need to put '@st.cache(suppress_st_warning=True)' above a function where the 'with open ...' code below is the function.

    if file is not None:
        file_details = {
            "FileName": file.name,
            "FileType": file.type,
            "FileSize": str(file.size / 1000000) + 'mb'
        }
        data_load_state = st.text('Loading data... Thank you for waiting 😊')

        st.write(file_details)
        parser = HierarchyParser()
        source = FileSource(file, page_numbers=list(range(start, end)))
        document = parser.parse_pdf(source)
        printer = JsonFilePrinter()
        file_path = pathlib.Path('pdf.json')
        printer.print(document, file_path=str(file_path.absolute()))
        with open('pdf.json') as file:
            data = json.load(file)
        pages = {i: ' '.join(get_page(data, i)) for i in range(end)}

        doc_size = 0.25
        (formatted_docs,
         paragraph_page_idx) = preprocessing1.get_formatted_docs(
             pages, doc_size)
        preprocessed_docs = preprocessing1.get_preprocessed_docs(
            formatted_docs)
        data_load_state.text("Done!")
        st.subheader('First page in the selected range')
        st.write({"page 1": pages[0]})
        st.subheader('Page range word distribution')
        # (uniques, counts) = get_histogram(preprocessed_docs)
        # fig = px.bar(x = uniques, y = counts)
        # st.plotly_chart(fig)
        (uniques, counts) = get_histogram(preprocessed_docs)
        fig, ax = plt.subplots(figsize=(10, 10))
        ax.bar(uniques, counts)
        plt.setp(ax.get_xticklabels(), rotation='vertical')
        st.pyplot(fig)

        tfidf_vectorizer = cosine1.get_tfidf_vectorizer()
        tfidf_matrix = tfidf_vectorizer.fit_transform(
            list(preprocessed_docs.values())).toarray()
        query = st.text_input("Search:")
        if query:
            q = cosine1.get_query_vector(query, tfidf_vectorizer)
            cos_sims = cosine1.get_cosine_sim(q, tfidf_matrix)
            (rankings, scores) = cosine1.get_rankings(cos_sims)

            idx = rankings[0]
            score = scores[0]
            page_num = paragraph_page_idx[idx] + 1
            doc = formatted_docs[idx]
            if score > 0.0:
                st.subheader("Similarity: " + str(score))
                st.write({"page " + str(page_num): str(doc)})

            else:
                st.subheader("No matches found.")

    st.subheader('made with ❤️ by:')
    st.markdown(
        '[Vince Bartle](https://bartle.io) (vb344) | [Dubem Ogwulumba](https://www.linkedin.com/in/dubem-ogwulumba/) (dao52) | [Erik Ossner](https://erikossner.com/) (eco9) | [Qiyu Yang](https://github.com/qiyuyang16/) (qy35) | [Youhan Yuan](https://github.com/nukenukenukelol) (yy435)'
    )
示例#13
0
def app():
    global email_logged_in
    choice = st.sidebar.selectbox("Menu", ["Login", "Sign Up"])
    if email_logged_in == "":
        if choice == "Login":
            email = st.sidebar.text_input("Email")
            password = st.sidebar.text_input("Password", type='password')
            if st.sidebar.button("Login"):
                # Match from fire base
                check_email = db.collection("users").where(
                    u'email', u'==', email).stream()
                user_dict = dict()
                for user in check_email:
                    user_dict = user.to_dict()
                    break
                if len(user_dict) > 0:
                    salt = user_dict['salt']
                    key = user_dict['key']
                    new_key = hashlib.pbkdf2_hmac('sha256',
                                                  password.encode('utf-8'),
                                                  salt, 100000)
                    if key == new_key:
                        st.sidebar.success("Logged in as {}".format(email))
                        email_logged_in = email
                        if st.sidebar.button("Logout"):
                            email_logged_in = ""
                            st.sidebar.success(
                                "Logged out! Use the sidebar to sign back in")

                    else:
                        st.sidebar.warning("Incorrect Password!")
                else:
                    st.sidebar.warning("No account with that email exists")
        else:
            new_email = st.sidebar.text_input("New Email")
            new_pass = st.sidebar.text_input("New Password", type='password')
            new_pass_2 = st.sidebar.text_input("Verify Password",
                                               type='password')
            if st.sidebar.button("Sign Up"):
                check_email = db.collection("users").where(
                    u'email', u'==', new_email).stream()
                good_email = True
                for e in check_email:
                    st.sidebar.warning(
                        "An account exists with this email already!")
                    good_email = False
                    break
                if new_pass == new_pass_2 and good_email:
                    st.sidebar.success(
                        "Successfully created account! Login from the sidebar")
                    #Write to firebase
                    salt = os.urandom(32)  # A new salt for this user
                    key = hashlib.pbkdf2_hmac('sha256',
                                              new_pass.encode('utf-8'), salt,
                                              100000)
                    login_ref = db.collection("users").document()
                    login_ref.set({
                        "email": new_email,
                        "salt": salt,
                        "key": key
                    })
                elif good_email:
                    st.sidebar.warning("Passwords do not match!")
    else:
        st.sidebar.warning("You are already logged in!")
        if st.sidebar.button("Logout"):
            email_logged_in = ""
            st.sidebar.success("Logged out! Use the sidebar to sign back in")

    def text_on_page(dict_var, id_json, list_res, page):
        if type(dict_var) is dict:
            for k, v in dict_var.items():
                if k == id_json and v == page:
                    if v > page: return list_res
                    list_res.append(dict_var["text"])
                elif isinstance(v, dict):
                    text_on_page(v, id_json, list_res, page)
                elif isinstance(v, list):
                    for item in v:
                        text_on_page(item, id_json, list_res, page)
        return list_res

    def get_page(data, page):
        lines = []
        for chunk in data["elements"]:
            lines.extend(text_on_page(chunk, "page", [], page))
        return lines

    def get_histogram(docs, top=20):
        tokens = []
        for s in docs.values():
            tokens += s.split()
        uniques, counts = np.unique(tokens, return_counts=True)
        sorted_inds = np.argsort(counts)
        uniques_sorted = uniques[sorted_inds[-top:]][::-1]
        counts_sorted = counts[sorted_inds[-top:]][::-1]
        return (uniques_sorted, counts_sorted)

    # def st_display_pdf(pdf_file):
    #     base64_pdf = base64.b64encode(pdf_file.read()).decode('utf-8')
    #     st.write(base64_pdf)
    #     pdf_display = f'<embed src="data:application/pdf;base64,{base64_pdf}" width="700" height="1000" type="application/pdf">'
    #     st.markdown(pdf_display, unsafe_allow_html=True)

    # Read from fire base if logged in
    counter_queries = 1
    if not email_logged_in == "":

        queries_collection_user = db.collection("queries")
        user_queries = queries_collection_user.where(
            u'email', u'==', email_logged_in).order_by(
                u'timeStamp',
                direction=firestore.Query.DESCENDING).limit(5).stream()
        with st.beta_expander("Your Most Recent Queries:"):
            for doc in user_queries:
                doc_dict = doc.to_dict()
                st.markdown("<strong>Query " + str(counter_queries) +
                            "</strong>: \n",
                            unsafe_allow_html=True)
                st.markdown("<u>Query</u>: " + doc_dict["query"] + "\n",
                            unsafe_allow_html=True)
                st.markdown("<u>Top Match</u>: " + doc_dict["topMatch"] + "\n",
                            unsafe_allow_html=True)
                if doc_dict["upvote"] < 0:
                    st.markdown(
                        "<small>So far " + str(abs(doc_dict["upvote"])) +
                        "people don't think it's a good match.</small>",
                        unsafe_allow_html=True)
                else:
                    st.markdown("<small>So far " + str(doc_dict["upvote"]) +
                                " people think it's a good match.</small>",
                                unsafe_allow_html=True)
                st.markdown("<hr>", unsafe_allow_html=True)
                counter_queries += 1
            if counter_queries == 1:
                st.write("No queries...yet!")
    file = st.file_uploader("Upload:", type="pdf", key=2)
    file_length = 100
    if file is not None:
        with pdfplumber.open(file) as raw:
            file_length = len(raw.pages)
    st.write(
        '🙊 If the app runs slowly, please consider reducing the page range!')
    st.write(
        '🌴 Also consider collapsing cells as you go for ease of navigating features.'
    )
    if file_length > 20 and file_length < 100:
        slider_val = st.slider('Page range:',
                               min_value=1,
                               max_value=file_length,
                               value=(1, int(file_length * .1)),
                               step=1)
    if file_length >= 100:
        slider_val = st.slider('Page range:',
                               min_value=1,
                               max_value=file_length,
                               value=(1, 25),
                               step=1)
    if file_length <= 20:
        slider_val = st.slider('Page range:',
                               min_value=1,
                               max_value=file_length,
                               value=(1, file_length),
                               step=1)

    if slider_val[1] - slider_val[0] > 50:
        st.write('Range greater than 50 pages, ‼️ this may run slowly.')
        st.subheader('')
    if file is not None:
        file_details = {
            "FileName": file.name,
            "FileType": file.type,
            "FileSize": str(file.size / 1000000) + 'mb'
        }
        data_load_state = st.text('Loading data... Thank you for waiting 😊. ')

        st.write(file_details)
        parser = HierarchyParser()
        source = FileSource(file,
                            page_numbers=list(
                                range(slider_val[0] - 1, slider_val[1])))

        @st.cache(suppress_st_warning=True)
        def fetch_pages(source):
            document = parser.parse_pdf(source)
            printer = JsonFilePrinter()
            file_path = pathlib.Path('pdf.json')
            printer.print(document, file_path=str(file_path.absolute()))

            with open('pdf.json') as json_file:
                data = json.load(json_file)
            json_file.close()
            pages = {i + 1: get_page(data, i) for i in range(0, slider_val[1])}

            return pages, file_path

        pages, _ = fetch_pages(source)
        paragraphs = [i for j in [i[1] for i in pages.items()] for i in j]
        windowed_paragraphs = [
            i for i in list(enumerate(paragraphs))
            if count_words(i[1]) > word_window
        ]
        text_blob = ' '.join([i[1] for i in windowed_paragraphs])
        word_list = text_blob.split()

        (formatted_docs,
         paragraph_page_idx) = preprocessing3.get_formatted_docs(pages)
        preprocessed_docs = preprocessing3.get_preprocessed_docs(
            formatted_docs)
        data_load_state.text(
            "Done! 🎉 If you receive an error messages from the server it will likely not impede app functionality."
        )
        with st.beta_expander('View word distribution.', expanded=True):
            radio_1 = st.radio("Select 1-word or 2-word distribution.",
                               ("1-word", "2-word", "1-word dispersion"))
            if radio_1 == "1-word":
                (uniques, counts) = get_histogram(preprocessed_docs)
                fig = px.bar(x=uniques, y=counts)
                st.plotly_chart(fig)
            if radio_1 == "2-word":
                bigrams = zip(word_list, word_list[1:])
                counts = Counter(bigrams)
                uniques = [
                    str(i)
                    for i in list(np.vstack(counts.most_common())[:, 0][:20])
                ]
                counts = list(np.vstack(counts.most_common())[:, 1][:20])
                fig = px.bar(x=uniques, y=counts)
                st.plotly_chart(fig)
            if radio_1 == "1-word dispersion":
                dispersion = st.text_input("Insert words separated by spaces.")
                dispersion_query = (''.join(dispersion)).split()
                wt_words = text_blob.split()
                if dispersion_query:
                    points = [(x, y) for x in range(len(wt_words))
                              for y in range(len(dispersion_query))
                              if wt_words[x] == dispersion_query[y]]
                    if points:
                        x, y = zip(*points)
                    else:
                        x = y = ()
                    import plotly.graph_objects as go
                    fig = go.Figure(data=go.Scatter(
                        x=x,
                        y=y,
                        mode="markers",
                    ))
                    if len(y) == 0:
                        st.write('No results! 😔 Please try other words.')
                    else:
                        fig.update_layout(
                            title="Where these words land in the document.",
                            yaxis=dict(tickmode='array',
                                       tickvals=list(range(max(y) + 1)),
                                       ticktext=dispersion_query))
                    st.plotly_chart(fig)
                # fig.show()

        st.subheader('First paragraphs on page ' + str(slider_val[0]) + ":")
        if len(pages[slider_val[0]]) >= 3:
            for i in range(3):
                st.markdown("<u>Paragraph " + str(i + 1) + "</u>: " +
                            pages[slider_val[0]][i],
                            unsafe_allow_html=True)
        else:
            st.markdown("Page " + str(slider_val[0]) + " is empty.")

        tfidf_vectorizer = cosine3.get_tfidf_vectorizer()
        tfidf_matrix = tfidf_vectorizer.fit_transform(
            list(preprocessed_docs.values())).toarray()
        (_, num_terms) = tfidf_matrix.shape
        query1 = st.text_input("Cosine-SVD Search")
        if query1:
            q = cosine3.get_query_vector(query1, tfidf_vectorizer)
            if num_terms > 1000:
                (doc_mat, weight_mat, term_mat) = cosine3.get_svd(tfidf_matrix)
                cos_sims = cosine3.get_cosine_sim_svd(q, doc_mat, weight_mat,
                                                      term_mat)
            else:
                cos_sims = cosine3.get_cosine_sim(q, tfidf_matrix)
            (rankings, scores) = cosine3.get_rankings(cos_sims)

            ranking_lengths = []
            for i in range(len(rankings)):
                idx = rankings[i]
                score = scores[i]
                page_num = paragraph_page_idx[idx]
                doc = formatted_docs[idx]
                curr_len = count_words(doc)
                ranking_lengths.append(curr_len)
            global word_window
            word_window = st.slider("Minimum word count",
                                    min_value=1,
                                    max_value=max(ranking_lengths),
                                    value=10)
            for i in range(len(rankings)):
                # there's probably a more efficient way to do this but these are at most 10 loops so sufficient for now.
                idx = rankings[i]
                score = scores[i]
                page_num = paragraph_page_idx[idx]
                doc = formatted_docs[idx]
                curr_len = count_words(doc)
                if curr_len >= word_window:
                    # st.write(curr_len,word_window)
                    break

            if score > 0.0:
                st.subheader("Similarity: " + str(round(score, 4)) +
                             ", Ranking: " + str(i + 1))
                st.markdown("<u>Match</u>: " + str(doc),
                            unsafe_allow_html=True)
                st.markdown("<u>Page Number</u>: " + str(page_num),
                            unsafe_allow_html=True)

                #columns used to layout the button to ask user to upload the result to db
                uploadCols = st.beta_columns(4)
                #columns used to write thank you message if user click upload
                thankyouCols = st.beta_columns(4)
                if uploadCols[-1].button(
                        "Submit Your Search Result for our Study!"):
                    thankyouCols[-1].write(
                        "Thank you! We can't get better without your support😃")
                    #write match and query to the db
                    doc_ref = db.collection("queries").document()
                    doc_ref.set({
                        "id": doc_ref.get().id,
                        "query": query1,
                        "topMatch": str(doc),
                        "timeStamp": firestore.SERVER_TIMESTAMP,
                        "upvote": 0,
                        "queryType": "Cosine",
                        "email": email_logged_in
                    })
                    cosMultiSub = True

                #columns used to layout explanation of upload button
                explainCols = st.beta_columns(4)
                explainCols[-1].markdown(
                    "<i><small>By clicking the submit button you agree with our <a href=\
                    'https://theuniversityfaculty.cornell.edu/dean/academic-integrity/'>terms of service</a></small></i>",
                    unsafe_allow_html=True)

            else:
                st.subheader("No matches found.")

        with st.beta_expander('Compare with Verbatim Search:'):
            query3 = st.text_input("Search:")
            verbatim_search = lambda query: [
                msg for msg in windowed_paragraphs if query in msg[1]
            ]
            if query3:
                v_result = verbatim_search(query3)
                if len(v_result) == 0:
                    st.write("No matches found.")
                else:
                    st.write("Matches found. 🎉")
                    v_slider = st.slider("View at most this many matches:", 1,
                                         100, 3)
                    display_result = v_result[:v_slider]
                    counter = 0
                    for pageNum, text in display_result:
                        counter += 1
                        st.subheader("Result " + str(counter) + ":")
                        st.markdown("&nbsp")
                        st.markdown("<u>Match</u>: " + str(text),
                                    unsafe_allow_html=True)
                        st.markdown("<u>Page Number</u>: " + str(pageNum),
                                    unsafe_allow_html=True)
                        st.markdown("&nbsp")
                        #columns used to layout the button to ask user to upload the result to db
                        uploadCols = st.beta_columns(4)
                        #columns used to write thank you message if user click upload
                        thankyouCols = st.beta_columns(4)
                        if uploadCols[-1].button(
                                "Submit Your Search Result for our Study",
                                key="Verbatim" + str(counter)):
                            thankyouCols[-1].write(
                                "Thank you! We can't get better without your support😃"
                            )
                            #write match and query to the db
                            doc_ref = db.collection("queries").document()
                            doc_ref.set({
                                "id": doc_ref.get().id,
                                "query": query1,
                                "topMatch": str(doc),
                                "timeStamp": firestore.SERVER_TIMESTAMP,
                                "upvote": 0,
                                "queryType": "Verbatim",
                                "email": email_logged_in
                            })

        with st.beta_expander('Explore Paragraph Similarities.'):
            st.write(
                'Browse and zoom into the similarity heatmap. Generally moderate matches, with a score between .4 and .6 that lie further from the diagonal are the most informative. Request the matching paragraphs below.'
            )
            sim_mat = tfidf_matrix @ tfidf_matrix.T
            fig1 = px.imshow(sim_mat)
            st.plotly_chart(fig1)

            # windowed_paragraphs = [i for i in list(enumerate(paragraphs)) if count_words(i[1])>word_window]

            number_query1 = st.number_input("Select 1st paragraph", 0,
                                            len(paragraphs))
            number_query2 = st.number_input("Select 2nd paragraph", 0,
                                            len(paragraphs))

            st.write(paragraphs[number_query1])
            st.write(paragraphs[number_query2])

        # st.subheader('Paragraph similarity heatmap')

    queries_collection_ref = db.collection("queries")
    query = queries_collection_ref.order_by(
        u'timeStamp', direction=firestore.Query.DESCENDING).limit(5)
    counter = 0

    #helper function to write upvote onto the page
    def writeUpvote(voteCount):
        if voteCount < 0:
            st.markdown("<small>So far " + str(abs(voteCount)) +
                        " people don't think it's a good match.</small>",
                        unsafe_allow_html=True)
        else:
            st.markdown("<small>So far " + str(voteCount) +
                        " people think it's a good match.</small>",
                        unsafe_allow_html=True)

    #helper function to update upvote given doc id and queries collection ref. Return the new upvote
    def updateVotes(queries_collection_ref, id, inc):
        doc_ref = queries_collection_ref.document(id)
        latestUpvote = doc_ref.get().to_dict()["upvote"]
        if inc:
            latestUpvote += 1
        else:
            latestUpvote -= 1

        doc_ref.update({"upvote": latestUpvote})
        return latestUpvote

    with st.beta_expander("Recent Queries We Processed..."):
        for doc in query.stream():
            counter += 1
            doc_dict = doc.to_dict()
            st.markdown("<strong>Query " + str(counter) + "</strong>: \n",
                        unsafe_allow_html=True)
            st.markdown("<u>Query</u>: " + doc_dict["query"] + "\n",
                        unsafe_allow_html=True)
            st.markdown("<u>Top Match</u>: " + doc_dict["topMatch"] + "\n",
                        unsafe_allow_html=True)
            st.markdown("<u>Search Method</u>: " + doc_dict["queryType"] +
                        "\n",
                        unsafe_allow_html=True)

            st.markdown("&nbsp")

            st.markdown(
                "<i><small>Do you think this is a good match?</small></i>",
                unsafe_allow_html=True)
            cols = st.beta_columns(12)
            likeButton = cols[0].button("👍", key="YesButton" + str(counter))
            dislikeButton = cols[1].button("👎", key="NoButton" + str(counter))
            newUpvote = doc_dict["upvote"]
            if likeButton:
                newUpvote = updateVotes(queries_collection_ref, doc_dict["id"],
                                        True)
                writeUpvote(newUpvote)

            elif dislikeButton:
                newUpvote = newUpvote = updateVotes(queries_collection_ref,
                                                    doc_dict["id"], False)
                writeUpvote(newUpvote)
            else:
                writeUpvote(newUpvote)

            if counter != 5:
                st.markdown("<hr>", unsafe_allow_html=True)
    # if file is not None:
    #     # st.write(file_path)
    #     st_display_pdf(file)

    st.subheader('made with ❤️ by:')
    st.markdown(
        '[Vince Bartle](https://bartle.io) (vb344) | [Dubem Ogwulumba](https://www.linkedin.com/in/dubem-ogwulumba/) (dao52) | [Erik Ossner](https://erikossner.com/) (eco9) | [Qiyu Yang](https://github.com/qiyuyang16/) (qy35) | [Youhan Yuan](https://github.com/nukenukenukelol) (yy435)'
    )
示例#14
0
 def setUpClass(cls) -> None:
     parser = HierarchyParser()
     cls.testDocument = parser.parse_pdf(
         FileSource(cls.straight_forward_doc))
示例#15
0
def app():
    def text_on_page(dict_var, id_json, list_res, page):
        if type(dict_var) is dict:
            for k, v in dict_var.items():
                if k == id_json and v == page:
                    if v > page: return list_res
                    list_res.append(dict_var["text"])
                elif isinstance(v, dict):
                    text_on_page(v, id_json, list_res, page)
                elif isinstance(v, list):
                    for item in v:
                        text_on_page(item, id_json, list_res, page)
        return list_res

    def get_page(data, page):
        lines = []
        for chunk in data["elements"]:
            lines.extend(text_on_page(chunk, "page", [], page))
        return lines

    def get_histogram(docs, top=20):
        tokens = []
        for s in docs.values():
            tokens += s.split()
        uniques, counts = np.unique(tokens, return_counts=True)
        sorted_inds = np.argsort(counts)
        uniques_sorted = uniques[sorted_inds[-top:]][::-1]
        counts_sorted = counts[sorted_inds[-top:]][::-1]
        return (uniques_sorted, counts_sorted)

    file = st.file_uploader("test", type="pdf", key=2)
    start = 1
    max_val = 1000
    end = 5
    slider_val = st.slider('Page range:',
                           min_value=start,
                           max_value=max_val,
                           value=(1, end),
                           step=1)

    if file is not None:
        file_details = {
            "FileName": file.name,
            "FileType": file.type,
            "FileSize": str(file.size / 1000000) + 'mb'
        }
        data_load_state = st.text('Loading data... Thank you for waiting 😊')

        parser = HierarchyParser()
        source = FileSource(file, page_numbers=list(range(start - 1, end)))

        @st.cache(suppress_st_warning=True)
        def fetch_doc(source):
            return parser.parse_pdf(source)

        document = fetch_doc(source)
        printer = JsonFilePrinter()
        file_path = pathlib.Path('pdf.json')
        printer.print(document, file_path=str(file_path.absolute()))

        with open('pdf.json') as json_file:
            data = json.load(json_file)
        json_file.close()
        pages = {
            i: get_page(data, i)
            for i in range(slider_val[0], slider_val[1])
        }

        (formatted_docs,
         paragraph_page_idx) = preprocessing2.get_formatted_docs(
             pages, max_paragraphs=5)
        preprocessed_docs = preprocessing2.get_preprocessed_docs(
            formatted_docs)
        data_load_state.text("Done!")
        st.write(file_details)
        with st.beta_expander("PDF Extraction details"):
            st.subheader('First paragraphs on page ' + str(slider_val[0]))
            if len(pages[slider_val[0]]) >= 5:
                for i in range(5):
                    st.markdown("<u>¶ " + str(i + 1) + "</u>: " +
                                pages[slider_val[0]][i],
                                unsafe_allow_html=True)
            else:
                for i in range(len(pages[slider_val[0]])):
                    st.markdown("<u>¶ " + str(i + 1) + "</u>: " +
                                pages[slider_val[0]][i],
                                unsafe_allow_html=True)

            st.subheader('PDF word distribution')
            (uniques, counts) = get_histogram(preprocessed_docs)
            fig = px.bar(x=uniques, y=counts)
            fig.update_xaxes(title_text='words')
            fig.update_yaxes(title_text='occurances')
            st.plotly_chart(fig)

            st.subheader('Paragraph similarity heatmap')

        tfidf_vectorizer = cosine2.get_tfidf_vectorizer()
        tfidf_matrix = tfidf_vectorizer.fit_transform(
            list(preprocessed_docs.values())).toarray()
        query1 = st.text_input("Cosine-SVD Search")
        if query1:
            q = cosine2.get_query_vector(query1, tfidf_vectorizer)
            cos_sims = cosine2.get_cosine_sim(q, tfidf_matrix)
            (rankings, scores) = cosine2.get_rankings(cos_sims)

            idx = rankings[0]
            score = scores[0]
            page_num = paragraph_page_idx[idx] + 1
            doc = formatted_docs[idx]
            if score > 0.0:
                st.subheader("Similarity: " + str(score))
                st.markdown("<u>Match</u>: " + str(doc),
                            unsafe_allow_html=True)
                st.markdown("<u>Page Number</u>: " + str(page_num),
                            unsafe_allow_html=True)

                #write match and query to the db
                doc_ref = db.collection("queries").document()
                doc_ref.set({
                    "query": query1,
                    "topMatch": str(doc),
                    "timeStamp": firestore.SERVER_TIMESTAMP,
                    "upvote": 0
                })

            else:
                st.subheader("No matches found.")
        st.write("Following methods are under construction 😊 Stay tuned!")
        query2 = st.text_input("Synonymized Query Search")
        query3 = st.text_input("Verbatim Search")

    st.subheader("Recent search results:")
    q_ref = db.collection("queries").order_by(
        u'timeStamp', direction=firestore.Query.DESCENDING)
    counter = 0
    yesButtons = []
    noButtons = []
    for doc in q_ref.stream():
        counter += 1
        doc_dict = doc.to_dict()

        st.markdown("<strong>Query " + str(counter) + "</strong>: \n",
                    unsafe_allow_html=True)
        st.markdown("<u>Query</u>: " + doc_dict["query"] + "\n",
                    unsafe_allow_html=True)
        st.markdown("<u>Top Match</u>: " + doc_dict["topMatch"] + "\n",
                    unsafe_allow_html=True)
        st.markdown("&nbsp")
        if doc_dict["upvote"] < 0:
            st.markdown("<small>So far " + str(abs(doc_dict["upvote"])) +
                        "people don't think it's a good match.</small>",
                        unsafe_allow_html=True)
        else:
            st.markdown("<small>So far " + str(doc_dict["upvote"]) +
                        " people think it's a good match.</small>",
                        unsafe_allow_html=True)

        st.markdown("<i><small>Do you think this is a good match?</small></i>",
                    unsafe_allow_html=True)
        yesButtons.append(st.button("👍", key="YesButton" + str(counter)))
        noButtons.append(st.button("👎", key="NoButton" + str(counter)))

        st.markdown("<hr>", unsafe_allow_html=True)

        if counter == 5:
            break

    st.subheader('made with ❤️ by:')
    st.markdown(
        '[Vince Bartle](https://bartle.io) (vb344) | [Dubem Ogwulumba](https://www.linkedin.com/in/dubem-ogwulumba/) (dao52) | [Erik Ossner](https://erikossner.com/) (eco9) | [Qiyu Yang](https://github.com/qiyuyang16/) (qy35) | [Youhan Yuan](https://github.com/nukenukenukelol) (yy435)'
    )