class TestHierarchy(TestCase): doc_with_columns = str(Path("resources/IE00BM67HT60-ATB-FS-DE-2020-2-28.pdf").absolute()) straight_forward_doc = str(Path("resources/interview_cheatsheet.pdf").absolute()) nested_doc_bold_title = str(Path("resources/5648.pdf").absolute()) same_style_doc = str(Path("resources/SameStyleOnly.pdf").absolute()) same_size_bold_header = str(Path("resources/SameSize_BoldTitle.pdf").absolute()) same_size_enum_header = str(Path("resources/SameSize_EnumeratedTitle.pdf").absolute()) parser = HierarchyParser() def test_no_hierarchy_detected(self): pdf = self.parser.parse_pdf(FileSource(self.same_style_doc)) self.assertEqual(4, len(pdf.elements[0].children)) self.assertIsInstance(pdf.elements[0], DanglingTextSection) def test_hierarchy_bold_title(self): pdf = self.parser.parse_pdf(FileSource(self.same_size_bold_header)) self.assertEqual(2, len(pdf.elements)) self.assertEqual("Lorem Ipsum.", pdf.elements[0].heading.text) self.assertEqual("Appendix", pdf.elements[1].heading.text) def test_hierarchy_pdf_parser(self): path = self.straight_forward_doc source = FileSource(path) pdf = self.parser.parse_pdf(source) self.assertEqual(9, len(pdf.elements)) self.assertEqual("Data Structure Basics", pdf.elements[5].heading.text) self.assertEqual("Basic Types of Algorithms", pdf.elements[8].heading.text) self.assertEqual(4, pdf.elements[8].heading.page) def test_grouping(self): test_doc = self.nested_doc_bold_title doc = self.parser.parse_pdf(FileSource(test_doc)) self.assertEqual(1, doc.elements.__len__()) self.assertEqual(13, doc.elements[0].children.__len__()) self.assertEqual("Outdoorpädagogik", doc.elements[0].heading.text) self.assertEqual("„Fange den Stock“", doc.elements[0].children[0].heading.text) def test_grouping_bold_key_and_size(self): doc = self.parser.parse_pdf(FileSource(self.straight_forward_doc)) self.assertEqual(len(doc.elements), 9) def test_grouping_bold_columns(self): doc = self.parser.parse_pdf(FileSource(self.doc_with_columns)) self.assertEqual("Xtrackers MSCI World Information Technology UCITS ETF 1C", doc.elements[1].heading.text)
class TestExamples(TestCase): parser = HierarchyParser() def test_count_paragraph_words(self): test_file = str(Path("resources/lorem.pdf")) document = self.parser.parse_pdf(FileSource(file_path=test_file)) assert_token_order = [50, 100, 150] for level, title, content in traverse_inorder_sections_with_content( document): prefix = txtPrinter.get_title_prefix(level) tokens = content.split() self.assertEqual(assert_token_order.pop(0), len(tokens)) print("{}{};\twords: {}".format(prefix, title, len(tokens))) def test_load_book(self): book_path = Path("resources/interview_cheatsheet.pdf") document = self.parser.parse_pdf(FileSource(file_path=str(book_path))) for level, title, content in traverse_inorder_sections_with_content( document): prefix = txtPrinter.get_title_prefix(level) print("{}{};\twords: {}".format(prefix, title, len(content.split())))
def skip_test_pdfstructure(self): path = TestHierarchy.straight_forward_doc parser = HierarchyParser() source = FileSource(path) pdf = parser.parse_pdf(source) print(PrettyStringPrinter().print(pdf))
def setUpClass(cls) -> None: parser = HierarchyParser() cls.test_doc_same_style = parser.parse_pdf( FileSource(cls.same_style_doc)) cls.test_doc = parser.parse_pdf(FileSource(cls.straight_forward_doc))
def app(): def text_on_page(dict_var, id_json, list_res, page): if type(dict_var) is dict: for k, v in dict_var.items(): if k == id_json and v == page: if v > page: return list_res list_res.append(dict_var["text"]) elif isinstance(v, dict): text_on_page(v, id_json, list_res, page) elif isinstance(v, list): for item in v: text_on_page(item, id_json, list_res, page) return list_res def get_page(data, page): lines = [] for chunk in data["elements"]: lines.extend(text_on_page(chunk, "page", [], page)) return lines def get_histogram(docs, top=20): tokens = [] for s in docs.values(): tokens += s.split() uniques, counts = np.unique(tokens, return_counts=True) sorted_inds = np.argsort(counts) uniques_sorted = uniques[sorted_inds[-top:]][::-1] counts_sorted = counts[sorted_inds[-top:]][::-1] return (uniques_sorted, counts_sorted) file = st.file_uploader("test", type="pdf", key=2) start = 1 max_val = 1000 end = 25 slider_val = st.slider('Page range:', min_value=start, max_value=max_val, value=(1, end), step=1) #probably need to put '@st.cache(suppress_st_warning=True)' above a function where the 'with open ...' code below is the function. if file is not None: file_details = { "FileName": file.name, "FileType": file.type, "FileSize": str(file.size / 1000000) + 'mb' } data_load_state = st.text('Loading data... Thank you for waiting 😊') st.write(file_details) parser = HierarchyParser() source = FileSource(file, page_numbers=list(range(start, end))) document = parser.parse_pdf(source) printer = JsonFilePrinter() file_path = pathlib.Path('pdf.json') printer.print(document, file_path=str(file_path.absolute())) with open('pdf.json') as file: data = json.load(file) pages = {i: ' '.join(get_page(data, i)) for i in range(end)} doc_size = 0.25 (formatted_docs, paragraph_page_idx) = preprocessing1.get_formatted_docs( pages, doc_size) preprocessed_docs = preprocessing1.get_preprocessed_docs( formatted_docs) data_load_state.text("Done!") st.subheader('First page in the selected range') st.write({"page 1": pages[0]}) st.subheader('Page range word distribution') # (uniques, counts) = get_histogram(preprocessed_docs) # fig = px.bar(x = uniques, y = counts) # st.plotly_chart(fig) (uniques, counts) = get_histogram(preprocessed_docs) fig, ax = plt.subplots(figsize=(10, 10)) ax.bar(uniques, counts) plt.setp(ax.get_xticklabels(), rotation='vertical') st.pyplot(fig) tfidf_vectorizer = cosine1.get_tfidf_vectorizer() tfidf_matrix = tfidf_vectorizer.fit_transform( list(preprocessed_docs.values())).toarray() query = st.text_input("Search:") if query: q = cosine1.get_query_vector(query, tfidf_vectorizer) cos_sims = cosine1.get_cosine_sim(q, tfidf_matrix) (rankings, scores) = cosine1.get_rankings(cos_sims) idx = rankings[0] score = scores[0] page_num = paragraph_page_idx[idx] + 1 doc = formatted_docs[idx] if score > 0.0: st.subheader("Similarity: " + str(score)) st.write({"page " + str(page_num): str(doc)}) else: st.subheader("No matches found.") st.subheader('made with ❤️ by:') st.markdown( '[Vince Bartle](https://bartle.io) (vb344) | [Dubem Ogwulumba](https://www.linkedin.com/in/dubem-ogwulumba/) (dao52) | [Erik Ossner](https://erikossner.com/) (eco9) | [Qiyu Yang](https://github.com/qiyuyang16/) (qy35) | [Youhan Yuan](https://github.com/nukenukenukelol) (yy435)' )
def app(): global email_logged_in choice = st.sidebar.selectbox("Menu", ["Login", "Sign Up"]) if email_logged_in == "": if choice == "Login": email = st.sidebar.text_input("Email") password = st.sidebar.text_input("Password", type='password') if st.sidebar.button("Login"): # Match from fire base check_email = db.collection("users").where( u'email', u'==', email).stream() user_dict = dict() for user in check_email: user_dict = user.to_dict() break if len(user_dict) > 0: salt = user_dict['salt'] key = user_dict['key'] new_key = hashlib.pbkdf2_hmac('sha256', password.encode('utf-8'), salt, 100000) if key == new_key: st.sidebar.success("Logged in as {}".format(email)) email_logged_in = email if st.sidebar.button("Logout"): email_logged_in = "" st.sidebar.success( "Logged out! Use the sidebar to sign back in") else: st.sidebar.warning("Incorrect Password!") else: st.sidebar.warning("No account with that email exists") else: new_email = st.sidebar.text_input("New Email") new_pass = st.sidebar.text_input("New Password", type='password') new_pass_2 = st.sidebar.text_input("Verify Password", type='password') if st.sidebar.button("Sign Up"): check_email = db.collection("users").where( u'email', u'==', new_email).stream() good_email = True for e in check_email: st.sidebar.warning( "An account exists with this email already!") good_email = False break if new_pass == new_pass_2 and good_email: st.sidebar.success( "Successfully created account! Login from the sidebar") #Write to firebase salt = os.urandom(32) # A new salt for this user key = hashlib.pbkdf2_hmac('sha256', new_pass.encode('utf-8'), salt, 100000) login_ref = db.collection("users").document() login_ref.set({ "email": new_email, "salt": salt, "key": key }) elif good_email: st.sidebar.warning("Passwords do not match!") else: st.sidebar.warning("You are already logged in!") if st.sidebar.button("Logout"): email_logged_in = "" st.sidebar.success("Logged out! Use the sidebar to sign back in") def text_on_page(dict_var, id_json, list_res, page): if type(dict_var) is dict: for k, v in dict_var.items(): if k == id_json and v == page: if v > page: return list_res list_res.append(dict_var["text"]) elif isinstance(v, dict): text_on_page(v, id_json, list_res, page) elif isinstance(v, list): for item in v: text_on_page(item, id_json, list_res, page) return list_res def get_page(data, page): lines = [] for chunk in data["elements"]: lines.extend(text_on_page(chunk, "page", [], page)) return lines def get_histogram(docs, top=20): tokens = [] for s in docs.values(): tokens += s.split() uniques, counts = np.unique(tokens, return_counts=True) sorted_inds = np.argsort(counts) uniques_sorted = uniques[sorted_inds[-top:]][::-1] counts_sorted = counts[sorted_inds[-top:]][::-1] return (uniques_sorted, counts_sorted) # def st_display_pdf(pdf_file): # base64_pdf = base64.b64encode(pdf_file.read()).decode('utf-8') # st.write(base64_pdf) # pdf_display = f'<embed src="data:application/pdf;base64,{base64_pdf}" width="700" height="1000" type="application/pdf">' # st.markdown(pdf_display, unsafe_allow_html=True) # Read from fire base if logged in counter_queries = 1 if not email_logged_in == "": queries_collection_user = db.collection("queries") user_queries = queries_collection_user.where( u'email', u'==', email_logged_in).order_by( u'timeStamp', direction=firestore.Query.DESCENDING).limit(5).stream() with st.beta_expander("Your Most Recent Queries:"): for doc in user_queries: doc_dict = doc.to_dict() st.markdown("<strong>Query " + str(counter_queries) + "</strong>: \n", unsafe_allow_html=True) st.markdown("<u>Query</u>: " + doc_dict["query"] + "\n", unsafe_allow_html=True) st.markdown("<u>Top Match</u>: " + doc_dict["topMatch"] + "\n", unsafe_allow_html=True) if doc_dict["upvote"] < 0: st.markdown( "<small>So far " + str(abs(doc_dict["upvote"])) + "people don't think it's a good match.</small>", unsafe_allow_html=True) else: st.markdown("<small>So far " + str(doc_dict["upvote"]) + " people think it's a good match.</small>", unsafe_allow_html=True) st.markdown("<hr>", unsafe_allow_html=True) counter_queries += 1 if counter_queries == 1: st.write("No queries...yet!") file = st.file_uploader("Upload:", type="pdf", key=2) file_length = 100 if file is not None: with pdfplumber.open(file) as raw: file_length = len(raw.pages) st.write( '🙊 If the app runs slowly, please consider reducing the page range!') st.write( '🌴 Also consider collapsing cells as you go for ease of navigating features.' ) if file_length > 20 and file_length < 100: slider_val = st.slider('Page range:', min_value=1, max_value=file_length, value=(1, int(file_length * .1)), step=1) if file_length >= 100: slider_val = st.slider('Page range:', min_value=1, max_value=file_length, value=(1, 25), step=1) if file_length <= 20: slider_val = st.slider('Page range:', min_value=1, max_value=file_length, value=(1, file_length), step=1) if slider_val[1] - slider_val[0] > 50: st.write('Range greater than 50 pages, ‼️ this may run slowly.') st.subheader('') if file is not None: file_details = { "FileName": file.name, "FileType": file.type, "FileSize": str(file.size / 1000000) + 'mb' } data_load_state = st.text('Loading data... Thank you for waiting 😊. ') st.write(file_details) parser = HierarchyParser() source = FileSource(file, page_numbers=list( range(slider_val[0] - 1, slider_val[1]))) @st.cache(suppress_st_warning=True) def fetch_pages(source): document = parser.parse_pdf(source) printer = JsonFilePrinter() file_path = pathlib.Path('pdf.json') printer.print(document, file_path=str(file_path.absolute())) with open('pdf.json') as json_file: data = json.load(json_file) json_file.close() pages = {i + 1: get_page(data, i) for i in range(0, slider_val[1])} return pages, file_path pages, _ = fetch_pages(source) paragraphs = [i for j in [i[1] for i in pages.items()] for i in j] windowed_paragraphs = [ i for i in list(enumerate(paragraphs)) if count_words(i[1]) > word_window ] text_blob = ' '.join([i[1] for i in windowed_paragraphs]) word_list = text_blob.split() (formatted_docs, paragraph_page_idx) = preprocessing3.get_formatted_docs(pages) preprocessed_docs = preprocessing3.get_preprocessed_docs( formatted_docs) data_load_state.text( "Done! 🎉 If you receive an error messages from the server it will likely not impede app functionality." ) with st.beta_expander('View word distribution.', expanded=True): radio_1 = st.radio("Select 1-word or 2-word distribution.", ("1-word", "2-word", "1-word dispersion")) if radio_1 == "1-word": (uniques, counts) = get_histogram(preprocessed_docs) fig = px.bar(x=uniques, y=counts) st.plotly_chart(fig) if radio_1 == "2-word": bigrams = zip(word_list, word_list[1:]) counts = Counter(bigrams) uniques = [ str(i) for i in list(np.vstack(counts.most_common())[:, 0][:20]) ] counts = list(np.vstack(counts.most_common())[:, 1][:20]) fig = px.bar(x=uniques, y=counts) st.plotly_chart(fig) if radio_1 == "1-word dispersion": dispersion = st.text_input("Insert words separated by spaces.") dispersion_query = (''.join(dispersion)).split() wt_words = text_blob.split() if dispersion_query: points = [(x, y) for x in range(len(wt_words)) for y in range(len(dispersion_query)) if wt_words[x] == dispersion_query[y]] if points: x, y = zip(*points) else: x = y = () import plotly.graph_objects as go fig = go.Figure(data=go.Scatter( x=x, y=y, mode="markers", )) if len(y) == 0: st.write('No results! 😔 Please try other words.') else: fig.update_layout( title="Where these words land in the document.", yaxis=dict(tickmode='array', tickvals=list(range(max(y) + 1)), ticktext=dispersion_query)) st.plotly_chart(fig) # fig.show() st.subheader('First paragraphs on page ' + str(slider_val[0]) + ":") if len(pages[slider_val[0]]) >= 3: for i in range(3): st.markdown("<u>Paragraph " + str(i + 1) + "</u>: " + pages[slider_val[0]][i], unsafe_allow_html=True) else: st.markdown("Page " + str(slider_val[0]) + " is empty.") tfidf_vectorizer = cosine3.get_tfidf_vectorizer() tfidf_matrix = tfidf_vectorizer.fit_transform( list(preprocessed_docs.values())).toarray() (_, num_terms) = tfidf_matrix.shape query1 = st.text_input("Cosine-SVD Search") if query1: q = cosine3.get_query_vector(query1, tfidf_vectorizer) if num_terms > 1000: (doc_mat, weight_mat, term_mat) = cosine3.get_svd(tfidf_matrix) cos_sims = cosine3.get_cosine_sim_svd(q, doc_mat, weight_mat, term_mat) else: cos_sims = cosine3.get_cosine_sim(q, tfidf_matrix) (rankings, scores) = cosine3.get_rankings(cos_sims) ranking_lengths = [] for i in range(len(rankings)): idx = rankings[i] score = scores[i] page_num = paragraph_page_idx[idx] doc = formatted_docs[idx] curr_len = count_words(doc) ranking_lengths.append(curr_len) global word_window word_window = st.slider("Minimum word count", min_value=1, max_value=max(ranking_lengths), value=10) for i in range(len(rankings)): # there's probably a more efficient way to do this but these are at most 10 loops so sufficient for now. idx = rankings[i] score = scores[i] page_num = paragraph_page_idx[idx] doc = formatted_docs[idx] curr_len = count_words(doc) if curr_len >= word_window: # st.write(curr_len,word_window) break if score > 0.0: st.subheader("Similarity: " + str(round(score, 4)) + ", Ranking: " + str(i + 1)) st.markdown("<u>Match</u>: " + str(doc), unsafe_allow_html=True) st.markdown("<u>Page Number</u>: " + str(page_num), unsafe_allow_html=True) #columns used to layout the button to ask user to upload the result to db uploadCols = st.beta_columns(4) #columns used to write thank you message if user click upload thankyouCols = st.beta_columns(4) if uploadCols[-1].button( "Submit Your Search Result for our Study!"): thankyouCols[-1].write( "Thank you! We can't get better without your support😃") #write match and query to the db doc_ref = db.collection("queries").document() doc_ref.set({ "id": doc_ref.get().id, "query": query1, "topMatch": str(doc), "timeStamp": firestore.SERVER_TIMESTAMP, "upvote": 0, "queryType": "Cosine", "email": email_logged_in }) cosMultiSub = True #columns used to layout explanation of upload button explainCols = st.beta_columns(4) explainCols[-1].markdown( "<i><small>By clicking the submit button you agree with our <a href=\ 'https://theuniversityfaculty.cornell.edu/dean/academic-integrity/'>terms of service</a></small></i>", unsafe_allow_html=True) else: st.subheader("No matches found.") with st.beta_expander('Compare with Verbatim Search:'): query3 = st.text_input("Search:") verbatim_search = lambda query: [ msg for msg in windowed_paragraphs if query in msg[1] ] if query3: v_result = verbatim_search(query3) if len(v_result) == 0: st.write("No matches found.") else: st.write("Matches found. 🎉") v_slider = st.slider("View at most this many matches:", 1, 100, 3) display_result = v_result[:v_slider] counter = 0 for pageNum, text in display_result: counter += 1 st.subheader("Result " + str(counter) + ":") st.markdown(" ") st.markdown("<u>Match</u>: " + str(text), unsafe_allow_html=True) st.markdown("<u>Page Number</u>: " + str(pageNum), unsafe_allow_html=True) st.markdown(" ") #columns used to layout the button to ask user to upload the result to db uploadCols = st.beta_columns(4) #columns used to write thank you message if user click upload thankyouCols = st.beta_columns(4) if uploadCols[-1].button( "Submit Your Search Result for our Study", key="Verbatim" + str(counter)): thankyouCols[-1].write( "Thank you! We can't get better without your support😃" ) #write match and query to the db doc_ref = db.collection("queries").document() doc_ref.set({ "id": doc_ref.get().id, "query": query1, "topMatch": str(doc), "timeStamp": firestore.SERVER_TIMESTAMP, "upvote": 0, "queryType": "Verbatim", "email": email_logged_in }) with st.beta_expander('Explore Paragraph Similarities.'): st.write( 'Browse and zoom into the similarity heatmap. Generally moderate matches, with a score between .4 and .6 that lie further from the diagonal are the most informative. Request the matching paragraphs below.' ) sim_mat = tfidf_matrix @ tfidf_matrix.T fig1 = px.imshow(sim_mat) st.plotly_chart(fig1) # windowed_paragraphs = [i for i in list(enumerate(paragraphs)) if count_words(i[1])>word_window] number_query1 = st.number_input("Select 1st paragraph", 0, len(paragraphs)) number_query2 = st.number_input("Select 2nd paragraph", 0, len(paragraphs)) st.write(paragraphs[number_query1]) st.write(paragraphs[number_query2]) # st.subheader('Paragraph similarity heatmap') queries_collection_ref = db.collection("queries") query = queries_collection_ref.order_by( u'timeStamp', direction=firestore.Query.DESCENDING).limit(5) counter = 0 #helper function to write upvote onto the page def writeUpvote(voteCount): if voteCount < 0: st.markdown("<small>So far " + str(abs(voteCount)) + " people don't think it's a good match.</small>", unsafe_allow_html=True) else: st.markdown("<small>So far " + str(voteCount) + " people think it's a good match.</small>", unsafe_allow_html=True) #helper function to update upvote given doc id and queries collection ref. Return the new upvote def updateVotes(queries_collection_ref, id, inc): doc_ref = queries_collection_ref.document(id) latestUpvote = doc_ref.get().to_dict()["upvote"] if inc: latestUpvote += 1 else: latestUpvote -= 1 doc_ref.update({"upvote": latestUpvote}) return latestUpvote with st.beta_expander("Recent Queries We Processed..."): for doc in query.stream(): counter += 1 doc_dict = doc.to_dict() st.markdown("<strong>Query " + str(counter) + "</strong>: \n", unsafe_allow_html=True) st.markdown("<u>Query</u>: " + doc_dict["query"] + "\n", unsafe_allow_html=True) st.markdown("<u>Top Match</u>: " + doc_dict["topMatch"] + "\n", unsafe_allow_html=True) st.markdown("<u>Search Method</u>: " + doc_dict["queryType"] + "\n", unsafe_allow_html=True) st.markdown(" ") st.markdown( "<i><small>Do you think this is a good match?</small></i>", unsafe_allow_html=True) cols = st.beta_columns(12) likeButton = cols[0].button("👍", key="YesButton" + str(counter)) dislikeButton = cols[1].button("👎", key="NoButton" + str(counter)) newUpvote = doc_dict["upvote"] if likeButton: newUpvote = updateVotes(queries_collection_ref, doc_dict["id"], True) writeUpvote(newUpvote) elif dislikeButton: newUpvote = newUpvote = updateVotes(queries_collection_ref, doc_dict["id"], False) writeUpvote(newUpvote) else: writeUpvote(newUpvote) if counter != 5: st.markdown("<hr>", unsafe_allow_html=True) # if file is not None: # # st.write(file_path) # st_display_pdf(file) st.subheader('made with ❤️ by:') st.markdown( '[Vince Bartle](https://bartle.io) (vb344) | [Dubem Ogwulumba](https://www.linkedin.com/in/dubem-ogwulumba/) (dao52) | [Erik Ossner](https://erikossner.com/) (eco9) | [Qiyu Yang](https://github.com/qiyuyang16/) (qy35) | [Youhan Yuan](https://github.com/nukenukenukelol) (yy435)' )
def setUpClass(cls) -> None: parser = HierarchyParser() cls.testDocument = parser.parse_pdf( FileSource(cls.straight_forward_doc))
def app(): def text_on_page(dict_var, id_json, list_res, page): if type(dict_var) is dict: for k, v in dict_var.items(): if k == id_json and v == page: if v > page: return list_res list_res.append(dict_var["text"]) elif isinstance(v, dict): text_on_page(v, id_json, list_res, page) elif isinstance(v, list): for item in v: text_on_page(item, id_json, list_res, page) return list_res def get_page(data, page): lines = [] for chunk in data["elements"]: lines.extend(text_on_page(chunk, "page", [], page)) return lines def get_histogram(docs, top=20): tokens = [] for s in docs.values(): tokens += s.split() uniques, counts = np.unique(tokens, return_counts=True) sorted_inds = np.argsort(counts) uniques_sorted = uniques[sorted_inds[-top:]][::-1] counts_sorted = counts[sorted_inds[-top:]][::-1] return (uniques_sorted, counts_sorted) file = st.file_uploader("test", type="pdf", key=2) start = 1 max_val = 1000 end = 5 slider_val = st.slider('Page range:', min_value=start, max_value=max_val, value=(1, end), step=1) if file is not None: file_details = { "FileName": file.name, "FileType": file.type, "FileSize": str(file.size / 1000000) + 'mb' } data_load_state = st.text('Loading data... Thank you for waiting 😊') parser = HierarchyParser() source = FileSource(file, page_numbers=list(range(start - 1, end))) @st.cache(suppress_st_warning=True) def fetch_doc(source): return parser.parse_pdf(source) document = fetch_doc(source) printer = JsonFilePrinter() file_path = pathlib.Path('pdf.json') printer.print(document, file_path=str(file_path.absolute())) with open('pdf.json') as json_file: data = json.load(json_file) json_file.close() pages = { i: get_page(data, i) for i in range(slider_val[0], slider_val[1]) } (formatted_docs, paragraph_page_idx) = preprocessing2.get_formatted_docs( pages, max_paragraphs=5) preprocessed_docs = preprocessing2.get_preprocessed_docs( formatted_docs) data_load_state.text("Done!") st.write(file_details) with st.beta_expander("PDF Extraction details"): st.subheader('First paragraphs on page ' + str(slider_val[0])) if len(pages[slider_val[0]]) >= 5: for i in range(5): st.markdown("<u>¶ " + str(i + 1) + "</u>: " + pages[slider_val[0]][i], unsafe_allow_html=True) else: for i in range(len(pages[slider_val[0]])): st.markdown("<u>¶ " + str(i + 1) + "</u>: " + pages[slider_val[0]][i], unsafe_allow_html=True) st.subheader('PDF word distribution') (uniques, counts) = get_histogram(preprocessed_docs) fig = px.bar(x=uniques, y=counts) fig.update_xaxes(title_text='words') fig.update_yaxes(title_text='occurances') st.plotly_chart(fig) st.subheader('Paragraph similarity heatmap') tfidf_vectorizer = cosine2.get_tfidf_vectorizer() tfidf_matrix = tfidf_vectorizer.fit_transform( list(preprocessed_docs.values())).toarray() query1 = st.text_input("Cosine-SVD Search") if query1: q = cosine2.get_query_vector(query1, tfidf_vectorizer) cos_sims = cosine2.get_cosine_sim(q, tfidf_matrix) (rankings, scores) = cosine2.get_rankings(cos_sims) idx = rankings[0] score = scores[0] page_num = paragraph_page_idx[idx] + 1 doc = formatted_docs[idx] if score > 0.0: st.subheader("Similarity: " + str(score)) st.markdown("<u>Match</u>: " + str(doc), unsafe_allow_html=True) st.markdown("<u>Page Number</u>: " + str(page_num), unsafe_allow_html=True) #write match and query to the db doc_ref = db.collection("queries").document() doc_ref.set({ "query": query1, "topMatch": str(doc), "timeStamp": firestore.SERVER_TIMESTAMP, "upvote": 0 }) else: st.subheader("No matches found.") st.write("Following methods are under construction 😊 Stay tuned!") query2 = st.text_input("Synonymized Query Search") query3 = st.text_input("Verbatim Search") st.subheader("Recent search results:") q_ref = db.collection("queries").order_by( u'timeStamp', direction=firestore.Query.DESCENDING) counter = 0 yesButtons = [] noButtons = [] for doc in q_ref.stream(): counter += 1 doc_dict = doc.to_dict() st.markdown("<strong>Query " + str(counter) + "</strong>: \n", unsafe_allow_html=True) st.markdown("<u>Query</u>: " + doc_dict["query"] + "\n", unsafe_allow_html=True) st.markdown("<u>Top Match</u>: " + doc_dict["topMatch"] + "\n", unsafe_allow_html=True) st.markdown(" ") if doc_dict["upvote"] < 0: st.markdown("<small>So far " + str(abs(doc_dict["upvote"])) + "people don't think it's a good match.</small>", unsafe_allow_html=True) else: st.markdown("<small>So far " + str(doc_dict["upvote"]) + " people think it's a good match.</small>", unsafe_allow_html=True) st.markdown("<i><small>Do you think this is a good match?</small></i>", unsafe_allow_html=True) yesButtons.append(st.button("👍", key="YesButton" + str(counter))) noButtons.append(st.button("👎", key="NoButton" + str(counter))) st.markdown("<hr>", unsafe_allow_html=True) if counter == 5: break st.subheader('made with ❤️ by:') st.markdown( '[Vince Bartle](https://bartle.io) (vb344) | [Dubem Ogwulumba](https://www.linkedin.com/in/dubem-ogwulumba/) (dao52) | [Erik Ossner](https://erikossner.com/) (eco9) | [Qiyu Yang](https://github.com/qiyuyang16/) (qy35) | [Youhan Yuan](https://github.com/nukenukenukelol) (yy435)' )