def setup_class(cls): Keywords.search().delete() Faculty.search().delete() sleep(3) will = Faculty(meta={"id": 379}, name="William.Allison", full_name="Allison, William.", faculty_id=379, email="*****@*****.**") will.save() will_keywords = Keywords( faculty_id=379, datasource="test", approach_id=0, keywords=["zebrafish", "evolutionary processes", "marine"]) will_keywords.save() vince = Faculty(meta={"id": 356}, name="Vincent.Bouchard", full_name="Bouchard, Vincent", faculty_id=356, email="*****@*****.**") vince.save() vince_keywords = Keywords( faculty_id=356, datasource="test", approach_id=0, keywords=["string theory", "number theory", "mathematics"]) vince_keywords.save() sleep(3)
def get(self): """HTTP Get that enables boolean query processing and search.""" query = request.args.get('query') if query is None: abort(400) q_parser = parser.QueryParser() q_builder = builder.QueryBuilder() pf_query = q_parser.parse_query(query) elastic_query = q_builder.build(pf_query) # response = Faculty.search().query(elastic_query).execute() response = Keywords.search().query(elastic_query).execute() faculty_with_keywords = set() for keywords in response: faculty_with_keywords.add(keywords.faculty_id) schema = FacultySchema() results = [ schema.dump(Faculty.safe_get(faculty_id)) for faculty_id in faculty_with_keywords ] return {"data": results}
def test_simple_search(self): """Test the results of queries on actual data.""" query = "zebrafish" elastic_query = self.build_query(query) results = Keywords.search().query(elastic_query).execute() assert len(results) == 1 query = "zebrafish OR mathematics" elastic_query = self.build_query(query) results = Keywords.search().query(elastic_query).execute() assert len(results) == 2 query = "zebrafish AND mathematics" elastic_query = self.build_query(query) results = Keywords.search().query(elastic_query).execute() assert len(results) == 0
def add_name_search_results(faculty_with_keywords, pf_query): """ Inserts results of pf_query on faculty index into faculty_with_keywords. If an faculty is returned from the query, but does not currently exist in faculty_with_words, the faculty member plus their entire keyword set, is inserted into the dictionary. :param faculty_with_keywords: Dictionary of faculty id's to keywords. :param pf_query: Postfix query created by the Query Builder. :returns: faculty_with_keywords also containing faculty whose names match the query. """ # Add functionality of searching names in query. q_builder = builder.QueryBuilder() name_elastic_query = q_builder.build(pf_query, search_field="full_name") names_response = Faculty.search().query(name_elastic_query).execute() for faculty in names_response: # We already have the faculty who was searched in the results. if faculty.faculty_id in faculty_with_keywords: continue faculty_keywords = Keywords.search()\ .query('match', faculty_id=faculty.faculty_id).execute() faculty_with_keywords[faculty.faculty_id] = faculty_keywords
def get(self): """HTTP Get that enables boolean query processing and search.""" query = request.args.get('query') dept = request.args.get('department') if query is None: abort(400) # Take dept string and turn it into an easy to compare set. try: if dept is not None: dept_filter = set([x.strip() for x in dept.split(',')]) else: dept_filter = set() except: abort(400) q_parser = parser.QueryParser() q_builder = builder.QueryBuilder() try: pf_query = q_parser.parse_query(query) except parser.QueryException: abort(400) keywords_elastic_query = q_builder.build(pf_query) response = Keywords.search().query(keywords_elastic_query).execute() faculty_with_keywords = SearchAPI.get_faculty_with_keywords(response) SearchAPI.add_name_search_results(faculty_with_keywords, pf_query) return { "data": SearchAPI.create_results(faculty_with_keywords, dept_filter) }
def get(self): """HTTP Get that enables boolean query processing and batch.""" response = Keywords.search().query().execute() schema = KeywordSchema() results = [schema.dump(s) for s in response] return {"data": results}
def get(self): """HTTP Get for the keyword list resource. Returns a list of faculty members from elasticsearch. :param page: URL Parameter for the page to fetch. Default - 0. :param results: URL Parameter for the number of results to return per page. Default - 20. :param id: URL Parameter to filter the results based on a faculty id. :param source: URL Parameter to filter the results based on the keyword source. :param approach: URL Parameter to filter results based on the approach_id. :return: """ id = request.args.get("id", type=int) source = request.args.get("source", type=str) approach = request.args.get("approach", type=int) search = Keywords.search() search = apply_filters(search, faculty_id=id, datasource=source, approach_id=approach) query, pagination_info = paginate_query(request, search) response = query.execute() schema = KeywordSchema() results = [schema.dump(keyword) for keyword in response] return {"pagination": pagination_info, "data": results}
def run(self, data): """Updates a Keyword object information in Elasticsearch, based on the generator results. :param data: list of keyword objects :return: returns True. """ for key_object in data: key_search = Keywords.search().query('match', faculty_id=key_object.faculty_id) \ .query('match' , datasource = key_object.datasource) \ .query('match', approach_id = key_object.approach_id) \ .execute() try: keywords = key_search[0] except IndexError: keywords = Keywords() keywords.faculty_id = key_object.faculty_id keywords.datasource = key_object.datasource keywords.approach_id = key_object.approach_id keywords.keywords = key_object.keywords keywords.save() return True
def get(self): query = request.args.get('query') approach = request.args.get('approach') approach = int(approach) if query is None or approach is None: abort(400) q_parser = parser.QueryParser() q_builder = builder.QueryBuilder() try: pf_query = q_parser.parse_query(query) except parser.QueryException: abort(400) keywords_elastic_query = q_builder.build(pf_query) response = Keywords.search().query(keywords_elastic_query).execute() faculty_with_keywords = SearchAPI.get_faculty_with_keywords(response) empty_profs = [] for faculty_id, keywords in faculty_with_keywords.items(): filtered_keywords = [] for keyword_obj in keywords: if keyword_obj.approach_id == approach: filtered_keywords.append(keyword_obj) faculty_with_keywords[faculty_id] = filtered_keywords if len(filtered_keywords) == 0: empty_profs.append(faculty_id) for faculty_id in empty_profs: del faculty_with_keywords[faculty_id] results = SearchAPI.create_results(faculty_with_keywords, dept_filter=[]) str_io = BytesIO() str_io.write(json.dumps(results, indent=4).encode()) str_io.seek(0) return send_file(str_io, as_attachment=True, attachment_filename="batch_results.txt")
"""Updates a Keyword object information in Elasticsearch, based on the generator results. :param data: list of keyword objects :return: returns True. """ for key_object in data: key_search = Keywords.search().query('match', faculty_id=key_object.faculty_id) \ .query('match' , datasource = key_object.datasource) \ .query('match', approach_id = key_object.approach_id) \ .execute() try: keywords = key_search[0] except IndexError: keywords = Keywords() keywords.faculty_id = key_object.faculty_id keywords.datasource = key_object.datasource keywords.approach_id = key_object.approach_id keywords.keywords = key_object.keywords keywords.save() return True if __name__ == "__main__": from elasticsearch_dsl import connections connections.create_connection() Faculty.init() Keywords.init()
def run(self, data): """Performs a scraping of a faculty members ResearchId page. :param data is a faculty object :return: last faculty member handled """ faculty = data if isinstance(faculty, str): search_results = Faculty.search().query('match', name=faculty).execute() if len(search_results) > 1: # Shouldn't happen, but could. raise WorkflowException( "Professor id is ambiguous during search ... More than 1 result" ) faculty = search_results[0] faculty_name = faculty.name Document.search().query('match', faculty_id=faculty.faculty_id) \ .query("match", source="ResearchId") \ .delete() Keywords.search().query('match', faculty_id=faculty.faculty_id) \ .query("match", approach_id="4") \ .delete() print("Running researchid scrape on {}. Research id {}.".format( faculty_name, faculty.research_id)) if faculty.research_id is not None: scraper = ScraperFactory.create_scraper(faculty.research_id, ScraperType.RESEARCHID) try: scrapps = scraper.get_scrapps() except ScraperException: return faculty keywords_and_description = scrapps[0] titles = scrapps[1:] doc = Document() doc.faculty_id = faculty.faculty_id doc.source = "ResearchId" keywords = Keywords() keywords.faculty_id = faculty.faculty_id keywords.datasource = "user_keywords" keywords.approach_id = "4" try: doc.text = keywords_and_description.meta_data["description"] except: print("No description") doc.text = "" try: doc.user_keywords = keywords_and_description.meta_data[ "keywords"] keywords.keywords = keywords_and_description.meta_data[ "keywords"] except: print("No keywords") doc.date = datetime.now() doc.save() keywords.save() for scrapp in titles: doc = Document() if scrapp.data_source == ScraperType.RESEARCHID: doc.source = "ResearchId" else: doc.source = "ResearchIdAbstract" doc.faculty_id = faculty.faculty_id if scrapp.data_source == ScraperType.RESEARCHID: doc.text = scrapp.title else: doc.text = scrapp.meta_data["text"] doc.date = datetime.now() doc.save() return faculty
def teardown_class(cls): Faculty.get(id=379).delete() Faculty.get(id=356).delete() Keywords.search().query('match', faculty_id=379).delete() Keywords.search().query('match', faculty_id=356).delete()