def run(self, data): """Performs a scraping of a faculty members ResearchId page. :param data is a faculty object :return: last faculty member handled """ no_text_count = 0 for faculty in data: faculty_name = faculty.name search_results = Faculty.search().query( 'match', name=faculty_name).execute() if len(search_results) > 1: # Shouldn't happen, but could. raise WorkflowException( "Professor id is ambiguous during search ... More than 1 result" ) search_dup = Document.search().query( 'match', faculty_id=faculty.faculty_id).query("match", source="ResearchId") search_dup.delete() faculty = search_results[0] if faculty.research_id is not None: scraper = ScraperFactory.create_scraper( faculty.research_id, ScraperType.RESEARCHID) scrapps = scraper.get_scrapps() keywords_and_description = scrapps[0] titles = scrapps[1:] doc = Document() doc.faculty_id = faculty.faculty_id doc.source = "ResearchId" try: doc.text = keywords_and_description.meta_data[ "description"] except: print("No description") doc.text = "" try: doc.user_keywords = keywords_and_description.meta_data[ "keywords"] except: print("No keywords") doc.save() for scrapp in titles: doc = Document() doc.source = "ResearchId" doc.faculty_id = faculty.faculty_id doc.text = scrapp.title doc.save() else: no_text_count += 1 print("NO TEXT COUNT = ", no_text_count) return faculty
def run(self, data): """Performs a scraping of a faculty members ResearchId page. :param data is a faculty object :return: last faculty member handled """ faculty = data if isinstance(faculty, str): search_results = Faculty.search().query('match', name=faculty).execute() if len(search_results) > 1: # Shouldn't happen, but could. raise WorkflowException( "Professor id is ambiguous during search ... More than 1 result" ) faculty = search_results[0] faculty_name = faculty.name Document.search().query('match', faculty_id=faculty.faculty_id) \ .query("match", source="ResearchId") \ .delete() print("Running researchid scrape on {}. Research id {}.".format( faculty_name, faculty.research_id)) if faculty.research_id is not None: scraper = ScraperFactory.create_scraper(faculty.research_id, ScraperType.RESEARCHID) scrapps = scraper.get_scrapps() keywords_and_description = scrapps[0] titles = scrapps[1:] doc = Document() doc.faculty_id = faculty.faculty_id doc.source = "ResearchId" try: doc.text = keywords_and_description.meta_data["description"] except: print("No description") doc.text = "" try: doc.user_keywords = keywords_and_description.meta_data[ "keywords"] except: print("No keywords") doc.save() for scrapp in titles: doc = Document() doc.source = "ResearchId" doc.faculty_id = faculty.faculty_id doc.text = scrapp.title doc.save() return faculty