def run(self, data): """Updates a Keyword object information in Elasticsearch, based on the generator results. :param data: list of keyword objects :return: returns True. """ for key_object in data: key_search = Keywords.search().query('match', faculty_id=key_object.faculty_id) \ .query('match' , datasource = key_object.datasource) \ .query('match', approach_id = key_object.approach_id) \ .execute() try: keywords = key_search[0] except IndexError: keywords = Keywords() keywords.faculty_id = key_object.faculty_id keywords.datasource = key_object.datasource keywords.approach_id = key_object.approach_id keywords.keywords = key_object.keywords keywords.save() return True
def run(self, data): """Performs a scraping of a faculty members ResearchId page. :param data is a faculty object :return: last faculty member handled """ faculty = data if isinstance(faculty, str): search_results = Faculty.search().query('match', name=faculty).execute() if len(search_results) > 1: # Shouldn't happen, but could. raise WorkflowException( "Professor id is ambiguous during search ... More than 1 result" ) faculty = search_results[0] faculty_name = faculty.name Document.search().query('match', faculty_id=faculty.faculty_id) \ .query("match", source="ResearchId") \ .delete() Keywords.search().query('match', faculty_id=faculty.faculty_id) \ .query("match", approach_id="4") \ .delete() print("Running researchid scrape on {}. Research id {}.".format( faculty_name, faculty.research_id)) if faculty.research_id is not None: scraper = ScraperFactory.create_scraper(faculty.research_id, ScraperType.RESEARCHID) try: scrapps = scraper.get_scrapps() except ScraperException: return faculty keywords_and_description = scrapps[0] titles = scrapps[1:] doc = Document() doc.faculty_id = faculty.faculty_id doc.source = "ResearchId" keywords = Keywords() keywords.faculty_id = faculty.faculty_id keywords.datasource = "user_keywords" keywords.approach_id = "4" try: doc.text = keywords_and_description.meta_data["description"] except: print("No description") doc.text = "" try: doc.user_keywords = keywords_and_description.meta_data[ "keywords"] keywords.keywords = keywords_and_description.meta_data[ "keywords"] except: print("No keywords") doc.date = datetime.now() doc.save() keywords.save() for scrapp in titles: doc = Document() if scrapp.data_source == ScraperType.RESEARCHID: doc.source = "ResearchId" else: doc.source = "ResearchIdAbstract" doc.faculty_id = faculty.faculty_id if scrapp.data_source == ScraperType.RESEARCHID: doc.text = scrapp.title else: doc.text = scrapp.meta_data["text"] doc.date = datetime.now() doc.save() return faculty