Exemplo n.º 1
0
    def run(self, data):
        """Updates a Keyword object information in Elasticsearch, based on the generator results.

        :param data: list of keyword objects
        :return:  returns True.
        """

        for key_object in data:
            key_search = Keywords.search().query('match', faculty_id=key_object.faculty_id) \
                .query('match' , datasource = key_object.datasource) \
                .query('match', approach_id = key_object.approach_id) \
                .execute()

            try:
                keywords = key_search[0]
            except IndexError:
                keywords = Keywords()
                keywords.faculty_id = key_object.faculty_id
                keywords.datasource = key_object.datasource
                keywords.approach_id = key_object.approach_id

            keywords.keywords = key_object.keywords
            keywords.save()
        return True
Exemplo n.º 2
0
    def run(self, data):
        """Performs a scraping of a faculty members ResearchId page.
        :param data is a faculty object
        :return: last faculty member handled
        """

        faculty = data
        if isinstance(faculty, str):
            search_results = Faculty.search().query('match',
                                                    name=faculty).execute()
            if len(search_results) > 1:
                # Shouldn't happen, but could.
                raise WorkflowException(
                    "Professor id is ambiguous during search ... More than 1 result"
                )
            faculty = search_results[0]

        faculty_name = faculty.name


        Document.search().query('match', faculty_id=faculty.faculty_id) \
            .query("match", source="ResearchId") \
            .delete()

        Keywords.search().query('match', faculty_id=faculty.faculty_id) \
            .query("match", approach_id="4") \
            .delete()

        print("Running researchid scrape on {}. Research id {}.".format(
            faculty_name, faculty.research_id))

        if faculty.research_id is not None:

            scraper = ScraperFactory.create_scraper(faculty.research_id,
                                                    ScraperType.RESEARCHID)
            try:
                scrapps = scraper.get_scrapps()
            except ScraperException:
                return faculty

            keywords_and_description = scrapps[0]
            titles = scrapps[1:]

            doc = Document()
            doc.faculty_id = faculty.faculty_id
            doc.source = "ResearchId"

            keywords = Keywords()
            keywords.faculty_id = faculty.faculty_id
            keywords.datasource = "user_keywords"
            keywords.approach_id = "4"

            try:
                doc.text = keywords_and_description.meta_data["description"]
            except:
                print("No description")
                doc.text = ""
            try:
                doc.user_keywords = keywords_and_description.meta_data[
                    "keywords"]
                keywords.keywords = keywords_and_description.meta_data[
                    "keywords"]
            except:
                print("No keywords")
            doc.date = datetime.now()
            doc.save()
            keywords.save()

            for scrapp in titles:
                doc = Document()
                if scrapp.data_source == ScraperType.RESEARCHID:
                    doc.source = "ResearchId"
                else:
                    doc.source = "ResearchIdAbstract"
                doc.faculty_id = faculty.faculty_id
                if scrapp.data_source == ScraperType.RESEARCHID:
                    doc.text = scrapp.title
                else:
                    doc.text = scrapp.meta_data["text"]

                doc.date = datetime.now()
                doc.save()

        return faculty