Пример #1
0
    def setup_class(cls):
        Keywords.search().delete()
        Faculty.search().delete()
        sleep(3)

        will = Faculty(meta={"id": 379},
                       name="William.Allison",
                       full_name="Allison, William.",
                       faculty_id=379,
                       email="*****@*****.**")
        will.save()
        will_keywords = Keywords(
            faculty_id=379,
            datasource="test",
            approach_id=0,
            keywords=["zebrafish", "evolutionary processes", "marine"])
        will_keywords.save()

        vince = Faculty(meta={"id": 356},
                        name="Vincent.Bouchard",
                        full_name="Bouchard, Vincent",
                        faculty_id=356,
                        email="*****@*****.**")
        vince.save()
        vince_keywords = Keywords(
            faculty_id=356,
            datasource="test",
            approach_id=0,
            keywords=["string theory", "number theory", "mathematics"])
        vince_keywords.save()

        sleep(3)
Пример #2
0
 def test_create__success(self):
     link = 'http://www.researcherid.com/rid/A-2612-2014'
     rid = ResearchIdPageScrape()
     obj = Faculty(name="Test.Prof", faculty_id=110, email="*****@*****.**")
     obj.researcherid = link
     res = rid.run(obj)
     print(res)
     assert res != None
Пример #3
0
 def test_create__success(self):
     link = 'https://scholar.google.ca/citations?user=KffJRdgAAAAJ&hl=en&oi=sra'
     ga = GoogleScholarPageScrape()
     obj = Faculty(name="Test.Prof", faculty_id=110, email="*****@*****.**")
     obj.google_scholar = link
     res = ga.run(obj)
     print(res)
     assert res != None
Пример #4
0
 def test_create__success(self):
     link = 'http://www.researcherid.com/rid/C-6729-2008'
     rid = ResearchIdPageScrape()
     obj = Faculty(name="Test.Prof", faculty_id=110, email="*****@*****.**")
     obj.research_id = link
     res = rid.is_requirement_satisfied(obj)
     assert res is True
     res = rid.run(obj)
     print(res)
     assert res is not None
Пример #5
0
    def run(self, data):
        """Performs a scraping of a faculty members ResearchId page.
        :param data is a faculty object
        :return: last faculty member handled
        """

        no_text_count = 0
        for faculty in data:
            faculty_name = faculty.name

            search_results = Faculty.search().query(
                'match', name=faculty_name).execute()
            if len(search_results) > 1:
                # Shouldn't happen, but could.
                raise WorkflowException(
                    "Professor id is ambiguous during search ... More than 1 result"
                )

            search_dup = Document.search().query(
                'match',
                faculty_id=faculty.faculty_id).query("match",
                                                     source="ResearchId")
            search_dup.delete()
            faculty = search_results[0]
            if faculty.research_id is not None:

                scraper = ScraperFactory.create_scraper(
                    faculty.research_id, ScraperType.RESEARCHID)
                scrapps = scraper.get_scrapps()

                keywords_and_description = scrapps[0]
                titles = scrapps[1:]

                doc = Document()
                doc.faculty_id = faculty.faculty_id
                doc.source = "ResearchId"
                try:
                    doc.text = keywords_and_description.meta_data[
                        "description"]
                except:
                    print("No description")
                    doc.text = ""
                try:
                    doc.user_keywords = keywords_and_description.meta_data[
                        "keywords"]
                except:
                    print("No keywords")
                doc.save()

                for scrapp in titles:
                    doc = Document()
                    doc.source = "ResearchId"
                    doc.faculty_id = faculty.faculty_id
                    doc.text = scrapp.title
                    doc.save()

            else:
                no_text_count += 1
        print("NO TEXT COUNT = ", no_text_count)
        return faculty
Пример #6
0
    def run(self, data):
        """Performs a scraping of a faculty members GoogleScholar page.
        :param data is a faculty object
        :return: list of faculty members
        """

        faculty = data
        if isinstance(faculty, str):
            search_results = Faculty.search().query(
                'match', name=faculty_name).execute()
            if len(search_results) > 1:
                # Shouldn't happen, but could.
                raise WorkflowException(
                    "Professor id is ambiguous during search ... More than 1 result"
                )
            faculty = search_results[0]
        faculty_name = faculty.name

        Document.search().query('match', faculty_id=faculty.faculty_id) \
            .query("match", source="GoogleScholar") \
            .delete()

        if faculty.google_scholar is not None and "http" in faculty.google_scholar:
            scraper = ScraperFactory.create_scraper(faculty.google_scholar,
                                                    ScraperType.GOOGLESCHOLAR)
            scrapps = scraper.get_scrapps()
            for scrapp in scrapps:
                doc = Document()
                doc.source = "GoogleScholar"
                doc.faculty_id = faculty.faculty_id
                doc.text = scrapp.title
                doc.save()

        return faculty
Пример #7
0
    def create_grant(json_data, write=True):
        """Creates an instance of Faculty from a JSON representation.

        :param dict json_data: Dictionary representation of the JSON data.
        :param bool write: Boolean switch that will enable writing to elastic.
        """
        schema = GrantSchema()

        try:
            grant = schema.load(json_data)
        except ValidationError as err:
            raise DataIngestionException(
                "Missing one of the required fields of the schema. {}".format(
                    err.messages))

        # Need to find a faculty with matching name so we can build a new document
        search_results = Faculty.search().query(
            'match', full_name=grant["faculty_name"]).execute()
        if len(search_results) < 1:
            return
        faculty = search_results[0]

        # TODO: There is no spot for titles in the document...
        grant_doc = Document(faculty_id=faculty.faculty_id,
                             source=grant["source"],
                             text=grant["text"])

        if write:
            grant_doc.save()
Пример #8
0
    def add_name_search_results(faculty_with_keywords, pf_query):
        """ Inserts results of pf_query on faculty index into faculty_with_keywords.

        If an faculty is returned from the query, but does not currently exist in
        faculty_with_words, the faculty member plus their entire keyword set,
        is inserted into the dictionary.
        :param faculty_with_keywords: Dictionary of faculty id's to keywords.
        :param pf_query: Postfix query created by the Query Builder.
        :returns: faculty_with_keywords also containing faculty whose names match the
            query.
        """
        # Add functionality of searching names in query.
        q_builder = builder.QueryBuilder()
        name_elastic_query = q_builder.build(pf_query, search_field="full_name")
        names_response = Faculty.search().query(name_elastic_query).execute()

        for faculty in names_response:
            # We already have the faculty who was searched in the results.
            if faculty.faculty_id in faculty_with_keywords:
                continue

            faculty_keywords = Keywords.search()\
                .query('match', faculty_id=faculty.faculty_id).execute()
            
            faculty_with_keywords[faculty.faculty_id] = faculty_keywords
Пример #9
0
    def run(self, data):

        """Performs a scraping of a faculty members GoogleScholar page.
        :param data is a faculty object
        :return: last faculty member handled
        """
        
        no_text_count = 0
        for faculty in data:
            faculty_name = faculty.name
    
            search_results = Faculty.search().query('match', name=faculty_name).execute()
            if len(search_results) > 1:
                # Shouldn't happen, but could.
                raise WorkflowException("Professor id is ambiguous during search ... More than 1 result")

            search_dup = Document.search().query('match', faculty_id=faculty.faculty_id).query("match", source="GoogleScholar")
            search_dup.delete()

            faculty = search_results[0]
            if faculty.google_scholar is not None and "http" in faculty.google_scholar:
                scraper = ScraperFactory.create_scraper(faculty.google_scholar, ScraperType.GOOGLESCHOLAR)
                scrapps = scraper.get_scrapps()
                for scrapp in scrapps:
                    doc = Document()
                    doc.source = "GoogleScholar"
                    doc.faculty_id = faculty.faculty_id
                    doc.text = scrapp.title
                    doc.save()
            else:
                no_text_count += 1
        print("NO TEXT COUNT = ", no_text_count)
        return faculty
Пример #10
0
    def get(self):
        """HTTP Get that enables boolean query processing and search."""
        query = request.args.get('query')

        if query is None:
            abort(400)

        q_parser = parser.QueryParser()
        q_builder = builder.QueryBuilder()

        pf_query = q_parser.parse_query(query)
        elastic_query = q_builder.build(pf_query)

        # response = Faculty.search().query(elastic_query).execute()
        response = Keywords.search().query(elastic_query).execute()
        faculty_with_keywords = set()
        for keywords in response:
            faculty_with_keywords.add(keywords.faculty_id)
        schema = FacultySchema()
        results = [
            schema.dump(Faculty.safe_get(faculty_id))
            for faculty_id in faculty_with_keywords
        ]

        return {"data": results}
Пример #11
0
    def run(self, data):
        """Performs a scraping of a faculty members ResearchId page.
        :param data is a faculty object
        :return: last faculty member handled
        """

        faculty = data
        if isinstance(faculty, str):
            search_results = Faculty.search().query('match',
                                                    name=faculty).execute()
            if len(search_results) > 1:
                # Shouldn't happen, but could.
                raise WorkflowException(
                    "Professor id is ambiguous during search ... More than 1 result"
                )
            faculty = search_results[0]

        faculty_name = faculty.name


        Document.search().query('match', faculty_id=faculty.faculty_id) \
            .query("match", source="ResearchId") \
            .delete()

        print("Running researchid scrape on {}. Research id {}.".format(
            faculty_name, faculty.research_id))

        if faculty.research_id is not None:

            scraper = ScraperFactory.create_scraper(faculty.research_id,
                                                    ScraperType.RESEARCHID)
            scrapps = scraper.get_scrapps()

            keywords_and_description = scrapps[0]
            titles = scrapps[1:]

            doc = Document()
            doc.faculty_id = faculty.faculty_id
            doc.source = "ResearchId"
            try:
                doc.text = keywords_and_description.meta_data["description"]
            except:
                print("No description")
                doc.text = ""
            try:
                doc.user_keywords = keywords_and_description.meta_data[
                    "keywords"]
            except:
                print("No keywords")
            doc.save()

            for scrapp in titles:
                doc = Document()
                doc.source = "ResearchId"
                doc.faculty_id = faculty.faculty_id
                doc.text = scrapp.title
                doc.save()

        return faculty
Пример #12
0
 def run(self, data):
     """ Searches through all results in elastic search
     :param data: str or Faculty instance.
     :return: all faculty
     """
     s = Faculty.search()
     allFaculty = [faculty for faculty in s.scan()]
     return allFaculty
Пример #13
0
    def run(self, data):
        """Updates a Faculty members information in Elasticsearch, based on the result of a scrape.

        :param data: list of tuples of form <str, Scrapp>
        :return: The updated instance of a Faculty model.
        """

        faculty_name = data[0]
        scrapp = data[1]

        search_results = Faculty.search().query('match',
                                                name=faculty_name).execute()
        if len(search_results) > 1:
            # Shouldn't happen, but could.
            raise WorkflowException(
                "Faculty name is ambiguous during search... More than 1 result"
            )

        faculty = search_results[0]

        Document.search().query('match', faculty_id=faculty.faculty_id) \
            .query("match", source="Profile") \
            .delete()

        if "orcid_link" in scrapp.meta_data:
            faculty.orc_id = scrapp.meta_data["orcid_link"]

        if "researchid_link" in scrapp.meta_data:
            faculty.research_id = scrapp.meta_data["researchid_link"]

        if "googlescholar_link" in scrapp.meta_data:
            faculty.google_scholar = scrapp.meta_data["googlescholar_link"]

        if "text" in scrapp.meta_data:
            doc_search = Document.search().query('match', faculty_id=faculty.faculty_id) \
                .query('match', source = "profile") \
                .execute()
            try:
                doc = doc_search[0]
            except IndexError:
                doc = Document()
                doc.faculty_id = faculty.faculty_id
                doc.source = "profile"

            doc.text = scrapp.meta_data["text"]
            doc.date = datetime.now()
            doc.save()

        faculty.save()

        return faculty
Пример #14
0
    def test_create(self):
        prof = Faculty()
        prof.name = "name"
        prof.email = "*****@*****.**"
        prof.faculty_id = 1
        prof.department = "cs"

        assert prof.department == "cs"
Пример #15
0
    def get(self, faculty_id):
        """ HTTP Get for the faculty resource.

        Currently returns an HTML page, but should instead return the Faculty object as JSON.

        :param faculty_id: The id as is in elasticsearch. This id is defined by the forum data dump.
        :return:HTTP 404 if the given ID does not exist.
                HTTP 200 if the id exists and the GET operation succeeds.
        """
        faculty = Faculty.safe_get(faculty_id)

        if faculty is None:
            abort(404)

        return make_response(render_template("faculty.html", faculty=faculty), 200, {'content-type': 'text/html'})
Пример #16
0
    def get(self):
        """HTTP Get for the faculty list resource.

        Returns a list of faculty members from elasticsearch.
        :param page: URL Parameter for the page to fetch. Default - 0.
        :param results: URL Parameter for the number of results to return per page. Default - 20.
        :return:
        """
        search = Faculty.search()
        query, pagination_info = paginate_query(request, search)
        response = query.execute()

        schema = FacultySchema()
        results = [schema.dump(faculty) for faculty in response]

        return {"pagination": pagination_info, "data": results}
Пример #17
0
    def post(self):
        """HTTP Get that enables boolean query processing and search."""
        to_run = request.args.get('run')
        faculty = request.args.get('faculty')

        if not to_run or not faculty:
            abort(400)

        try:
            task_list = TASKLIST[to_run]
        except:
            abort(400)

        workflow = Workflow(task_list, Faculty.safe_get(faculty))
        run_workflow.apply_async((workflow,), countdown=1)

        return 200
Пример #18
0
    def get(self):
        """HTTP Get that enables boolean query processing and search."""
        query = request.args.get('query')

        if query is None:
            abort(400)

        q_parser = parser.QueryParser()
        q_builder = builder.QueryBuilder()

        pf_query = q_parser.parse_query(query)
        elastic_query = q_builder.build(pf_query)

        response = Faculty.search().query(elastic_query).execute()
        schema = FacultySchema()
        results = [schema.dump(faculty) for faculty in response]

        return {"data": results}
Пример #19
0
    def get(self):
        """HTTP Get for the faculty list resource.

        Returns a list of faculty members from elasticsearch.
        :param page: URL Parameter for the page to fetch. Default - 0.
        :param results: URL Parameter for the number of results to return per page. Default - 20.
        :return:
        """
        page = request.args.get("page", default=0, type=int)
        results = request.args.get("results", default=20, type=int)

        # Get the slice of data to retrieve
        first = page * results
        last = (page * results) + results

        search = Faculty.search()
        count = search.count()
        query = search[first:last]
        response = query.execute()

        schema = FacultySchema()
        results = [schema.dump(faculty) for faculty in response]

        has_previous = True if page > 0 else False
        has_next = True if last < count else False
        previous = page - 1 if has_previous else None
        next = page + 1 if has_next else None

        return {
            "pagination": {
                "has_previous": has_previous,
                "has_next": has_next,
                "previous_page": previous,
                "current_page": page,
                "next_page": next,
            },

            "data": results
        }
Пример #20
0
    def create_results(faculty_with_keywords, dept_filter):
        """ Creates the json representation of a faculty member, including all keywords.

        :param faculty_with_keywords: A dictionary of id's to lists of keywords.
            The keywords are inserted into the faculty object before being dumped to json.
        :param dept_filter: List of string departments to be included in the results. If a
            professor does not belong one of the departments, they are not included.
            All professors are included if the filter is empty.
        :returns: List of JSON objects, each representing a faculty member and keywords.
        """
        # Build json representations with nested keywords
        schema = FacultySchema()
        results = []
        for faculty_id, keywords in faculty_with_keywords.items():
            faculty = Faculty.safe_get(faculty_id)

            if faculty is None or \
                    (len(dept_filter) > 0 and faculty.department not in dept_filter):
                continue
            
            faculty.generated_keywords = keywords
            results.append(schema.dump(faculty))
        
        return results
Пример #21
0
        if isinstance(data, Faculty):
            return FacultyNames.validate_name(data.name)

    def run(self, data):
        """Performs a scraping of a faculty members directory page.
        :param data: str or Faculty instance.
        :return: tuple of the faculty name and Scrapp produced by scraping the faculty directory page.
        """
        print("Running {} on {}".format(self.task_name, data))
        if isinstance(data, str):
            faculty_name = data
        else:
            faculty_name = data.name

        faculty_directory_url = URLs.build_faculty_url(faculty_name)

        scraper = ScraperFactory.create_scraper(faculty_directory_url,
                                                ScraperType.PROFILE)
        scrapp = scraper.get_scrapps()[0]

        ret_data = (data, scrapp)

        return ret_data


if __name__ == "__main__":
    from elasticsearch_dsl import connections
    connections.create_connection()
    Faculty.init()
Пример #22
0
            search_dup = Document.search().query('match', faculty_id=faculty.faculty_id).query("match", source="GoogleScholar")
            search_dup.delete()

            faculty = search_results[0]
            if faculty.google_scholar is not None and "http" in faculty.google_scholar:
                scraper = ScraperFactory.create_scraper(faculty.google_scholar, ScraperType.GOOGLESCHOLAR)
                scrapps = scraper.get_scrapps()
                for scrapp in scrapps:
                    doc = Document()
                    doc.source = "GoogleScholar"
                    doc.faculty_id = faculty.faculty_id
                    doc.text = scrapp.title
                    doc.save()
            else:
                no_text_count += 1
        print("NO TEXT COUNT = ", no_text_count)
        return faculty


if __name__ == "__main__":
    from elasticsearch_dsl import connections
    connections.create_connection()
    Faculty.init()
    Document.init()

    search = Faculty.search()
    allFaculty = [faculty for faculty in search.scan()]
    task = GoogleScholarPageScrape()
    task.run(allFaculty)
Пример #23
0
    def teardown_class(cls):
        Faculty.get(id=379).delete()
        Faculty.get(id=356).delete()

        Keywords.search().query('match', faculty_id=379).delete()
        Keywords.search().query('match', faculty_id=356).delete()