示例#1
0
    def update_cache_profiles(self, unified_name_to_profiles: {}):
        """
    Given a unified_profile_name to profiles map, merges the profiles and creates the FK references
    :param unified_name_to_profiles:
    :param unified_name_to_real_name:
    :return:
    """

        sql = self._update_cache_profiles[0]

        # Fire the sql script in a transaction
        with self._engine.begin() as conn:
            log.debug("Updating cache profiles")
            for _, profile_list in unified_name_to_profiles.items():
                # flatten the profile list down to one profile
                reference_profile = None
                """:type : Profile"""

                for profile in profile_list:
                    if reference_profile is None or len(profile.display_name) > len(reference_profile.display_name):
                        reference_profile = profile

                # if we found at least one reference_profile (which we should)
                # add the corresponding sql insert string to the cache_profile_strings array
                if reference_profile is not None:
                    u, r = unify_profile_name(reference_profile.first_name, reference_profile.last_name)
                    b64u = generate_id(u)
                    log.info("inserting %s, %s" % (b64u, sanitize_text(r)))
                    conn.execute(sql, (b64u, sanitize_text(r)))

        log.info("Cache profiles have been updated")
示例#2
0
    def get_profiles_by_profile_ids_or_field_ids(self, profile_ids: [int], field_ids: [int]):
        """
        Given a list of profile ids and field ids, queries all profiles that belong to the research field
        OR are associated with the profile_ids.
        :param slim:
        :return:
        """
        profile_ids_string = ""
        field_ids_string = ""
        if len(profile_ids) > 0:
            profile_ids_string = "(%s)" % (",".join(map(lambda x: "'%s'" % x, profile_ids)))
        else:
            profile_ids_string = "(NULL)"

        if len(field_ids) > 0:
            field_ids_string = "(%s)" % (",".join(map(lambda x: "'%s'" % x, field_ids)))
        else:
            field_ids_string = "(NULL)"

        query = self._query_profiles_by_profile_ids_or_field_ids[0]
        query = re.sub(':profile_ids', profile_ids_string, query)
        query = re.sub(':field_ids', field_ids_string, query)

        log.info("Querying profiles by profile_ids and field_ids\n"
                 "\t| profile_ids: {profile_ids}\n"
                 "\t| field_ids: {field_ids}\n".format(
            profile_ids=profile_ids_string,
            field_ids=field_ids_string
        ))

        # Fire the sql script in a transaction
        with self._engine.begin() as conn:
            return conn.execute(query).fetchall()
示例#3
0
    def crawl_profiles(self):
        """
        Given a populated members array this function crawls the profiles linked to the ids as well as the publications
        :return:
        """
        log.debug("Adding members to worker queues")
        for member in self._members:
            self._profile_queue.put(member.profile_id)
            self._profile_documents_queue.put(member.profile_id)

        # Create profile crawlers
        log.debug("Spawning profile workers")
        for i in range(number_profile_workers):
            t = Thread(target=self.profile_worker)
            t.daemon = False
            t.start()

        # Create document crawlers
        log.debug("Spawning document crawlers")
        for i in range(number_document_workers):
            t = Thread(target=self.document_worker)
            t.daemon = False
            t.start()

        # Wait for both queues to complete
        self._profile_queue.join()
        self._profile_documents_queue.join()
        log.info("Profiles and associated documents have been fetched")
示例#4
0
    def execute(
        self,
        profiles,
        documents,
        unified_name_to_profiles,
        unified_document_title_to_documents,
        unified_field_title_to_field,
        unified_field_title_to_documents,
        unified_name_to_authored_documents,
        unified_name_to_participated_documents,
    ):
        """
    Given the required crawl data updates the whole cache
    :return:
    """

        log.info("Crawl data update has been started")
        self.update_cache_profiles(unified_name_to_profiles)
        self.update_cache_documents(unified_document_title_to_documents)
        self.update_profiles(profiles)
        self.update_documents(documents)
        self.update_cache_fields(unified_field_title_to_field)
        self.link_profiles_to_documents(
            unified_name_to_profiles, unified_name_to_authored_documents, unified_name_to_participated_documents
        )
        self.link_fields_to_documents(unified_field_title_to_documents)
        self.post_update()
        log.info("Crawl data has been updated")
示例#5
0
    def update_cache_documents(self, unified_document_title_to_documents: {}):
        """
    Given a unified_document_title to documents map, merges the documents and creates the FK references
    :param unified_document_title_to_documents:
    :return:
    """

        sql = self._update_cache_documents[0]

        # Fire the sql script in a transaction
        with self._engine.begin() as conn:
            log.debug("Updating cache documents")
            for _, doc_list in unified_document_title_to_documents.items():
                # flatten the document list down to one document
                reference_doc = None
                """:type : Document"""

                for doc in doc_list:
                    if reference_doc is None or doc.core_last_modified > reference_doc.core_last_modified:
                        reference_doc = doc

                # if we found at least one reference_doc (which we should),
                # add the corresponding sql insert string to the cache_document_strings array
                if reference_doc is not None:
                    u, r = unify_document_title(reference_doc.core_title)
                    b64u = generate_id(u)
                    conn.execute(sql, (b64u, sanitize_text(r)))

        log.info("Cache documents have been updated")
示例#6
0
    def get_profiles(self):
        log.info('The route GET /profiles/ has been triggered')

        # Default parameters
        profile_ids = ''
        field_ids = ''
        slim = False

        # Set passed query parameters if existing
        if 'profile-ids' in request.args:
            profile_ids = request.args['profile-ids'].split(',')
            log.debug('Query parameter "profile-ids" = %s' % profile_ids)
        if 'field-ids' in request.args:
            field_ids = request.args['field-ids'].split(',')
            log.debug('Query parameter "field_ids" = %s' % field_ids)
        if 'slim' in request.args:
            slim = bool(request.args['slim'])
            log.debug('Query parameter "slim" = %s' % slim)

        # Trigger the respective methods
        profiles = []
        if slim:
            profiles = self._data_controller.api_data.get_profiles_slim()
        else:
            profiles = self._data_controller.api_data.get_profiles_by_profile_ids_or_field_ids(
                profile_ids=profile_ids, field_ids=field_ids)

        # Pattern for cms pages
        page_pattern = self._cache_config.profile_page_pattern

        # Serialize documents
        response = []
        for profile in profiles:
            profile_dict = dict(profile)

            # names
            name = None
            first_name = None
            last_name = None

            # Get names
            if 'first_name' in profile_dict and 'last_name' in profile_dict:
                first_name = profile_dict['first_name']
                last_name = profile_dict['last_name']
            elif 'name' in profile_dict:
                name = profile_dict['name']
                name_parts = [s.lower() for i, s in enumerate(name.split())]
                first_name = name_parts[0]
                last_name = name_parts[1]

            # If the names are available create the page link
            if first_name is not None and last_name is not None:
                page = page_pattern
                page = re.sub(':firstname', first_name, page)
                page = re.sub(':lastname', last_name, page)
                profile_dict["page"] = page

            response.append(profile_dict)
        return json.dumps(response, cls=DefaultEncoder)
示例#7
0
 def assert_schema(self):
     if self.is_initialized():
         log.info("Schema is already initialized")
     else:
         log.warning("The current schema is incomplete. Starting migration.")
         # TODO: Backup && Restore as soon as the database has state
         self.drop_all()
         self.run_schema()
示例#8
0
 def post_update(self):
     """
 Executes all linking steps that are required for the queries
 :return:
 """
     with self._engine.begin() as conn:
         for stmt in self._post_update:
             conn.execute(stmt)
     log.info("Cleanup statements have been executed")
示例#9
0
    def process_profile_documents(self):
        """
        Iterates over the profile documents, finds research fields, finds duplicates, finds author profiles
        :return:
        """
        for profile_unified in self._unified_name_to_profiles:
            found_docs = []

            profiles = self._unified_name_to_profiles[profile_unified]
            if len(profiles) == 0:
                log.warning("There were no profiles for the unified name %s" %
                            profile_unified)
                continue

            # For each profile linked to that unified name, add the found documents to the list
            for profile in profiles:
                x = self._profile_docs[profile.identifier]
                log.debug(
                    "Used {len_x} documents from id {mendeley_id} for unified name {name}"
                    .format(len_x=len(x),
                            mendeley_id=profile.identifier,
                            name=unify_profile_name(profile.first_name,
                                                    profile.last_name)))
                found_docs += x

            # Process these documents
            for doc in found_docs:
                # Add doc to all docs
                self._documents.append(doc)

                # Create unified document title
                doc_unified, doc_real = unify_document_title(doc.core_title)

                # Add document to docs
                if doc_unified in self._unified_document_title_to_documents:
                    existing_docs = self._unified_document_title_to_documents[
                        doc_unified]
                    existing_docs.append(doc)
                else:
                    self._unified_document_title_to_documents[doc_unified] = [
                        doc
                    ]

                # Append the doc title to the authored_docs of that unified profile name
                authored_docs = self._unified_name_to_authored_documents[
                    profile_unified]
                authored_docs.add(doc_unified)

                # Process core_authors field of the doc to find participants
                for author in doc.core_authors:
                    self.analyze_author(doc_unified, author)

                # Analyze the tags fields of the doc to find research fields
                for tag in doc.tags:
                    self.analyze_field_tag(doc_unified, tag)
        log.info("Profile documents have been analyzed")
示例#10
0
 def crawl_group_members(self):
     """
     Fetches members of the pre-configured research group
     :return:
     """
     self._members = self._crawler.get_group_members(self._research_group)
     log.debug(
         "{num} group members have been fetched for group_id {group_id}".
         format(num=len(self._members), group_id=self._research_group))
     log.info("Group members have been fetched")
示例#11
0
 def execute(self):
     """
     Process all input
     :return:
     """
     self.reset()
     self.process_profiles()
     self.process_profile_documents()
     self.process_group_documents()
     log.info("Analysis has been executed")
示例#12
0
    def get_last_update(self):
        """
        Returns the last entry of the update_log
        :return:
        """
        query = self._query_last_update[0]
        log.info("Querying last update")

        with self._engine.begin() as conn:
            return conn.execute(query).fetchall()
示例#13
0
 def execute(self):
     """
     Process all input
     :return:
     """
     self.reset()
     self.process_profiles()
     self.process_profile_documents()
     self.process_group_documents()
     log.info("Analysis has been executed")
示例#14
0
    def update_profiles(self, profiles: [Profile]):
        """
    Given a profile list, this method replaces the profiles in the database with new ones
    :param docs:
    :return:
    """

        def insert_profile(conn: Connection, insert: str, p: Profile):
            u, _ = unify_profile_name(p.first_name, p.last_name)
            b64u = generate_id(u)
            conn.execute(
                insert,
                (
                    sanitize_text(p.identifier),
                    b64u,
                    sanitize_text(p.first_name),
                    sanitize_text(p.last_name),
                    sanitize_text(p.display_name),
                    sanitize_text(p.link),
                ),
            )

        # If there's nothing to insert, abort
        if len(profiles) == 0:
            return None

        delete = self._replace_profiles[0]
        insert = self._replace_profiles[1]
        temp = self._replace_profiles[2]
        temp_insert = self._replace_profiles[3]
        update = self._replace_profiles[4]
        temp_drop = self._replace_profiles[5]

        # Fire the sql script in a transaction
        with self._engine.begin() as conn:
            log.debug("Deleting existing profiles")
            conn.execute(delete)

            log.debug("Inserting new profiles")
            for profile in profiles:
                insert_profile(conn, insert, profile)

            log.debug("Creating temporary table")
            conn.execute(temp)

            log.debug("Spooling data into temporary table")
            conn.execute(temp_insert)

            log.debug("Creating profile links")
            conn.execute(update)

            log.debug("Dropping temporary table")
            conn.execute(temp_drop)

        log.info("Profiles have been updated")
示例#15
0
    def get_profiles_slim(self):
        """
        Query slim profiles for fast UI auto completion
        :param profile_ids:
        :return:
        """
        query = self._query_profiles_slim[0]

        log.info("Querying slim profiles")

        return self._engine.execute(query).fetchall()
示例#16
0
    def get_fields(self):
        log.info('The route GET /fields/ has been triggered')

        fields = self._data_controller.api_data.get_fields()

        # Serialize fields
        response = []
        for field in fields:
            field_dict = dict(field.items())
            response.append(field_dict)
        return json.dumps(response, cls=DefaultEncoder)
示例#17
0
    def __init__(self, app_id: str, app_secret: str):
        self._app_id = app_id
        self._app_secret = app_secret
        self._initialized = False
        self._mendeley = Mendeley(app_id, app_secret)
        self._session = None
        """:type : MendeleySession """

        log.info(
            "Intialized SDKCrawler with app_id: {app_id} and app_secret: {app_secret}"
            .format(app_id=app_id, app_secret=app_secret))
示例#18
0
文件: fields.py 项目: ankoh/mc-server
    def get_fields(self):
        log.info('The route GET /fields/ has been triggered')

        fields = self._data_controller.api_data.get_fields()

        # Serialize fields
        response = []
        for field in fields:
            field_dict = dict(field.items())
            response.append(field_dict)
        return json.dumps(response, cls=DefaultEncoder)
示例#19
0
    def get_entities(self):
        """
        Returns the number of elements in each table
        :return:
        """
        query = self._query_entities[0]
        log.info("Querying entity numbers")

        # Fire the sql script in a transaction
        with self._engine.begin() as conn:
            return conn.execute(query).fetchall()
示例#20
0
 def crawl_group_members(self):
     """
     Fetches members of the pre-configured research group
     :return:
     """
     self._members = self._crawler.get_group_members(self._research_group)
     log.debug("{num} group members have been fetched for group_id {group_id}".format(
         num=len(self._members),
         group_id=self._research_group
     ))
     log.info("Group members have been fetched")
示例#21
0
 def crawl_group_documents(self):
     """
     Fetches the publications that are associated with the pre-configured group
     :return:
     """
     self._group_documents = self._crawler.get_documents_by_group_id(self._research_group)
     log.debug("{num} documents have been fetched for group_id {group_id}".format(
         num=len(self._group_documents),
         group_id=self._research_group
     ))
     log.info("Group documents have been fetched")
示例#22
0
 def crawl_group_documents(self):
     """
     Fetches the publications that are associated with the pre-configured group
     :return:
     """
     self._group_documents = self._crawler.get_documents_by_group_id(
         self._research_group)
     log.debug(
         "{num} documents have been fetched for group_id {group_id}".format(
             num=len(self._group_documents), group_id=self._research_group))
     log.info("Group documents have been fetched")
示例#23
0
    def get_system_entities(self):
        log.info('The route GET /cache/entities has been triggered')
        json_result = dict()

        entities = self._data_controller.api_data.get_entities()

        # Serialize fields
        response = []
        for entity in entities:
            columns = dict(entity.items())
            response.append(columns)
        return json.dumps(response, cls=DefaultEncoder)
示例#24
0
文件: cache.py 项目: ankoh/mc-server
    def get_system_entities(self):
        log.info("The route GET /cache/entities has been triggered")
        json_result = dict()

        entities = self._data_controller.api_data.get_entities()

        # Serialize fields
        response = []
        for entity in entities:
            columns = dict(entity.items())
            response.append(columns)
        return json.dumps(response, cls=DefaultEncoder)
示例#25
0
 def prepare(self, profiles, profile_docs, group_docs):
     """
     Prepare the AnalysisController with data
     :param profiles:
     :param profile_docs
     :param group_docs:
     :return:
     """
     self._profiles = profiles
     self._profile_docs = profile_docs
     self._group_docs = group_docs
     log.info("Analysis has been prepared")
示例#26
0
 def prepare(self, profiles, profile_docs, group_docs):
     """
     Prepare the AnalysisController with data
     :param profiles:
     :param profile_docs
     :param group_docs:
     :return:
     """
     self._profiles = profiles
     self._profile_docs = profile_docs
     self._group_docs = group_docs
     log.info("Analysis has been prepared")
示例#27
0
    def get_fields(self):
        """
        Queries all research fields
        :return:
        """
        query = self._query_fields[0]

        log.info("Querying fields")

        # Fire the sql script in a transaction
        with self._engine.begin() as conn:
            return conn.execute(query).fetchall()
示例#28
0
    def post_update(self):
        log.info('The route POST /cache/update has been triggered')

        # Get remote IP
        remote = get_remote_ip()

        # Trigger the pipeline
        report = self._pipeline_controller.execute(remote)

        # Dump report
        report_dict = dict(report.__dict__)
        json_report = json.dumps(report_dict, cls=DefaultEncoder)
        return json_report
示例#29
0
    def __init__(self, app_id: str, app_secret: str):
        self._app_id = app_id
        self._app_secret = app_secret
        self._initialized = False
        self._mendeley = Mendeley(app_id, app_secret)
        self._session = None
        """:type : MendeleySession """


        log.info("Intialized SDKCrawler with app_id: {app_id} and app_secret: {app_secret}".format(
            app_id=app_id,
            app_secret=app_secret
        ))
示例#30
0
文件: cache.py 项目: ankoh/mc-server
    def post_update(self):
        log.info("The route POST /cache/update has been triggered")

        # Get remote IP
        remote = get_remote_ip()

        # Trigger the pipeline
        report = self._pipeline_controller.execute(remote)

        # Dump report
        report_dict = dict(report.__dict__)
        json_report = json.dumps(report_dict, cls=DefaultEncoder)
        return json_report
示例#31
0
    def process_profile_documents(self):
        """
        Iterates over the profile documents, finds research fields, finds duplicates, finds author profiles
        :return:
        """
        for profile_unified in self._unified_name_to_profiles:
            found_docs = []

            profiles = self._unified_name_to_profiles[profile_unified]
            if len(profiles) == 0:
                log.warning("There were no profiles for the unified name %s" % profile_unified)
                continue

            # For each profile linked to that unified name, add the found documents to the list
            for profile in profiles:
                x = self._profile_docs[profile.identifier]
                log.debug("Used {len_x} documents from id {mendeley_id} for unified name {name}".format(
                    len_x=len(x),
                    mendeley_id=profile.identifier,
                    name=unify_profile_name(profile.first_name, profile.last_name)
                ))
                found_docs += x

            # Process these documents
            for doc in found_docs:
                # Add doc to all docs
                self._documents.append(doc)

                # Create unified document title
                doc_unified, doc_real = unify_document_title(doc.core_title)

                # Add document to docs
                if doc_unified in self._unified_document_title_to_documents:
                    existing_docs = self._unified_document_title_to_documents[doc_unified]
                    existing_docs.append(doc)
                else:
                    self._unified_document_title_to_documents[doc_unified] = [doc]

                # Append the doc title to the authored_docs of that unified profile name
                authored_docs = self._unified_name_to_authored_documents[profile_unified]
                authored_docs.add(doc_unified)

                # Process core_authors field of the doc to find participants
                for author in doc.core_authors:
                    self.analyze_author(doc_unified, author)

                # Analyze the tags fields of the doc to find research fields
                for tag in doc.tags:
                    self.analyze_field_tag(doc_unified, tag)
        log.info("Profile documents have been analyzed")
示例#32
0
    def execute(self):
        """
        Subsequently trigger crawler for members, group_publications and profiles
        :return:
        """

        log.info("Crawler has been started")
        self.reset()
        self._crawler.prepare()
        self.crawl_group_members()
        self.crawl_group_documents()
        self.crawl_profiles()
        self._crawler.destroy()
        self._succeeded = True
        log.info("Crawler has been executed")
示例#33
0
    def execute(self):
        """
        Subsequently trigger crawler for members, group_publications and profiles
        :return:
        """

        log.info("Crawler has been started")
        self.reset()
        self._crawler.prepare()
        self.crawl_group_members()
        self.crawl_group_documents()
        self.crawl_profiles()
        self._crawler.destroy()
        self._succeeded = True
        log.info("Crawler has been executed")
示例#34
0
    def run_schema(self):
        """
        Runs the schema initialization and returns if it was successfull
        """
        schema = []
        if self._config.engine == "sqlite":
            schema = read_sql_statements('sql', 'schema', 'sqlite.sql')
        elif self._config.engine == "mysql":
            schema = read_sql_statements('sql', 'schema', 'mysql.sql')

        with self._engine.begin() as conn:
            for cmd in schema:
                conn.execute(cmd)

        log.info("Schema has been initialized")
示例#35
0
    def get_system_status(self):
        log.info('The route GET /cache/status has been triggered')

        api_online = remote_is_online("api.mendeley.com", 443)

        json_result = dict()
        json_result["serverVersion"] = self._config.version
        json_result["mendeleyStatus"] = "Online" if api_online else "Offline"
        json_result["lastUpdate"] = "never"

        # Fetch last entry in update_log
        last_update_log = self._data_controller.api_data.get_last_update()
        if len(last_update_log) > 0:
            json_result["lastUpdate"] = last_update_log[0]["dt"]

        return json.dumps(json_result, cls=DefaultEncoder)
示例#36
0
文件: cache.py 项目: ankoh/mc-server
    def get_system_status(self):
        log.info("The route GET /cache/status has been triggered")

        api_online = remote_is_online("api.mendeley.com", 443)

        json_result = dict()
        json_result["serverVersion"] = self._config.version
        json_result["mendeleyStatus"] = "Online" if api_online else "Offline"
        json_result["lastUpdate"] = "never"

        # Fetch last entry in update_log
        last_update_log = self._data_controller.api_data.get_last_update()
        if len(last_update_log) > 0:
            json_result["lastUpdate"] = last_update_log[0]["dt"]

        return json.dumps(json_result, cls=DefaultEncoder)
示例#37
0
    def update_cache_fields(self, unified_field_title_to_field: {}):
        """
    Given a unified_field_title to field map, updates the fields
    :param unified_field_title_to_field:
    :return:
    """

        sql = self._update_cache_fields[0]

        # Fire the sql script in a transaction
        with self._engine.begin() as conn:
            log.debug("Updating cache fields")
            for _, field in unified_field_title_to_field.items():
                b64u = generate_id(field.unified_title)
                conn.execute(sql, (b64u, sanitize_text(field.title)))

        log.info("Cache fields have been updated")
示例#38
0
 def log_update(self, report, remote_addr: str):
     if remote_addr is None:
         remote_addr = "localhost"
     insert = self._log_update[0]
     with self._engine.begin() as conn:
         conn.execute(
             insert,
             (
                 remote_addr,
                 report.profiles,
                 report.documents,
                 report.unified_profiles,
                 report.unified_documents,
                 report.fields,
                 report.field_links,
             ),
         )
     log.info("Update has been logged for address '%s'" % remote_addr)
示例#39
0
    def process_group_documents(self):
        """
        Iterates over the group documents, finds research fields, finds duplicates, finds author profiles
        :return:
        """
        for doc in self._group_docs:
            # Add doc to all docs
            self._documents.append(doc)

            # Create unified document title
            doc_unified, doc_real = unify_document_title(doc.core_title)

            # Add document to docs
            if doc_unified in self._unified_document_title_to_documents:
                existing_docs = self._unified_document_title_to_documents[
                    doc_unified]
                existing_docs.append(doc)
            else:
                self._unified_document_title_to_documents[doc_unified] = [doc]

            # Try to find the main owner of the document through the document profile_id
            # If not existent do nothing
            # (we can't do much only with the profile_id.
            # We could post-fetch the unknown profiles but that is more involved)
            profile_id = doc.core_profile_id
            if profile_id in self._profiles:
                profile = self._profiles[profile_id]
                unified_name, real_name = unify_profile_name(
                    profile.first_name, profile.last_name)
                if unified_name in self._unified_name_to_authored_documents:
                    authored_documents = self._unified_name_to_authored_documents[
                        unified_name]
                    authored_documents.add(doc_unified)

            # Process core_authors field of the doc to find participants
            for author in doc.core_authors:
                self.analyze_author(doc_unified, author)

            # Analyze the tags fiels of the doc to find research fields
            for tag in doc.tags:
                self.analyze_field_tag(doc_unified, tag)
        log.info("Group documents have been analyzed")
示例#40
0
    def link_fields_to_documents(self, unified_field_title_to_documents: {}):
        """
    Given a unified_field_title to documents map, creates the N:M relations in the database
    :param unified_field_title_to_documents:
    :return:
    """

        # Get the different statements in the sql file
        delete = self._link_fields_to_documents[0]
        insert = self._link_fields_to_documents[1]

        # Fire the sql scripts in a transaction
        with self._engine.begin() as conn:
            log.debug("Deleting previous field -> document links")
            conn.execute(delete)
            log.debug("Inserting new field -> document links")
            for unified_field_title, doc_list in unified_field_title_to_documents.items():
                for doc_unified in doc_list:
                    conn.execute(insert, (generate_id(doc_unified), generate_id(unified_field_title)))

        log.info("Field -> document links have been updated")
示例#41
0
    def process_profiles(self):
        """
        Iterates over the profiles and finds duplicates
        :return:
        """
        for profile in self._profiles:
            unified, real = unify_profile_name(profile.first_name, profile.last_name)

            # Check if the name is already stored in the profiles
            # Then store the additional profile
            existing_profiles = []
            if unified in self._unified_name_to_profiles:
                existing_profiles = self._unified_name_to_profiles[unified]
            existing_profiles.append(profile)
            self._unified_name_to_profiles[unified] = existing_profiles

            # Store empty entries in documents maps for that profile
            # (then we don't need to check the key every time)
            self._unified_name_to_authored_documents[unified] = set()
            self._unified_name_to_participated_documents[unified] = set()
        log.info("Profiles have been analyzed")
示例#42
0
    def link_profiles_to_documents(
        self,
        unified_name_to_profiles: {},
        unified_name_to_authored_documents: {},
        unified_name_to_participated_documents: {},
    ):
        """
    Given a unified_profile_name to authored_documents and participated_documents map(s), creates the N:M relations
    in the database
    :param unified_name_to_authored_documents:
    :param unified_name_to_participated_documents:
    :return:
    """

        # Get the different statements in the sql file
        delete = self._link_profiles_to_documents[0]
        insert = self._link_profiles_to_documents[1]

        # Fire the sql scripts in a transaction
        with self._engine.begin() as conn:
            log.debug("Deleting previous profile -> document links")
            conn.execute(delete)

            log.debug("Inserting new profile -> document links")

            for unified_name, doc_list in unified_name_to_authored_documents.items():
                # TODO: if author unknown, ignore for now (Foreign key constraints broken otherwise)
                if unified_name not in unified_name_to_profiles:
                    continue
                for doc_unified in doc_list:
                    conn.execute(insert, (generate_id(unified_name), generate_id(doc_unified)))

            for unified_name, doc_list in unified_name_to_participated_documents.items():
                # TODO: if author unknown, ignore for now (Foreign key constraints broken otherwise)
                if unified_name not in unified_name_to_profiles:
                    continue
                for doc_unified in doc_list:
                    conn.execute(insert, (generate_id(unified_name), generate_id(doc_unified)))

        log.info("Profile -> document links have been updated")
示例#43
0
    def process_group_documents(self):
        """
        Iterates over the group documents, finds research fields, finds duplicates, finds author profiles
        :return:
        """
        for doc in self._group_docs:
            # Add doc to all docs
            self._documents.append(doc)

            # Create unified document title
            doc_unified, doc_real = unify_document_title(doc.core_title)

            # Add document to docs
            if doc_unified in self._unified_document_title_to_documents:
                existing_docs = self._unified_document_title_to_documents[doc_unified]
                existing_docs.append(doc)
            else:
                self._unified_document_title_to_documents[doc_unified] = [doc]

            # Try to find the main owner of the document through the document profile_id
            # If not existent do nothing
            # (we can't do much only with the profile_id.
            # We could post-fetch the unknown profiles but that is more involved)
            profile_id = doc.core_profile_id
            if profile_id in self._profiles:
                profile = self._profiles[profile_id]
                unified_name, real_name = unify_profile_name(profile.first_name, profile.last_name)
                if unified_name in self._unified_name_to_authored_documents:
                    authored_documents = self._unified_name_to_authored_documents[unified_name]
                    authored_documents.add(doc_unified)

            # Process core_authors field of the doc to find participants
            for author in doc.core_authors:
                self.analyze_author(doc_unified, author)

            # Analyze the tags fiels of the doc to find research fields
            for tag in doc.tags:
                self.analyze_field_tag(doc_unified, tag)
        log.info("Group documents have been analyzed")
示例#44
0
    def process_profiles(self):
        """
        Iterates over the profiles and finds duplicates
        :return:
        """
        for profile in self._profiles:
            unified, real = unify_profile_name(profile.first_name,
                                               profile.last_name)

            # Check if the name is already stored in the profiles
            # Then store the additional profile
            existing_profiles = []
            if unified in self._unified_name_to_profiles:
                existing_profiles = self._unified_name_to_profiles[unified]
            existing_profiles.append(profile)
            self._unified_name_to_profiles[unified] = existing_profiles

            # Store empty entries in documents maps for that profile
            # (then we don't need to check the key every time)
            self._unified_name_to_authored_documents[unified] = set()
            self._unified_name_to_participated_documents[unified] = set()
        log.info("Profiles have been analyzed")
示例#45
0
def create_engine(config: DatabaseConfiguration) -> Engine:
    path = ""
    log_path = path

    if not config.path:
        path = "sqlite://"
        log_path = path
    else:
        path = "sqlite:///{path}".format(
            path=config.path
        )
        log_path = path

    log.info("Creating engine '{engine}' with path {path}".format(
        engine="sqlite",
        path=log_path
    ))

    # create engine
    # Pool recycle:
    # http://stackoverflow.com/questions/26891971/mysql-connection-not-available-when-use-sqlalchemymysql-and-flask
    return sqlalchemy.create_engine(path, encoding="utf-8", pool_recycle=3600)
示例#46
0
    def drop_all(self):
        drops = read_sql_statements('sql', 'schema', 'drop_all.sql')

        foreign_key_off = ""
        foreign_key_on = ""

        if self._config.engine == "mysql":
            foreign_key_off = "SET FOREIGN_KEY_CHECKS = 0"
            foreign_key_on = "SET FOREIGN_KEY_CHECKS = 1"
        elif self._config.engine == "sqlite":
            foreign_key_off = "PRAGMA foreign_keys = OFF"
            foreign_key_on = "PRAGMA foreign_keys = ON"

        with self._engine.begin() as conn:
            log.info(foreign_key_off)
            conn.execute(foreign_key_off)
            for drop in drops:
                log.info(drop)
                conn.execute(drop)
            log.info(foreign_key_on)
            conn.execute(foreign_key_on)

        log.info("Database has been dropped")
示例#47
0
    def get_documents_by_profile_ids_and_field_ids(self,
                                                   profile_ids: [int], field_ids: [int],
                                                   order_attr: str="year", order_dir: str="desc",
                                                   limit: int=0, offset: int=0, only_count: bool=False):
        """
        Given profile ids and field ids, queries all documents that belong to the research field
        AND are associated with these profiles
        :return:
        """

        profile_ids_string = ""
        field_ids_string = ""
        query_limit = 20
        query_offset = 0
        query_order_attr = "pub_year"
        query_order_dir = "ASC"
        if len(profile_ids) > 0:
            profile_ids_string = "(%s)" % (",".join(map(lambda x: "'%s'" % x, profile_ids)))
        else:
            profile_ids_string = "(NULL)"

        if len(field_ids) > 0:
            field_ids_string = "(%s)" % (",".join(map(lambda x: "'%s'" % x, field_ids)))
        else:
            field_ids_string = "(NULL)"

        # Check order attribute parameter
        if order_attr == "year":
            query_order_attr = "d.pub_year"
        elif order_attr == "title":
            query_order_attr = "d.title"
        elif order_attr == "source":
            query_order_attr = "d.source"

        # Check order direction
        if order_dir == "desc":
            query_order_dir = "DESC"
        elif order_dir == "asc":
            query_order_dir = "ASC"

        # Check limit parameter
        if limit > 0:
            query_limit = limit

        # Check offset parameter
        if offset > 0:
            query_offset = offset

        # If no profile_ids and field_ids have been passed, i need to return everything
        # && use query without AND xx IN ()
        query = ""
        if len(profile_ids) > 0 and len(field_ids) > 0:
            query = self._query_documents_by_profile_ids_and_field_ids[0]
            query = re.sub(':profile_ids', profile_ids_string, query)
            query = re.sub(':field_ids', field_ids_string, query)
        elif len(profile_ids) > 0 and len(field_ids) == 0:
            query = self._query_documents_by_profile_ids[0]
            query = re.sub(':profile_ids', profile_ids_string, query)
        elif len(profile_ids) == 0 and len(field_ids) > 0:
            query = self._query_documents_by_field_ids[0]
            query = re.sub(':field_ids', field_ids_string, query)
        else:
            query = self._query_all_documents[0]

        if only_count:
            select = "SELECT COUNT(DISTINCT cd.id) AS cnt FROM"
            query = re.sub(query_head, select, query)
            query = re.sub('ORDER BY :order_by', '', query)
            query = re.sub('LIMIT :query_limit', '',  query)
        else:
            select = str(
                "DISTINCT "
                "cd.id             AS id,"
                "d.mendeley_id     AS mendeley_id,"
                "d.title           AS title,"
                "d.doc_type        AS doc_type,"
                "d.last_modified   AS last_modified,"
                "d.abstract        AS abstract,"
                "d.source          AS source,"
                "d.pub_year        AS pub_year,"
                "d.authors         AS authors,"
                "d.keywords        AS keywords,"
                "d.tags            AS tags,"
                "d.derived_bibtex  AS derived_bibtex")
            query = re.sub(':select_attributes', select, query)

            # Substitute order_by and query_limit as well
            query = re.sub(':order_by', '{order_attr} {order_dir}'.format(
                order_attr=query_order_attr,
                order_dir=query_order_dir
            ), query)
            query = re.sub(':query_limit', '{offset},{limit}'.format(
                offset=query_offset,
                limit=query_limit
            ), query)

        log.info("Querying documents by profile_ids and field_ids\n"
                 "\t| profile_ids: {profile_ids}\n"
                 "\t| field_ids: {field_ids}\n"
                 "\t| order_attr: {order_attr}\n"
                 "\t| order_dir: {order_dir}\n"
                 "\t| offset: {offset}\n"
                 "\t| limit: {limit}\n"
                 "\t| only_count: {only_count}".format(
            profile_ids=profile_ids_string,
            field_ids=field_ids_string,
            order_attr=query_order_attr,
            order_dir=query_order_dir,
            offset=query_offset,
            limit=query_limit,
            only_count=only_count
        ))
        log.debug("Query: {query}".format(query=query))

        # Fire the sql script in a transaction
        with self._engine.begin() as conn:
            return conn.execute(query).fetchall()
示例#48
0
    def get_documents(self):
        log.info('The route GET /documents/ has been triggered')

        # Default parameters
        profile_ids = ''
        field_ids = ''
        limit = 0
        offset = 0
        order_dir = ""
        order_attr = ""
        only_count = False

        # Set passed query parameters if existing
        if 'profile-ids' in request.args:
            profile_ids = request.args['profile-ids'].split(',')
            log.debug('Query parameter "profile-ids" = %s' % profile_ids)
        if 'field-ids' in request.args:
            field_ids = request.args['field-ids'].split(',')
            log.debug('Query parameter "field-ids" = %s' % field_ids)
        if 'limit' in request.args:
            limit = int(request.args['limit'])
            log.debug('Query parameter "limit" = %s' % limit)
        if 'offset' in request.args:
            offset = int(request.args['offset'])
            log.debug('Query parameter "offset" = %s' % offset)
        if 'order-dir' in request.args:
            order_dir = request.args['order-dir']
            log.debug('Query parameter "order-dir" = %s' % order_dir)
        if 'order-attr' in request.args:
            order_attr = request.args['order-attr']
            log.debug('Query parameter "order-attr" = %s' % order_attr)
        if 'only-count' in request.args:
            only_count = bool(request.args['only-count'])
            log.debug('Query parameter "only-count" = %s' % only_count)

        # Trigger the respective methods
        data = self._data_controller.api_data.get_documents_by_profile_ids_and_field_ids(
            profile_ids=profile_ids,
            field_ids=field_ids,
            order_attr=order_attr,
            order_dir=order_dir,
            offset=offset,
            limit=limit,
            only_count=only_count)

        if only_count:
            # Extract count
            response = []
            for document in data:
                document_dict = dict(document.items())
                response.append(document_dict)

            if len(response) > 0:
                return json.dumps(response[0], cls=DefaultEncoder)
            else:
                return json.dumps({"cnt": 0}, cls=DefaultEncoder)
        else:
            # Serialize documents
            response = []
            for document in data:
                document_dict = dict(document.items())
                response.append(document_dict)
            return json.dumps(response, cls=DefaultEncoder)
示例#49
0
    def __init__(self, *args, **kwargs):
        super(MendeleyCache, self).__init__(*args, **kwargs)

        # Read configuration
        self.configuration = ServiceConfiguration()
        self.configuration.load()
        log.info("Configuration has been loaded")

        # Create service controllers
        self.data_controller = DataController(self.configuration.database)
        self.data_controller.assert_schema()
        log.info("Schema has been checked")

        # Create crawler based on configuration
        self.crawler = None
        """:type : AbstractCrawler"""
        if not self.configuration.uses_mendeley:
            log.info("Pipeline uses FileCrawler")
            self.crawler = FileCrawler()
        else:
            from mendeleycache.crawler.sdk_crawler import SDKCrawler

            log.info(
                "Pipeline uses SDKCrawler".format(
                    app_id=self.configuration.crawler.app_id, app_secret=self.configuration.crawler.app_secret
                )
            )
            self.crawler = SDKCrawler(
                app_id=self.configuration.crawler.app_id, app_secret=self.configuration.crawler.app_secret
            )

        # Create the pipeline
        self.crawl_controller = CrawlController(self.crawler, self.configuration.crawler.research_group)
        self.analysis_controller = AnalysisController()
        self.pipeline_controller = PipelineController(
            data_controller=self.data_controller,
            crawl_controller=self.crawl_controller,
            analysis_controller=self.analysis_controller,
        )
        log.info("Pipeline has been initialized")

        # Create the routing controllers
        self.fields_controller = FieldsController(self, self.data_controller)
        self.profiles_controller = ProfilesController(self, self.data_controller, self.configuration.cache)
        self.publications_controller = DocumentsController(self, self.data_controller)
        self.cache_controller = CacheController(
            self, self.data_controller, self.pipeline_controller, self.configuration
        )
        self.root_controller = RootController(self, self.data_controller, self.configuration)

        # Register the routes
        self.register_routes()
        log.info("Routes have been registered")
        log.info("MendeleyCache has been initialized")
示例#50
0
文件: root.py 项目: ankoh/mc-server
    def get_root(self):
        log.info('The route GET / has been triggered')

        return "Welcome to the Mendeley Cache"
示例#51
0
    def get_profiles(self):
        log.info('The route GET /profiles/ has been triggered')

        # Default parameters
        profile_ids = ''
        field_ids = ''
        slim = False

        # Set passed query parameters if existing
        if 'profile-ids' in request.args:
            profile_ids = request.args['profile-ids'].split(',')
            log.debug('Query parameter "profile-ids" = %s' % profile_ids)
        if 'field-ids' in request.args:
            field_ids = request.args['field-ids'].split(',')
            log.debug('Query parameter "field_ids" = %s' % field_ids)
        if 'slim' in request.args:
            slim = bool(request.args['slim'])
            log.debug('Query parameter "slim" = %s' % slim)

        # Trigger the respective methods
        profiles = []
        if slim:
            profiles = self._data_controller.api_data.get_profiles_slim()
        else:
            profiles = self._data_controller.api_data.get_profiles_by_profile_ids_or_field_ids(
                profile_ids=profile_ids,
                field_ids=field_ids
            )

        # Pattern for cms pages
        page_pattern = self._cache_config.profile_page_pattern

        # Serialize documents
        response = []
        for profile in profiles:
            profile_dict = dict(profile)

            # names
            name = None
            first_name = None
            last_name = None


            # Get names
            if 'first_name' in profile_dict and 'last_name' in profile_dict:
                first_name = profile_dict['first_name']
                last_name = profile_dict['last_name']
            elif 'name' in profile_dict:
                name = profile_dict['name']
                name_parts = [s.lower() for i, s in enumerate(name.split())]
                first_name = name_parts[0]
                last_name = name_parts[1]

            # If the names are available create the page link
            if first_name is not None and last_name is not None:
                page = page_pattern
                page = re.sub(':firstname', first_name, page)
                page = re.sub(':lastname', last_name, page)
                profile_dict["page"] = page

            response.append(profile_dict)
        return json.dumps(response, cls=DefaultEncoder)
示例#52
0
    def get_root(self):
        log.info('The route GET / has been triggered')

        return "Welcome to the Mendeley Cache"
    def __init__(self, *args, **kwargs):
        super(MendeleyCache, self).__init__(*args, **kwargs)

        # Read configuration
        self.configuration = ServiceConfiguration()
        self.configuration.load()
        log.info("Configuration has been loaded")

        # Create service controllers
        self.data_controller = DataController(self.configuration.database)
        self.data_controller.assert_schema()
        log.info("Schema has been checked")

        # Create crawler based on configuration
        self.crawler = None
        """:type : AbstractCrawler"""
        if not self.configuration.uses_mendeley:
            log.info("Pipeline uses FileCrawler")
            self.crawler = FileCrawler()
        else:
            from mendeleycache.crawler.sdk_crawler import SDKCrawler
            log.info("Pipeline uses SDKCrawler".format(
                app_id=self.configuration.crawler.app_id,
                app_secret=self.configuration.crawler.app_secret))
            self.crawler = SDKCrawler(
                app_id=self.configuration.crawler.app_id,
                app_secret=self.configuration.crawler.app_secret)

        # Create the pipeline
        self.crawl_controller = CrawlController(
            self.crawler, self.configuration.crawler.research_group)
        self.analysis_controller = AnalysisController()
        self.pipeline_controller = PipelineController(
            data_controller=self.data_controller,
            crawl_controller=self.crawl_controller,
            analysis_controller=self.analysis_controller)
        log.info("Pipeline has been initialized")

        # Create the routing controllers
        self.fields_controller = FieldsController(self, self.data_controller)
        self.profiles_controller = ProfilesController(self,
                                                      self.data_controller,
                                                      self.configuration.cache)
        self.publications_controller = DocumentsController(
            self, self.data_controller)
        self.cache_controller = CacheController(self, self.data_controller,
                                                self.pipeline_controller,
                                                self.configuration)
        self.root_controller = RootController(self, self.data_controller,
                                              self.configuration)

        # Register the routes
        self.register_routes()
        log.info("Routes have been registered")
        log.info("MendeleyCache has been initialized")
示例#54
0
from mendeleycache.logging import log
from mendeleycache.utils.files import get_relative_path
from mendeleycache.test.test_pipeline import sample_pipeline
from mendeleycache.test.routes.test_api import sample_api

import unittest
from unittest import TestLoader

import logging
import sys
import os
import json
from pprint import PrettyPrinter

if len(sys.argv) >= 2:
    log.info("Welcome to the MendeleyCache runner")

    command = sys.argv[1]

    # Test runner
    if command == "tests":
        log.info("Disabling non-critical logs for better unittest output")

        # Disable logging for tests
        logging.disable(logging.CRITICAL)

        # Get project root path
        project_root = get_relative_path(".")

        # Prepare
        loader = TestLoader()
示例#55
0
    def execute(self, addr: str = "localhost"):
        """
        Execute a single run of the pipeline
        This is later scheduled like once per day
        :return:
        """

        # Run the crawler
        self._crawl_controller.execute()

        # Crawl results
        profiles = self._crawl_controller.profiles
        profile_docs = self._crawl_controller.profile_documents
        group_docs = self._crawl_controller.group_documents

        # Then pipe the data to the analysis controller
        self._analysis_controller.prepare(profiles, profile_docs, group_docs)
        self._analysis_controller.execute()

        # Analysis results
        documents = self._analysis_controller.documents
        unified_name_to_profiles = self._analysis_controller.unified_name_to_profiles
        unified_document_title_to_documents = self._analysis_controller.unified_document_title_to_documents
        unified_field_title_to_field = self._analysis_controller.unified_field_title_to_field
        unified_field_title_to_documents = self._analysis_controller.unified_field_title_to_documents
        unified_name_to_authored_documents = self._analysis_controller.unified_name_to_authored_documents
        unified_name_to_participated_documents = self._analysis_controller.unified_name_to_participated_documents

        # Then store the all data with the data controller
        self._data_controller.crawl_data.execute(
            profiles=profiles,
            documents=documents,
            unified_name_to_profiles=unified_name_to_profiles,
            unified_document_title_to_documents=
            unified_document_title_to_documents,
            unified_field_title_to_field=unified_field_title_to_field,
            unified_field_title_to_documents=unified_field_title_to_documents,
            unified_name_to_authored_documents=
            unified_name_to_authored_documents,
            unified_name_to_participated_documents=
            unified_name_to_participated_documents)

        # Count field associations
        field_links = 0
        for title, docs in unified_field_title_to_documents.items():
            field_links += len(docs)

        # Generate report
        report = PipelineReport(
            profiles=len(profiles),
            documents=len(documents),
            unified_profiles=len(unified_name_to_profiles),
            unified_documents=len(unified_document_title_to_documents),
            fields=len(unified_field_title_to_field),
            field_links=field_links)

        # Log update in update_log
        self._data_controller.crawl_data.log_update(report=report,
                                                    remote_addr=addr)

        # Log report
        log.info("Pipeline has been executed\n"
                 "\t| found {profiles} profiles\n"
                 "\t| found {documents} documents\n"
                 "\t| found {unified_profiles} unified profile names\n"
                 "\t| found {unified_documents} unified document titles\n"
                 "\t| found {fields} research fields\n"
                 "\t| found {field_links} field links\n".format(
                     profiles=report.profiles,
                     documents=report.documents,
                     unified_profiles=report.unified_profiles,
                     unified_documents=report.unified_documents,
                     fields=report.fields,
                     field_links=report.field_links))

        # Return report
        return report