예제 #1
0
파일: triples.py 프로젝트: ctsit/m3c-tools
 def find_person(name: str) -> int:
     try:
         first_name, last_name = name.split(' ', 1)
         matches = list(db.get_person(sup_cur, first_name, last_name))
         if len(set(matches)) == 1:
             return matches[0]
     except Exception:
         pass
     return 0
예제 #2
0
def add_developers(sup_cur: db.Cursor) -> None:
    pmids = set(tools.MetabolomicsToolsWiki.pmids())
    total = len(pmids)
    if total == 0:
        return

    publications = db.get_pubmed_publications(sup_cur, pmids)
    pmids = pmids.intersection(publications.keys())
    print(f"Found {len(pmids)} of {total} tools-related publications in the "
          "Supplemental database.")
    if len(pmids) == 0:
        return

    for pmid in pmids:
        author_list = parse_author_list(publications[pmid])
        for author in author_list:
            forename = author.findtext("ForeName", "").strip()
            lastname = author.findtext("LastName", "").strip()

            if not forename:
                print(f"PMID {pmid}: missing forename of author {lastname}")
                continue

            if not lastname:
                print(f"PMID {pmid}: missing surname of author {forename}")
                continue

            matches = list(db.get_person(sup_cur, forename, lastname))
            if len(matches) > 1:
                print(f"PMID {pmid}: WARNING! Found {len(matches)} people "
                      f" named {forename} {lastname}: {matches}")
                continue

            if matches:
                pid = matches[0]
                print(f"PMID {pmid}: found {forename} {lastname}: {pid}")
            else:
                pid = db.add_person(sup_cur, forename, lastname, "", "")
                if not pid:
                    print(f"PMID {pmid}: WARNING failed to add person: "
                          f"{forename} {lastname}")
                    continue
                print(f"PMID {pmid}: added {forename} {lastname}: {pid}")

            affiliation_list = author.findall(".//Affiliation")
            for element in affiliation_list:
                affiliation = element.text
                if affiliation is None or affiliation.strip() == '':
                    continue
                print(f"PMID {pmid}: affiliation for {forename} {lastname}"
                      f": {affiliation.strip()}")

    return
예제 #3
0
 def test_case_insensitive_name_matching(self):
     cursor = self.conn.cursor()
     actual = list(db.get_person(cursor, "james", "bond", False))
     expected = [7]
     self.assertListEqual(expected, actual)
예제 #4
0
def get_studies(mwb_cur: db.Cursor, sup_cur: db.Cursor,
                people: Dict[int, Person], orgs: Dict[int, Organization],
                embargoed: List[str]) -> Dict[str, Study]:
    print("Gathering Workbench Studies")
    studies: Dict[str, Study] = {}
    mwb_cur.execute("""\
        SELECT study.study_id, study.study_title,
               COALESCE(study.study_type, ''),
               COALESCE(study.study_summary, ''), study.submit_date,
               study.project_id, study.last_name, study.first_name,
               study.institute, study.department, study.laboratory
        FROM study, study_status_prod
        WHERE study.study_id = study_status_prod.study_id
          AND study_status_prod.status = 1""")

    for row in mwb_cur:
        submit_date = ""
        if row[4]:
            submit_date = f"{row[4]}T00:00:00"

        study = Study(study_id=row[0].replace('\n', ''),
                      study_title=row[1].replace('\n', '').replace('"', '\\"'),
                      study_type=row[2].replace('\n', ''),
                      summary=row[3].replace('\n', '').replace('"', '\\"'),
                      submit_date=submit_date,
                      project_id=row[5].replace('\n', ''))

        # Exclude embargoed studies.
        if study.study_id in embargoed:
            print(f"Skipping embargoed study: {study.study_id}")
            continue

        # Skip invalid studies
        if not is_valid_study(study):
            continue

        last_names: str = row[6]
        first_names: str = row[7]
        institutes = row[8]
        departments = row[9]
        labs = row[10]

        institute_list = [inst.strip() for inst in institutes.split(';')]
        try:
            department_list = [dept.strip() for dept in departments.split(';')]
        except AttributeError:
            department_list = []
        try:
            lab_list = [lab.strip() for lab in labs.split(';')]
        except AttributeError:
            lab_list = []
        max_range = len(institute_list)
        if len(department_list) > max_range:
            max_range = len(department_list)
        if len(lab_list) > max_range:
            max_range = len(lab_list)

        for i in range(0, max_range):
            # If there are not enough institutes, default to first
            try:
                sup_cur.execute(
                    """
                    SELECT id, parent_id
                    FROM organizations
                    WHERE name=%s AND type='institute'
                    AND withheld = FALSE
                """, (institute_list[i], ))
                try:
                    inst_id = sup_cur.fetchone()[0]
                    study.institutes.append(orgs[inst_id].org_id)
                except TypeError:
                    print("Error: Organization does not exist.")
                    print("Organization for study " + study.study_id)
                    print("Organization name: " + institute_list[i])
                    sys.exit()
            except IndexError:
                sup_cur.execute(
                    """
                    SELECT id, parent_id
                    FROM organizations
                    WHERE name=%s AND type='institute'
                    AND withheld = FALSE
                """, (institute_list[0], ))
                inst_id = sup_cur.fetchone()[0]

            # If there are not enough departments, default to first
            if departments:
                try:
                    sup_cur.execute(
                        """
                        SELECT id, parent_id
                        FROM organizations
                        WHERE name=%s AND type='department'
                        AND withheld = FALSE
                    """, (department_list[i], ))
                    try:
                        dept_options = {}
                        for row in sup_cur:
                            dept_options[row[0]] = row[1]
                        for dept_id, parent in dept_options.items():
                            if inst_id == parent:
                                department_id = dept_id
                                study.departments.append(orgs[dept_id].org_id)
                    except TypeError:
                        print("Error: Organization does not exist.")
                        print("Organization for study " + study.study_id)
                        print("Organization name: " + department_list[i])
                        sys.exit()
                except IndexError:
                    sup_cur.execute(
                        """
                        SELECT id, parent_id
                        FROM organizations
                        WHERE name=%s AND type='department'
                        AND withheld = FALSE
                    """, (department_list[0], ))
                    dept_options = {}
                    for row in sup_cur:
                        dept_options[row[0]] = row[1]
                    for dept_id, parent in dept_options.items():
                        if inst_id == parent:
                            department_id = dept_id
            if labs:
                try:
                    sup_cur.execute(
                        """
                        SELECT id, parent_id
                        FROM organizations
                        WHERE name=%s AND type='laboratory'
                        AND withheld = FALSE
                    """, (lab_list[i], ))
                    try:
                        lab_options = {}
                        for row in sup_cur:
                            lab_options[row[0]] = row[1]
                        for lab_id, parent in lab_options.items():
                            if department_id == parent:
                                study.labs.append(orgs[lab_id].org_id)
                    except TypeError:
                        print("Error: Organization does not exist.")
                        print("Organization for study " + study.study_id)
                        print("Organization name: " + lab_list[i])
                        sys.exit()
                except IndexError:
                    pass

        last_name_list = [ln.strip() for ln in last_names.split(';')]
        first_name_list = [fn.strip() for fn in first_names.split(';')]

        for i in range(0, len(last_name_list)):
            last_name = last_name_list[i]
            first_name = first_name_list[i]

            ids = list(db.get_person(sup_cur, first_name, last_name))
            try:
                person_id = ids[0]
                study.runner.append(people[person_id].person_id)
            except (IndexError, KeyError, TypeError):
                print("Error: Person does not exist.")
                print("Runner for study " + study.study_id)
                print("Last name: " + last_name + '.')
                print("First name: " + first_name + '.')
                sys.exit()

        studies[study.study_id] = study
    return studies
예제 #5
0
def get_projects(mwb_cur: db.Cursor, sup_cur: db.Cursor, people: Dict[int,
                                                                      Person],
                 orgs: List[Organization]) -> Mapping[str, Project]:
    print("Gathering Workbench Projects")
    projects = {}
    mwb_cur.execute("""\
        SELECT project_id, project_title, COALESCE(project_type, ''),
               COALESCE(project_summary, ''), COALESCE(doi, ''),
               COALESCE(funding_source, ''),
               last_name, first_name, institute, department, laboratory
          FROM project
    """)
    for row in mwb_cur:
        project = Project(project_id=row[0].replace('\n', ''),
                          project_title=row[1].replace('\n',
                                                       '').replace('"', '\\"'),
                          project_type=row[2].replace('\n', ''),
                          summary=row[3].replace('\n', '').replace('"', '\\"'),
                          doi=row[4].replace('\n', ''),
                          funding_source=row[5].replace('\n', ''))

        last_names: str = row[6]
        first_names: str = row[7]
        institutes = row[8]
        departments = row[9]
        labs = row[10]

        institute_list = [inst.strip() for inst in institutes.split(';')]
        try:
            department_list = [dept.strip() for dept in departments.split(';')]
        except AttributeError:
            department_list = []
        try:
            lab_list = [lab.strip() for lab in labs.split(';')]
        except AttributeError:
            lab_list = []
        max_range = len(institute_list)
        if len(department_list) > max_range:
            max_range = len(department_list)
        if len(lab_list) > max_range:
            max_range = len(lab_list)

        for i in range(0, max_range):
            # If there are not enough institutes, default to first
            try:
                sup_cur.execute(
                    """
                    SELECT id, parent_id
                    FROM organizations
                    WHERE name=%s AND type='institute'
                    AND withheld = FALSE
                """, (institute_list[i], ))
                try:
                    inst_id = sup_cur.fetchone()[0]
                    project.institutes.append(orgs[inst_id].org_id)
                except TypeError:
                    print("Error: Organization does not exist.")
                    print("Organization for project " + project.project_id)
                    print("Organization name: " + institute_list[i])
                    sys.exit()
            except IndexError:
                sup_cur.execute(
                    """
                    SELECT id, parent_id
                    FROM organizations
                    WHERE name=%s AND type='institute'
                    AND withheld = FALSE
                """, (institute_list[0], ))
                inst_id = sup_cur.fetchone()[0]

            # If there are not enough departments, default to first
            if departments:
                try:
                    sup_cur.execute(
                        """
                        SELECT id, parent_id
                        FROM organizations
                        WHERE name=%s AND type='department'
                        AND withheld = FALSE
                    """, (department_list[i], ))
                    try:
                        dept_options = {}
                        for row in sup_cur:
                            dept_options[row[0]] = row[1]
                        for dept_id, parent in dept_options.items():
                            if inst_id == parent:
                                department_id = dept_id
                                org_id = orgs[dept_id].org_id
                                project.departments.append(org_id)
                    except TypeError:
                        print("Error: Organization does not exist.")
                        print("Organization for project " + project.project_id)
                        print("Organization name: " + department_list[i])
                        sys.exit()
                except IndexError:
                    sup_cur.execute(
                        """
                        SELECT id, parent_id
                        FROM organizations
                        WHERE name=%s AND type='department'
                        AND withheld = FALSE
                    """, (department_list[0], ))
                    dept_options = {}
                    for row in sup_cur:
                        dept_options[row[0]] = row[1]
                    for dept_id, parent in dept_options.items():
                        if inst_id == parent:
                            department_id = dept_id
            if labs:
                try:
                    sup_cur.execute(
                        """
                        SELECT id, parent_id
                        FROM organizations
                        WHERE name=%s AND type='laboratory'
                        AND withheld = FALSE
                    """, (lab_list[i], ))
                    try:
                        lab_options = {}
                        for row in sup_cur:
                            lab_options[row[0]] = row[1]
                        for lab_id, parent in lab_options.items():
                            if department_id == parent:
                                project.labs.append(orgs[lab_id].org_id)
                    except TypeError:
                        print("Error: Organization does not exist.")
                        print("Organization for project " + project.project_id)
                        print("Organization name: " + lab_list[i])
                        sys.exit()
                except IndexError:
                    pass

        last_name_list = [ln.strip() for ln in last_names.split(';')]
        first_name_list = [fn.strip() for fn in first_names.split(';')]

        for i in range(0, len(last_name_list)):
            last_name = last_name_list[i]
            first_name = first_name_list[i]
            ids = list(db.get_person(sup_cur, first_name, last_name))
            try:
                person_id = ids[0]
                project.pi.append(people[person_id].person_id)
            except (IndexError, KeyError, TypeError):
                print("Error: Person does not exist.")
                print("PI for project " + project.project_id)
                print("Last name: " + last_name)
                print("First name: " + first_name)
                sys.exit()
        projects[project.project_id] = project
    return projects