Пример #1
0
    def handle(self, *args, **options):

        # parse and validate args
        validate_only = options["validate_only"]
        project_guid = options["project_id"]
        pedigree_file_path = options["pedigree_file"]

        # look up project id and validate other args
        try:
            project = Project.objects.get(guid=project_guid)
        except ObjectDoesNotExist:
            raise CommandError("Invalid project id: %(project_guid)s" %
                               locals())

        if pedigree_file_path and not os.path.isfile(pedigree_file_path):
            raise CommandError("Can't open pedigree file: %(pedigree_file)s" %
                               locals())

        # parse the pedigree file if specified
        input_stream = file_iter(pedigree_file_path)
        json_records, errors, warnings = parse_pedigree_table(
            pedigree_file_path, input_stream)

        if errors:
            for message in errors:
                logger.error(message)
            raise CommandError("Unable to parse %(pedigree_file_path)s" %
                               locals())

        if warnings:
            for message in warnings:
                logger.warn(message)

        if not validate_only:
            add_or_update_individuals_and_families(project, json_records)
Пример #2
0
 def process_records(json_records, filename='ped_file'):
     pedigree_records, errors, ped_warnings = parse_pedigree_table(json_records, filename, user=request.user, project=project)
     if errors:
         raise ErrorsWarningsException(errors, ped_warnings)
     nonlocal warnings
     warnings += ped_warnings
     return pedigree_records
Пример #3
0
def add_individuals_from_pedigree_file(project, pedigree_file_path, validate_only=False):
    if pedigree_file_path and not os.path.isfile(pedigree_file_path):
        raise CommandError("Can't open pedigree file: %(pedigree_file)s" % locals())

    # parse the pedigree file if specified
    input_stream = file_iter(pedigree_file_path)
    json_records, errors, warnings = parse_pedigree_table(pedigree_file_path, input_stream)

    if errors:
        for message in errors:
            logger.error(message)
        raise CommandError("Unable to parse %(pedigree_file_path)s" % locals())

    if warnings:
        for message in warnings:
            logger.warn(message)

    if not validate_only:
        add_or_update_individuals_and_families(project, json_records)
Пример #4
0
    def test_parse_datstat_pedigree_table(self):
        records, errors, warnings = parse_pedigree_table(
            [[
                'DATSTAT_ALTPID', 'FAMILY_ID', 'DDP_CREATED',
                'DDP_LASTUPDATED', 'RELATIONSHIP', 'RELATIONSHIP_SPECIFY',
                'PATIENT_WEBSITE', 'DESCRIPTION', 'CLINICAL_DIAGNOSES',
                'CLINICAL_DIAGNOSES_SPECIFY', 'GENETIC_DIAGNOSES',
                'GENETIC_DIAGNOSES_SPECIFY', 'FIND_OUT.DOCTOR',
                'FIND_OUT_DOCTOR_DETAILS', 'PATIENT_AGE', 'CONDITION_AGE',
                'PATIENT_DECEASED', 'DECEASED_AGE', 'DECEASED_CAUSE',
                'DECEASED_STORED_SAMPLE', 'PATIENT_SEX', 'RACE_LIST',
                'PTETHNICITY', 'DOCTOR_TYPES_LIST', 'DOCTOR_TYPES_SPECIFY',
                'TESTS.NONE', 'TESTS.NOT_SURE', 'TESTS.KARYOTYPE',
                'TESTS.SINGLE_GENE_TESTING', 'TESTS.GENE_PANEL_TESTING',
                'TESTS.MITOCHON_GENOME_SEQUENCING', 'TESTS.MICROARRAY',
                'TESTS_MICROARRAY_YEAR', 'TESTS_MICROARRAY_LAB',
                'TESTS_MICROARRAY_RELATIVE_LIST',
                'TESTS_MICROARRAY_RELATIVE_SPEC', 'TESTS.WEXOME_SEQUENCING',
                'TESTS_WEXOME_SEQUENCING_YEAR', 'TESTS_WEXOME_SEQUENCING_LAB',
                'TESTS_WEXOME_SEQUENCING_REL_LI',
                'TESTS_WEXOME_SEQUENCING_REL_SP', 'TESTS.WGENOME_SEQUENCING',
                'TESTS_WGENOME_SEQUENCING_YEAR',
                'TESTS_WGENOME_SEQUENCING_LAB',
                'TESTS_WGENOME_SEQUENCING_REL_L',
                'TESTS_WGENOME_SEQUENCING_REL_S', 'TESTS.OTHER',
                'TEST_OTHER_SPECIFY', 'BIOPSY.NONE', 'BIOPSY', 'BIOPSY.OTHER',
                'BIOPSY_OTHER_SPECIFY', 'OTHER_GENETIC_STUDIES',
                'OTHER_GENETIC_STUDIES_SPECIFY', 'EXPECTING_GENETIC_RESULTS',
                'SAME_CONDITION_MOM', 'CONDITION_AGE_MOM',
                'ABLE_TO_PARTICIPATE_MOM', 'DECEASED_MOM', 'STORED_DNA_MOM',
                'SAME_CONDITION_DAD', 'CONDITION_AGE_DAD',
                'ABLE_TO_PARTICIPATE_DAD', 'DECEASED_DAD', 'STORED_DNA_DAD',
                'NO_SIBLINGS', 'SIBLING_LIST', 'NO_CHILDREN', 'CHILD_LIST',
                'NO_RELATIVE_AFFECTED', 'RELATIVE_LIST', 'FAMILY_INFO'
            ],
             [
                 '1518231365', '123', '2019-07-31T03:54:21UTC',
                 '2019-08-01T14:12:40UTC', '6', 'Grandchild',
                 'wwww.myblog.com',
                 'I have a really debilitating probably genetic condition. I\xe2ve seen many specialists.',
                 '1', 'SMA\xe2s', '1', 'Dwarfism\xe2', '1', 'Dr John Smith',
                 '34', '21', '1', '33', 'heart attack', '2', '1',
                 '["White","Asian","Pacific"]', '2',
                 '["ClinGen","Neurologist","Cardiologist","Other"]',
                 'Pediatrician', '0', '0', '0', '1', '1', '0', '0', '', '', '',
                 '', '1', '2018', 'UDN\xe2s lab',
                 '["Parent","AuntUncle","NieceNephew","Other"]', 'Grandmother',
                 '1', '', '', '', 'Grandmother', '1', 'Blood work', '0',
                 'MUSCLE,SKIN,OTHER: Muscle Biopsy, Skin Biopsy, Other Tissue Biopsy',
                 '1', 'Bone\xe2s', '1', 'Undiagnosed Diseases Network', '2',
                 '1', '19', '1', '', '', '2', '', '', '1', '2', '0',
                 '[{"sex":"Female","age":"21","races":["White"],"ethnicity":"NonHispanic","sameCondition":"Yes","ageOnsetCondition":null,"ableToParticipate":"No","siblingId":"d18b9f4b-0995-45e9-9b00-e710d0004a3f"},{"sex":"","age":"17","races":["White"],"ethnicity":"NonHispanic","sameCondition":"","ageOnsetCondition":null,"ableToParticipate":"Yes","siblingId":"3ddc9015-3c2c-484c-b1de-502ba9ffc1e4"}]',
                 '1', '', '0',
                 '[{"sex":"Male","age":"44","races":["White"],"ethnicity":"NonHispanic","sameCondition":"No","ageOnsetCondition":null,"ableToParticipate":null,"siblingId":"bb87c69f-6c52-48b4-8854-e639d998abe7"}]',
                 'patient\xe2s uncle (dads brother) died from Fahrs disease at 70'
             ],
             [
                 'b392fd78b440', '987', '2019-08-06T14:30:44UTC',
                 '2019-08-06T15:18:48UTC', '8', 'Grandchild', '', '', '3',
                 'SMA', '2', 'Dwarfism', '0', 'Dr John Smith', '47', '2', '0',
                 '33', 'heart attack', '2', '3', '["White"]', '3', '[]',
                 'Pediatrician', '0', '1', '0', '1', '1', '0', '0', '', '', '',
                 '', '1', '2018', 'UDN',
                 '["Parent","AuntUncle","NieceNephew","Other"]', 'Grandmother',
                 '1', '', '', '', 'Grandmother', '1', 'Blood work', '1',
                 'NONE: This individual hasn\'t had a biopsy', '1', 'Bone',
                 '0', 'Undiagnosed Diseases Network', '2', '3', '19', '2', '3',
                 '', '', '', '', '', '1', '1',
                 '[{"sex":"Female","age":"21","races":["White"],"ethnicity":"NonHispanic","sameCondition":"Yes","ageOnsetCondition":null,"ableToParticipate":"No","siblingId":"d18b9f4b-0995-45e9-9b00-e710d0004a3f"},{"sex":"","age":"17","races":["White"],"ethnicity":"NonHispanic","sameCondition":"","ageOnsetCondition":null,"ableToParticipate":"Yes","siblingId":"3ddc9015-3c2c-484c-b1de-502ba9ffc1e4"}]',
                 '0: No',
                 '[{"sex":"Male","age":"12","races":["White"],"ethnicity":"NonHispanic","sameCondition":"No","ageOnsetCondition":null,"ableToParticipate":"Unsure","siblingId":"bb87c69f-6c52-48b4-8854-e639d998abe7"}]',
                 '1', '', ''
             ]], FILENAME)

        note_1 = """#### Clinical Information
      __Patient is my:__ Grandchild (male)
      __Current Age:__ Patient is deceased, age 33, due to heart attack, sample not available
      __Age of Onset:__ 21
      __Race/Ethnicity:__ White, Asian, Pacific; Not Hispanic
      __Case Description:__ I have a really debilitating probably genetic condition. Ive seen many specialists.
      __Clinical Diagnoses:__ Yes; SMAs
      __Genetic Diagnoses:__ Yes; Dwarfism
      __Website/Blog:__ Yes
      __Additional Information:__ patients uncle (dads brother) died from Fahrs disease at 70
#### Prior Testing
      __Referring Physician:__ Dr John Smith
      __Doctors Seen:__ Clinical geneticist, Neurologist, Cardiologist, Other: Pediatrician
      __Previous Testing:__ Yes;
          Single gene testing
          Gene panel testing
          Whole exome sequencing. Year: 2018, Lab: UDNs lab, Relatives: Parent, Aunt or Uncle, Niece or Nephew, Other: Grandmother
          Whole genome sequencing. Year: unspecified, Lab: unspecified, Relatives: not specified
          Other tests: Blood work
      __Biopsies Available:__ Muscle Biopsy, Skin Biopsy, Other Tissue Biopsy: Bones
      __Other Research Studies:__ Yes, Name of studies: Undiagnosed Diseases Network, Expecting results: No
#### Family Information
      __Mother:__ affected, onset age 19, available
      __Father:__ unaffected, unavailable, deceased, sample not available
      __Siblings:__ 
          Sister, age 21, affected, unavailable
          Sibling (unspecified sex), age 17, unspecified affected status, available
      __Children:__ None
      __Relatives:__ 
          Male, age 44, affected, unspecified availability"""

        note_2 = """#### Clinical Information
      __Patient is my:__ Adult Child (unspecified sex) - unable to provide consent
      __Current Age:__ 47
      __Age of Onset:__ 2
      __Race/Ethnicity:__ White; Unknown
      __Case Description:__ 
      __Clinical Diagnoses:__ Unknown/Unsure
      __Genetic Diagnoses:__ No
      __Website/Blog:__ No
      __Additional Information:__ None specified
#### Prior Testing
      __Referring Physician:__ None
      __Doctors Seen:__ 
      __Previous Testing:__ Not sure
      __Biopsies Available:__ None
      __Other Research Studies:__ No
#### Family Information
      __Mother:__ unknown affected status, unavailable, unknown deceased status
      __Father:__ unknown affected status, unavailable, unspecified deceased status
      __Siblings:__ None
      __Children:__ 
          Son, age 12, unaffected, unspecified availability
      __Relatives:__ None"""
        self.assertListEqual(records, [
            {
                'familyId': 'RGP_123',
                'individualId': 'RGP_123_1',
                'sex': 'F',
                'affected': 'N'
            },
            {
                'familyId': 'RGP_123',
                'individualId': 'RGP_123_2',
                'sex': 'M',
                'affected': 'N'
            },
            {
                'familyId': 'RGP_123',
                'individualId': 'RGP_123_3',
                'sex': 'M',
                'affected': 'A',
                'maternalId': 'RGP_123_1',
                'paternalId': 'RGP_123_2',
                'familyNotes': note_1
            },
            {
                'familyId': 'RGP_987',
                'individualId': 'RGP_987_1',
                'sex': 'F',
                'affected': 'N'
            },
            {
                'familyId': 'RGP_987',
                'individualId': 'RGP_987_2',
                'sex': 'M',
                'affected': 'N'
            },
            {
                'familyId': 'RGP_987',
                'individualId': 'RGP_987_3',
                'sex': 'U',
                'affected': 'A',
                'maternalId': 'RGP_987_1',
                'paternalId': 'RGP_987_2',
                'familyNotes': note_2
            },
        ])
        self.assertListEqual(errors, [])
        self.assertListEqual(warnings, [])
Пример #5
0
    def test_parse_pedigree_table(self):
        records, errors, warnings = parse_pedigree_table(
            [['family_id', 'individual_id', 'sex', 'affected'],
             ['fam1', 'ind1', 'male']], FILENAME)
        self.assertListEqual(records, [])
        self.assertListEqual(errors, [
            'Error while parsing file: {}. Row 1 contains 3 columns: fam1, ind1, male, while header contains 4: family_id, individual_id, sex, affected'
            .format(FILENAME)
        ])
        self.assertListEqual(warnings, [])
        records, errors, warnings = parse_pedigree_table([[
            'family_id', 'individual_id', 'sex', 'affected', 'father', 'mother'
        ], ['', '', 'male', 'u', '.', 'ind2']], FILENAME)
        self.assertListEqual(records, [])
        self.assertEqual(len(errors), 1)
        self.assertEqual(
            errors[0].split('\n')[0],
            "Error while converting {} rows to json: Family Id not specified in row #1:"
            .format(FILENAME))
        self.assertListEqual(warnings, [])

        records, errors, warnings = parse_pedigree_table([[
            'family_id', 'individual_id', 'sex', 'affected', 'father', 'mother'
        ], ['fam1', '', 'male', 'u', '.', 'ind2']], FILENAME)
        self.assertListEqual(records, [])
        self.assertEqual(len(errors), 1)
        self.assertEqual(
            errors[0].split('\n')[0],
            "Error while converting {} rows to json: Individual Id not specified in row #1:"
            .format(FILENAME))
        self.assertListEqual(warnings, [])

        records, errors, warnings = parse_pedigree_table([[
            'family_id', 'individual_id', 'sex', 'affected', 'father', 'mother'
        ], ['fam1', 'ind1', 'boy', 'u', '.', 'ind2']], FILENAME)
        self.assertListEqual(records, [])
        self.assertListEqual(errors, [
            "Error while converting {} rows to json: Invalid value 'boy' for sex in row #1"
            .format(FILENAME)
        ])
        self.assertListEqual(warnings, [])

        records, errors, warnings = parse_pedigree_table([[
            'family_id', 'individual_id', 'sex', 'affected', 'father', 'mother'
        ], ['fam1', 'ind1', 'male', 'no', '.', 'ind2']], FILENAME)
        self.assertListEqual(records, [])
        self.assertListEqual(errors, [
            "Error while converting {} rows to json: Invalid value 'no' for affected status in row #1"
            .format(FILENAME)
        ])

        records, errors, warnings = parse_pedigree_table([[
            'family_id', 'individual_id', 'sex', 'affected', 'father',
            'mother', 'proband_relation'
        ], ['fam1', 'ind1', 'male', 'aff.', 'ind3', 'ind2', 'mom']], FILENAME)
        self.assertListEqual(records, [])
        self.assertListEqual(errors, [
            'Error while converting {} rows to json: Invalid value "mom" for proband relationship in row #1'
            .format(FILENAME)
        ])

        records, errors, warnings = parse_pedigree_table([[
            'family_id', 'individual_id', 'sex', 'affected', 'father',
            'mother', 'proband_relation'
        ], ['fam1', 'ind1', 'male', 'aff.', 'ind3', 'ind2', 'mother'
            ], ['fam2', 'ind2', 'male', 'unknown', '.', '', '']], FILENAME)
        self.assertListEqual(records, [
            {
                'familyId': 'fam1',
                'individualId': 'ind1',
                'sex': 'M',
                'affected': 'A',
                'paternalId': 'ind3',
                'maternalId': 'ind2',
                'probandRelationship': 'M'
            },
            {
                'familyId': 'fam2',
                'individualId': 'ind2',
                'sex': 'M',
                'affected': 'U',
                'paternalId': '',
                'maternalId': '',
                'probandRelationship': ''
            },
        ])
        self.assertListEqual(errors, [
            'Invalid proband relationship "Mother" for ind1 with given gender Male',
            'ind2 is recorded as Male and also as the mother of ind1',
            'ind2 is recorded as the mother of ind1 but they have different family ids: fam2 and fam1',
        ])
        self.assertListEqual(warnings, [
            "ind3 is the father of ind1 but doesn't have a separate record in the table"
        ])

        records, errors, warnings = parse_pedigree_table(
            [['A pedigree file'], ['# Some comments'],
             [
                 '#family_id', '#individual_id', 'previous_individual_id',
                 'notes_for_import', 'other_data', 'sex', 'affected', 'father',
                 'mother', 'phenotype: coded', 'proband_relation'
             ],
             [
                 'fam1', 'ind1', 'ind1_old_id', 'some notes',
                 'some more notes', 'male', 'aff.', '.', 'ind2', 'HPO:12345',
                 ''
             ],
             [
                 'fam1', 'ind2', '', ' ', '', 'female', 'u', '.', '',
                 'HPO:56789', 'mother'
             ]], FILENAME)
        self.assertListEqual(records, [
            {
                'familyId': 'fam1',
                'individualId': 'ind1',
                'sex': 'M',
                'affected': 'A',
                'paternalId': '',
                'maternalId': 'ind2',
                'notes': 'some notes',
                'codedPhenotype': 'HPO:12345',
                'probandRelationship': '',
                'previousIndividualId': 'ind1_old_id'
            },
            {
                'familyId': 'fam1',
                'individualId': 'ind2',
                'sex': 'F',
                'affected': 'N',
                'paternalId': '',
                'maternalId': '',
                'notes': '',
                'codedPhenotype': 'HPO:56789',
                'probandRelationship': 'M',
                'previousIndividualId': ''
            },
        ])
        self.assertListEqual(errors, [])
        self.assertListEqual(warnings, [])
Пример #6
0
    def test_parse_sample_manifest(self, mock_email):
        header_1 = [
            'Do not modify - Broad use', '', '',
            'Please fill in columns D - O', '', '', '', '', '', '', '', '', '',
            '', ''
        ]
        header_2 = [
            'Kit ID', 'Well', 'Sample ID', 'Family ID', 'Alias', 'Alias',
            'Paternal Sample ID', 'Maternal Sample ID', 'Gender',
            'Affected Status', 'Volume', 'Concentration', 'Notes',
            'Coded Phenotype', 'Data Use Restrictions'
        ]
        header_3 = [
            '', 'Position', '', '', 'Collaborator Participant ID',
            'Collaborator Sample ID', '', '', '', '', 'ul', 'ng/ul', '', '',
            'indicate study/protocol number'
        ]

        records, errors, warnings = parse_pedigree_table([
            header_1,
            [
                'Kit ID', 'Well', 'Sample ID', 'Family ID', 'Alias',
                'Maternal Sample ID', 'Gender', 'Affected Status', 'Volume',
                'Concentration', 'Notes', 'Coded Phenotype',
                'Data Use Restrictions'
            ],
            header_3,
        ], FILENAME)
        self.assertListEqual(errors, [
            'Error while parsing file: {}. Expected vs. actual header columns: | Sample ID| Family ID| Alias|-Alias|-Paternal Sample ID| Maternal Sample ID| Gender| Affected Status'
            .format(FILENAME)
        ])
        self.assertListEqual(warnings, [])
        self.assertListEqual(records, [])

        records, errors, warnings = parse_pedigree_table([
            header_1, header_2,
            [
                '', 'Position', '', '', 'Collaborator Sample ID', '', '', '',
                '', 'ul', 'ng/ul', '', '', 'indicate study/protocol number'
            ]
        ], FILENAME)
        self.assertListEqual(errors, [
            'Error while parsing file: {}. Expected vs. actual header columns: |-Collaborator Participant ID| Collaborator Sample ID|+'
            .format(FILENAME)
        ])
        self.assertListEqual(warnings, [])
        self.assertListEqual(records, [])

        original_data = [
            header_1, header_2, header_3,
            [
                'SK-3QVD', 'A02', 'SM-IRW6C', 'PED073', 'SCO_PED073B_GA0339',
                'SCO_PED073B_GA0339_1', '', '', 'male', 'unaffected', '20',
                '94.8', 'probably dad', '', '1234'
            ],
            [
                'SK-3QVD', 'A03', 'SM-IRW69', 'PED073', 'SCO_PED073C_GA0340',
                'SCO_PED073C_GA0340_1', 'SCO_PED073B_GA0339_1',
                'SCO_PED073A_GA0338_1', 'female', 'affected', '20', '98', '',
                'Perinatal death', ''
            ]
        ]

        records, errors, warnings = parse_pedigree_table(
            original_data,
            FILENAME,
            user=User.objects.get(id=10),
            project=Project.objects.get(id=1))
        self.assertListEqual(records, [{
            'affected': 'N',
            'maternalId': '',
            'notes': 'probably dad',
            'individualId': 'SCO_PED073B_GA0339_1',
            'sex': 'M',
            'familyId': 'PED073',
            'paternalId': '',
            'codedPhenotype': ''
        }, {
            'affected': 'A',
            'maternalId': 'SCO_PED073A_GA0338_1',
            'notes': '',
            'individualId': 'SCO_PED073C_GA0340_1',
            'sex': 'F',
            'familyId': 'PED073',
            'paternalId': 'SCO_PED073B_GA0339_1',
            'codedPhenotype': 'Perinatal death'
        }])
        self.assertListEqual(warnings, [
            "SCO_PED073A_GA0338_1 is the mother of SCO_PED073C_GA0340_1 but doesn't have a separate record in the table"
        ])
        self.assertListEqual(errors, [])

        mock_email.assert_called_with(
            subject='SK-3QVD Merged Sample Pedigree File',
            body=mock.ANY,
            to=['*****@*****.**'],
            attachments=[
                ('SK-3QVD.xlsx', mock.ANY,
                 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
                 ),
                ('test.xlsx', mock.ANY,
                 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
                 ),
            ])
        self.assertEqual(
            mock_email.call_args.kwargs['body'],
            """User [email protected] just uploaded pedigree info to 1kg project n\xe5me with uni\xe7\xf8de.This email has 2 attached files:
    
    SK-3QVD.xlsx is the sample manifest file in a format that can be sent to GP.
    
    test.csv is the original merged pedigree-sample-manifest file that the user uploaded.
    """)
        mock_email.return_value.attach_alternative.assert_called_with(
            """User [email protected] just uploaded pedigree info to 1kg project n\xe5me with uni\xe7\xf8de.<br />This email has 2 attached files:<br />
    <br />
    <b>SK-3QVD.xlsx</b> is the sample manifest file in a format that can be sent to GP.<br />
    <br />
    <b>test.csv</b> is the original merged pedigree-sample-manifest file that the user uploaded.<br />
    """, 'text/html')
        mock_email.return_value.send.assert_called()

        # Test sent sample manifest is correct
        sample_wb = load_workbook(
            BytesIO(mock_email.call_args.kwargs['attachments'][0][1]))
        sample_ws = sample_wb.active
        sample_ws.title = 'Sample Info'
        self.assertListEqual(
            [[cell.value or '' for cell in row] for row in sample_ws],
            [[
                'Well', 'Sample ID', 'Alias', 'Alias', 'Gender', 'Volume',
                'Concentration'
            ],
             [
                 'Position', '', 'Collaborator Participant ID',
                 'Collaborator Sample ID', '', 'ul', 'ng/ul'
             ],
             [
                 'A02', 'SM-IRW6C', 'SCO_PED073B_GA0339',
                 'SCO_PED073B_GA0339_1', 'male', '20', '94.8'
             ],
             [
                 'A03', 'SM-IRW69', 'SCO_PED073C_GA0340',
                 'SCO_PED073C_GA0340_1', 'female', '20', '98'
             ]])

        # Test original file copy is correct
        original_wb = load_workbook(
            BytesIO(mock_email.call_args.kwargs['attachments'][1][1]))
        original_ws = original_wb.active
        self.assertListEqual([[cell.value or '' for cell in row]
                              for row in original_ws], original_data)
Пример #7
0
def create_project_from_workspace(request, namespace, name):
    """
    Create a project when a cooperator requests to load data from an AnVIL workspace.

    :param request: Django request object
    :param namespace: The namespace (or the billing account) of the workspace
    :param name: The name of the workspace. It also be used as the project name
    :return the projectsByGuid with the new project json

    """
    # Validate that the current user has logged in through google and has sufficient permissions
    workspace_meta = check_workspace_perm(request.user, CAN_EDIT, namespace, name, can_share=True, meta_fields=['workspace.bucketName'])

    projects = Project.objects.filter(workspace_namespace=namespace, workspace_name=name)
    if projects:
        error = 'Project "{}" for workspace "{}/{}" exists.'.format(projects.first().name, namespace, name)
        return create_json_response({'error': error}, status=400, reason=error)

    # Validate all the user inputs from the post body
    request_json = json.loads(request.body)

    missing_fields = [field for field in ['genomeVersion', 'uploadedFileId', 'dataPath'] if not request_json.get(field)]
    if missing_fields:
        error = 'Field(s) "{}" are required'.format(', '.join(missing_fields))
        return create_json_response({'error': error}, status=400, reason=error)

    if not request_json.get('agreeSeqrAccess'):
        error = 'Must agree to grant seqr access to the data in the associated workspace.'
        return create_json_response({'error': error}, status=400, reason=error)

    # Add the seqr service account to the corresponding AnVIL workspace
    added_account_to_workspace = add_service_account(request.user, namespace, name)
    if added_account_to_workspace:
        _wait_for_service_account_access(request.user,namespace, name)

    # Validate the data path
    bucket_name = workspace_meta['workspace']['bucketName']
    data_path = 'gs://{bucket}/{path}'.format(bucket=bucket_name.rstrip('/'), path=request_json['dataPath'].lstrip('/'))
    if not does_file_exist(data_path):
        error = 'Data file or path {} is not found.'.format(request_json['dataPath'])
        return create_json_response({'error': error}, status=400, reason=error)

    # Parse families/individuals in the uploaded pedigree file
    json_records = load_uploaded_file(request_json['uploadedFileId'])
    pedigree_records, errors, ped_warnings = parse_pedigree_table(json_records, 'uploaded pedigree file', user=request.user)
    errors += ped_warnings
    if errors:
        return create_json_response({'errors': errors}, status=400)

    # Create a new Project in seqr
    project_args = {
        'name': name,
        'genome_version': request_json['genomeVersion'],
        'description': request_json.get('description', ''),
        'workspace_namespace': namespace,
        'workspace_name': name,
    }

    project = create_model_from_json(Project, project_args, user=request.user)

    # add families and individuals according to the uploaded individual records
    _, updated_individuals = add_or_update_individuals_and_families(
        project, individual_records=pedigree_records, user=request.user
    )

    # Send an email to all seqr data managers
    try:
        _send_load_data_email(project, updated_individuals, data_path, request.user)
    except Exception as ee:
        message = 'Exception while sending email to user {}. {}'.format(request.user, str(ee))
        logger.error(message)

    return create_json_response({'projectGuid':  project.guid})
Пример #8
0
    def handle(self, *args, **options):

        analysis_type = Dataset.ANALYSIS_TYPE_VARIANT_CALLS

        # parse and validate args
        sample_type = options["sample_type"]
        genome_version = options["genome_version"]
        validate_only = options["validate_only"]
        remap_sample_ids = options["remap_sample_ids"]
        max_edit_distance = options["max_edit_distance_for_id_match"]
        pedigree_file_path = options["pedigree_file"]
        export_pedigree_file_template = options["export_pedigree_file_template"]
        project_guid = options["project_id"]
        vcf_path = options["vcf_path"]
        elasticsearch_index = options["elasticsearch_index"]
        is_loaded = options["is_loaded"]

        # look up project id and validate other args
        try:
            project = Project.objects.get(guid=project_guid)
        except ObjectDoesNotExist:
            raise CommandError("Invalid project id: %(project_guid)s" % locals())

        #if project.genome_version != genome_version:
        #    raise CommandError("Genome version %s doesn't match the project's genome version which is %s" % (genome_version, project.genome_version))

        if pedigree_file_path and not os.path.isfile(pedigree_file_path):
            raise CommandError("Can't open pedigree file: %(pedigree_file_path)s" % locals())

        # parse the pedigree file if specified
        if pedigree_file_path:

            input_stream = file_iter(pedigree_file_path)
            json_records, errors, warnings = parse_pedigree_table(pedigree_file_path, input_stream)

            if errors:
                for message in errors:
                    logger.error(message)
                raise CommandError("Unable to parse %(pedigree_file_path)s" % locals())

            if warnings:
                for message in warnings:
                    logger.warn(message)

            if not validate_only:
                add_or_update_individuals_and_families(project, json_records)

        # validate VCF and get sample ids
        vcf_sample_ids = _validate_vcf(vcf_path, sample_type=sample_type, genome_version=genome_version)

        if remap_sample_ids:
            if not does_file_exist(remap_sample_ids):
                raise ValueError("File not found: " + remap_sample_ids)

            id_mapping = {}
            for line in file_iter(remap_sample_ids):
                fields = line.strip().split("\t")
                if len(fields) != 2:
                    raise ValueError("Must contain 2 columns: " + str(fields))
                id_mapping[fields[0]] = fields[1]

            remapped_vcf_sample_ids = []
            for sample_id in vcf_sample_ids:
                if sample_id in id_mapping:
                    remapped_vcf_sample_ids.append(id_mapping[sample_id])
                    print("Remapped %s to %s" % (sample_id, id_mapping[sample_id]))
                else:
                    remapped_vcf_sample_ids.append(sample_id)
                    print("No sample id mapping for %s" % sample_id)
                    
            vcf_sample_ids = remapped_vcf_sample_ids

        vcf_sample_ids_to_sample_records = match_sample_ids_to_sample_records(
            project,
            sample_ids=vcf_sample_ids,
            sample_type=sample_type,
            max_edit_distance=max_edit_distance,
            create_sample_records=not validate_only,
        )

        if export_pedigree_file_template:
            with open(export_pedigree_file_template, "w") as out_f:
                out_f.write("#%s\n" % ("\t".join(['family_id', 'individual_id', 'paternal_id', 'maternal_id', 'sex', 'affected_status'],)))
                for vcf_sample_id in vcf_sample_ids:
                    if vcf_sample_id in vcf_sample_ids_to_sample_records:
                        continue

                    family_id = individual_id = vcf_sample_id
                    out_f.write("%s\n" % ("\t".join([family_id, individual_id, '', '', '', ''],)))
            logger.info("Wrote out %(export_pedigree_file_template)s. Exiting..." % locals())
            return

        if len(vcf_sample_ids_to_sample_records) == 0:
            all_vcf_sample_id_count = len(vcf_sample_ids)
            all_project_sample_id_count = len(Sample.objects.filter(individual__family__project=project, sample_type=sample_type))
            logger.info("None of the individuals or samples in the project matched the %(all_vcf_sample_id_count)s sample id(s) in the VCF" % locals())
            return

        # retrieve or create Dataset record and link it to sample(s)
        dataset = get_or_create_elasticsearch_dataset(
            project=project,
            analysis_type=analysis_type,
            genome_version=genome_version,
            source_file_path=vcf_path,
            elasticsearch_index=elasticsearch_index,
            is_loaded=is_loaded,
        )

        if is_loaded and not dataset.loaded_date:
            dataset.loaded_date=timezone.now()
            dataset.save()

        link_dataset_to_sample_records(dataset, vcf_sample_ids_to_sample_records.values())

        # check if all VCF samples loaded already
        vcf_sample_ids = set(vcf_sample_ids_to_sample_records.keys())
        existing_sample_ids = set([s.sample_id for s in dataset.samples.all()])
        if dataset.is_loaded and len(vcf_sample_ids - existing_sample_ids) == 0:
            logger.info("All %s samples in this VCF have already been loaded" % len(vcf_sample_ids))
            return
        elif not dataset.is_loaded:
            logger.info("Dataset not loaded. %s Loading..." % (is_loaded,))
        elif len(vcf_sample_ids - existing_sample_ids) != 0:
            logger.info("Dataset is loaded but these samples aren't included in the dataset: %s" % (vcf_sample_ids - existing_sample_ids, ))

        logger.info("done")
Пример #9
0
    def handle(self, *args, **options):

        analysis_type = Dataset.ANALYSIS_TYPE_VARIANT_CALLS

        # parse and validate args
        sample_type = options["sample_type"]
        genome_version = options["genome_version"]
        validate_only = options["validate_only"]
        max_edit_distance = options["max_edit_distance_for_id_match"]
        pedigree_file_path = options["pedigree_file"]
        export_pedigree_file_template = options["export_pedigree_file_template"]
        project_guid = options["project_id"]
        vcf_path = options["vcf_path"]
        dataset_id = options["dataset_id"]

        # look up project id and validate other args
        try:
            project = Project.objects.get(guid=project_guid)
        except ObjectDoesNotExist:
            raise CommandError("Invalid project id: %(project_guid)s" % locals())

        if project.genome_version != genome_version:
            raise CommandError("Genome version %s doesn't match the project's genome version which is %s" % (genome_version, project.genome_version))

        if pedigree_file_path and not os.path.isfile(pedigree_file_path):
            raise CommandError("Can't open pedigree file: %(pedigree_file_path)s" % locals())

        # parse the pedigree file if specified
        if pedigree_file_path:

            input_stream = file_iter(pedigree_file_path)
            json_records, errors, warnings = parse_pedigree_table(pedigree_file_path, input_stream)

            if errors:
                for message in errors:
                    logger.error(message)
                raise CommandError("Unable to parse %(pedigree_file_path)s" % locals())

            if warnings:
                for message in warnings:
                    logger.warn(message)

            if not validate_only:
                add_or_update_individuals_and_families(project, json_records)

        # validate VCF and get sample ids
        vcf_sample_ids = _validate_vcf(vcf_path, sample_type=sample_type, genome_version=genome_version)

        vcf_sample_ids_to_sample_records = match_sample_ids_to_sample_records(
            project,
            sample_ids=vcf_sample_ids,
            sample_type=sample_type,
            max_edit_distance=max_edit_distance,
            create_records_for_new_sample_ids=not validate_only,
        )

        if export_pedigree_file_template:
            with open(export_pedigree_file_template, "w") as out_f:
                out_f.write("#%s\n" % ("\t".join(['family_id', 'individual_id', 'paternal_id', 'maternal_id', 'sex', 'affected_status'],)))
                for vcf_sample_id in vcf_sample_ids:
                    if vcf_sample_id in vcf_sample_ids_to_sample_records:
                        continue

                    family_id = individual_id = vcf_sample_id
                    out_f.write("%s\n" % ("\t".join([family_id, individual_id, '', '', '', ''],)))
            logger.info("Wrote out %(export_pedigree_file_template)s. Exiting..." % locals())
            return

        if len(vcf_sample_ids_to_sample_records) == 0:
            all_vcf_sample_id_count = len(vcf_sample_ids)
            all_project_sample_id_count = len(Sample.objects.filter(individual__family__project=project, sample_type=sample_type))
            logger.info(("No matches found between the %(all_vcf_sample_id_count)s sample id(s) in the VCF and "
                "the %(all_project_sample_id_count)s %(sample_type)s sample id(s) in %(project_guid)s") % locals())
            return

        if validate_only:
            return

         # retrieve or create Dataset record and link it to sample(s)
        dataset = get_or_create_dataset(
            analysis_type=analysis_type,
            source_file_path=vcf_path,
            project=project,
            dataset_id=dataset_id,
        )

        link_dataset_to_sample_records(dataset, vcf_sample_ids_to_sample_records.values())

        # check if all VCF samples loaded already
        vcf_sample_ids = set(vcf_sample_ids_to_sample_records.keys())
        existing_sample_ids = set([s.sample_id for s in dataset.samples.all()])
        if dataset.is_loaded and len(vcf_sample_ids - existing_sample_ids) == 0:
            logger.info("All %s samples in this VCF have already been loaded" % len(vcf_sample_ids))
            return

        # load the VCF
        _load_variants(dataset)

        logger.info("done")
Пример #10
0
 def process_records(json_records, filename='ped_file'):
     pedigree_records, errors, warnings = parse_pedigree_table(json_records, filename, user=request.user, project=project)
     if errors:
         raise ErrorsWarningsException(errors, warnings)
     return pedigree_records
Пример #11
0
 def parse_file(filename, stream):
     pedigree_records, errors, warnings = parse_pedigree_table(
         filename, stream, user=request.user, project=project)
     if errors:
         raise ErrorsWarningsException(errors, warnings)
     return pedigree_records
Пример #12
0
def receive_individuals_table_handler(request, project_guid):
    """Handler for the initial upload of an Excel or .tsv table of individuals. This handler
    parses the records, but doesn't save them in the database. Instead, it saves them to
    a temporary file and sends a 'token' representing this file back to the client. If/when the
    client then wants to 'apply' this table, it can send the token to the
    save_individuals_table(..) handler to actually save the data in the database.

    Args:
        request (object): Django request object
        project_guid (string): project GUID
    """

    project = _get_project_and_check_permissions(project_guid, request.user)

    if len(request.FILES) != 1:
        return create_json_response({
            'errors': ["Received %s files instead of 1" % len(request.FILES)]
        })

    # parse file
    stream = request.FILES.values()[0]
    filename = stream._name
    #file_size = stream._size
    #content_type = stream.content_type
    #content_type_extra = stream.content_type_extra
    #for chunk in value.chunks():
    #   destination.write(chunk)

    json_records, errors, warnings = parse_pedigree_table(filename, stream)

    if errors:
        return create_json_response({'errors': errors, 'warnings': warnings})

    # save json to temporary file
    token = hashlib.md5(str(json_records)).hexdigest()
    serialized_file_path = _compute_serialized_file_path(token)
    with gzip.open(serialized_file_path, "w") as f:
        json.dump(json_records, f)

    # send back some stats
    num_families = len(set(r['familyId'] for r in json_records))
    num_individuals = len(set(r['individualId'] for r in json_records))
    num_families_to_create = len([
        family_id for family_id in set(r['familyId'] for r in json_records)
        if not Family.objects.filter(family_id=family_id, project=project)
    ])
    num_individuals_to_create = len(
        set(r['individualId'] for r in json_records
            if not Individual.objects.filter(individual_id=r['individualId'],
                                             family__family_id=r['familyId'],
                                             family__project=project)))
    info = [
        "%(num_families)s families, %(num_individuals)s inidividuals parsed from %(filename)s"
        % locals(),
        "%d new families, %d new individuals will be added to the project" %
        (num_families_to_create, num_individuals_to_create),
        "%d existing individuals will be updated" %
        (num_individuals - num_individuals_to_create),
    ]

    return create_json_response({
        'token': token,
        'errors': errors,
        'warnings': warnings,
        'info': info,
    })
Пример #13
0
    def test_parse_pedigree_table(self):
        records, errors, warnings = parse_pedigree_table(
            [['family_id', 'individual_id', 'sex', 'affected'],
             ['fam1', 'ind1', 'male']], FILENAME)
        self.assertListEqual(records, [])
        self.assertListEqual(errors, [
            'Error while parsing file: {}. Row 1 contains 3 columns: fam1, ind1, male, while header contains 4: family_id, individual_id, sex, affected'
            .format(FILENAME)
        ])
        self.assertListEqual(warnings, [])
        records, errors, warnings = parse_pedigree_table([[
            'family_id', 'individual_id', 'sex', 'affected', 'father', 'mother'
        ], ['', '', 'male', 'u', '.', 'ind2']], FILENAME)
        self.assertListEqual(records, [])
        self.assertListEqual(errors, [
            "Error while converting {} rows to json: Family Id not specified in row #1:\n{{'affected': 'u', 'maternalId': 'ind2', 'individualId': '', 'sex': 'male', 'familyId': '', 'paternalId': ''}}"
            .format(FILENAME)
        ])
        self.assertListEqual(warnings, [])

        records, errors, warnings = parse_pedigree_table([[
            'family_id', 'individual_id', 'sex', 'affected', 'father', 'mother'
        ], ['fam1', '', 'male', 'u', '.', 'ind2']], FILENAME)
        self.assertListEqual(records, [])
        self.assertListEqual(errors, [
            "Error while converting {} rows to json: Individual Id not specified in row #1:\n{{'affected': 'u', 'maternalId': 'ind2', 'individualId': '', 'sex': 'male', 'familyId': 'fam1', 'paternalId': ''}}"
            .format(FILENAME)
        ])
        self.assertListEqual(warnings, [])

        records, errors, warnings = parse_pedigree_table([[
            'family_id', 'individual_id', 'sex', 'affected', 'father', 'mother'
        ], ['fam1', 'ind1', 'boy', 'u', '.', 'ind2']], FILENAME)
        self.assertListEqual(records, [])
        self.assertListEqual(errors, [
            "Error while converting {} rows to json: Invalid value 'boy' for sex in row #1"
            .format(FILENAME)
        ])
        self.assertListEqual(warnings, [])

        records, errors, warnings = parse_pedigree_table([[
            'family_id', 'individual_id', 'sex', 'affected', 'father', 'mother'
        ], ['fam1', 'ind1', 'male', 'no', '.', 'ind2']], FILENAME)
        self.assertListEqual(records, [])
        self.assertListEqual(errors, [
            "Error while converting {} rows to json: Invalid value 'no' for affected status in row #1"
            .format(FILENAME)
        ])

        records, errors, warnings = parse_pedigree_table([[
            'family_id', 'individual_id', 'sex', 'affected', 'father', 'mother'
        ], ['fam1', 'ind1', 'male', 'aff.', 'ind3', 'ind2'
            ], ['fam2', 'ind2', 'male', 'u', '.', '']], FILENAME)
        self.assertListEqual(records, [
            {
                'familyId': 'fam1',
                'individualId': 'ind1',
                'sex': 'M',
                'affected': 'A',
                'paternalId': 'ind3',
                'maternalId': 'ind2'
            },
            {
                'familyId': 'fam2',
                'individualId': 'ind2',
                'sex': 'M',
                'affected': 'N',
                'paternalId': '',
                'maternalId': ''
            },
        ])
        self.assertListEqual(errors, [
            'ind2 is recorded as Male and also as the mother of ind1',
            'ind2 is recorded as the mother of ind1 but they have different family ids: fam2 and fam1',
        ])
        self.assertListEqual(warnings, [
            "ind3 is the father of ind1 but doesn't have a separate record in the table"
        ])

        records, errors, warnings = parse_pedigree_table(
            [[
                'family_id', 'individual_id', 'notes_for_import', 'other_data',
                'sex', 'affected', 'father', 'mother', 'phenotype: coded'
            ],
             [
                 'fam1', 'ind1', 'some notes', 'some more notes', 'male',
                 'aff.', '.', 'ind2', 'HPO:12345'
             ], ['fam1', 'ind2', ' ', '', 'female', 'u', '.', '', 'HPO:56789']
             ], FILENAME)
        self.assertListEqual(records, [
            {
                'familyId': 'fam1',
                'individualId': 'ind1',
                'sex': 'M',
                'affected': 'A',
                'paternalId': '',
                'maternalId': 'ind2',
                'notes': 'some notes',
                'codedPhenotype': 'HPO:12345'
            },
            {
                'familyId': 'fam1',
                'individualId': 'ind2',
                'sex': 'F',
                'affected': 'N',
                'paternalId': '',
                'maternalId': '',
                'notes': '',
                'codedPhenotype': 'HPO:56789'
            },
        ])
        self.assertListEqual(errors, [])
        self.assertListEqual(warnings, [])