Exemplo n.º 1
0
def clinvar_doc_feeder(input_file, hg19: bool):
    """
    This function will split the xml file into `<ClinVarSet>...</ClinVarSet>` blocks, then parse each block into an
    `PublicSetType` object (which is defined in the dynamically imported `clinvarlib`), and finally convert each
    `PublicSetType` object into a clinvar document.
    """
    """
    A ClinVarFullRelease_*.xml.gz file has the following structure:
    
        <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
        <ReleaseSet Dated="2021-06-26" ...>

            <ClinVarSet ID="38756179">
              ...
            </ClinVarSet>
            
            <ClinVarSet ID="38756180">
              ...
            </ClinVarSet>
            
            ...

        </ReleaseSet>
    
    Therefore when splitting the xml into `<ClinVarSet>` blocks, the first 2 lines and the last 1 line should be 
    skipped. 
    
    However the `rec_handler` function cannot skip the last 1 line, and will return "\n</ReleaseSet>...</ClinVarSet>" as 
    the last block in this scenario. Therefore in the for-loop below, we will skip any block starting with 
    "\n</ReleaseSet>".
    """
    clinvar_set_blocks = rec_handler(input_file,
                                     block_end='</ClinVarSet>\n',
                                     skip=2,
                                     include_block_end=True)
    for clinvar_set_block in clinvar_set_blocks:
        # Skip any block starting with "\n</ReleaseSet>"
        # Actually only the last block will be skipped. See comments above
        if clinvar_set_block.startswith('\n</ReleaseSet>'):
            continue

        try:
            # Parse each `<ClinVarSet>` block into a `clinvarlib.PublicSetType` object
            public_set_obj = clinvarlib.parseString(clinvar_set_block,
                                                    silence=1)
        except:
            logging.debug(clinvar_set_block)
            raise

        # Convert each `clinvarlib.PublicSetType` object into a json document
        for doc in _map_public_set_to_json(public_set_obj, hg19):
            yield doc
def rcv_feeder(input_file, hg19):
    # the first two line of clinvar_xml is not useful information
    cv_data = rec_handler(input_file, block_end='</ClinVarSet>\n',
                          skip=2, include_block_end=True)
    for record in cv_data:
        # some exceptions
        if record.startswith('\n</ReleaseSet>'):
            continue
        try:
            record_parsed = clinvar.parseString(record, silence=1)
        except:
            logging.debug(record)
            raise
        for record_mapped in _map_line_to_json(record_parsed, hg19):
            yield record_mapped