Exemplo n.º 1
0
Arquivo: io.py Projeto: orenlivne/ober
def read_npz(in_file):
    '''Read problem from NPZ file. in_file may be a file name or an open 
    file descriptor.'''

    files = np.load(in_file)
    graph = nx.DiGraph()
    graph.add_nodes_from(files['pedigree_nodes'])
    graph.add_edges_from(files['pedigree_graph'][0])
    p = Pedigree(graph,
                 sample_id=files['pedigree_sample_id'],
                 sex=files['pedigree_sex'],
                 phenotype=files['pedigree_phenotype'],
                 node_type=files['pedigree_node_type'],
                 sample_index=files['pedigree_sample_index'],
                 num_genotyped=files['pedigree_num_genotyped'][0])
    g = GenotypeFactory.new_instance('genotype', files['genotype_data'], files['genotype_snp'])
    h = GenotypeFactory.new_instance('haplotype', files['haplotype_data'], files['haplotype_snp'], qc=MISSING)
    error = files['error']
    h.qc = files['haplotype_qc']
    info = files['info'][0]
    frames = Frames((k, w) for k, v in files['frames'][0].iteritems() for w in v[0]) if files['frames'][0] else None
    lam = files['lam']
    
    # Optional fields
    if 'genotype_map' in files.files: g.map = files['genotype_map']
    if 'haplotype_poo_phase' in files.files: h.poo_phase = files['haplotype_poo_phase']
    if 'haplotype_hap_type' in files.files: h.hap_type = files['haplotype_hap_type']
    
    return Problem(p, g, haplotype=h, error=error, info=info, frames=frames, lam=lam)
Exemplo n.º 2
0
 def test_create_from_mock_data(self):
     '''Create a simple genotype set from the hutterites pedigree and some mock genotype data.'''
     # Load data from text file to compare with the load result
     snp = np.array(
         [(0, 'rs1', 0., 12), (0, 'rs2', 0., 34), (0, 'rs3', 0., 56),
          (0, 'rs4', 0., 78)],
         dtype={
             'names': ('chrom', 'snp', 'dist_cm', 'base_pair'),
             'formats': ('i2', 'S12', 'i8', 'i8')
         })
     sample_id = [126251, 111161]
     data = np.array([[[1, 2]], [[2, 2]], [[1, 2]], [[1, 1]]])
     g = GenotypeFactory.new_instance('genotype', data, snp, sample_id)
     itu.assert_size_equals(g, 4, 1)
     assert_equal(4, g.num_snps, 'Incorrect number of SNPS')
     assert_equal(g.segment_intersect([0, 40]), [0, 2],
                  'Wrong interval intersection')
     assert_equal([0, 2], g.segment_intersect([10, 40]),
                  'Wrong interval intersection')
     assert_equal([0, 3], g.segment_intersect([10, 60]),
                  'Wrong interval intersection')
     assert_equal([1, 3], g.segment_intersect([20, 60]),
                  'Wrong interval intersection')
     assert_equal([0, 4], g.segment_intersect([0, 100]),
                  'Wrong interval intersection')
     assert_equal([1, 4], g.segment_intersect([20, 100]),
                  'Wrong interval intersection')
Exemplo n.º 3
0
def read_tabix(file_name,
               genotyped_id_file=os.environ['OBER_DATA'] +
               '/hutt/hutt.3chipoverlap.clean.fam'):
    '''Read a Haplotype object from an ITABIX CGI-imputed file.
    Line format: tab-delimited
    7849538    chr11    1909005    1909006    snp    T    C    dbsnp.107:rs3817198    <genotypes>
    '''
    # Load entire file into memory. It must fit, if we are to load it into a Genotype object
    d = np.loadtxt(file_name, str)

    # Read SNP metadata into a record array
    snp_dtype = [
        ('chrom', np.uint8),  # Chromosome # containing the SNP
        ('name', np.chararray),  # SNP name (e.g., 'rs...')
        ('dist_cm', np.float),  # Genetic position [CENTI-Morgans!!]
        ('base_pair', np.uint)  # Base pair position on chromosome
    ]
    snp = np.array([(int(line[1][3:]), line[7], 0, int(line[3]))
                    for line in d],
                   dtype=snp_dtype)
    data = np.array([[(CGI_LETTER_TO_ALLELE[x[1]], CGI_LETTER_TO_ALLELE[x[2]])
                      for x in line[8:]] for line in d])
    hap_type = np.array([[int(x[0]) for x in line[8:]] for line in d])
    sample_id = read_sample_id(genotyped_id_file)
    # Construct object
    return GenotypeFactory.new_instance('haplotype',
                                        data,
                                        snp,
                                        sample_id,
                                        hap_type=hap_type)
Exemplo n.º 4
0
    def read(self, clazz, **kwargs):
        """Load genotype data. If prefix is specified, will use prefix.tfam, prefix.tped
        input file names, unless tfam and/or tped are specified (with or without the
        prefix argument), in which case they override the prefix-based names."""

        # Read input arguments
        prefix = kwargs.get("prefix", None)
        load_ids = kwargs.get("load_ids", True)
        tped = kwargs.get("tped", None if prefix is None else (prefix + ".tped"))
        if tped is None:
            raise ValueError("Must specify plink file prefix and/or tped file name")
        if load_ids:
            tfam = kwargs.get("tfam", None if prefix is None else (prefix + ".tfam"))
            if tfam is None:
                raise ValueError("If loading IDs, must specify plink file prefix and/or tfam file name")
        # lazily-load data or not fetch all of it
        lazy_load = kwargs.get("lazy_load", False)

        # Read TPED file in two sweeps.
        # See http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#map

        # Read the first line in the file to determine the number of samples
        with open(tped, "r") as f:
            reader = csv.reader(f, delimiter=" ", skipinitialspace=True)
            line = reader.next()
            if line[-1] == "":
                line = line[:-1]  # Trim last item in field list of this line if it is blank
            num_items = len(line)

        # Read SNP metadata into a record array
        snp_dtype = [
            ("chrom", np.uint8),  # Chromosome # containing the SNP
            ("name", np.chararray),  # SNP name (e.g., 'rs...')
            ("dist_cm", np.float),  # Genetic position [CENTI-Morgans!!]
            ("base_pair", np.uint),  # Base pair position on chromosome
        ]
        snp = np.loadtxt(tped, usecols=range(4), dtype=snp_dtype)
        # Fix the special case of a single row, where loadtxt is buggy
        if snp.size == 1:
            snp = np.array([tuple(snp[key] for key, _ in snp_dtype)], dtype=snp_dtype)

        # Read Genotype data
        if lazy_load:
            # Only pass pointer to file, to be read into a data structure that supports lazy loading
            data = tped
        else:
            # Read Genotype data into array
            data = np.genfromtxt(tped, usecols=range(4, num_items), dtype=np.byte)
            if np.size(snp) == 1:
                data = data.reshape([1, data.shape[0] / 2, 2])
            else:
                data = data.reshape([data.shape[0], data.shape[1] / 2, 2])

        # Load TFAM data, use only study IDs
        sample_id = np.genfromtxt(tfam, dtype=np.int)[:, 1] if load_ids else None

        # Construct object
        return GenotypeFactory.new_instance(clazz, data, snp, sample_id, lazy_load=lazy_load)
Exemplo n.º 5
0
def iplex_to_genotype(path, t, suffix='_iPlex_HUTTERITE.txt'):
    '''Convert an iPlex data into a Genotype object that matches the ordering of SNPs and
    samples in an ImputationSet object t.'''
    # Read iPlex data into a dictionary of dictionaries
    iplex = __read_iplex(path, suffix)

    # Match SNPs by name. This is a slow (m x n) operation where m=#iplex SNPs, n=#imputation_set SNPs,
    # but speed is not an issue here since m, n are in the tens to hundreds
    iplex_snps = iplex.keys()
    print iplex_snps
    snp_metadata = t.snp['name']
    # index of each IPLEX SNP in imputation_set
    index = [
        np.where([iplex_snp in x for x in snp_metadata])[0][0]
        for iplex_snp in iplex_snps
    ]

    # Match genotype IDs of each iplex SNP and imputation_set
    s = t.pedigree.genotyped_sample_id()
    data = np.zeros_like(t.imputed_data)
    print t.genotype.map
    num_snps = len(iplex_snps)
    print 'num_snps', num_snps
    for i in xrange(num_snps):
        snp = iplex_snps[i]
        snp_index = index[i]
        iplex_data = iplex[snp]
        values = np.array(iplex_data.values())

        letter = t.genotype.map[snp_index]
        print '#%2d iPlex SNP %-14s index in t %3d %s/%s' % (
            i, snp, snp_index, letter[0], letter[1])
        # Reverse strand if needed
        if values[0][0] not in letter:
            print 'Reversing strand letters of iPlex data (Dakota labeling uses standard dbSNP letters)'
            letter = ''.join(REVERSE_STRAND[x] for x in letter)

        allele = __allele_dict(letter)
        iplex_id = np.array(iplex_data.keys())
        samples_in_t = np.in1d(iplex_id, s)
        sample_index = [t.pedigree.node_of[x] for x in iplex_id[samples_in_t]]

        recoded_genotypes = np.array([[allele[v[0]], allele[v[1]]]
                                      for v in values[samples_in_t]])
        f1, f2 = im.gt.allele_frequencies(recoded_genotypes)
        if f2 > 0.7:  # Minor allele has a much larger frequency than the major allele, swap them
            print 'Major-minor allele CGI letter swap detected, fixing (f1=%.2f, f2=%.2f)' % (
                f1, f2)
            im.gt.swap_alleles(recoded_genotypes)
            t.genotype.map[snp_index] = letter[1] + letter[0]

        # Insert data into the appropriate row of the target genotype object data array
        data[snp_index, sample_index, :] = recoded_genotypes
    # return data, snp_metadata[index], t.genotype.sample_id
    g = GenotypeFactory.new_instance('genotype', data, t.snp,
                                     t.genotype.sample_id)
    g.map = t.genotype.map
    return g
Exemplo n.º 6
0
Arquivo: io.py Projeto: orenlivne/ober
def read_npz(in_file):
    '''Read problem from NPZ file. in_file may be a file name or an open 
    file descriptor.'''

    files = np.load(in_file)
    graph = nx.DiGraph()
    graph.add_nodes_from(files['pedigree_nodes'])
    graph.add_edges_from(files['pedigree_graph'][0])
    p = Pedigree(graph,
                 sample_id=files['pedigree_sample_id'],
                 sex=files['pedigree_sex'],
                 phenotype=files['pedigree_phenotype'],
                 node_type=files['pedigree_node_type'],
                 sample_index=files['pedigree_sample_index'],
                 num_genotyped=files['pedigree_num_genotyped'][0])
    g = GenotypeFactory.new_instance('genotype', files['genotype_data'],
                                     files['genotype_snp'])
    h = GenotypeFactory.new_instance('haplotype',
                                     files['haplotype_data'],
                                     files['haplotype_snp'],
                                     qc=MISSING)
    error = files['error']
    h.qc = files['haplotype_qc']
    info = files['info'][0]
    frames = Frames((k, w) for k, v in files['frames'][0].iteritems()
                    for w in v[0]) if files['frames'][0] else None
    lam = files['lam']

    # Optional fields
    if 'genotype_map' in files.files: g.map = files['genotype_map']
    if 'haplotype_poo_phase' in files.files:
        h.poo_phase = files['haplotype_poo_phase']
    if 'haplotype_hap_type' in files.files:
        h.hap_type = files['haplotype_hap_type']

    return Problem(p,
                   g,
                   haplotype=h,
                   error=error,
                   info=info,
                   frames=frames,
                   lam=lam)
Exemplo n.º 7
0
def iplex_to_genotype(path, t, suffix='_iPlex_HUTTERITE.txt'):
    '''Convert an iPlex data into a Genotype object that matches the ordering of SNPs and
    samples in an ImputationSet object t.'''
    # Read iPlex data into a dictionary of dictionaries
    iplex = __read_iplex(path, suffix)
    
    # Match SNPs by name. This is a slow (m x n) operation where m=#iplex SNPs, n=#imputation_set SNPs,
    # but speed is not an issue here since m, n are in the tens to hundreds
    iplex_snps = iplex.keys()
    print iplex_snps
    snp_metadata = t.snp['name']
    # index of each IPLEX SNP in imputation_set
    index = [np.where([iplex_snp in x for x in snp_metadata])[0][0] for iplex_snp in iplex_snps]
    
    # Match genotype IDs of each iplex SNP and imputation_set
    s = t.pedigree.genotyped_sample_id()
    data = np.zeros_like(t.imputed_data)
    print t.genotype.map
    num_snps = len(iplex_snps)
    print 'num_snps', num_snps
    for i in xrange(num_snps):
        snp = iplex_snps[i]
        snp_index = index[i]
        iplex_data = iplex[snp]
        values = np.array(iplex_data.values())
        
        letter = t.genotype.map[snp_index]
        print '#%2d iPlex SNP %-14s index in t %3d %s/%s' % (i, snp, snp_index, letter[0], letter[1])
        # Reverse strand if needed
        if values[0][0] not in letter:
            print 'Reversing strand letters of iPlex data (Dakota labeling uses standard dbSNP letters)'
            letter = ''.join(REVERSE_STRAND[x] for x in letter)

        allele = __allele_dict(letter)
        iplex_id = np.array(iplex_data.keys())
        samples_in_t = np.in1d(iplex_id, s)
        sample_index = [t.pedigree.node_of[x] for x in iplex_id[samples_in_t]]

        recoded_genotypes = np.array([[allele[v[0]], allele[v[1]]] for v in values[samples_in_t]])
        f1, f2 = im.gt.allele_frequencies(recoded_genotypes)
        if f2 > 0.7:  # Minor allele has a much larger frequency than the major allele, swap them
            print 'Major-minor allele CGI letter swap detected, fixing (f1=%.2f, f2=%.2f)' % (f1, f2)
            im.gt.swap_alleles(recoded_genotypes)
            t.genotype.map[snp_index] = letter[1] + letter[0]
        
        # Insert data into the appropriate row of the target genotype object data array 
        data[snp_index, sample_index, :] = recoded_genotypes
    # return data, snp_metadata[index], t.genotype.sample_id
    g = GenotypeFactory.new_instance('genotype', data, t.snp, t.genotype.sample_id)
    g.map = t.genotype.map
    return g 
Exemplo n.º 8
0
def read(input_type, clazz, **kwargs):
    """Read a Genotype object of class clazz ('genotype'/'haplotype'/'problem')
    from file of the format 'input_type'.
    
    Supported formats: input_type='plink' (PLINK format); 'npz' (our NPZ format)."""
    if input_type == "npz":
        data = np.load(kwargs.get("file"))
        g = GenotypeFactory.new_instance(clazz, data["data"], data["snp"], sample_id=data["sample_id"])
        # If there exists a genetic map, load it. If not, don't. For backward-compatibility with older
        # Genotype npz files that didn't have the map yet
        if "map" in data.files:
            g.map = data["map"]
        if "poo_phase" in data.files:
            g.poo_phase = data["poo_phase"]
        return g
    elif input_type == "plink":
        return _plink_reader.read(clazz, **kwargs)
    else:
        raise ValueError("Unsupported genotype input type %s" % (input,))
Exemplo n.º 9
0
 def test_create_from_mock_data(self):
     '''Create a simple genotype set from the hutterites pedigree and some mock genotype data.'''
     # Load data from text file to compare with the load result
     snp = np.array([(0, 'rs1', 0., 12), 
                     (0, 'rs2', 0., 34), 
                     (0, 'rs3', 0., 56),
                     (0, 'rs4', 0., 78)],
                   dtype={'names': ('chrom', 'snp', 'dist_cm', 'base_pair'), 
                          'formats': ('i2', 'S12', 'i8', 'i8')})               
     sample_id = [126251, 111161]
     data = np.array([[[1, 2]], [[2, 2]], [[1, 2]], [[1, 1]]])
     g = GenotypeFactory.new_instance('genotype', data, snp, sample_id)
     itu.assert_size_equals(g, 4, 1)
     assert_equal(4, g.num_snps, 'Incorrect number of SNPS')
     assert_equal(g.segment_intersect([0, 40]), [0, 2], 'Wrong interval intersection')
     assert_equal([0, 2], g.segment_intersect([10,40]), 'Wrong interval intersection')
     assert_equal([0, 3], g.segment_intersect([10,60]), 'Wrong interval intersection')
     assert_equal([1, 3], g.segment_intersect([20,60]), 'Wrong interval intersection')
     assert_equal([0, 4], g.segment_intersect([0,100]), 'Wrong interval intersection')
     assert_equal([1, 4], g.segment_intersect([20,100]), 'Wrong interval intersection')
Exemplo n.º 10
0
def read(input_type, clazz, **kwargs):
    '''Read a Genotype object of class clazz ('genotype'/'haplotype'/'problem')
    from file of the format 'input_type'.
    
    Supported formats: input_type='plink' (PLINK format); 'npz' (our NPZ format).'''
    if input_type == 'npz':
        data = np.load(kwargs.get('file'))
        g = GenotypeFactory.new_instance(clazz,
                                         data['data'],
                                         data['snp'],
                                         sample_id=data['sample_id'])
        # If there exists a genetic map, load it. If not, don't. For backward-compatibility with older
        # Genotype npz files that didn't have the map yet
        if 'map' in data.files: g.map = data['map']
        if 'poo_phase' in data.files: g.poo_phase = data['poo_phase']
        return g
    elif input_type == 'plink':
        return _plink_reader.read(clazz, **kwargs)
    else:
        raise ValueError('Unsupported genotype input type %s' % (input, ))
Exemplo n.º 11
0
def read_tabix(file_name, genotyped_id_file=os.environ["OBER_DATA"] + "/hutt/hutt.3chipoverlap.clean.fam"):
    """Read a Haplotype object from an ITABIX CGI-imputed file.
    Line format: tab-delimited
    7849538    chr11    1909005    1909006    snp    T    C    dbsnp.107:rs3817198    <genotypes>
    """
    # Load entire file into memory. It must fit, if we are to load it into a Genotype object
    d = np.loadtxt(file_name, str)

    # Read SNP metadata into a record array
    snp_dtype = [
        ("chrom", np.uint8),  # Chromosome # containing the SNP
        ("name", np.chararray),  # SNP name (e.g., 'rs...')
        ("dist_cm", np.float),  # Genetic position [CENTI-Morgans!!]
        ("base_pair", np.uint),  # Base pair position on chromosome
    ]
    snp = np.array([(int(line[1][3:]), line[7], 0, int(line[3])) for line in d], dtype=snp_dtype)
    data = np.array([[(CGI_LETTER_TO_ALLELE[x[1]], CGI_LETTER_TO_ALLELE[x[2]]) for x in line[8:]] for line in d])
    hap_type = np.array([[int(x[0]) for x in line[8:]] for line in d])
    sample_id = read_sample_id(genotyped_id_file)
    # Construct object
    return GenotypeFactory.new_instance("haplotype", data, snp, sample_id, hap_type=hap_type)
Exemplo n.º 12
0
    def read(self, clazz, **kwargs):
        '''Load genotype data. If prefix is specified, will use prefix.tfam, prefix.tped
        input file names, unless tfam and/or tped are specified (with or without the
        prefix argument), in which case they override the prefix-based names.'''

        # Read input arguments
        prefix = kwargs.get('prefix', None)
        load_ids = kwargs.get('load_ids', True)
        tped = kwargs.get('tped', None if prefix is None else
                          (prefix + '.tped'))
        if tped is None:
            raise ValueError(
                'Must specify plink file prefix and/or tped file name')
        if load_ids:
            tfam = kwargs.get('tfam', None if prefix is None else
                              (prefix + '.tfam'))
            if tfam is None:
                raise ValueError(
                    'If loading IDs, must specify plink file prefix and/or tfam file name'
                )
        # lazily-load data or not fetch all of it
        lazy_load = kwargs.get('lazy_load', False)

        # Read TPED file in two sweeps.
        # See http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#map

        # Read the first line in the file to determine the number of samples
        with open(tped, 'r') as f:
            reader = csv.reader(f, delimiter=' ', skipinitialspace=True)
            line = reader.next()
            if line[-1] == '':
                line = line[:
                            -1]  # Trim last item in field list of this line if it is blank
            num_items = len(line)

        # Read SNP metadata into a record array
        snp_dtype = [
            ('chrom', np.uint8),  # Chromosome # containing the SNP
            ('name', np.chararray),  # SNP name (e.g., 'rs...')
            ('dist_cm', np.float),  # Genetic position [CENTI-Morgans!!]
            ('base_pair', np.uint)  # Base pair position on chromosome
        ]
        snp = np.loadtxt(tped, usecols=range(4), dtype=snp_dtype)
        # Fix the special case of a single row, where loadtxt is buggy
        if snp.size == 1:
            snp = np.array([tuple(snp[key] for key, _ in snp_dtype)],
                           dtype=snp_dtype)

        # Read Genotype data
        if lazy_load:
            # Only pass pointer to file, to be read into a data structure that supports lazy loading
            data = tped
        else:
            # Read Genotype data into array
            data = np.genfromtxt(tped,
                                 usecols=range(4, num_items),
                                 dtype=np.byte)
            if np.size(snp) == 1:
                data = data.reshape([1, data.shape[0] / 2, 2])
            else:
                data = data.reshape([data.shape[0], data.shape[1] / 2, 2])

        # Load TFAM data, use only study IDs
        sample_id = np.genfromtxt(tfam, dtype=np.int)[:,
                                                      1] if load_ids else None

        # Construct object
        return GenotypeFactory.new_instance(clazz,
                                            data,
                                            snp,
                                            sample_id,
                                            lazy_load=lazy_load)