示例#1
0
def read_beagle_markerfile(filename, label=None):
    ''' 
    Reads marker locations from a beagle format file
    
    Arguments
    -----
    filename: The file to be read
    label: An optional label to give the chromosome, since the BEAGLE
        format does not require it

    Returns: a ChromosomeTemplate object
    '''
    with open(filename) as f:
        chrom = ChromosomeTemplate(label=label)

        last_pos = -1
        for line in f:
            rec = BeagleMarkerRecord(line)

            if rec.pos < 0:
                raise FileFormatError(
                    'Bad position for genotype: {}'.format(rec.pos))
            elif rec.pos <= last_pos:
                raise FileFormatError('Makers in file out of order')

            chrom.add_genotype(None, cm=None, label=rec.label, bp=rec.pos,
                               reference=rec.reference, alternates=rec.alternates)
            last_pos = rec.pos

    return chrom
示例#2
0
 def direct_to_disk(filename, pop, seed_size=500, 
                    min_length=1, size_unit='mb',
                    min_density=100, maxmiss=0.25,
                    onlywithin=False, njobs=1):
     with open(filename, 'w') as outf:
         results = _perform_sgs(pop, seed_size=seed_size,
                                min_length=min_length, size_unit=size_unit,
                                min_density=min_density, maxmiss=maxmiss,
                                onlywithin=onlywithin, njobs=njobs)
         for result in results:
             for segment in result.segments:
                 outf.write(segment.to_germline() + '\n')
示例#3
0
def write_sgs(data, filename):
    # GERMLINE files are text files with the format:

    #     0) Family ID 1
    #     1) Individual ID 1
    #     2) Family ID 2
    #     3) Individual ID 2
    #     4) Chromosome
    #     5) Segment start (bp/cM)
    #     6) Segment end (bp/cM)
    #     7) Segment start (SNP)
    #     8) Segment end (SNP)
    #     9) Total SNPs in segment
    #     10) Genetic length of segment
    #     11) Units for genetic length (cM or MB)
    #     12) Mismatching SNPs in segment
    #     13) 1 if Individual 1 is homozygous in match; 0 otherwise
    #     14) 1 if Individual 2 is homozygous in match; 0 otherwise

    with open(filename, 'w') as o:
        for segment in data.segments:
            oline = []

            ind1 = segment.ind1.full_label
            ind2 = segment.ind2.full_label
            oline.extend(ind1)
            oline.extend(ind2)

            chrom = [segment.chromosome.label]
            physical = segment.physical_location
            labs = segment.marker_labels
            nmark = [segment.nmark]
            psize = [segment.physical_size / 1e6] # Megabases, not basepairs
            oline.extend(chrom)
            oline.extend(physical)
            oline.extend(labs)
            oline.extend(nmark)
            oline.extend(psize)
            unit = ['MB']
            # Extra info GERMLINE gives you like mismatch rate
            misc = 'X', 'X', 'X'
            oline.extend(unit)
            oline.extend(misc)

            oline = '\t'.join([str(x) for x in oline])

            o.write(oline)
            o.write('\n')
示例#4
0
def read_kinship(filename):
    '''
    Reads a KinInbCoef formatted file of kinship and inbreeding coefficients

    Arguments
    ------
    filename: the filename to be read
    
    Returns: a dictionary in the format 
    {frozenset({(fam, ind_a), (fam, ind_b)}): kinship/inbreeding
    '''
    kindict = {}
    with open(filename) as f:
        for line in f:
            fam, ida, idb, phi = line.strip().split()
            kindict[frozenset({(fam, ida), (fam, idb)})] = float(phi)
    return kindict
示例#5
0
 def direct_to_disk(filename,
                    pop,
                    seed_size=500,
                    min_length=1,
                    size_unit='mb',
                    min_density=100,
                    maxmiss=0.25,
                    onlywithin=False,
                    njobs=1):
     with open(filename, 'w') as outf:
         results = _perform_sgs(pop,
                                seed_size=seed_size,
                                min_length=min_length,
                                size_unit=size_unit,
                                min_density=min_density,
                                maxmiss=maxmiss,
                                onlywithin=onlywithin,
                                njobs=njobs)
         for result in results:
             for segment in result.segments:
                 outf.write(segment.to_germline() + '\n')
示例#6
0
def read_beagle_genotypefile(filename, pop, missingcode='0'):
    '''
    Reads BEAGLE formatted genotype files
    
    Arguments
    ------
    filename: Filename of BEAGLE genotype file
    pop: the population to add these individuals to
    missingcode: The value that indicates a missing genotype

    Returns: Nothing
    '''
    with open(filename) as f:
        for line in f:
            rec = BeagleGenotypeRecord(line)

            if rec.identifier == 'I':
                inds = [Individual(pop, label) for label in rec.data[::2]]
            elif rec.is_phenotype_record:
                for ind, pheno_status in zip(inds, rec.data[::2]):
                    if rec.identifier == 'A':
                        pheno_status = pheno_status == '2'
                    else:
                        try:
                            pheno_status = float(pheno_status)
                        except ValueError:
                            pass
                    ind.phenotypes[rec.label] = pheno_status
            else:
                # We've reached the genotypes, and we're skipping out
                break
        f.seek(0)
        gtrows = [list(grouper(BeagleGenotypeRecord(x).data, 2))
                  for x in f if x.startswith('M')]
        genotypes = zip(*gtrows)
        for ind, sequentialalleles in zip(inds, genotypes):
            ind.genotypes = genotypes_from_sequential_alleles(ind.chromosomes,
                                                              sequentialalleles,
                                                              missingcode=missingcode)
示例#7
0
def read_germline(filename):
    '''
    Reads a GERMLINE formatted SGS filename into an SGSAnalysis object

    GERMLINE files are text files with the format:

        0) Family ID 1
        1) Individual ID 1
        2) Family ID 2
        3) Individual ID 2
        4) Chromosome
        5) Segment start (bp/cM)
        6) Segment end (bp/cM)
        7) Segment start (SNP)
        8) Segment end (SNP)
        9) Total SNPs in segment
        10) Length of segment
        11) Units for genetic length (cM or MB)
        12) Mismatching SNPs in segment
        13) 1 if Individual 1 is homozygous in match; 0 otherwise
        14) 1 if Individual 2 is homozygous in match; 0 otherwise

    This function only uses 0-6.
    '''
    analysis = SGSAnalysis()
    with open(filename) as f:
        for line in f:
            rec = GermlineRecord(line)

            if rec.pair not in analysis:
                analysis[rec.pair] = SGS(rec.ind1, rec.ind2)

            phys_loc = (rec.location if rec.bp_locations else None)
            gen_loc = (rec.location if not rec.bp_locations else None)
            seg = Segment(rec.ind1, rec.ind2, rec.chromosome, None, None,
                          physical_location=phys_loc)
            
            analysis[rec.pair].append(seg)
    return analysis
示例#8
0
def read_vcf(filename, require_pass=False, freq_info=None, info_filters=None):
    '''
    Reads a VCF file and returns a Population object with the
    individuals represented in the file
    '''
    if not info_filters:
        info_filters = []

    for filter in info_filters:
        if not callable(filter):
            raise ValueError('Filter not callable')

    with open(filename) as f:
        pop = Population()

        last_chrom = None
        genotypes = []

        for i, line in enumerate(f):

            if line.startswith('##'):
                continue

            elif line.startswith('#'):
                ind_ids = line.strip().split()[9:]
                inds = [Individual(pop, ind_id) for ind_id in ind_ids]
                for ind in inds:
                    pop.register_individual(ind)

                break
        
        for i, line in enumerate(f):
            record = VCFRecord(line)

            if info_filters and not all(filter(record) for filter in info_filters):
                continue

            if require_pass and not record.filter_passed:
                continue

            if record.chrom != last_chrom:
                if last_chrom is not None:
                    chromobj.finalize()
                    pop.add_chromosome(chromobj)
                chromobj = ChromosomeTemplate(label=record.chrom)


            if freq_info is not None and freq_info in record.info:
                freq = record.info[freq_info]
                if ',' in freq:
                    freq = freq.split(',')[0]
                freq = float(freq)
            else:
                freq = 0

            genorow = record.genotypes()
            genotypes.append(genorow)

            chromobj.add_genotype(bp=record.pos,
                                  label=record.label,
                                  frequency=freq)

            last_chrom = record.chrom

        chromobj.finalize()
        pop.add_chromosome(chromobj)

    for ind in inds:
        # Initialize new genotypes
        ind._init_genotypes(sparse=True)

    # Now actually sift through markers and assign them to individuals
    final_indices = []
    for chromidx, chromobj  in enumerate(pop.chromosomes):
        indices = zip([chromidx]*chromobj.nmark(), range(chromobj.nmark()))
        final_indices.extend(indices)

    raw_indices = range(len(genotypes))

    for raw, final in zip(raw_indices, final_indices):
        chromidx, markidx = final
        row = genotypes[raw]
        assign_genorow(row, inds, chromidx, markidx)

        # Kill the row so we don't end up with the whole dataset in memory twice
        genotypes[raw] = None
    
    return pop