Пример #1
0
def do_work(fd_vcf, csv):
    vcf = Vcf(fd_vcf)
    vcf.load_meta_header()
    grps = Group(csv)
    #sys.stderr.write("# Of groups loaded: %s\n" % grps.num())
    min_threshold = float(sys.argv[2])
    min_num_all = float(sys.argv[3])
    ComputeSnps(vcf, grps, min_threshold, min_num_all).run()
Пример #2
0
def doHeader(fd):
    vcf = Vcf(fd)
    vcf.load_meta_header()
    ot_already_there = str(vcf.check_info('OT'))
    if ot_already_there == True:
        error("This vcf seems to have an OT INFO already. Bailing out.")
    vcf.add_info('OT', '0', 'Flag', 'The site is on target.')
    # print "Checking if INFO id=OT is there: " + str(vcf.check_info('OT'))
    print vcf.get_meta()
    return vcf
Пример #3
0
def doHeader(fd):
  vcf = Vcf(fd)
  vcf.load_meta_header()
  ot_already_there = str(vcf.check_info('OT'))
  if ot_already_there == True:
    error("This vcf seems to have an OT INFO already. Bailing out.")
  vcf.add_info('OT', '0', 'Flag', 'The site is on target.')
  # print "Checking if INFO id=OT is there: " + str(vcf.check_info('OT'))
  print vcf.get_meta()
  return vcf
Пример #4
0
def doHeader(fd):
  vcf = Vcf(fd)
  vcf.load_meta_header()
  ot_already_there = str(vcf.check_info('RDP'))
  if ot_already_there == True:
    error("This vcf seems to have an RDP INFO already. Bailing out.")
  vcf.add_info('RDP', '1', 'Integer', 'Raw read coverage at locus.')
  # print "Checking if INFO id=OT is there: " + str(vcf.check_info('OT'))
  print vcf.get_meta()
  return vcf
Пример #5
0
def do_work(fd_vcf, tsv_pheno, tsv_haplo):
  vcf  = Vcf(fd_vcf)
  vcf.load_meta_header()
  grps_pheno, grps_haplo = Group(tsv_pheno), Group(tsv_haplo)
  matrix, a_sites, a_groups = prepare(vcf, grps_pheno, grps_haplo)

  print matrix.shape
  print a_groups
  #values   = np.random.randn(100,100) * 10
  #a_sites  = ['1_100', '2_200', '3_300']
  #a_groups = ['grp1', 'grp1', 'grp2']
  cb_labels = ['HETE', 'HOMO_VAR', 'OTHER', 'NO_COVERAGE', 'HOMO_REF']
  drdplots.Heatmap(matrix, cb_labels, a_sites, a_groups).plot()
Пример #6
0
def do_work(fd_vcf, tsv_pheno, tsv_haplo):
    vcf = Vcf(fd_vcf)
    vcf.load_meta_header()
    grps_pheno, grps_haplo = Group(tsv_pheno), Group(tsv_haplo)
    matrix, a_sites, a_groups = prepare(vcf, grps_pheno, grps_haplo)

    print matrix.shape
    print a_groups
    #values   = np.random.randn(100,100) * 10
    #a_sites  = ['1_100', '2_200', '3_300']
    #a_groups = ['grp1', 'grp1', 'grp2']
    cb_labels = ['HETE', 'HOMO_VAR', 'OTHER', 'NO_COVERAGE', 'HOMO_REF']
    drdplots.Heatmap(matrix, cb_labels, a_sites, a_groups).plot()
Пример #7
0
def do_work(fd_vcf, min_num_samples):
    vcf = Vcf(fd_vcf)
    vcf.load_meta_header()
    report(process_snps(vcf, min_num_samples))
Пример #8
0
class SnpFreq(object):
    GENOME_SIZE = {'wgs': 3000000000, 'wes': 34000000}
    MIN_QUAL = 20

    def __init__(self, fd_vcf, exp_type, options):
        self.fd_vcf = fd_vcf
        self.exp_type = exp_type
        self.drop = options.drop
        self.list_s_snps = options.list_s_snps
        self.__validate_type()
        self.coordinates_in_file = options.coordinates_in_file
        self.__load_vcf()
        if options.coor_fn:
            self.coor_fn = options.coor_fn
            self.__load_species_snp_coordinates()

    def __load_species_snp_coordinates(self):
        fd = drdcommon.xopen(self.coor_fn)
        d = {}
        self.d_species_coor = d
        n = 0
        for l in fd:
            n += 1
            chrm, coor = l.split()
            if not d.has_key(chrm):
                d[chrm] = {}
            d[chrm][int(coor)] = 1
        fd.close()
        logging.info("# of coordinates loaded: %d" % n)
        logging.info("current memory usage in %dkb" % drdcommon.memory_usage())

    def __load_vcf(self):
        self.vcf = Vcf(self.fd_vcf)
        self.vcf.load_meta_header()

        if self.drop and (not self.coordinates_in_file
                          and self.vcf.num_of_samples < 2):
            drdcommon.error(
                "I need a population level vcf in order to drop species snps.")

    def __validate_type(self):
        if self.exp_type != 'wgs' and \
           self.exp_type != 'wes' and \
           self.exp_type != 'null':
            raise_it('Invalid experiment type. Valid types: wgs or wes')

    def __list_species_snps(self):
        for l in self.vcf.each_snp():
            snp = VcfSnp(l)
            if snp.is_a_substitution() and \
               snp.has_high_quality(self.MIN_QUAL) and \
               snp.species_snp():
                print(snp.coordinate(' '))

    def __is_a_species_snp(self, snp):
        if self.coordinates_in_file:
            ch, co = snp.coordinate(' ').split()
            return self.d_species_coor.has_key(
                ch) and self.d_species_coor[ch].has_key(int(co))
        else:
            return snp.species_snp()

    def __calculate_snp_freq(self):
        """
    Compute the snp frequency (# of snps per kbp)
    Drop snps that are indels, have low quality
    If wes, also drop non coding regions
    If drop is True, we have to drop species snps
    """
        num_snps = 0
        total = 0
        for l in self.vcf.each_snp():
            snp = VcfSnp(l)
            total += 1
            if snp.is_a_substitution() and snp.has_high_quality(self.MIN_QUAL):
                if self.exp_type == 'wgs':
                    if not self.drop or (self.drop
                                         and not self.__is_a_species_snp(snp)):
                        num_snps += 1
                if self.exp_type == 'wes' and snp.in_coding_region():
                    if not self.drop or (self.drop
                                         and not self.__is_a_species_snp(snp)):
                        num_snps += 1

        logging.info("Total/counted: %d/%d" % (total, num_snps))
        return (float(num_snps) / self.GENOME_SIZE[self.exp_type]) * 1000

    def run(self):
        if self.list_s_snps:
            self.__list_species_snps()
        else:
            return self.__calculate_snp_freq()
Пример #9
0
class SnpFreq(object):
  GENOME_SIZE = {'wgs': 3000000000, 'wes': 34000000}
  MIN_QUAL    = 20
  def __init__(self, fd_vcf, exp_type, options):
    self.fd_vcf   = fd_vcf
    self.exp_type = exp_type
    self.drop = options.drop
    self.list_s_snps = options.list_s_snps
    self.__validate_type()
    self.coordinates_in_file = options.coordinates_in_file
    self.__load_vcf()
    if options.coor_fn:
      self.coor_fn = options.coor_fn
      self.__load_species_snp_coordinates()

  def __load_species_snp_coordinates(self):
    fd = drdcommon.xopen(self.coor_fn)
    d = {}
    self.d_species_coor = d
    n = 0
    for l in fd:
      n += 1
      chrm, coor = l.split()
      if not d.has_key(chrm):
        d[chrm] = {}
      d[chrm][int(coor)] = 1
    fd.close()
    logging.info("# of coordinates loaded: %d" % n)
    logging.info("current memory usage in %dkb" % drdcommon.memory_usage())

  def __load_vcf(self):
    self.vcf = Vcf(self.fd_vcf)
    self.vcf.load_meta_header()

    if self.drop and (not self.coordinates_in_file and self.vcf.num_of_samples < 2):
      drdcommon.error("I need a population level vcf in order to drop species snps.")

  def __validate_type(self):
    if self.exp_type != 'wgs' and \
       self.exp_type != 'wes' and \
       self.exp_type != 'null':
      raise_it('Invalid experiment type. Valid types: wgs or wes')

  def __list_species_snps(self):
    for l in self.vcf.each_snp():
      snp = VcfSnp(l)
      if snp.is_a_substitution() and \
         snp.has_high_quality(self.MIN_QUAL) and \
         snp.species_snp():
        print(snp.coordinate(' '))

  def __is_a_species_snp(self, snp):
    if self.coordinates_in_file:
      ch, co = snp.coordinate(' ').split()
      return self.d_species_coor.has_key(ch) and self.d_species_coor[ch].has_key(int(co))
    else:
      return snp.species_snp()

  def __calculate_snp_freq(self):
    """
    Compute the snp frequency (# of snps per kbp)
    Drop snps that are indels, have low quality
    If wes, also drop non coding regions
    If drop is True, we have to drop species snps
    """
    num_snps = 0
    total = 0
    for l in self.vcf.each_snp():
      snp = VcfSnp(l)
      total += 1
      if snp.is_a_substitution() and snp.has_high_quality(self.MIN_QUAL):
        if self.exp_type == 'wgs':
          if not self.drop or (self.drop and not self.__is_a_species_snp(snp)):
            num_snps += 1
        if self.exp_type == 'wes' and snp.in_coding_region():
          if not self.drop or (self.drop and not self.__is_a_species_snp(snp)):
            num_snps += 1

    logging.info("Total/counted: %d/%d" % (total, num_snps))
    return (float(num_snps)/self.GENOME_SIZE[self.exp_type])*1000

  def run(self):
    if self.list_s_snps:
      self.__list_species_snps()
    else:
      return self.__calculate_snp_freq()