示例#1
0
                            institute=institute)

    # If a variant belongs to any gene lists we check which ones
    mongo_variant['gene_lists'] = variant['info_dict'].get(
        config_object['VCF']['GeneLists']['vcf_info_key'], None)

    ################# Add the rank score and variant rank #################
    # Get the rank score as specified in the config file.
    # This is central for displaying variants in scout.

    mongo_variant['rank_score'] = float(
        variant.get('rank_scores', {}).get(case_name, 0.0))

    ################# Add gt calls #################
    gt_calls = []
    for individual_id, display_name in iteritems(individuals):
        # This function returns an ODM GTCall object with the
        # relevant information for a individual:
        gt_calls.append(
            get_genotype(variant, config_object, individual_id, display_name))
    mongo_variant['samples'] = gt_calls

    ################# Add the compound information #################

    mongo_variant['compounds'] = get_compounds(variant, case, variant_type)

    ################# Add the inheritance patterns #################

    mongo_variant['genetic_models'] = variant.get('genetic_models',
                                                  {}).get(case_name, [])
示例#2
0
def load_mongo_db(scout_configs,
                  vcf_configs=None,
                  family_type='cmms',
                  mongo_db='variantDatabase',
                  variant_type='clinical',
                  username=None,
                  password=None,
                  port=27017,
                  host='localhost',
                  rank_score_threshold=0,
                  variant_number_threshold=5000):
    """Populate a moongo database with information from ped and variant files."""
    # get root path of the Flask app
    # project_root = '/'.join(app.root_path.split('/')[0:-1])

    logger = logging.getLogger(__name__)
    # For testing only
    if __name__ == '__main__':
        logger = logging.getLogger("scout.ext.backend.load_mongo")

    ####### Check if the vcf file is on the proper format #######
    vcf_file = scout_configs['load_vcf']
    logger.info(
        "Found a vcf for loading variants into scout: {0}".format(vcf_file))

    logger.info("Connecting to {0}".format(mongo_db))
    connect(mongo_db,
            host=host,
            port=port,
            username=username,
            password=password)

    variant_database = get_db()

    ped_file = scout_configs['ped']
    logger.info("Found a ped file: {0}".format(ped_file))

    ######## Parse the config file to check for keys ########
    logger.info("Parsing config file")
    config_object = ConfigParser(vcf_configs)

    ######## Get the cases and add them to the mongo db: ########

    logger.info("Get the case from ped file")
    case = get_case(scout_configs, family_type)

    logger.info('Case found in {0}: {1}'.format(ped_file, case.display_name))

    ######## Add the institute to the mongo db: ########

    for institute_name in case['collaborators']:
        if institute_name:
            institute = get_institute(institute_name)
            logger.info("Institute found: {0}".format(institute))
            try:
                Institute.objects.get(internal_id=institute.internal_id)
                logger.info(
                    "Institute {0} already in database".format(institute))
            except DoesNotExist:
                institute.save()
                logger.info(
                    "Adding new institute {0} to database".format(institute))

    logger.info("Updating case in database")

    update_case(case, variant_type, logger)

    ######## Get the variants and add them to the mongo db: ########

    logger.info("Setting up a variant parser")
    variant_parser = VCFParser(infile=vcf_file,
                               split_variants=True,
                               skip_info_check=True)
    nr_of_variants = 0

    logger.info("Deleting old variants for case {0}".format(case.case_id))
    Variant.objects(case_id=case.case_id, variant_type=variant_type).delete()
    logger.debug("Variants deleted")

    start_inserting_variants = datetime.now()

    # Get the individuals to see which we should include in the analysis
    ped_individuals = {
        individual.individual_id: individual.display_name
        for individual in case.individuals
    }

    # Check which individuals that exists in the vcf file.
    # Save the individuals in a dictionary with individual ids as keys
    # and display names as values
    individuals = {}
    # loop over keys (internal ids)
    logger.info("Checking which individuals in ped file exists in vcf")
    for individual_id, display_name in iteritems(ped_individuals):
        logger.debug("Checking individual {0}".format(individual_id))
        if individual_id in variant_parser.individuals:
            logger.debug("Individual {0} found".format(individual_id))
            individuals[individual_id] = display_name
        else:
            logger.warning("Individual {0} is present in ped file but"\
                          " not in vcf".format(individual_id))

    logger.info('Start parsing variants')

    ########## If a rank score threshold is used check if it is below that threshold ##########
    for variant in variant_parser:
        logger.debug("Parsing variant {0}".format(variant['variant_id']))
        if not float(variant['rank_scores'][
                case.display_name]) > rank_score_threshold:
            logger.info("Lower rank score threshold reaced after {0}"\
                        " variants".format(nr_of_variants))
            break

        if nr_of_variants > variant_number_threshold:
            logger.info("Variant number threshold reached. ({0})".format(
                variant_number_threshold))
            break

        nr_of_variants += 1
        mongo_variant = get_mongo_variant(variant, variant_type, individuals,
                                          case, config_object, nr_of_variants)

        mongo_variant.save()

        if nr_of_variants % 1000 == 0:
            logger.info('{0} variants parsed'.format(nr_of_variants))
示例#3
0
def load_mongo_db(scout_configs, vcf_configs=None, family_type='cmms',
                  mongo_db='variantDatabase', variant_type='clinical',
                  username=None, password=None, port=27017, host='localhost',
                  rank_score_threshold = 0, variant_number_threshold = 5000):
  """Populate a moongo database with information from ped and variant files."""
  # get root path of the Flask app
  # project_root = '/'.join(app.root_path.split('/')[0:-1])

  logger = logging.getLogger(__name__)
  # For testing only
  if __name__ == '__main__':
    logger = logging.getLogger("scout.ext.backend.load_mongo")

  ####### Check if the vcf file is on the proper format #######
  vcf_file = scout_configs['load_vcf']
  logger.info("Found a vcf for loading variants into scout: {0}".format(
    vcf_file
  ))

  logger.info("Connecting to {0}".format(mongo_db))
  connect(mongo_db, host=host, port=port, username=username,
          password=password)

  variant_database = get_db()

  ped_file = scout_configs['ped']
  logger.info("Found a ped file: {0}".format(ped_file))

  ######## Parse the config file to check for keys ########
  logger.info("Parsing config file")
  config_object = ConfigParser(vcf_configs)


  ######## Get the cases and add them to the mongo db: ########

  logger.info("Get the case from ped file")
  case = get_case(scout_configs, family_type)

  logger.info('Case found in {0}: {1}'.format(ped_file, case.display_name))

  ######## Add the institute to the mongo db: ########

  for institute_name in case['collaborators']:
    if institute_name:
      institute = get_institute(institute_name)
      logger.info("Institute found: {0}".format(institute))
      try:
        Institute.objects.get(internal_id = institute.internal_id)
        logger.info("Institute {0} already in database".format(institute))
      except DoesNotExist:
        institute.save()
        logger.info("Adding new institute {0} to database".format(institute))

  logger.info("Updating case in database")

  update_case(case, variant_type, logger)

  ######## Get the variants and add them to the mongo db: ########

  logger.info("Setting up a variant parser")
  variant_parser = VCFParser(infile=vcf_file, split_variants=True, skip_info_check=True)
  nr_of_variants = 0

  logger.info("Deleting old variants for case {0}".format(case.case_id))
  Variant.objects(case_id=case.case_id, variant_type=variant_type).delete()
  logger.debug("Variants deleted")

  start_inserting_variants = datetime.now()

  # Get the individuals to see which we should include in the analysis
  ped_individuals = {individual.individual_id: individual.display_name
                     for individual in case.individuals}

  # Check which individuals that exists in the vcf file.
  # Save the individuals in a dictionary with individual ids as keys
  # and display names as values
  individuals = {}
  # loop over keys (internal ids)
  logger.info("Checking which individuals in ped file exists in vcf")
  for individual_id, display_name in iteritems(ped_individuals):
    logger.debug("Checking individual {0}".format(individual_id))
    if individual_id in variant_parser.individuals:
      logger.debug("Individual {0} found".format(individual_id))
      individuals[individual_id] = display_name
    else:
        logger.warning("Individual {0} is present in ped file but"\
                      " not in vcf".format(individual_id))

  logger.info('Start parsing variants')

  ########## If a rank score threshold is used check if it is below that threshold ##########
  for variant in variant_parser:
    logger.debug("Parsing variant {0}".format(variant['variant_id']))
    if not float(variant['rank_scores'][case.display_name]) > rank_score_threshold:
      logger.info("Lower rank score threshold reaced after {0}"\
                  " variants".format(nr_of_variants))
      break

    if nr_of_variants > variant_number_threshold:
      logger.info("Variant number threshold reached. ({0})".format(
        variant_number_threshold))
      break


    nr_of_variants += 1
    mongo_variant = get_mongo_variant(variant, variant_type, individuals, case, config_object, nr_of_variants)

    mongo_variant.save()

    if nr_of_variants % 1000 == 0:
      logger.info('{0} variants parsed'.format(nr_of_variants))

  logger.info("Parsing variants done")
  logger.info("{0} variants inserted".format(nr_of_variants))
  logger.info("Time to insert variants: {0}".format(
    datetime.now() - start_inserting_variants
  ))

  logger.info("Updating indexes")

  ensure_indexes(variant_database, logger)

  return