Пример #1
0
def VariantUploader(tsv, submit_to_filesystem=False):

    gather = DataGatherer(tsv)

    variants = connect_to_mongo()

    if submit_to_filesystem:
        filesystem = SomaticFileSystem('/dsde/working/somaticDB/master/data')

        S = SubmissionFile(tsv)
        S.change_file_dir()
        S.to_csv(
            os.path.join('/dsde/working/somaticDB/master/records',
                         os.path.basename(tsv)))
    else:
        filesystem = None

    bulk_count = 0
    bulk = variants.initialize_unordered_bulk_op()

    start_time = time.time()
    n = 0

    for variant_dict in gather.data_iterator():
        n += 1

        bulk_count += 1

        additional_data_dict = {}

        mongo_submission = merge_dicts(variant_dict, additional_data_dict)

        # if the user is uploading a mutect1 maf ... rename fields
        if mongo_submission.has_key('contig'):
            mongo_submission['chromosome'] = mongo_submission.pop('contig')
        if mongo_submission.has_key('position'):
            mongo_submission['start'] = mongo_submission.pop('position')
        if mongo_submission.has_key('ref_allele'):
            mongo_submission['ref'] = mongo_submission.pop('ref_allele')
        if mongo_submission.has_key('alt_allele'):
            mongo_submission['alt'] = mongo_submission.pop('alt_allele')

        unique_data = get_entries_from_dict(mongo_submission,
                                            keys=[
                                                'chromosome', 'start', 'ref',
                                                'alt', 'project', 'dataset',
                                                'evidence_type'
                                            ],
                                            return_type=dict)

        if filesystem:
            project = mongo_submission['project']
            dataset = mongo_submission['dataset']
            filesystem.add_project(project)
            filesystem[project].add_dataset(dataset)

            filesystem[project][dataset].add_file(
                mongo_submission['data_filename'])

            mongo_submission['data_filename']=\
                change_data_filename("/dsde/working/somaticDB/master/data/%s/%s/"%(project,dataset),
                                     mongo_submission['data_filename'])

        bulk.insert(mongo_submission)

        if bulk_count == 10000:
            print "variants uploaded: %d (%.2f seconds since start of upload)." % (
                n, time.time() - start_time)
            bulk_count = 0
            bulk.execute()
            bulk = variants.initialize_unordered_bulk_op()

    if bulk_count > 0:
        print "variants uploaded: %d (%.2f seconds since start of upload)." % (
            n, time.time() - start_time)
        bulk.execute()
Пример #2
0
def VariantUploader(tsv, submit_to_filesystem=False):

    gather = DataGatherer(tsv)

    variants = connect_to_mongo()

    if submit_to_filesystem:
        filesystem = SomaticFileSystem("/dsde/working/somaticDB/master/data")

        S = SubmissionFile(tsv)
        S.change_file_dir()
        S.to_csv(os.path.join("/dsde/working/somaticDB/master/records", os.path.basename(tsv)))
    else:
        filesystem = None

    bulk_count = 0
    bulk = variants.initialize_unordered_bulk_op()

    start_time = time.time()
    n = 0

    for variant_dict in gather.data_iterator():
        n += 1

        bulk_count += 1

        additional_data_dict = {}

        mongo_submission = merge_dicts(variant_dict, additional_data_dict)

        # if the user is uploading a mutect1 maf ... rename fields
        if mongo_submission.has_key("contig"):
            mongo_submission["chromosome"] = mongo_submission.pop("contig")
        if mongo_submission.has_key("position"):
            mongo_submission["start"] = mongo_submission.pop("position")
        if mongo_submission.has_key("ref_allele"):
            mongo_submission["ref"] = mongo_submission.pop("ref_allele")
        if mongo_submission.has_key("alt_allele"):
            mongo_submission["alt"] = mongo_submission.pop("alt_allele")

        unique_data = get_entries_from_dict(
            mongo_submission,
            keys=["chromosome", "start", "ref", "alt", "project", "dataset", "evidence_type"],
            return_type=dict,
        )

        if filesystem:
            project = mongo_submission["project"]
            dataset = mongo_submission["dataset"]
            filesystem.add_project(project)
            filesystem[project].add_dataset(dataset)

            filesystem[project][dataset].add_file(mongo_submission["data_filename"])

            mongo_submission["data_filename"] = change_data_filename(
                "/dsde/working/somaticDB/master/data/%s/%s/" % (project, dataset), mongo_submission["data_filename"]
            )

        bulk.insert(mongo_submission)

        if bulk_count == 10000:
            print "variants uploaded: %d (%.2f seconds since start of upload)." % (n, time.time() - start_time)
            bulk_count = 0
            bulk.execute()
            bulk = variants.initialize_unordered_bulk_op()

    if bulk_count > 0:
        print "variants uploaded: %d (%.2f seconds since start of upload)." % (n, time.time() - start_time)
        bulk.execute()
Пример #3
0
def VariantAssessor(query,tsv,output_file,outdir=""):

    collection = connect_to_mongo()

    caller_output = pd.read_csv(tsv,sep='\t')

    known_true     = {'snp': defaultdict(set),'indel': defaultdict(set)}
    known_false    = {'snp': defaultdict(set),'indel': defaultdict(set)}
    found_variants = {'snp': defaultdict(set),'indel': defaultdict(set)}

    false_positive = {'snp': defaultdict(set),'indel': defaultdict(set)}

    query = query_processor(query)

    logging.getLogger(__name__).info("Querying database for variants.")

    # collect query information
    for record in collection.find(ast.literal_eval(query)):

        sample_information = get_entries_from_dict(record, keys=['project','dataset','sample'],return_type=tuple)
        variant = get_entries_from_dict(record, keys=['chromosome','start','ref','alt'],return_type=tuple)

        sample_information = tuple(map(str, sample_information))
        variant = tuple(map(str, variant))

        evidence_type = record['evidence_type']

        if is_snp(record):
            if 'TP' in evidence_type:
                known_true['snp'][sample_information].add(variant)

            if 'FP' in evidence_type:
                known_false['snp'][sample_information].add(variant)

        elif is_indel(record):
            if 'TP' in evidence_type:
                known_true['indel'][sample_information].add(variant)

            if 'FP' in evidence_type:
                known_false['indel'][sample_information].add(variant)

    normal_normal = set([])
    cm = set([])

    #index the type of assessment to be done for each datatype.
    for k,row in caller_output.iterrows():
        sample_information = (row['project'],row['dataset'],row['sample'])
        if row['evidence_type'] == 'NN':
            normal_normal.add(sample_information)
        elif row['evidence_type'] == 'CM':
            cm.add(sample_information)
        else:
            cm.add(sample_information) #by default, ROC-like curves are used.

    gather = DataGatherer(tsv)


    logging.getLogger(__name__).info("Collection of variants from user submitted files.")

    found_feature_data = defaultdict(dict)

    #data from file (algorithm being tested)
    for variant_dict in gather.data_iterator():
        sample_information = get_entries_from_dict(variant_dict, keys=['project','dataset','sample'],return_type=tuple)
        variant = get_entries_from_dict(variant_dict, keys=['chromosome','start','ref','alt'],return_type=tuple)

        print sample_information, variant

        #found_feature_data[sample_information][variant] = get_entries_from_dict(variant_dict, keys=['ECNT','HCNT','NLOD','TLOD'],return_type=dict)

        if is_snp(variant_dict):
                found_variants['snp'][sample_information].add(variant)



        elif is_indel(variant_dict):
                found_variants['indel'][sample_information].add(variant)

    #print found_feature_data.keys()

    caller_samples = caller_output[['project','dataset','sample']].values.tolist()

    data = []

    for feature in ['snp','indel']:

        filename = {}; fp_fn = {}; fp_fp={}; all_dict={}; fp_tp= {}

        filename[feature] = feature+".false_negatives.tsv"
        fp_fn[feature] = csv.DictWriter(open( os.path.join(outdir,filename[feature])  ,'w'),
                                        delimiter='\t',
                                        fieldnames=['project','dataset','sample','chromosome','start','ref','alt','variant_type'])


        #fieldnames=['project','dataset','sample','chromosome','start','ref','alt','ECNT','HCNT','NLOD','TLOD','variant_type']
        fieldnames=['project','dataset','sample','chromosome','start','ref','alt','variant_type']
        filename[feature] = feature+".false_positives.tsv"
        fp_fp[feature] = csv.DictWriter(open( os.path.join(outdir,filename[feature]) ,'w'),
                                        delimiter='\t',
                                        fieldnames=fieldnames)


        #fieldnames=['project','dataset','sample','chromosome','start','ref','alt','ECNT','HCNT','NLOD','TLOD','variant_type']
        fieldnames=['project','dataset','sample','chromosome','start','ref','alt','variant_type']
        filename[feature] = feature+".true_positives.tsv"
        fp_tp[feature] = csv.DictWriter(open( os.path.join(outdir,filename[feature]) ,'w'),
                                        delimiter='\t',
                                        fieldnames=fieldnames)


        fp_fn[feature].writeheader()
        fp_fp[feature].writeheader()
        fp_tp[feature].writeheader()


        for eval_type in ['CM','NN']:

            all_dict[eval_type] = {'project': 'all',
                                   'dataset': 'all',
                                   'sample' : 'all',
                                   'false_positives': 0,
                                   'true_positives':  0,
                                   'false_negatives': 0,
                                   'tpr': np.nan,
                                   'fpr': np.nan,
                                   'precision': np.nan,
                                   'evidence_type': eval_type,
                                   'variant_type': feature }


        for sample_information in map(tuple,caller_samples):

            if sample_information in normal_normal:
                assessment_type = 'NN'
            elif sample_information in cm:
                assessment_type = 'CM'
            else:
                assessment_type = 'CM'



            row_dict = {'project': sample_information[0],
                        'dataset': sample_information[1],
                        'sample' : sample_information[2],
                        'false_positives': 0,
                        'true_positives': 0,
                        'false_negatives': 0,
                        'tpr': np.nan,
                        'fpr': np.nan,
                        'precision': np.nan,
                        'evidence_type': assessment_type,
                        'variant_type': feature}

            if assessment_type == 'NN':

                FN = len(found_variants[feature][sample_information])
                row_dict['false_positives'] = FN
                row_dict['precision'] = 0

            if assessment_type == 'CM':

                TP = np.float(len(found_variants[feature][sample_information].intersection(known_true[feature][sample_information])))
                FN = np.float(len(known_true[feature][sample_information].difference(found_variants[feature][sample_information])))
                FP = np.float(len(found_variants[feature][sample_information].difference(known_true[feature][sample_information])))

                print TP, FN, FP

                try:
                    row_dict['tpr']  = TP/(TP+FN)
                except:
                    row_dict['tpr']  = np.nan


                row_dict['true_positives']  = TP
                row_dict['false_negatives'] = FN
                row_dict['false_positives'] =  FP

                all_dict['CM']['true_positives']  += TP
                all_dict['CM']['false_negatives'] += FN
                all_dict['CM']['false_positives'] +=  FP

                try:
                    row_dict['precision'] = TP/(TP+FP)
                except:
                    row_dict['precision']  = np.nan

                row_dict['dream_accuracy'] = (row_dict['tpr'] + row_dict['precision'])/2.0

                print row_dict['tpr'], row_dict['precision'], row_dict['dream_accuracy']

                row_dict['variant_type'] = feature

            data.append(row_dict)



            true_positives  = list(found_variants[feature][sample_information].intersection(known_true[feature][sample_information]))
            false_positives = list(found_variants[feature][sample_information].difference(known_true[feature][sample_information]))
            false_negatives = list(known_true[feature][sample_information].difference(found_variants[feature][sample_information]))

            for variant in true_positives:

                fp_tp[feature].writerow({'project': sample_information[0],
                                         'dataset':sample_information[1],
                                         'sample':sample_information[2],
                                         'chromosome':variant[0],
                                         'start':variant[1],
                                         'ref':variant[2],
                                         'alt':variant[3],
                                         #'ECNT':found_feature_data[sample_information][variant]['ECNT'],
                                         #'HCNT':found_feature_data[sample_information][variant]['HCNT'],
                                         #'NLOD':found_feature_data[sample_information][variant]['NLOD'],
                                         #'TLOD':found_feature_data[sample_information][variant]['TLOD'],
                                         'variant_type':feature})

            for variant in false_positives:

                fp_fp[feature].writerow({'project': sample_information[0],
                                         'dataset':sample_information[1],
                                         'sample':sample_information[2],
                                         'chromosome':variant[0],
                                         'start':variant[1],
                                         'ref':variant[2],
                                         'alt':variant[3],
                                         #'ECNT':found_feature_data[sample_information][variant]['ECNT'],
                                         #'HCNT':found_feature_data[sample_information][variant]['HCNT'],
                                         #'NLOD':found_feature_data[sample_information][variant]['NLOD'],
                                         #'TLOD':found_feature_data[sample_information][variant]['TLOD'],
                                         'variant_type':feature})

            for variant in false_negatives:

                fp_fn[feature].writerow({'project': sample_information[0],
                                          'dataset':sample_information[1],
                                          'sample':sample_information[2],
                                          'chromosome':variant[0],
                                          'start':variant[1],
                                          'ref':variant[2],
                                          'alt':variant[3],
                                          'variant_type':feature})



        try:
            all_dict['CM']['tpr'] = all_dict['CM']['true_positives']/(all_dict['CM']['true_positives']+all_dict['CM']['false_negatives'])
        except:
            all_dict['CM']['tpr'] = np.nan


        all_dict['CM']['dream_accuracy'] = (all_dict['CM']['tpr'] + 1 -all_dict['CM']['precision'])/2.0

        try:
            all_dict['CM']['precision'] = all_dict['CM']['true_positives']/(all_dict['CM']['true_positives']+all_dict['CM']['false_positives'])
        except:
            all_dict['CM']['precision'] = np.nan



        all_dict['CM']['dream_accuracy'] = (all_dict['CM']['tpr'] + all_dict['CM']['precision'])/2.0

        all_dict['CM']['variant_type'] = feature

        #data.append(all_dict['CM'])
        #data.append(all_dict['NN'])

    fieldnames=['project','dataset','sample' ,'false_positives','true_positives','false_negatives','tpr','precision','evidence_type','dream_accuracy','variant_type']

    pd.DataFrame(data).to_csv(output_file, sep='\t',index=False,columns=fieldnames,na_rep='nan')
Пример #4
0
def VariantAssessor(query, tsv, output_file, outdir=""):

    collection = connect_to_mongo()

    caller_output = pd.read_csv(tsv, sep='\t')

    known_true = {'snp': defaultdict(set), 'indel': defaultdict(set)}
    known_false = {'snp': defaultdict(set), 'indel': defaultdict(set)}
    found_variants = {'snp': defaultdict(set), 'indel': defaultdict(set)}

    false_positive = {'snp': defaultdict(set), 'indel': defaultdict(set)}

    query = query_processor(query)

    logging.getLogger(__name__).info("Querying database for variants.")

    # collect query information
    for record in collection.find(ast.literal_eval(query)):

        sample_information = get_entries_from_dict(
            record, keys=['project', 'dataset', 'sample'], return_type=tuple)
        variant = get_entries_from_dict(
            record,
            keys=['chromosome', 'start', 'ref', 'alt'],
            return_type=tuple)

        sample_information = tuple(map(str, sample_information))
        variant = tuple(map(str, variant))

        evidence_type = record['evidence_type']

        if is_snp(record):
            if 'TP' in evidence_type:
                known_true['snp'][sample_information].add(variant)

            if 'FP' in evidence_type:
                known_false['snp'][sample_information].add(variant)

        elif is_indel(record):
            if 'TP' in evidence_type:
                known_true['indel'][sample_information].add(variant)

            if 'FP' in evidence_type:
                known_false['indel'][sample_information].add(variant)

    normal_normal = set([])
    cm = set([])

    #index the type of assessment to be done for each datatype.
    for k, row in caller_output.iterrows():
        sample_information = (row['project'], row['dataset'], row['sample'])
        if row['evidence_type'] == 'NN':
            normal_normal.add(sample_information)
        elif row['evidence_type'] == 'CM':
            cm.add(sample_information)
        else:
            cm.add(sample_information)  #by default, ROC-like curves are used.

    gather = DataGatherer(tsv)

    logging.getLogger(__name__).info(
        "Collection of variants from user submitted files.")

    found_feature_data = defaultdict(dict)

    #data from file (algorithm being tested)
    for variant_dict in gather.data_iterator():
        sample_information = get_entries_from_dict(
            variant_dict,
            keys=['project', 'dataset', 'sample'],
            return_type=tuple)
        variant = get_entries_from_dict(
            variant_dict,
            keys=['chromosome', 'start', 'ref', 'alt'],
            return_type=tuple)

        print sample_information, variant

        #found_feature_data[sample_information][variant] = get_entries_from_dict(variant_dict, keys=['ECNT','HCNT','NLOD','TLOD'],return_type=dict)

        if is_snp(variant_dict):
            found_variants['snp'][sample_information].add(variant)

        elif is_indel(variant_dict):
            found_variants['indel'][sample_information].add(variant)

    #print found_feature_data.keys()

    caller_samples = caller_output[['project', 'dataset',
                                    'sample']].values.tolist()

    data = []

    for feature in ['snp', 'indel']:

        filename = {}
        fp_fn = {}
        fp_fp = {}
        all_dict = {}
        fp_tp = {}

        filename[feature] = feature + ".false_negatives.tsv"
        fp_fn[feature] = csv.DictWriter(
            open(os.path.join(outdir, filename[feature]), 'w'),
            delimiter='\t',
            fieldnames=[
                'project', 'dataset', 'sample', 'chromosome', 'start', 'ref',
                'alt', 'variant_type'
            ])

        #fieldnames=['project','dataset','sample','chromosome','start','ref','alt','ECNT','HCNT','NLOD','TLOD','variant_type']
        fieldnames = [
            'project', 'dataset', 'sample', 'chromosome', 'start', 'ref',
            'alt', 'variant_type'
        ]
        filename[feature] = feature + ".false_positives.tsv"
        fp_fp[feature] = csv.DictWriter(open(
            os.path.join(outdir, filename[feature]), 'w'),
                                        delimiter='\t',
                                        fieldnames=fieldnames)

        #fieldnames=['project','dataset','sample','chromosome','start','ref','alt','ECNT','HCNT','NLOD','TLOD','variant_type']
        fieldnames = [
            'project', 'dataset', 'sample', 'chromosome', 'start', 'ref',
            'alt', 'variant_type'
        ]
        filename[feature] = feature + ".true_positives.tsv"
        fp_tp[feature] = csv.DictWriter(open(
            os.path.join(outdir, filename[feature]), 'w'),
                                        delimiter='\t',
                                        fieldnames=fieldnames)

        fp_fn[feature].writeheader()
        fp_fp[feature].writeheader()
        fp_tp[feature].writeheader()

        for eval_type in ['CM', 'NN']:

            all_dict[eval_type] = {
                'project': 'all',
                'dataset': 'all',
                'sample': 'all',
                'false_positives': 0,
                'true_positives': 0,
                'false_negatives': 0,
                'tpr': np.nan,
                'fpr': np.nan,
                'precision': np.nan,
                'evidence_type': eval_type,
                'variant_type': feature
            }

        for sample_information in map(tuple, caller_samples):

            if sample_information in normal_normal:
                assessment_type = 'NN'
            elif sample_information in cm:
                assessment_type = 'CM'
            else:
                assessment_type = 'CM'

            row_dict = {
                'project': sample_information[0],
                'dataset': sample_information[1],
                'sample': sample_information[2],
                'false_positives': 0,
                'true_positives': 0,
                'false_negatives': 0,
                'tpr': np.nan,
                'fpr': np.nan,
                'precision': np.nan,
                'evidence_type': assessment_type,
                'variant_type': feature
            }

            if assessment_type == 'NN':

                FN = len(found_variants[feature][sample_information])
                row_dict['false_positives'] = FN
                row_dict['precision'] = 0

            if assessment_type == 'CM':

                TP = np.float(
                    len(found_variants[feature]
                        [sample_information].intersection(
                            known_true[feature][sample_information])))
                FN = np.float(
                    len(known_true[feature][sample_information].difference(
                        found_variants[feature][sample_information])))
                FP = np.float(
                    len(found_variants[feature][sample_information].difference(
                        known_true[feature][sample_information])))

                print TP, FN, FP

                try:
                    row_dict['tpr'] = TP / (TP + FN)
                except:
                    row_dict['tpr'] = np.nan

                row_dict['true_positives'] = TP
                row_dict['false_negatives'] = FN
                row_dict['false_positives'] = FP

                all_dict['CM']['true_positives'] += TP
                all_dict['CM']['false_negatives'] += FN
                all_dict['CM']['false_positives'] += FP

                try:
                    row_dict['precision'] = TP / (TP + FP)
                except:
                    row_dict['precision'] = np.nan

                row_dict['dream_accuracy'] = (row_dict['tpr'] +
                                              row_dict['precision']) / 2.0

                print row_dict['tpr'], row_dict['precision'], row_dict[
                    'dream_accuracy']

                row_dict['variant_type'] = feature

            data.append(row_dict)

            true_positives = list(
                found_variants[feature][sample_information].intersection(
                    known_true[feature][sample_information]))
            false_positives = list(
                found_variants[feature][sample_information].difference(
                    known_true[feature][sample_information]))
            false_negatives = list(
                known_true[feature][sample_information].difference(
                    found_variants[feature][sample_information]))

            for variant in true_positives:

                fp_tp[feature].writerow({
                    'project': sample_information[0],
                    'dataset': sample_information[1],
                    'sample': sample_information[2],
                    'chromosome': variant[0],
                    'start': variant[1],
                    'ref': variant[2],
                    'alt': variant[3],
                    #'ECNT':found_feature_data[sample_information][variant]['ECNT'],
                    #'HCNT':found_feature_data[sample_information][variant]['HCNT'],
                    #'NLOD':found_feature_data[sample_information][variant]['NLOD'],
                    #'TLOD':found_feature_data[sample_information][variant]['TLOD'],
                    'variant_type': feature
                })

            for variant in false_positives:

                fp_fp[feature].writerow({
                    'project': sample_information[0],
                    'dataset': sample_information[1],
                    'sample': sample_information[2],
                    'chromosome': variant[0],
                    'start': variant[1],
                    'ref': variant[2],
                    'alt': variant[3],
                    #'ECNT':found_feature_data[sample_information][variant]['ECNT'],
                    #'HCNT':found_feature_data[sample_information][variant]['HCNT'],
                    #'NLOD':found_feature_data[sample_information][variant]['NLOD'],
                    #'TLOD':found_feature_data[sample_information][variant]['TLOD'],
                    'variant_type': feature
                })

            for variant in false_negatives:

                fp_fn[feature].writerow({
                    'project': sample_information[0],
                    'dataset': sample_information[1],
                    'sample': sample_information[2],
                    'chromosome': variant[0],
                    'start': variant[1],
                    'ref': variant[2],
                    'alt': variant[3],
                    'variant_type': feature
                })

        try:
            all_dict['CM']['tpr'] = all_dict['CM']['true_positives'] / (
                all_dict['CM']['true_positives'] +
                all_dict['CM']['false_negatives'])
        except:
            all_dict['CM']['tpr'] = np.nan

        all_dict['CM']['dream_accuracy'] = (all_dict['CM']['tpr'] + 1 -
                                            all_dict['CM']['precision']) / 2.0

        try:
            all_dict['CM']['precision'] = all_dict['CM']['true_positives'] / (
                all_dict['CM']['true_positives'] +
                all_dict['CM']['false_positives'])
        except:
            all_dict['CM']['precision'] = np.nan

        all_dict['CM']['dream_accuracy'] = (all_dict['CM']['tpr'] +
                                            all_dict['CM']['precision']) / 2.0

        all_dict['CM']['variant_type'] = feature

        #data.append(all_dict['CM'])
        #data.append(all_dict['NN'])

    fieldnames = [
        'project', 'dataset', 'sample', 'false_positives', 'true_positives',
        'false_negatives', 'tpr', 'precision', 'evidence_type',
        'dream_accuracy', 'variant_type'
    ]

    pd.DataFrame(data).to_csv(output_file,
                              sep='\t',
                              index=False,
                              columns=fieldnames,
                              na_rep='nan')