Exemplo n.º 1
0
    def data_iterator(self,
                      keys=('tumor_bam', 'normal_bam', 'data_filename',
                            'project', 'dataset', 'sample', 'evidence_type')):
        file = open(self.filename, 'rU')
        reader = csv.DictReader(file, delimiter='\t')

        logging.getLogger(__name__).info(
            "Gathering variants from table of files:" + self.filename)

        for file_data in reader:

            if file_data.has_key('FILTER'):
                if len(filter) > 0: continue

            meta_data_dict = get_entries_from_dict(file_data,
                                                   keys=keys,
                                                   return_type=dict)

            logging.getLogger(
                __name__).info("Gathering variants from individual file:" +
                               meta_data_dict['data_filename'])

            D = DatabaseParser(meta_data_dict['data_filename'])
            self.current_file = meta_data_dict['data_filename']

            n = 0

            self.new_file = True

            for variant_dict in D.get_variants():
                yield merge_dicts(variant_dict, meta_data_dict)
                if self.new_file == True: self.new_file = False

        self.current_file = None
Exemplo n.º 2
0
    def data_iterator(self,keys=('tumor_bam','normal_bam',
                                             'data_filename',
                                             'project',
                                             'dataset',
                                             'sample',
                                             'evidence_type')):
        file = open(self.filename,'rU')
        reader = csv.DictReader(file,delimiter='\t')

        logging.getLogger(__name__).info("Gathering variants from table of files:"+ self.filename)

        for file_data in reader:

                if file_data.has_key('FILTER'):
                    if len(filter) > 0: continue

                meta_data_dict = get_entries_from_dict(file_data,
                                                       keys=keys,
                                                       return_type=dict)

                logging.getLogger(__name__).info("Gathering variants from individual file:"+ meta_data_dict['data_filename'])

                D = DatabaseParser(meta_data_dict['data_filename'])
                self.current_file = meta_data_dict['data_filename']

                n=0

                self.new_file = True

                for variant_dict in D.get_variants():
                    yield merge_dicts(variant_dict, meta_data_dict)
                    if self.new_file == True: self.new_file = False


        self.current_file = None
Exemplo n.º 3
0
def survey(filename):

    logging.getLogger(__name__).info("Beginning survey.")

    query = "{'project' : { '$exists' : 'true' } }"

    tally = defaultdict(int)

    collector = connect_to_mongo()

    # collect query information
    n = 0
    for record in collector.find(ast.literal_eval(query)):

        sample_information = get_entries_from_dict(
            record,
            keys=['project', 'dataset', 'sample', 'evidence_type'],
            return_type=dict)

        feature = None

        if is_snp(record):
            feature = 'snp'
        if is_indel(record):
            feature = 'indel'

        if feature is None:
            print record
            continue

        if sample_information['evidence_type'] == 'TP':
            n += 1
            project = sample_information['project']
            dataset = sample_information['dataset']
            sample = sample_information['sample']

            tally[(project, dataset, sample, feature)] += 1
            tally[(project, dataset, '', feature)] += 1
            tally[(project, '', '', feature)] += 1
            tally[('', '', '', feature)] += 1

            if not (n % 10000):
                logging.getLogger(__name__).info("Variants seen: " + str(n))

    fp = csv.DictWriter(
        open(filename, 'w'),
        fieldnames=['project', 'dataset', 'sample', 'feature', 'count'],
        delimiter='\t')

    fp.writeheader()

    for item in tally:
        fp.writerow({
            'project': item[0],
            'dataset': item[1],
            'sample': item[2],
            'feature': item[3],
            'count': tally[item]
        })
Exemplo n.º 4
0
def survey(filename):

    logging.getLogger(__name__).info("Beginning survey.")

    query = "{'project' : { '$exists' : 'true' } }"

    tally = defaultdict(int)

    collector = connect_to_mongo()

    # collect query information
    n = 0
    for record in collector.find(ast.literal_eval(query)):

        sample_information = get_entries_from_dict(record, keys=['project',
                                                                 'dataset',
                                                                 'sample',
                                                                 'evidence_type'],return_type=dict)

        feature = None

        if is_snp(record):
            feature = 'snp'
        if is_indel(record):
            feature = 'indel'

        if feature is None:
            print record
            continue


        if sample_information['evidence_type'] == 'TP':
            n+=1
            project = sample_information['project']
            dataset = sample_information['dataset']
            sample  = sample_information['sample']

            tally[(project,dataset,sample,feature)]+=1
            tally[(project,dataset,'',feature)]+=1
            tally[(project,'','',feature)]+=1
            tally[('','','',feature)]+=1

            if not (n % 10000): logging.getLogger(__name__).info("Variants seen: "+str(n))


    fp = csv.DictWriter(open(filename,'w'), fieldnames=['project','dataset','sample','feature','count'],delimiter='\t')

    fp.writeheader()

    for item in tally:
        fp.writerow({'project':item[0],'dataset':item[1],'sample':item[2],'feature':item[3],'count': tally[item] })
Exemplo n.º 5
0
def variant_extract(query, output_filename, max_number_of_records):
    query = query_processor(query)

    output = []

    collection = connect_to_mongo()

    for record in collection.find(ast.literal_eval(query)):

        sample_information = get_entries_from_dict(record,
                                                   keys=[
                                                       'chromosome', 'start',
                                                       'ref', 'alt', 'project',
                                                       'dataset', 'sample',
                                                       'evidence_type'
                                                   ],
                                                   return_type=dict)

        for sample in sample_information:
            sample_information[sample] = str(sample_information[sample])

        output.append(sample_information)

    output = pd.DataFrame(output)

    if max_number_of_records is not None:
        if len(output) > max_number_of_records:
            output = output[:max_number_of_records]

    if output_filename == "<stdout>":
        print "project, dataset, sample, evidence_type, chromosome, start, ref, alt"
        for k, row in output.iterrows():
            row = [
                row['project'], row['dataset'], row['sample'],
                row['evidence_type'], row['chrosome'], row['start'],
                row['ref'], row['alt']
            ]
            row = ",".join(row)
            print row
    else:
        output.to_csv(output_filename,
                      index=False,
                      columns=[
                          "project", "dataset", "sample", "evidence_type",
                          "chromosome", "start", "ref", "alt"
                      ],
                      sep='\t')
Exemplo n.º 6
0
def variant_extract(query, output_filename, max_number_of_records):
    query = query_processor(query)

    output = []

    collection = connect_to_mongo()

    for record in collection.find(ast.literal_eval(query)):

        sample_information = get_entries_from_dict(record, keys=['chromosome',
                                                                 'start',
                                                                 'ref',
                                                                 'alt',
                                                                 'project',
                                                                 'dataset',
                                                                 'sample',
                                                                 'evidence_type'],return_type=dict)


        for sample in sample_information:
            sample_information[sample] = str(sample_information[sample])

        output.append(sample_information)


    output = pd.DataFrame(output)

    if max_number_of_records is not None:
        if len(output) > max_number_of_records: output = output[:max_number_of_records]


    if output_filename == "<stdout>":
        print "project, dataset, sample, evidence_type, chromosome, start, ref, alt"
        for k, row in output.iterrows():
            row = [row['project'],row['dataset'],row['sample'],row['evidence_type'],row['chrosome'],row['start'],row['ref'],row['alt']]
            row =",".join(row)
            print row
    else:
        output.to_csv(output_filename,index=False,columns=["project",
                                                           "dataset",
                                                           "sample",
                                                           "evidence_type",
                                                           "chromosome",
                                                           "start",
                                                           "ref",
                                                           "alt"], sep='\t')
Exemplo n.º 7
0
def VariantUploader(tsv, submit_to_filesystem=False):

    gather = DataGatherer(tsv)

    variants = connect_to_mongo()

    if submit_to_filesystem:
        filesystem = SomaticFileSystem('/dsde/working/somaticDB/master/data')

        S = SubmissionFile(tsv)
        S.change_file_dir()
        S.to_csv(
            os.path.join('/dsde/working/somaticDB/master/records',
                         os.path.basename(tsv)))
    else:
        filesystem = None

    bulk_count = 0
    bulk = variants.initialize_unordered_bulk_op()

    start_time = time.time()
    n = 0

    for variant_dict in gather.data_iterator():
        n += 1

        bulk_count += 1

        additional_data_dict = {}

        mongo_submission = merge_dicts(variant_dict, additional_data_dict)

        # if the user is uploading a mutect1 maf ... rename fields
        if mongo_submission.has_key('contig'):
            mongo_submission['chromosome'] = mongo_submission.pop('contig')
        if mongo_submission.has_key('position'):
            mongo_submission['start'] = mongo_submission.pop('position')
        if mongo_submission.has_key('ref_allele'):
            mongo_submission['ref'] = mongo_submission.pop('ref_allele')
        if mongo_submission.has_key('alt_allele'):
            mongo_submission['alt'] = mongo_submission.pop('alt_allele')

        unique_data = get_entries_from_dict(mongo_submission,
                                            keys=[
                                                'chromosome', 'start', 'ref',
                                                'alt', 'project', 'dataset',
                                                'evidence_type'
                                            ],
                                            return_type=dict)

        if filesystem:
            project = mongo_submission['project']
            dataset = mongo_submission['dataset']
            filesystem.add_project(project)
            filesystem[project].add_dataset(dataset)

            filesystem[project][dataset].add_file(
                mongo_submission['data_filename'])

            mongo_submission['data_filename']=\
                change_data_filename("/dsde/working/somaticDB/master/data/%s/%s/"%(project,dataset),
                                     mongo_submission['data_filename'])

        bulk.insert(mongo_submission)

        if bulk_count == 10000:
            print "variants uploaded: %d (%.2f seconds since start of upload)." % (
                n, time.time() - start_time)
            bulk_count = 0
            bulk.execute()
            bulk = variants.initialize_unordered_bulk_op()

    if bulk_count > 0:
        print "variants uploaded: %d (%.2f seconds since start of upload)." % (
            n, time.time() - start_time)
        bulk.execute()
Exemplo n.º 8
0
def VariantAssessor(query,tsv,output_file,outdir=""):

    collection = connect_to_mongo()

    caller_output = pd.read_csv(tsv,sep='\t')

    known_true     = {'snp': defaultdict(set),'indel': defaultdict(set)}
    known_false    = {'snp': defaultdict(set),'indel': defaultdict(set)}
    found_variants = {'snp': defaultdict(set),'indel': defaultdict(set)}

    false_positive = {'snp': defaultdict(set),'indel': defaultdict(set)}

    query = query_processor(query)

    logging.getLogger(__name__).info("Querying database for variants.")

    # collect query information
    for record in collection.find(ast.literal_eval(query)):

        sample_information = get_entries_from_dict(record, keys=['project','dataset','sample'],return_type=tuple)
        variant = get_entries_from_dict(record, keys=['chromosome','start','ref','alt'],return_type=tuple)

        sample_information = tuple(map(str, sample_information))
        variant = tuple(map(str, variant))

        evidence_type = record['evidence_type']

        if is_snp(record):
            if 'TP' in evidence_type:
                known_true['snp'][sample_information].add(variant)

            if 'FP' in evidence_type:
                known_false['snp'][sample_information].add(variant)

        elif is_indel(record):
            if 'TP' in evidence_type:
                known_true['indel'][sample_information].add(variant)

            if 'FP' in evidence_type:
                known_false['indel'][sample_information].add(variant)

    normal_normal = set([])
    cm = set([])

    #index the type of assessment to be done for each datatype.
    for k,row in caller_output.iterrows():
        sample_information = (row['project'],row['dataset'],row['sample'])
        if row['evidence_type'] == 'NN':
            normal_normal.add(sample_information)
        elif row['evidence_type'] == 'CM':
            cm.add(sample_information)
        else:
            cm.add(sample_information) #by default, ROC-like curves are used.

    gather = DataGatherer(tsv)


    logging.getLogger(__name__).info("Collection of variants from user submitted files.")

    found_feature_data = defaultdict(dict)

    #data from file (algorithm being tested)
    for variant_dict in gather.data_iterator():
        sample_information = get_entries_from_dict(variant_dict, keys=['project','dataset','sample'],return_type=tuple)
        variant = get_entries_from_dict(variant_dict, keys=['chromosome','start','ref','alt'],return_type=tuple)

        print sample_information, variant

        #found_feature_data[sample_information][variant] = get_entries_from_dict(variant_dict, keys=['ECNT','HCNT','NLOD','TLOD'],return_type=dict)

        if is_snp(variant_dict):
                found_variants['snp'][sample_information].add(variant)



        elif is_indel(variant_dict):
                found_variants['indel'][sample_information].add(variant)

    #print found_feature_data.keys()

    caller_samples = caller_output[['project','dataset','sample']].values.tolist()

    data = []

    for feature in ['snp','indel']:

        filename = {}; fp_fn = {}; fp_fp={}; all_dict={}; fp_tp= {}

        filename[feature] = feature+".false_negatives.tsv"
        fp_fn[feature] = csv.DictWriter(open( os.path.join(outdir,filename[feature])  ,'w'),
                                        delimiter='\t',
                                        fieldnames=['project','dataset','sample','chromosome','start','ref','alt','variant_type'])


        #fieldnames=['project','dataset','sample','chromosome','start','ref','alt','ECNT','HCNT','NLOD','TLOD','variant_type']
        fieldnames=['project','dataset','sample','chromosome','start','ref','alt','variant_type']
        filename[feature] = feature+".false_positives.tsv"
        fp_fp[feature] = csv.DictWriter(open( os.path.join(outdir,filename[feature]) ,'w'),
                                        delimiter='\t',
                                        fieldnames=fieldnames)


        #fieldnames=['project','dataset','sample','chromosome','start','ref','alt','ECNT','HCNT','NLOD','TLOD','variant_type']
        fieldnames=['project','dataset','sample','chromosome','start','ref','alt','variant_type']
        filename[feature] = feature+".true_positives.tsv"
        fp_tp[feature] = csv.DictWriter(open( os.path.join(outdir,filename[feature]) ,'w'),
                                        delimiter='\t',
                                        fieldnames=fieldnames)


        fp_fn[feature].writeheader()
        fp_fp[feature].writeheader()
        fp_tp[feature].writeheader()


        for eval_type in ['CM','NN']:

            all_dict[eval_type] = {'project': 'all',
                                   'dataset': 'all',
                                   'sample' : 'all',
                                   'false_positives': 0,
                                   'true_positives':  0,
                                   'false_negatives': 0,
                                   'tpr': np.nan,
                                   'fpr': np.nan,
                                   'precision': np.nan,
                                   'evidence_type': eval_type,
                                   'variant_type': feature }


        for sample_information in map(tuple,caller_samples):

            if sample_information in normal_normal:
                assessment_type = 'NN'
            elif sample_information in cm:
                assessment_type = 'CM'
            else:
                assessment_type = 'CM'



            row_dict = {'project': sample_information[0],
                        'dataset': sample_information[1],
                        'sample' : sample_information[2],
                        'false_positives': 0,
                        'true_positives': 0,
                        'false_negatives': 0,
                        'tpr': np.nan,
                        'fpr': np.nan,
                        'precision': np.nan,
                        'evidence_type': assessment_type,
                        'variant_type': feature}

            if assessment_type == 'NN':

                FN = len(found_variants[feature][sample_information])
                row_dict['false_positives'] = FN
                row_dict['precision'] = 0

            if assessment_type == 'CM':

                TP = np.float(len(found_variants[feature][sample_information].intersection(known_true[feature][sample_information])))
                FN = np.float(len(known_true[feature][sample_information].difference(found_variants[feature][sample_information])))
                FP = np.float(len(found_variants[feature][sample_information].difference(known_true[feature][sample_information])))

                print TP, FN, FP

                try:
                    row_dict['tpr']  = TP/(TP+FN)
                except:
                    row_dict['tpr']  = np.nan


                row_dict['true_positives']  = TP
                row_dict['false_negatives'] = FN
                row_dict['false_positives'] =  FP

                all_dict['CM']['true_positives']  += TP
                all_dict['CM']['false_negatives'] += FN
                all_dict['CM']['false_positives'] +=  FP

                try:
                    row_dict['precision'] = TP/(TP+FP)
                except:
                    row_dict['precision']  = np.nan

                row_dict['dream_accuracy'] = (row_dict['tpr'] + row_dict['precision'])/2.0

                print row_dict['tpr'], row_dict['precision'], row_dict['dream_accuracy']

                row_dict['variant_type'] = feature

            data.append(row_dict)



            true_positives  = list(found_variants[feature][sample_information].intersection(known_true[feature][sample_information]))
            false_positives = list(found_variants[feature][sample_information].difference(known_true[feature][sample_information]))
            false_negatives = list(known_true[feature][sample_information].difference(found_variants[feature][sample_information]))

            for variant in true_positives:

                fp_tp[feature].writerow({'project': sample_information[0],
                                         'dataset':sample_information[1],
                                         'sample':sample_information[2],
                                         'chromosome':variant[0],
                                         'start':variant[1],
                                         'ref':variant[2],
                                         'alt':variant[3],
                                         #'ECNT':found_feature_data[sample_information][variant]['ECNT'],
                                         #'HCNT':found_feature_data[sample_information][variant]['HCNT'],
                                         #'NLOD':found_feature_data[sample_information][variant]['NLOD'],
                                         #'TLOD':found_feature_data[sample_information][variant]['TLOD'],
                                         'variant_type':feature})

            for variant in false_positives:

                fp_fp[feature].writerow({'project': sample_information[0],
                                         'dataset':sample_information[1],
                                         'sample':sample_information[2],
                                         'chromosome':variant[0],
                                         'start':variant[1],
                                         'ref':variant[2],
                                         'alt':variant[3],
                                         #'ECNT':found_feature_data[sample_information][variant]['ECNT'],
                                         #'HCNT':found_feature_data[sample_information][variant]['HCNT'],
                                         #'NLOD':found_feature_data[sample_information][variant]['NLOD'],
                                         #'TLOD':found_feature_data[sample_information][variant]['TLOD'],
                                         'variant_type':feature})

            for variant in false_negatives:

                fp_fn[feature].writerow({'project': sample_information[0],
                                          'dataset':sample_information[1],
                                          'sample':sample_information[2],
                                          'chromosome':variant[0],
                                          'start':variant[1],
                                          'ref':variant[2],
                                          'alt':variant[3],
                                          'variant_type':feature})



        try:
            all_dict['CM']['tpr'] = all_dict['CM']['true_positives']/(all_dict['CM']['true_positives']+all_dict['CM']['false_negatives'])
        except:
            all_dict['CM']['tpr'] = np.nan


        all_dict['CM']['dream_accuracy'] = (all_dict['CM']['tpr'] + 1 -all_dict['CM']['precision'])/2.0

        try:
            all_dict['CM']['precision'] = all_dict['CM']['true_positives']/(all_dict['CM']['true_positives']+all_dict['CM']['false_positives'])
        except:
            all_dict['CM']['precision'] = np.nan



        all_dict['CM']['dream_accuracy'] = (all_dict['CM']['tpr'] + all_dict['CM']['precision'])/2.0

        all_dict['CM']['variant_type'] = feature

        #data.append(all_dict['CM'])
        #data.append(all_dict['NN'])

    fieldnames=['project','dataset','sample' ,'false_positives','true_positives','false_negatives','tpr','precision','evidence_type','dream_accuracy','variant_type']

    pd.DataFrame(data).to_csv(output_file, sep='\t',index=False,columns=fieldnames,na_rep='nan')
Exemplo n.º 9
0
def BamAggregator(query, normal_bam_list_name, tumor_bam_list_name,
                  interval_list_name, metadata_list_name, folder):

    collection = connect_to_mongo()

    query = query_processor(query)

    interval_list = defaultdict(set)

    metadata_list = {}

    query = query.strip('"')
    query = ast.literal_eval(query)

    print query
    print type(query)
    print "query dictionary:", query.items()

    print "directory:" + os.getcwd()

    doesrecordloop = False

    for record in collection.find(query):

        if doesrecordloop == False:
            print "Contains at least one record."
            doesrecordloop = True

        if not record.has_key('tumor_bam'):
            print record
            continue

        #print record['tumor_bam']

        record['tumor_bam'] = picard_version_to_current(record['tumor_bam'])
        record['normal_bam'] = picard_version_to_current(record['normal_bam'])

        #print record['tumor_bam']
        #print

        tumor_bam = record['tumor_bam']
        normal_bam = record['normal_bam']

        interval = "%s:%s-%s" % (record['chromosome'], record['start'],
                                 record['end'])

        interval_list[(tumor_bam, normal_bam)].add(interval)

        field_names = [
            'tumor_bam', 'normal_bam', 'data_filename', 'project', 'dataset',
            'sample'
        ]
        metadata_list[(tumor_bam,
                       normal_bam)] = get_entries_from_dict(record,
                                                            keys=field_names,
                                                            return_type=dict)
        metadata_list[(tumor_bam, normal_bam)]['evidence_type'] = '.'
        metadata_list[(tumor_bam, normal_bam)]['author'] = '.'

    print 'OPENNING FILES HERE.'
    print 'tumor_bam_file: ' + tumor_bam_list_name
    print 'normal_bam_file: ' + normal_bam_list_name
    print 'interval file:' + interval_list_name

    tumor_bam_file = open(tumor_bam_list_name, 'w')
    normal_bam_file = open(normal_bam_list_name, 'w')
    interval_file = open(interval_list_name, 'w')

    location = os.path.dirname(tumor_bam_list_name)

    fname = os.path.join(location, 'test.txt')

    print "test:" + fname

    f = open(fname, 'w')
    f.close()

    for filename in os.listdir(location):
        if filename.endswith("list"): print filename

    fieldnames = [
        'tumor_bam', 'normal_bam', 'data_filename', 'project', 'dataset',
        'sample', 'evidence_type', 'author'
    ]
    metadata_file = csv.DictWriter(open(metadata_list_name, 'w'),
                                   fieldnames=fieldnames,
                                   delimiter='\t')
    metadata_file.writeheader()

    current_dir = os.getcwd()

    for pair in interval_list:
        tumor_bam, normal_bam = pair
        tumor_bam_file.write(tumor_bam + '\n')
        normal_bam_file.write(normal_bam + '\n')

        metadata_file.writerow(metadata_list[(tumor_bam, normal_bam)])

        sample =\
            "".join([random.choice('abcdef0123456789') for k in range(40)])

        intervals_dir = folder

        current_filename = ".".join([
            "intervals",
            os.path.splitext(os.path.basename(tumor_bam))[0],
            os.path.splitext(os.path.basename(normal_bam))[0], "list"
        ])
        current_filename = os.path.join(intervals_dir, current_filename)

        if not os.path.exists(intervals_dir): os.mkdir(intervals_dir)

        print "made this folder:", intervals_dir, os.path.exists(intervals_dir)

        current_interval_file = open(current_filename, 'w')

        sorted_intervals = sorted(
            list(interval_list[pair]),
            key=lambda x: int(x.split(':')[1].split('-')[0]))

        sorted_intervals = sorted(sorted_intervals, key=lambda x: x.split(':'))

        for interval in sorted_intervals:
            current_interval_file.write(interval + "\n")

        current_interval_file.close()

        interval_file.write(current_filename + '\n')

    for thing in os.listdir(intervals_dir):
        print "file in dir:", thing

    tumor_bam_file.close()
    normal_bam_file.close()
    interval_file.close()
Exemplo n.º 10
0
def VariantUploader(tsv, submit_to_filesystem=False):

    gather = DataGatherer(tsv)

    variants = connect_to_mongo()

    if submit_to_filesystem:
        filesystem = SomaticFileSystem("/dsde/working/somaticDB/master/data")

        S = SubmissionFile(tsv)
        S.change_file_dir()
        S.to_csv(os.path.join("/dsde/working/somaticDB/master/records", os.path.basename(tsv)))
    else:
        filesystem = None

    bulk_count = 0
    bulk = variants.initialize_unordered_bulk_op()

    start_time = time.time()
    n = 0

    for variant_dict in gather.data_iterator():
        n += 1

        bulk_count += 1

        additional_data_dict = {}

        mongo_submission = merge_dicts(variant_dict, additional_data_dict)

        # if the user is uploading a mutect1 maf ... rename fields
        if mongo_submission.has_key("contig"):
            mongo_submission["chromosome"] = mongo_submission.pop("contig")
        if mongo_submission.has_key("position"):
            mongo_submission["start"] = mongo_submission.pop("position")
        if mongo_submission.has_key("ref_allele"):
            mongo_submission["ref"] = mongo_submission.pop("ref_allele")
        if mongo_submission.has_key("alt_allele"):
            mongo_submission["alt"] = mongo_submission.pop("alt_allele")

        unique_data = get_entries_from_dict(
            mongo_submission,
            keys=["chromosome", "start", "ref", "alt", "project", "dataset", "evidence_type"],
            return_type=dict,
        )

        if filesystem:
            project = mongo_submission["project"]
            dataset = mongo_submission["dataset"]
            filesystem.add_project(project)
            filesystem[project].add_dataset(dataset)

            filesystem[project][dataset].add_file(mongo_submission["data_filename"])

            mongo_submission["data_filename"] = change_data_filename(
                "/dsde/working/somaticDB/master/data/%s/%s/" % (project, dataset), mongo_submission["data_filename"]
            )

        bulk.insert(mongo_submission)

        if bulk_count == 10000:
            print "variants uploaded: %d (%.2f seconds since start of upload)." % (n, time.time() - start_time)
            bulk_count = 0
            bulk.execute()
            bulk = variants.initialize_unordered_bulk_op()

    if bulk_count > 0:
        print "variants uploaded: %d (%.2f seconds since start of upload)." % (n, time.time() - start_time)
        bulk.execute()
Exemplo n.º 11
0
def VariantAssessor(query, tsv, output_file, outdir=""):

    collection = connect_to_mongo()

    caller_output = pd.read_csv(tsv, sep='\t')

    known_true = {'snp': defaultdict(set), 'indel': defaultdict(set)}
    known_false = {'snp': defaultdict(set), 'indel': defaultdict(set)}
    found_variants = {'snp': defaultdict(set), 'indel': defaultdict(set)}

    false_positive = {'snp': defaultdict(set), 'indel': defaultdict(set)}

    query = query_processor(query)

    logging.getLogger(__name__).info("Querying database for variants.")

    # collect query information
    for record in collection.find(ast.literal_eval(query)):

        sample_information = get_entries_from_dict(
            record, keys=['project', 'dataset', 'sample'], return_type=tuple)
        variant = get_entries_from_dict(
            record,
            keys=['chromosome', 'start', 'ref', 'alt'],
            return_type=tuple)

        sample_information = tuple(map(str, sample_information))
        variant = tuple(map(str, variant))

        evidence_type = record['evidence_type']

        if is_snp(record):
            if 'TP' in evidence_type:
                known_true['snp'][sample_information].add(variant)

            if 'FP' in evidence_type:
                known_false['snp'][sample_information].add(variant)

        elif is_indel(record):
            if 'TP' in evidence_type:
                known_true['indel'][sample_information].add(variant)

            if 'FP' in evidence_type:
                known_false['indel'][sample_information].add(variant)

    normal_normal = set([])
    cm = set([])

    #index the type of assessment to be done for each datatype.
    for k, row in caller_output.iterrows():
        sample_information = (row['project'], row['dataset'], row['sample'])
        if row['evidence_type'] == 'NN':
            normal_normal.add(sample_information)
        elif row['evidence_type'] == 'CM':
            cm.add(sample_information)
        else:
            cm.add(sample_information)  #by default, ROC-like curves are used.

    gather = DataGatherer(tsv)

    logging.getLogger(__name__).info(
        "Collection of variants from user submitted files.")

    found_feature_data = defaultdict(dict)

    #data from file (algorithm being tested)
    for variant_dict in gather.data_iterator():
        sample_information = get_entries_from_dict(
            variant_dict,
            keys=['project', 'dataset', 'sample'],
            return_type=tuple)
        variant = get_entries_from_dict(
            variant_dict,
            keys=['chromosome', 'start', 'ref', 'alt'],
            return_type=tuple)

        print sample_information, variant

        #found_feature_data[sample_information][variant] = get_entries_from_dict(variant_dict, keys=['ECNT','HCNT','NLOD','TLOD'],return_type=dict)

        if is_snp(variant_dict):
            found_variants['snp'][sample_information].add(variant)

        elif is_indel(variant_dict):
            found_variants['indel'][sample_information].add(variant)

    #print found_feature_data.keys()

    caller_samples = caller_output[['project', 'dataset',
                                    'sample']].values.tolist()

    data = []

    for feature in ['snp', 'indel']:

        filename = {}
        fp_fn = {}
        fp_fp = {}
        all_dict = {}
        fp_tp = {}

        filename[feature] = feature + ".false_negatives.tsv"
        fp_fn[feature] = csv.DictWriter(
            open(os.path.join(outdir, filename[feature]), 'w'),
            delimiter='\t',
            fieldnames=[
                'project', 'dataset', 'sample', 'chromosome', 'start', 'ref',
                'alt', 'variant_type'
            ])

        #fieldnames=['project','dataset','sample','chromosome','start','ref','alt','ECNT','HCNT','NLOD','TLOD','variant_type']
        fieldnames = [
            'project', 'dataset', 'sample', 'chromosome', 'start', 'ref',
            'alt', 'variant_type'
        ]
        filename[feature] = feature + ".false_positives.tsv"
        fp_fp[feature] = csv.DictWriter(open(
            os.path.join(outdir, filename[feature]), 'w'),
                                        delimiter='\t',
                                        fieldnames=fieldnames)

        #fieldnames=['project','dataset','sample','chromosome','start','ref','alt','ECNT','HCNT','NLOD','TLOD','variant_type']
        fieldnames = [
            'project', 'dataset', 'sample', 'chromosome', 'start', 'ref',
            'alt', 'variant_type'
        ]
        filename[feature] = feature + ".true_positives.tsv"
        fp_tp[feature] = csv.DictWriter(open(
            os.path.join(outdir, filename[feature]), 'w'),
                                        delimiter='\t',
                                        fieldnames=fieldnames)

        fp_fn[feature].writeheader()
        fp_fp[feature].writeheader()
        fp_tp[feature].writeheader()

        for eval_type in ['CM', 'NN']:

            all_dict[eval_type] = {
                'project': 'all',
                'dataset': 'all',
                'sample': 'all',
                'false_positives': 0,
                'true_positives': 0,
                'false_negatives': 0,
                'tpr': np.nan,
                'fpr': np.nan,
                'precision': np.nan,
                'evidence_type': eval_type,
                'variant_type': feature
            }

        for sample_information in map(tuple, caller_samples):

            if sample_information in normal_normal:
                assessment_type = 'NN'
            elif sample_information in cm:
                assessment_type = 'CM'
            else:
                assessment_type = 'CM'

            row_dict = {
                'project': sample_information[0],
                'dataset': sample_information[1],
                'sample': sample_information[2],
                'false_positives': 0,
                'true_positives': 0,
                'false_negatives': 0,
                'tpr': np.nan,
                'fpr': np.nan,
                'precision': np.nan,
                'evidence_type': assessment_type,
                'variant_type': feature
            }

            if assessment_type == 'NN':

                FN = len(found_variants[feature][sample_information])
                row_dict['false_positives'] = FN
                row_dict['precision'] = 0

            if assessment_type == 'CM':

                TP = np.float(
                    len(found_variants[feature]
                        [sample_information].intersection(
                            known_true[feature][sample_information])))
                FN = np.float(
                    len(known_true[feature][sample_information].difference(
                        found_variants[feature][sample_information])))
                FP = np.float(
                    len(found_variants[feature][sample_information].difference(
                        known_true[feature][sample_information])))

                print TP, FN, FP

                try:
                    row_dict['tpr'] = TP / (TP + FN)
                except:
                    row_dict['tpr'] = np.nan

                row_dict['true_positives'] = TP
                row_dict['false_negatives'] = FN
                row_dict['false_positives'] = FP

                all_dict['CM']['true_positives'] += TP
                all_dict['CM']['false_negatives'] += FN
                all_dict['CM']['false_positives'] += FP

                try:
                    row_dict['precision'] = TP / (TP + FP)
                except:
                    row_dict['precision'] = np.nan

                row_dict['dream_accuracy'] = (row_dict['tpr'] +
                                              row_dict['precision']) / 2.0

                print row_dict['tpr'], row_dict['precision'], row_dict[
                    'dream_accuracy']

                row_dict['variant_type'] = feature

            data.append(row_dict)

            true_positives = list(
                found_variants[feature][sample_information].intersection(
                    known_true[feature][sample_information]))
            false_positives = list(
                found_variants[feature][sample_information].difference(
                    known_true[feature][sample_information]))
            false_negatives = list(
                known_true[feature][sample_information].difference(
                    found_variants[feature][sample_information]))

            for variant in true_positives:

                fp_tp[feature].writerow({
                    'project': sample_information[0],
                    'dataset': sample_information[1],
                    'sample': sample_information[2],
                    'chromosome': variant[0],
                    'start': variant[1],
                    'ref': variant[2],
                    'alt': variant[3],
                    #'ECNT':found_feature_data[sample_information][variant]['ECNT'],
                    #'HCNT':found_feature_data[sample_information][variant]['HCNT'],
                    #'NLOD':found_feature_data[sample_information][variant]['NLOD'],
                    #'TLOD':found_feature_data[sample_information][variant]['TLOD'],
                    'variant_type': feature
                })

            for variant in false_positives:

                fp_fp[feature].writerow({
                    'project': sample_information[0],
                    'dataset': sample_information[1],
                    'sample': sample_information[2],
                    'chromosome': variant[0],
                    'start': variant[1],
                    'ref': variant[2],
                    'alt': variant[3],
                    #'ECNT':found_feature_data[sample_information][variant]['ECNT'],
                    #'HCNT':found_feature_data[sample_information][variant]['HCNT'],
                    #'NLOD':found_feature_data[sample_information][variant]['NLOD'],
                    #'TLOD':found_feature_data[sample_information][variant]['TLOD'],
                    'variant_type': feature
                })

            for variant in false_negatives:

                fp_fn[feature].writerow({
                    'project': sample_information[0],
                    'dataset': sample_information[1],
                    'sample': sample_information[2],
                    'chromosome': variant[0],
                    'start': variant[1],
                    'ref': variant[2],
                    'alt': variant[3],
                    'variant_type': feature
                })

        try:
            all_dict['CM']['tpr'] = all_dict['CM']['true_positives'] / (
                all_dict['CM']['true_positives'] +
                all_dict['CM']['false_negatives'])
        except:
            all_dict['CM']['tpr'] = np.nan

        all_dict['CM']['dream_accuracy'] = (all_dict['CM']['tpr'] + 1 -
                                            all_dict['CM']['precision']) / 2.0

        try:
            all_dict['CM']['precision'] = all_dict['CM']['true_positives'] / (
                all_dict['CM']['true_positives'] +
                all_dict['CM']['false_positives'])
        except:
            all_dict['CM']['precision'] = np.nan

        all_dict['CM']['dream_accuracy'] = (all_dict['CM']['tpr'] +
                                            all_dict['CM']['precision']) / 2.0

        all_dict['CM']['variant_type'] = feature

        #data.append(all_dict['CM'])
        #data.append(all_dict['NN'])

    fieldnames = [
        'project', 'dataset', 'sample', 'false_positives', 'true_positives',
        'false_negatives', 'tpr', 'precision', 'evidence_type',
        'dream_accuracy', 'variant_type'
    ]

    pd.DataFrame(data).to_csv(output_file,
                              sep='\t',
                              index=False,
                              columns=fieldnames,
                              na_rep='nan')
Exemplo n.º 12
0
def BamAggregator(query, normal_bam_list_name, tumor_bam_list_name, interval_list_name, metadata_list_name, folder):

    collection = connect_to_mongo()

    query = query_processor(query)

    interval_list = defaultdict(set)

    metadata_list = {}

    query = query.strip('"')
    query = ast.literal_eval(query)

    print query
    print type(query)
    print "query dictionary:", query.items()

    print "directory:" + os.getcwd()

    doesrecordloop = False

    for record in collection.find(query):

        if doesrecordloop == False:
            print "Contains at least one record."
            doesrecordloop = True

        if not record.has_key("tumor_bam"):
            print record
            continue

        # print record['tumor_bam']

        record["tumor_bam"] = picard_version_to_current(record["tumor_bam"])
        record["normal_bam"] = picard_version_to_current(record["normal_bam"])

        # print record['tumor_bam']
        # print

        tumor_bam = record["tumor_bam"]
        normal_bam = record["normal_bam"]

        interval = "%s:%s-%s" % (record["chromosome"], record["start"], record["end"])

        interval_list[(tumor_bam, normal_bam)].add(interval)

        field_names = ["tumor_bam", "normal_bam", "data_filename", "project", "dataset", "sample"]
        metadata_list[(tumor_bam, normal_bam)] = get_entries_from_dict(record, keys=field_names, return_type=dict)
        metadata_list[(tumor_bam, normal_bam)]["evidence_type"] = "."
        metadata_list[(tumor_bam, normal_bam)]["author"] = "."

    print "OPENNING FILES HERE."
    print "tumor_bam_file: " + tumor_bam_list_name
    print "normal_bam_file: " + normal_bam_list_name
    print "interval file:" + interval_list_name

    tumor_bam_file = open(tumor_bam_list_name, "w")
    normal_bam_file = open(normal_bam_list_name, "w")
    interval_file = open(interval_list_name, "w")

    location = os.path.dirname(tumor_bam_list_name)

    fname = os.path.join(location, "test.txt")

    print "test:" + fname

    f = open(fname, "w")
    f.close()

    for filename in os.listdir(location):
        if filename.endswith("list"):
            print filename

    fieldnames = ["tumor_bam", "normal_bam", "data_filename", "project", "dataset", "sample", "evidence_type", "author"]
    metadata_file = csv.DictWriter(open(metadata_list_name, "w"), fieldnames=fieldnames, delimiter="\t")
    metadata_file.writeheader()

    current_dir = os.getcwd()

    for pair in interval_list:
        tumor_bam, normal_bam = pair
        tumor_bam_file.write(tumor_bam + "\n")
        normal_bam_file.write(normal_bam + "\n")

        metadata_file.writerow(metadata_list[(tumor_bam, normal_bam)])

        sample = "".join([random.choice("abcdef0123456789") for k in range(40)])

        intervals_dir = folder

        current_filename = ".".join(
            [
                "intervals",
                os.path.splitext(os.path.basename(tumor_bam))[0],
                os.path.splitext(os.path.basename(normal_bam))[0],
                "list",
            ]
        )
        current_filename = os.path.join(intervals_dir, current_filename)

        if not os.path.exists(intervals_dir):
            os.mkdir(intervals_dir)

        print "made this folder:", intervals_dir, os.path.exists(intervals_dir)

        current_interval_file = open(current_filename, "w")

        sorted_intervals = sorted(list(interval_list[pair]), key=lambda x: int(x.split(":")[1].split("-")[0]))

        sorted_intervals = sorted(sorted_intervals, key=lambda x: x.split(":"))

        for interval in sorted_intervals:
            current_interval_file.write(interval + "\n")

        current_interval_file.close()

        interval_file.write(current_filename + "\n")

    for thing in os.listdir(intervals_dir):
        print "file in dir:", thing

    tumor_bam_file.close()
    normal_bam_file.close()
    interval_file.close()