Exemplo n.º 1
0
    def data_iterator(self,
                      keys=('tumor_bam', 'normal_bam', 'data_filename',
                            'project', 'dataset', 'sample', 'evidence_type')):
        file = open(self.filename, 'rU')
        reader = csv.DictReader(file, delimiter='\t')

        logging.getLogger(__name__).info(
            "Gathering variants from table of files:" + self.filename)

        for file_data in reader:

            if file_data.has_key('FILTER'):
                if len(filter) > 0: continue

            meta_data_dict = get_entries_from_dict(file_data,
                                                   keys=keys,
                                                   return_type=dict)

            logging.getLogger(
                __name__).info("Gathering variants from individual file:" +
                               meta_data_dict['data_filename'])

            D = DatabaseParser(meta_data_dict['data_filename'])
            self.current_file = meta_data_dict['data_filename']

            n = 0

            self.new_file = True

            for variant_dict in D.get_variants():
                yield merge_dicts(variant_dict, meta_data_dict)
                if self.new_file == True: self.new_file = False

        self.current_file = None
Exemplo n.º 2
0
    def data_iterator(self,keys=('tumor_bam','normal_bam',
                                             'data_filename',
                                             'project',
                                             'dataset',
                                             'sample',
                                             'evidence_type')):
        file = open(self.filename,'rU')
        reader = csv.DictReader(file,delimiter='\t')

        logging.getLogger(__name__).info("Gathering variants from table of files:"+ self.filename)

        for file_data in reader:

                if file_data.has_key('FILTER'):
                    if len(filter) > 0: continue

                meta_data_dict = get_entries_from_dict(file_data,
                                                       keys=keys,
                                                       return_type=dict)

                logging.getLogger(__name__).info("Gathering variants from individual file:"+ meta_data_dict['data_filename'])

                D = DatabaseParser(meta_data_dict['data_filename'])
                self.current_file = meta_data_dict['data_filename']

                n=0

                self.new_file = True

                for variant_dict in D.get_variants():
                    yield merge_dicts(variant_dict, meta_data_dict)
                    if self.new_file == True: self.new_file = False


        self.current_file = None
Exemplo n.º 3
0
def VariantUploader(tsv, submit_to_filesystem=False):

    gather = DataGatherer(tsv)

    variants = connect_to_mongo()

    if submit_to_filesystem:
        filesystem = SomaticFileSystem('/dsde/working/somaticDB/master/data')

        S = SubmissionFile(tsv)
        S.change_file_dir()
        S.to_csv(
            os.path.join('/dsde/working/somaticDB/master/records',
                         os.path.basename(tsv)))
    else:
        filesystem = None

    bulk_count = 0
    bulk = variants.initialize_unordered_bulk_op()

    start_time = time.time()
    n = 0

    for variant_dict in gather.data_iterator():
        n += 1

        bulk_count += 1

        additional_data_dict = {}

        mongo_submission = merge_dicts(variant_dict, additional_data_dict)

        # if the user is uploading a mutect1 maf ... rename fields
        if mongo_submission.has_key('contig'):
            mongo_submission['chromosome'] = mongo_submission.pop('contig')
        if mongo_submission.has_key('position'):
            mongo_submission['start'] = mongo_submission.pop('position')
        if mongo_submission.has_key('ref_allele'):
            mongo_submission['ref'] = mongo_submission.pop('ref_allele')
        if mongo_submission.has_key('alt_allele'):
            mongo_submission['alt'] = mongo_submission.pop('alt_allele')

        unique_data = get_entries_from_dict(mongo_submission,
                                            keys=[
                                                'chromosome', 'start', 'ref',
                                                'alt', 'project', 'dataset',
                                                'evidence_type'
                                            ],
                                            return_type=dict)

        if filesystem:
            project = mongo_submission['project']
            dataset = mongo_submission['dataset']
            filesystem.add_project(project)
            filesystem[project].add_dataset(dataset)

            filesystem[project][dataset].add_file(
                mongo_submission['data_filename'])

            mongo_submission['data_filename']=\
                change_data_filename("/dsde/working/somaticDB/master/data/%s/%s/"%(project,dataset),
                                     mongo_submission['data_filename'])

        bulk.insert(mongo_submission)

        if bulk_count == 10000:
            print "variants uploaded: %d (%.2f seconds since start of upload)." % (
                n, time.time() - start_time)
            bulk_count = 0
            bulk.execute()
            bulk = variants.initialize_unordered_bulk_op()

    if bulk_count > 0:
        print "variants uploaded: %d (%.2f seconds since start of upload)." % (
            n, time.time() - start_time)
        bulk.execute()
Exemplo n.º 4
0
    def get_variants(self, dataset_type=None):

        logging.getLogger(__name__).info("Opening file:"+ self.filename)

        if dataset_type is None:
            if any([self.filename.endswith(".table"),
                   self.filename.endswith(".table.gz")]):
                dataset_type = "VCF_TABLE"

            if any([self.filename.endswith(".vcf"),
                   self.filename.endswith(".vcf.gz")]):
                dataset_type = "VCF"

            if any([self.filename.endswith(".maf"),
                   self.filename.endswith(".maf.gz")]):
                dataset_type = "MAF"

        logging.getLogger(__name__).info("Using vcf module: "+vcf.__file__)

        logging.getLogger(__name__).info("File is of type:"+ dataset_type)

        if dataset_type not in ['VCF', 'MAF', 'VCF_TABLE']:
            raise Exception("Bad file format: %s"%self.filename)

        if dataset_type == 'VCF':
            self.file = vcf.Reader(self.file)
        else:
            self.file = ifilter(lambda line: not line.startswith('#'), self.file)
            self.file = csv.DictReader(self.file,delimiter='\t')

        for record in self.file:
            if dataset_type == 'VCF':
                for k, alt in enumerate(record.ALT): #split biallelic sites
                    chrom = record.CHROM
                    start = record.POS
                    ref = record.REF
                    alt = str(alt)

                    filter = record.FILTER

                    start, end, ref, alt = adjustIndelFormat(start, ref, alt)

                    core_data = {"chromosome":chrom,"start":start,"end":end,"ref":ref,"alt":alt,"FILTER":filter}

                    for key in record.INFO:
                        if key in ['MLEAC','MLEAF','AC','AF']:
                            core_data[key] = record.INFO[key][k]
                        else:
                            core_data[key] = record.INFO[key]

                    #filters for callers
                    if record.FILTER is None or record.FILTER == '.' or (not record.FILTER) or record.FILTER=='PASS':
                        pass
                    else:
                        continue

                    #filters for dream challenge

                    if record.INFO.has_key('SVTYPE'): continue


                    core_data = merge_dicts(core_data,record.INFO)

                    core_data = stringify_dict(core_data)

                    yield core_data


            if dataset_type == 'VCF_TABLE':
                ALT = record['ALT'].split(",")

                for k, alt in enumerate(ALT):
                    chrom = record['CHROM']
                    start = record['POS']
                    ref = record['REF']

                    start, end, ref, alt = adjustIndelFormat(start, ref, alt)

                    core_data = {"chromosome":chrom,"start":start,"end":end,"ref":ref,"alt":alt}

                    mleac = record['MLEAC'].split(',')
                    mleaf = record['MLEAF'].split(',')
                    ac = record['AC'].split(',')
                    af = record['AF'].split(',')

                    #filters for callers

                    if record.has_key('FILTER'):
                        if record['FILTER'] == '.' or record['FILTER'] =='PASS':
                            pass
                        else:
                            continue

                    #filters for dream challenge

                    if record.INFO.get('SVTYPE') in ('IGN', 'MSK'): continue

                    for key in record:

                        if key in ['CHROM','POS','REF','MLEAF','MLEAC','AC','AF']: continue

                        if key == 'MLEAC':
                            core_data['MLEAC'] = mleac[k]
                        elif key == 'MLEAF':
                            core_data['MLEAF'] = mleaf[k]
                        elif key == 'AC':
                            core_data['AC'] = ac[k]
                        elif key == 'AF':
                            core_data['AF'] = af[k]
                        else:
                            core_data[key] = record[key]

                            if "," in core_data[key]:
                                core_data[key] = core_data[key].split(',')[k]

                    core_data = stringify_dict(core_data)

                    yield core_data

            if dataset_type == "MAF":

                if record.has_key('contig'): record['Chromosome'] = record.pop('contig')
                if record.has_key('position'): record['Start_position'] = record.pop('position')
                if record.has_key('ref_allele'): record['Reference_Allele'] = record.pop('ref_allele')
                if record.has_key('alt_allele'): record['Tumor_Seq_Allele2'] = record.pop('alt_allele')
                if not record.has_key('End_position'): record['End_position'] = str(int(record['Start_position'])+ len(record['Reference_Allele'])-1)


                chrom = record['Chromosome']
                start = record['Start_position']
                end = record['End_position']
                ref   = record['Reference_Allele']
                alt   = record['Tumor_Seq_Allele2']

                if record.has_key('judgement'):
                    if record['judgement'] != 'KEEP': continue

                core_data = {"chromosome":chrom,"start":start,"end":end,"ref":ref,"alt":alt}

                for key in record:
                    if key not in ['Chromosome','Start_Position','End_Position','Reference_Allele','Tumor_Seq_Allele1','Tumor_Seq_Allele2']:
                        core_data[key] = record[key]

                core_data = stringify_dict(core_data)

                yield core_data
Exemplo n.º 5
0
def VariantUploader(tsv, submit_to_filesystem=False):

    gather = DataGatherer(tsv)

    variants = connect_to_mongo()

    if submit_to_filesystem:
        filesystem = SomaticFileSystem("/dsde/working/somaticDB/master/data")

        S = SubmissionFile(tsv)
        S.change_file_dir()
        S.to_csv(os.path.join("/dsde/working/somaticDB/master/records", os.path.basename(tsv)))
    else:
        filesystem = None

    bulk_count = 0
    bulk = variants.initialize_unordered_bulk_op()

    start_time = time.time()
    n = 0

    for variant_dict in gather.data_iterator():
        n += 1

        bulk_count += 1

        additional_data_dict = {}

        mongo_submission = merge_dicts(variant_dict, additional_data_dict)

        # if the user is uploading a mutect1 maf ... rename fields
        if mongo_submission.has_key("contig"):
            mongo_submission["chromosome"] = mongo_submission.pop("contig")
        if mongo_submission.has_key("position"):
            mongo_submission["start"] = mongo_submission.pop("position")
        if mongo_submission.has_key("ref_allele"):
            mongo_submission["ref"] = mongo_submission.pop("ref_allele")
        if mongo_submission.has_key("alt_allele"):
            mongo_submission["alt"] = mongo_submission.pop("alt_allele")

        unique_data = get_entries_from_dict(
            mongo_submission,
            keys=["chromosome", "start", "ref", "alt", "project", "dataset", "evidence_type"],
            return_type=dict,
        )

        if filesystem:
            project = mongo_submission["project"]
            dataset = mongo_submission["dataset"]
            filesystem.add_project(project)
            filesystem[project].add_dataset(dataset)

            filesystem[project][dataset].add_file(mongo_submission["data_filename"])

            mongo_submission["data_filename"] = change_data_filename(
                "/dsde/working/somaticDB/master/data/%s/%s/" % (project, dataset), mongo_submission["data_filename"]
            )

        bulk.insert(mongo_submission)

        if bulk_count == 10000:
            print "variants uploaded: %d (%.2f seconds since start of upload)." % (n, time.time() - start_time)
            bulk_count = 0
            bulk.execute()
            bulk = variants.initialize_unordered_bulk_op()

    if bulk_count > 0:
        print "variants uploaded: %d (%.2f seconds since start of upload)." % (n, time.time() - start_time)
        bulk.execute()
Exemplo n.º 6
0
    def get_variants(self, dataset_type=None):

        logging.getLogger(__name__).info("Opening file:" + self.filename)

        if dataset_type is None:
            if any([
                    self.filename.endswith(".table"),
                    self.filename.endswith(".table.gz")
            ]):
                dataset_type = "VCF_TABLE"

            if any([
                    self.filename.endswith(".vcf"),
                    self.filename.endswith(".vcf.gz")
            ]):
                dataset_type = "VCF"

            if any([
                    self.filename.endswith(".maf"),
                    self.filename.endswith(".maf.gz")
            ]):
                dataset_type = "MAF"

        logging.getLogger(__name__).info("Using vcf module: " + vcf.__file__)

        logging.getLogger(__name__).info("File is of type:" + dataset_type)

        if dataset_type not in ['VCF', 'MAF', 'VCF_TABLE']:
            raise Exception("Bad file format: %s" % self.filename)

        if dataset_type == 'VCF':
            self.file = vcf.Reader(self.file)
        else:
            self.file = ifilter(lambda line: not line.startswith('#'),
                                self.file)
            self.file = csv.DictReader(self.file, delimiter='\t')

        for record in self.file:
            if dataset_type == 'VCF':
                for k, alt in enumerate(record.ALT):  #split biallelic sites
                    chrom = record.CHROM
                    start = record.POS
                    ref = record.REF
                    alt = str(alt)

                    filter = record.FILTER

                    start, end, ref, alt = adjustIndelFormat(start, ref, alt)

                    core_data = {
                        "chromosome": chrom,
                        "start": start,
                        "end": end,
                        "ref": ref,
                        "alt": alt,
                        "FILTER": filter
                    }

                    for key in record.INFO:
                        if key in ['MLEAC', 'MLEAF', 'AC', 'AF']:
                            core_data[key] = record.INFO[key][k]
                        else:
                            core_data[key] = record.INFO[key]

                    #filters for callers
                    if record.FILTER is None or record.FILTER == '.' or (
                            not record.FILTER) or record.FILTER == 'PASS':
                        pass
                    else:
                        continue

                    #filters for dream challenge

                    if record.INFO.has_key('SVTYPE'): continue

                    core_data = merge_dicts(core_data, record.INFO)

                    core_data = stringify_dict(core_data)

                    yield core_data

            if dataset_type == 'VCF_TABLE':
                ALT = record['ALT'].split(",")

                for k, alt in enumerate(ALT):
                    chrom = record['CHROM']
                    start = record['POS']
                    ref = record['REF']

                    start, end, ref, alt = adjustIndelFormat(start, ref, alt)

                    core_data = {
                        "chromosome": chrom,
                        "start": start,
                        "end": end,
                        "ref": ref,
                        "alt": alt
                    }

                    mleac = record['MLEAC'].split(',')
                    mleaf = record['MLEAF'].split(',')
                    ac = record['AC'].split(',')
                    af = record['AF'].split(',')

                    #filters for callers

                    if record.has_key('FILTER'):
                        if record['FILTER'] == '.' or record[
                                'FILTER'] == 'PASS':
                            pass
                        else:
                            continue

                    #filters for dream challenge

                    if record.INFO.get('SVTYPE') in ('IGN', 'MSK'): continue

                    for key in record:

                        if key in [
                                'CHROM', 'POS', 'REF', 'MLEAF', 'MLEAC', 'AC',
                                'AF'
                        ]:
                            continue

                        if key == 'MLEAC':
                            core_data['MLEAC'] = mleac[k]
                        elif key == 'MLEAF':
                            core_data['MLEAF'] = mleaf[k]
                        elif key == 'AC':
                            core_data['AC'] = ac[k]
                        elif key == 'AF':
                            core_data['AF'] = af[k]
                        else:
                            core_data[key] = record[key]

                            if "," in core_data[key]:
                                core_data[key] = core_data[key].split(',')[k]

                    core_data = stringify_dict(core_data)

                    yield core_data

            if dataset_type == "MAF":

                if record.has_key('contig'):
                    record['Chromosome'] = record.pop('contig')
                if record.has_key('position'):
                    record['Start_position'] = record.pop('position')
                if record.has_key('ref_allele'):
                    record['Reference_Allele'] = record.pop('ref_allele')
                if record.has_key('alt_allele'):
                    record['Tumor_Seq_Allele2'] = record.pop('alt_allele')
                if not record.has_key('End_position'):
                    record['End_position'] = str(
                        int(record['Start_position']) +
                        len(record['Reference_Allele']) - 1)

                chrom = record['Chromosome']
                start = record['Start_position']
                end = record['End_position']
                ref = record['Reference_Allele']
                alt = record['Tumor_Seq_Allele2']

                if record.has_key('judgement'):
                    if record['judgement'] != 'KEEP': continue

                core_data = {
                    "chromosome": chrom,
                    "start": start,
                    "end": end,
                    "ref": ref,
                    "alt": alt
                }

                for key in record:
                    if key not in [
                            'Chromosome', 'Start_Position', 'End_Position',
                            'Reference_Allele', 'Tumor_Seq_Allele1',
                            'Tumor_Seq_Allele2'
                    ]:
                        core_data[key] = record[key]

                core_data = stringify_dict(core_data)

                yield core_data