Exemplo n.º 1
0
 def __init__(s, path, software):
     new = False if os.path.isfile(path) else True
     s.archive = archive.Archive(path)
     s.metadata = {
         "createdOn": time.time(),
         "createdBy": getpass.getuser(),
         "name": os.path.basename(os.path.splitext(path)[0]),
         "software": software
     }
     with timer.Timer("Loading"):
         with s.archive:
             s.metadata = Dict(
                 dict(s.metadata,
                      **decode(s.archive.get("metadata.json",
                                             "{}"))))  # Metadata
             s.ref = Dict(
                 reference.Reference(
                     decode(s.archive.get("reference.json",
                                          "{}"))))  # Reference file
             s.data = Dict(decode(s.archive.get("data.json",
                                                "{}")))  # Storage
             if new:
                 s.metadata.diff = True
                 s.data.diff = True
                 s.ref.diff = True
             tree = dict((a, a.split("/")) for a in s.archive.keys())
             clipIDs = set(b[1] for a, b in tree.items() if b[0] == "clips")
             s.clips = Dict({})
             if clipIDs:
                 for ID in clipIDs:
                     c = Clip(ID)
                     c.metadata = Dict(
                         dict(
                             c.metadata,
                             **decode(
                                 s.archive.get(
                                     "clips/%s/metadata.json" % ID, "{}"))))
                     c.data = decode(
                         s.archive.get("clips/%s/data.json" % ID, "{}"))
                     thumbs = sorted([
                         a for a, b in tree.items() if b[0] == "clips"
                         and b[1] == ID and b[2] == "thumbs"
                     ])
                     if thumbs:
                         for th in thumbs:
                             c.thumbs.append(s.cache(th))
                     s.clips[ID] = c
             s.clips.diff
Exemplo n.º 2
0
    def run(self):

        ref_path = self.reference.__str__()
        refasta = reference.Reference(ref_path).show_fasta()
        # Having reference an independent class leaves enough room for accommodating future expansion

        myresult = myio.IMFilterOutput(self.infilename.name)
        # another problem tho is args.infile_sam came along way with multiple "/"
        # so here you need to extract the basename

        opened_input = open(self.infilename.__str__(), 'r')

        self._lgr.info('referencing the genome fasta provided...')

        counter_all, counter_imp = 0, 0
        for line in opened_input:
            samline = myclasses.LineUp(line)
            if samline._identity == 'SAMheader':
                continue
                # The header is excluded from the output SAM
            elif samline._identity == 'SAMread':
                counter_all += 1
                samread = samline.parse_line()
                start = samread.pat_locator()
                p = IMPriming(refasta, samread._strand, samread._chroms, start,
                              self.window_size, self.deny_number)
                if p._imp:
                    counter_imp += 1
                    continue
                else:
                    newline = samread.build_line()
                    myresult.add2content(newline)
            else:
                raise Exception(
                    'Unexpected format of lines appear in SAM input')
        self._lgr.info("creating filtered SAM file excluding headers ...")
        self._lgr.info("through {0} reads".format(counter_all))
        self._lgr.info(
            "\t\t{0} ({1}) were removed due to internal priming.".format(
                counter_imp,
                float(counter_imp) / float(counter_all)))
        myresult.open2write(myresult.content)
        self._lgr.info("Done!")
Exemplo n.º 3
0
    def filterSpectrumList(self,
                           refFilePath,
                           magmin=-1,
                           magmax=50,
                           zmin=-1,
                           zmax=50,
                           sfrmin=-1,
                           sfrmax=1e6,
                           outputPath=""):
        ref = reference.Reference(refFilePath)
        idList = self.fvect
        indexes = ref.filterIdList(idList,
                                   magmin=magmin,
                                   magmax=magmax,
                                   zmin=zmin,
                                   zmax=zmax,
                                   sfrmin=sfrmin,
                                   sfrmax=sfrmax)
        print("spllist filtering: indexes found n = {}".format(len(indexes)))

        if outputPath == "":
            dirPath = os.path.split(self.path)[0]
            nameNoExt = os.path.splitext(self.name)[0]
            outputFileFullPath = os.path.join(
                dirPath, "{}_z{}-{}_mag{}-{}_sfr{}-{}.spectrumlist".format(
                    nameNoExt, zmin, zmax, magmin, magmax, sfrmin, sfrmax))

            f = open(outputFileFullPath, 'w')
            for k, idThis in enumerate(self.fvect):
                if k in indexes:
                    if len(self.errfvect) == len(self.fvect) and len(
                            self.procidvect) == len(self.fvect):
                        f.write("{}\t{}\t{}\n".format(self.fvect[k],
                                                      self.errfvect[k],
                                                      self.procidvect[k]))
                    else:
                        f.write("{}\n".format(self.fvect[k]))
Exemplo n.º 4
0
else:
    dataY, N = readRefSeqData(options.dataY, (dataYtype == dataXtype))
    infoY = readRefSeqInfo(options.dataY)
sys.stdout.write('\rDataset Y [' + options.dataY + ']: ' + infoY + ', ' +
                 str(N) + ' transcripts')
print '\n'

# Read configuration file
refpath37, refpath38 = readConfigFile(dir)

# Initialize GRCh37 reference genome
if refpath37 is not None:
    if not os.path.isfile(refpath37):
        print '\nError: GRCh37 reference genome file (' + refpath37 + ') cannot be found.\n'
        quit()
    ref_GRCh37 = reference.Reference(refpath37)
else:
    ref_GRCh37 = None

# Initialize GRCh38 reference genome
if refpath38 is not None:
    if not os.path.isfile(refpath38):
        print '\nError: GRCh38 reference genome file (' + refpath38 + ') cannot be found.\n'
        quit()
    ref_GRCh38 = reference.Reference(refpath38)
else:
    ref_GRCh38 = None

# Check if required reference genome files are specified
if ref_GRCh37 is None and 'GRCh37' in [dataX_build, dataY_build]:
    print '\nError: GRCh37 reference genome file needs to be specified in configuration file.\n'
Exemplo n.º 5
0
def run(options):

    if not ((options.series.startswith('CART37')
             or options.series.startswith('CART38'))
            and len(options.series) == 7):
        print '\nSeries code incorrect!\n'
        quit()

    print '\n==== ENSTWriter {} '.format(__version__) + '=' * 78

    # Initialize reference sequence reader
    ref = reference.Reference(options.ref)

    # Initialize transcript database writer
    tdb_writer = helper.initialize_transcript_db_writer(options)

    # Read Ensembl database
    ensembl_db = transcripts.read_ensembl_db(options.ensembl)

    # Read previous CAVA db output and reference genome if required
    if options.prev_cava_db:
        prev_ref = reference.Reference(options.prev_ref)
        prev_cava_db = helper.read_prev_cava_db(options.prev_cava_db, prev_ref)
    else:
        prev_cava_db = None

    # Initialize output files
    out_genepred, out_fasta, out_genepred_annovar, out_fasta_annovar, gbk_dir = helper.initialize_output_files(
        options)

    # Initialize progress info
    sys.stdout.write('\nProcessing {} CARTs read from {} ... '.format(
        helper.number_of_input_carts(options.input), options.input))
    sys.stdout.flush()

    # Initialize CART numbering
    cartidx = 10000 if options.prev_cava_db is None else helper.get_last_cartidx(
        options.prev_cava_db)

    # Iterate through input records
    missing_list = []
    gff2_lines = {}
    gff3_lines = {}
    for line in open(options.input):
        line = line.strip()
        if line == '' or line.startswith('#'):
            continue
        cols = line.split()
        hgnc_id = cols[0][5:]
        enst = cols[1]

        # Add ENST to missing list if not found in Ensembl database
        if enst not in ensembl_db:
            missing_list.append('{} (HGNC:{})'.format(enst, hgnc_id))
            continue

        # Retrieve data about ENST
        transcript = ensembl_db[enst]

        # Calculating CART ID
        if options.prev_cava_db is None:
            cartidx += 1
            cart_id = '{}{}'.format(options.series, cartidx)
        else:
            content = (transcript.strand, len(transcript.exons),
                       helper.read_mrna_sequence(transcript, ref))

            if hgnc_id in prev_cava_db and content == prev_cava_db[hgnc_id][
                    'content']:
                cart_id = '{}{}'.format(options.series,
                                        prev_cava_db[hgnc_id]['cartidx'])
            else:
                cartidx += 1
                cart_id = '{}{}'.format(options.series, cartidx)

        # Add CART ID and HGNC ID to transcript
        transcript.id = cart_id
        transcript.hgnc_id = hgnc_id

        # Add transcript to database writer
        tdb_writer.add(transcript)

        # Create content of gff2 and gff3 files
        gff2_lines = helper.create_gff2_lines(transcript, gff2_lines)
        gff3_lines = helper.create_gff3_lines(transcript, gff3_lines)

        # Write to gp file
        helper.output_genepred(transcript, out_genepred)

        # Write to gbk output
        if options.gbk:
            helper.output_gbk(transcript, ref, gbk_dir)

        # Write to fasta file
        helper.output_fasta(transcript, out_fasta, ref)

        # Write annovar files
        if options.annovar:
            helper.output_genepred(transcript, out_genepred_annovar)
            helper.output_fasta_annovar(transcript, out_fasta_annovar, ref)

    # Create bgzipped, Tabix-index GFF2 and GFF3 output
    helper.output_gff2(gff2_lines, options.output + '.gff2')
    helper.output_gff3(gff3_lines, options.output + '.gff3')

    # Finalize outputs
    helper.finalize_outputs(options, tdb_writer, out_fasta, out_genepred,
                            out_genepred_annovar, out_fasta_annovar, gbk_dir)

    # Print out summary info
    helper.print_summary_info(options, missing_list)

    print '\n' + '=' * 100 + '\n'
Exemplo n.º 6
0
def run(options):


    # Checks if Ensembl TXT files exist if required
    if not os.path.isfile(options.dataX[:-3] + '.txt'):
        print 'Error: Dataset X txt file (' + options.dataX[:-3] + '.txt) cannot be found.\n'
        quit()
    if not os.path.isfile(options.dataY[:-3] + '.txt'):
        print 'Error: Dataset Y txt file (' + options.dataY[:-3] + '.txt) cannot be found.\n'
        quit()

    input_genes = helper.read_input_genes(options.input)

    # Read transcript database X
    sys.stdout.write('GRCh37 Ensembl db [' + options.dataX + ']: READING...')
    sys.stdout.flush()
    dataX, N = helper.readEnsemblData(options.dataX)

    sys.stdout.write('\rGRCh37 Ensembl db [' + options.dataX + ']: ' + str(N) + ' transcripts')
    print ''

    # Read transcript database Y
    sys.stdout.write('GRCh38 Ensembl db [' + options.dataY + ']: READING...')
    sys.stdout.flush()
    dataY, N = helper.readEnsemblData(options.dataY)

    sys.stdout.write('\rGRCh38 Ensembl db [' + options.dataY + ']: ' + str(N) + ' transcripts')
    print '\n'


    # Initialize GRCh37 reference genome
    if not os.path.isfile(options.ref37):
        print '\nError: GRCh37 reference genome file (' + options.ref37 + ') cannot be found.\n'
        quit()
    ref_GRCh37 = reference.Reference(options.ref37)


    # Initialize GRCh38 reference genome
    if not os.path.isfile(options.ref38):
        print '\nError: GRCh38 reference genome file (' + options.ref38 + ') cannot be found.\n'
        quit()
    ref_GRCh38 = reference.Reference(options.ref38)

    ensts_37 = helper.read_enst_file(options.enstsx)
    ensts_38 = helper.read_enst_file(options.enstsy)

    # Initialize output file
    out = open(options.output, 'w')
    out.write('\t'.join(['#GENE', 'ENST_37', 'ENST_38', 'DIFFERENCE']) + '\n')


    # Iterate through the input list of transcripts

    count_gene_in_both = 0

    n_identical = 0
    n_cds_identical = 0
    i = 1
    for g in input_genes:

        i += 1

        if g not in ensts_37 or g not in ensts_38:
            continue

        count_gene_in_both += 1

        enst1 = ensts_37[g]
        enst2 = ensts_38[g]

        flags = []
        comparewith = []

        sys.stdout.write('\rAnalysing gene ' + str(i) + '/' + str(len(input_genes)))
        sys.stdout.flush()

        if enst1 not in dataX.keys():
            flags.append('NF37')
        else:
            transcript = dataX[enst1]

        if enst2 not in dataY.keys():
            flags.append('NF38')
        else:
            comparewith = [dataY[enst2]]

        if len(flags) > 0:
            out.write('\t'.join(['HGNC:' + g, enst1, enst2, ';'.join(flags)]) + '\n')
            continue

        identical, cds_identical = helper.compare(
            g, transcript, comparewith, ref_GRCh37, ref_GRCh38, options, out)
        if identical:
            n_identical += 1
        if cds_identical:
            n_cds_identical += 1

    print ' - Done.'

    # Close output file
    out.close()

    # Goodbye message
    print '\nSummary:'
    print '- {} of the {} genes are on both ENSTs lists'.format(count_gene_in_both, len(input_genes))
    print '- ' + str(n_identical) + ' genes have identical ENSTs'
    print '- ' + str(n_cds_identical) + ' genes have CDS-identical ENSTs'
    print '\nOutput written to file: ' + options.output
Exemplo n.º 7
0
    def __init__(self, confpath, binpath, rootoutputpath, dividecount,
                 opt_bracketing, bracketing_templatesRootPath, refpath):
        self.logTagStr = "processHelper"
        self.ready = False
        self.configPath = confpath
        self.binPath = binpath
        self.baseoutputpath = rootoutputpath

        self.logsPath = os.path.join(self.baseoutputpath, "cluster_logs")
        if not os.path.exists(self.logsPath):
            os.mkdir(self.logsPath)

        self.refPath = refpath
        if self.refPath == "":
            self.enableProcessAtZ = 0
        else:
            self.enableProcessAtZ = 1
        if self.enableProcessAtZ and not dividecount == 1:
            print(
                "ERROR: incompatible options : enableProcessAtZ and dividecount!=1. Aborting."
            )
            return
        if self.enableProcessAtZ:
            self.refcatalog = reference.Reference(referencepath=self.refPath,
                                                  rtype="simple")

        self.proc_date = time.strftime("%Y%m%d")

        self.opt_bracketing = opt_bracketing  #choices = "", "method"
        self.bracketing_templatesRootPath = bracketing_templatesRootPath

        ret = self.loadConfig()
        if not ret:
            print("ERROR: load config failed.")
            return

        #prepare the working dir
        self.work_process_dir = os.path.abspath("process-work")
        if not os.path.exists(self.work_process_dir):
            os.mkdir(self.work_process_dir)

        if dividecount > 0:
            outputPath = os.path.join(
                self.work_process_dir,
                "spectrumlist_subs_{}".format(dividecount))
            if not os.path.exists(outputPath):
                os.mkdir(outputPath)
            print('INFO: splitting using full path: {}'.format(outputPath))
            spclist = spectrumlist.Spectrumlist(self.config_spclistPath)
            self.subspclists = spclist.splitIntoSubsets(
                int(dividecount), outputPath)
            self.subrecombine_info = {}
            self.subsetsRelPath = "output_subsets"
            self.baseoutputpath = os.path.join(self.baseoutputpath,
                                               self.subsetsRelPath)
            if not os.path.exists(self.baseoutputpath):
                os.mkdir(self.baseoutputpath)
        else:
            self.subspclists = []

        self.ready = True
Exemplo n.º 8
0
def run(options):

    if not ((options.series.startswith('CART37')
             or options.series.startswith('CART38'))
            and len(options.series) == 7):
        print '\nSeries code incorrect!\n'
        quit()

    # ...
    selected_ensts = helper.read_selected_ensts(options.selected_ensts)

    # ...
    canonical_ensts = helper.read_canonical_ensts(options.canonical)

    # Initialize reference sequence reader
    ref = reference.Reference(options.ref)

    # Initialize transcript database writer
    tdb_writer = helper.initialize_transcript_db_writer(options)

    # Read Ensembl database
    ensembl_db = transcripts.read_ensembl_db(options.ensembl)
    ensembl_by_symbol = transcripts.read_ensembl_db_by_symbol(options.ensembl)

    # Read previous CAVA db output and reference genome if required
    if options.prev_cava_db:
        prev_ref = reference.Reference(options.prev_ref)
        prev_cava_db = helper.read_prev_cava_db(options.prev_cava_db, prev_ref)
    else:
        prev_cava_db = None

    # Initialize output files
    out_genepred, out_fasta, out_genepred_annovar, out_fasta_annovar, gbk_dir, out_id, out_excl = helper.initialize_output_files(
        options)

    # Initialize progress info
    sys.stdout.write('Processing {} genes ... '.format(
        helper.number_of_genes(options.selected_nms)))
    sys.stdout.flush()

    # Initialize CART numbering
    cartidx = 10000 if options.prev_cava_db is None else helper.get_last_cartidx(
        options.prev_cava_db)

    # Iterate through input records
    count_excluded = 0
    count_selected = 0
    count_canonical_or_longest = 0
    gff2_lines = {}
    gff3_lines = {}
    for line in open(options.selected_nms):
        line = line.strip()
        if line == '' or line.startswith('#'):
            continue
        cols = line.split()

        symbol = cols[0]
        hgnc_id = cols[1]
        assoc_nm = cols[-1]

        if hgnc_id in selected_ensts and selected_ensts[hgnc_id] != '.':
            enst = selected_ensts[hgnc_id]
            selected = True
        elif symbol in canonical_ensts and canonical_ensts[
                symbol] in ensembl_db:
            enst = canonical_ensts[symbol]
            selected = False
        elif symbol in ensembl_by_symbol:
            enst = transcripts.find_longest_transcript(
                ensembl_by_symbol[symbol]).id
            selected = False
        else:
            out_excl.write('{}\t{}\t{}\n'.format(
                hgnc_id, symbol, 'no_selection_or_canonical_or_longest'))
            count_excluded += 1
            continue

        # Add to the list of excluded genes if ENST not found in Ensembl database
        if enst not in ensembl_db:
            out_excl.write('{}\t{}\t{}\n'.format(hgnc_id, symbol,
                                                 'not_in_ensembl_db'))
            count_excluded += 1
            continue

        try:

            # Retrieve data about ENST
            transcript = ensembl_db[enst]

            if selected:

                # Calculating CART ID
                if options.prev_cava_db is None:
                    cartidx += 1
                    cart_id = '{}{}'.format(options.series, cartidx)
                else:
                    content = (transcript.strand, len(transcript.exons),
                               helper.read_mrna_sequence(transcript, ref))

                    if hgnc_id in prev_cava_db and content == prev_cava_db[
                            hgnc_id]['content']:
                        cart_id = '{}{}'.format(
                            options.series, prev_cava_db[hgnc_id]['cartidx'])
                    else:
                        cartidx += 1
                        cart_id = '{}{}'.format(options.series, cartidx)

                template_id = cart_id

            else:
                template_id = enst

            # Add template ID and HGNC ID to transcript
            transcript.id = template_id
            transcript.hgnc_id = hgnc_id

            transcript.assoc_nm = assoc_nm
            transcript.assoc_enst = enst

            # Write IDs to file
            helper.output_ids(out_id, hgnc_id, template_id)

            # Add transcript to database writer
            tdb_writer.add(transcript)

            # Create content of gff3 file
            gff2_lines = helper.create_gff2_lines(transcript, gff2_lines)
            gff3_lines = helper.create_gff3_lines(transcript, gff3_lines)

            # Write to gp file
            helper.output_genepred(transcript, out_genepred)

            # Write to gbk output
            if options.gbk:
                helper.output_gbk(transcript, ref, gbk_dir)

            # Write to fasta file
            helper.output_fasta(transcript, out_fasta, ref)

            # Write annovar files
            if options.annovar:
                helper.output_genepred(transcript, out_genepred_annovar)
                helper.output_fasta_annovar(transcript, out_fasta_annovar, ref)

            if selected:
                count_selected += 1
            else:
                count_canonical_or_longest += 1

        except:
            out_excl.write('{}\t{}\t{}\n'.format(hgnc_id, symbol,
                                                 'output_error'))
            count_excluded += 1

    # Create bgzipped, Tabix-index GFF2 and GFF3 outputs
    helper.output_gff2(gff2_lines, options.output + '.gff2')
    helper.output_gff3(gff3_lines, options.output + '.gff3')

    # Finalize outputs
    helper.finalize_outputs(options, tdb_writer, out_fasta, out_genepred,
                            out_genepred_annovar, out_fasta_annovar, gbk_dir,
                            out_id, out_excl)

    # Print out summary info
    helper.print_summary_info(options, count_selected,
                              count_canonical_or_longest, count_excluded)
 def _open_reference(self):
     ref = reference.Reference(self._refgenome.__str__())
     opened_ref = ref.show_fasta()
     self._lgr.info("reference genome opened successfully.")
     # return an object that you can call .fetch() on
     return opened_ref
    def run(self):
        '''
        Whatever is the header of the reference, I will take it down as the first line of the output
        '''

        ref = reference.Reference(self._reference)
        masterlist = ref.show_masterlist()

        header = masterlist.readline().rstrip(
        ) + '\t' + self._sample_name + '\n'
        # so the header of the reference must be clean
        self._lgr.info("header of the output: %s", header)

        myresult = myio.PARcounterOutput(self._infilename.name)
        # self._infilename points to a Path() object

        myresult.add2content(header)

        # 1. cache the info from input BED file
        self._cached_dict = self.cache_BED_input()
        self._lgr.info("input BED cached.")

        # 2. increment on the reference
        hits = 0
        self._lgr.info("start matching with the given reference list...")
        for line in masterlist:
            ls_line = line.rstrip().split('\t')
            coords = ls_line[const.COORDS_coli]
            incrementals, hitted, self._cached_dict = utils.increment_reads_at(
                coords, self._window_size, self._cached_dict)
            # what is worth-noting
            # self._cached_dict is mutated within utils.increment_reads_at()
            # for the saking of write out previously unidentified ApA coords
            if hitted:
                hits += 1
                if hits % const.FIVE_HUNDRED_HITS == 0:
                    print "{0} hits".format(hits)

            new_line = ls_line[const.GENES_coli] + '\t'\
                       + ls_line[const.IDS_coli] + '\t'\
                       + ls_line[const.TRANSCRIPTS_coli] + '\t'\
                       + ls_line[const.TYPES_coli] + '\t'\
                       + ls_line[const.COORDS_coli] + '\t'\
                       + str(incrementals) + '\n'
            myresult.add2content(new_line)
        self._lgr.info("%s hits on the reference list were found!", str(hits))

        # write output
        myresult.open2write(myresult.content)

        ref.close_masterlist()

        self._lgr.info("PARcounter table generated.")
        '''
        The following outputs unidentified ApA coords and their read counts.
        '''
        if len(self._cached_dict) > 0:
            sideresult = myio.UnIdentifiedAPAsOutput(self._infilename.name)
            # self._infilename points to a Path() object

            sideheader = const.UID_HEADER + self._sample_name + '\n'
            sideresult.add2content(sideheader)
            counter_uidapa = 0
            for coords in self._cached_dict:
                counter_uidapa += 1
                # Behind each "coords" key in self._cached_dict, it is a list of BedRead objects
                # So the len(ls_bedreads) is the number of hits on that coords.
                ls_bedreads = self._cached_dict[coords]
                uid_apa_line = coords + '\t' + str(len(ls_bedreads)) + '\n'
                sideresult.add2content(uid_apa_line)
            self._lgr.info(
                "start to output %s un-identified potential ApAs from %s ...",
                str(counter_uidapa), self._infilename.name)
            sideresult.open2write(sideresult.content)
            self._lgr.info("Done!")
        else:
            self._lgr.info("There is no un-identified potential ApAs left.")
            pass