def getProblems(data, txn=None): if 'genome' not in data: data['genome'] = getGenome(data, txn=txn) problems = db.Problems(data['genome']) problemsInBounds = problems.getInBounds(data['ref'], data['start'], data['end']) if problemsInBounds is None: problemsPath = os.path.join(cfg.jbrowsePath, cfg.dataPath, 'genomes', data['genome'], 'problems.bed') if not os.path.exists(problemsPath): location = Hubs.generateProblems(data['genome'], problemsPath) if not location == problemsPath: raise Exception problemsDf = pd.read_csv(problemsPath, sep='\t', header=None) problemsDf.columns = problemColumns problems.put(problemsDf, txn=txn) problemsIsInBounds = problemsDf.apply(db.checkInBounds, axis=1, args=(data['ref'], data['start'], data['end'])) return problemsDf[problemsIsInBounds].to_dict('records') else: return problemsInBounds.to_dict('records')
def generateProblems(genome, path, txn=None): genesUrl = "%s%s/database/" % (cfg.geneUrl, genome) genomePath = os.path.join(path, 'genomes', genome) outputFile = os.path.join(genomePath, 'problems.bed') if db.Problems.has_key(genome): return outputFile if not os.path.exists(genomePath): try: os.makedirs(genomePath) except OSError: return files = [] for file in ['chromInfo', 'gap']: outputPath = os.path.join(genomePath, file + '.txt') fileUrl = genesUrl + file + '.txt.gz' files.append(downloadAndUnpackFile(fileUrl, outputPath)) chromInfo = pd.read_csv(files[0], sep='\t', header=None).iloc[:, 0:2] chromInfo.columns = ['chrom', 'bases'] gap = pd.read_csv(files[1], sep='\t', header=None).iloc[:, 1:4] gap.columns = ['chrom', 'gapStart', 'gapEnd'] join = gap.merge(chromInfo, on='chrom', how='outer') nan = join.isnull()['gapStart' and 'gapEnd'] nonNan = join.notnull()['gapStart' and 'gapEnd'] nanOutput = join[nan].groupby(['chrom']).apply(createNanProblems) nonNanOutput = join[nonNan].groupby(['chrom']).apply(createProblems) frames = [nonNanOutput, nanOutput] output = pd.concat(frames, sort=False) # Removes all entries with an _, not needed because these are genome "Fixes" output = output[~output['chrom'].str.contains('_')] output['chromStart'] = output['chromStart'].astype(int) output['chromEnd'] = output['chromEnd'].astype(int) db.Problems(genome).put(output, txn=txn) output.to_csv(outputFile, sep='\t', index=False, header=False) return outputFile
def getProblemsForChrom(genome, chrom, txn=None): problems = db.Problems(genome).get(txn=txn) return problems[problems['chrom'] == chrom].copy()