def testCreateGPTsvDatasource(self): dsFile = "testdata/small_genome_position_tsv_ds/oreganno_trim.hg19.txt" destDir = "out" datasourceBuilder = GenericTsvDatasourceCreator() datasourceFilename = datasourceBuilder._createDatabase(destDir=destDir, ds_file=dsFile) datasourceFilename = string.join([destDir, os.sep, datasourceFilename], "") self.assertTrue(os.path.exists(datasourceFilename), "No data source file was generated.")
def testCreateGPTsvConfigFile(self): configFilename = "out/ccle_by_gp.config" datasourceFilename = "ccle_results_by_pos.hg19.import.txt" dataSourceType = "gp_tsv" dataSourceName = "CCLE_By_GP" dataSourceVersion = "09292010" genomicPositionColumnNames = "chr,start,end" datasourceBuilder = GenericTsvDatasourceCreator() datasourceBuilder._createConfigFile( configFilename=configFilename, baseDSFile=datasourceFilename, ds_name=dataSourceName, ds_type=dataSourceType, ds_version=dataSourceVersion, indexCols=DatasourceInstallUtils.getIndexCols( "gp_tsv", genomicPositionColumnNames)) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue( configParser.has_option("general", "genomic_position_cols"), "genomic_position_cols option is missing in general section.") self.assertEqual( configParser.get("general", "type"), dataSourceType, "Expected data source type is %s but was %s." % (dataSourceType, configParser.get("general", "type"))) self.assertEqual( configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual( configParser.get("general", "title"), dataSourceName, "Expected data source title is %s but was %s." % (dataSourceName, configParser.get("general", "title"))) self.assertEqual( configParser.get("general", "version"), dataSourceVersion, "Expected data source version is %s but was %s." % (dataSourceVersion, configParser.get("general", "version"))) self.assertEqual( configParser.get("general", "genomic_position_cols"), genomicPositionColumnNames, "Expected data source genomic_position_cols is %s but was %s." % (genomicPositionColumnNames, configParser.get("general", "genomic_position_cols")))
def getGeneTsvConfigFile(self): configFilename = "out/simple_uniprot.config" datasourceFilename = "simple_uniprot.out.2011_09.tsv" dataSourceType = "gene_tsv" dataSourceName = "UniProt" dataSourceVersion = "2011_09" geneColumnName = "gene" datasourceBuilder = GenericTsvDatasourceCreator() datasourceBuilder._createConfigFile( configFilename=configFilename, baseDSFile=datasourceFilename, ds_name=dataSourceName, ds_type=dataSourceType, ds_version=dataSourceVersion, indexCols=DatasourceInstallUtils.getIndexCols( "gene_tsv", geneColumnName)) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue(configParser.has_option("general", "gene_col"), "gene_col option is missing in general section.") self.assertEqual( configParser.get("general", "type"), dataSourceType, "Expected data source type is %s but was %s." % (dataSourceType, configParser.get("general", "type"))) self.assertEqual( configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual( configParser.get("general", "title"), dataSourceName, "Expected data source title is %s but was %s." % (dataSourceName, configParser.get("general", "title"))) self.assertEqual( configParser.get("general", "version"), dataSourceVersion, "Expected data source version is %s but was %s." % (dataSourceVersion, configParser.get("general", "version"))) self.assertEqual( configParser.get("general", "gene_col"), geneColumnName, "Expected data source gene_col is %s but was %s." % (geneColumnName, configParser.get("general", "gene_col")))
def testCreateGPTsvConfigFile(self): configFilename = "out/ccle_by_gp.config" datasourceFilename = "ccle_results_by_pos.hg19.import.txt" dataSourceType = "gp_tsv" dataSourceName = "CCLE_By_GP" dataSourceVersion = "09292010" genomicPositionColumnNames = "chr,start,end" datasourceBuilder = GenericTsvDatasourceCreator() datasourceBuilder._createConfigFile(configFilename=configFilename, baseDSFile=datasourceFilename, ds_name=dataSourceName, ds_type=dataSourceType, ds_version=dataSourceVersion, indexCols=DatasourceInstallUtils.getIndexCols("gp_tsv", genomicPositionColumnNames)) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue(configParser.has_option("general", "genomic_position_cols"), "genomic_position_cols option is missing in general section.") self.assertEqual(configParser.get("general", "type"), dataSourceType, "Expected data source type is %s but was %s." % (dataSourceType, configParser.get("general", "type"))) self.assertEqual(configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual(configParser.get("general", "title"), dataSourceName, "Expected data source title is %s but was %s." % (dataSourceName, configParser.get("general", "title"))) self.assertEqual(configParser.get("general", "version"), dataSourceVersion, "Expected data source version is %s but was %s." % (dataSourceVersion, configParser.get("general", "version"))) self.assertEqual(configParser.get("general", "genomic_position_cols"), genomicPositionColumnNames, "Expected data source genomic_position_cols is %s but was %s." % (genomicPositionColumnNames, configParser.get("general", "genomic_position_cols")))
def getGeneTsvConfigFile(self): configFilename = "out/simple_uniprot.config" datasourceFilename = "simple_uniprot.out.2011_09.tsv" dataSourceType = "gene_tsv" dataSourceName = "UniProt" dataSourceVersion = "2011_09" geneColumnName = "gene" datasourceBuilder = GenericTsvDatasourceCreator() datasourceBuilder._createConfigFile(configFilename=configFilename, baseDSFile=datasourceFilename, ds_name=dataSourceName, ds_type=dataSourceType, ds_version=dataSourceVersion, indexCols=DatasourceInstallUtils.getIndexCols("gene_tsv", geneColumnName)) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue(configParser.has_option("general", "gene_col"), "gene_col option is missing in general section.") self.assertEqual(configParser.get("general", "type"), dataSourceType, "Expected data source type is %s but was %s." % (dataSourceType, configParser.get("general", "type"))) self.assertEqual(configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual(configParser.get("general", "title"), dataSourceName, "Expected data source title is %s but was %s." % (dataSourceName, configParser.get("general", "title"))) self.assertEqual(configParser.get("general", "version"), dataSourceVersion, "Expected data source version is %s but was %s." % (dataSourceVersion, configParser.get("general", "version"))) self.assertEqual(configParser.get("general", "gene_col"), geneColumnName, "Expected data source gene_col is %s but was %s." % (geneColumnName, configParser.get("general", "gene_col")))
def main(): setup_logging() args = parseOptions() gtf_files = args.gtf_files.split(",") fasta_files = args.fasta_files.split(",") output_dir = args.output_dir genome_build = args.genome_build name = args.name ver = args.version tx_filter = args.filter protein_map_file = args.protein_map_file # create temp dir tmpDir = tempfile.mkdtemp(prefix="onco_ensembl_ds_") try: logging.getLogger(__name__).info("Creating tmp dir (" + tmpDir + ") ....") ds_build_dir = tmpDir + "/" + genome_build + "/" os.mkdir(ds_build_dir) if not (args.gtf_files.lower().find("gencode") !=-1) and tx_filter == "basic": logging.getLogger(__name__).warn("basic filter requested for (apparently) a non-gencode set of GTFs. If this is an ENSEMBL run (not GENCODE), please specify dummy, using --filter.") logging.getLogger(__name__).info("Creating config file...") config_filename = ds_build_dir + "/" + name + ".config" logging.getLogger(__name__).info("config file being written to: " + os.path.abspath(config_filename)) config_file_creator = GenericTsvDatasourceCreator() idx_cols = DatasourceInstallUtils.indexCols("dummy_option", "dummy_values") config_file_creator._createConfigFile(configFilename=config_filename + ".tmp", baseDSFile=os.path.basename(gtf_files[0]),ds_type="ensembl", ds_version=ver, ds_name=name, indexCols=idx_cols) # Append the tx_filter and protein map file config_parser = SafeConfigParser() fp = file(config_filename + ".tmp", 'r') config_parser.readfp(fp) fp.close() config_parser.set("general", "transcript_filter", tx_filter) # Write updated config file fp = file(config_filename, 'w') config_parser.write(fp) fp.close() logging.getLogger(__name__).info("Starting index construction (temp location: " + ds_build_dir + ") ...") factory = GenomeBuildFactory() factory.construct_ensembl_indices(gtf_files, fasta_files, ds_build_dir + os.path.basename(gtf_files[0]), protein_id_mapping_file=protein_map_file) logging.getLogger(__name__).info("Creating datasource md5...") DatasourceInstallUtils.create_datasource_md5_file(ds_build_dir) logging.getLogger(__name__).info("Copying created datasource from temp directory to final location (" + output_dir + ")...") shutil.copytree(symlinks=True, src=tmpDir, dst=output_dir) except Exception as e: import traceback logging.getLogger(__name__).fatal((e.__repr__()) + " " + traceback.format_exc()) logging.getLogger(__name__).info(""""If you are getting and error such as: KeyError: 'ENST00000474204.1'), then you may be out of disk space in /tmp/.""") # Remove the tempdir logging.getLogger(__name__).info("Done...") logging.getLogger(__name__).info("Removing ..." + tmpDir + '/') shutil.rmtree(tmpDir)