Пример #1
0
  def get_genome_assembly_files(self, species: dict, grch37=False) -> list:
    """
    This method retrieve the genome assembly files for a specific specie object
    :param grch37: if the GrCh37 genome assembly is desired enable to true
    :param species: species to download the file.
    :return:
    """
    files = []
    try:
      if grch37:
        species['assembly'] = 'GRCh37'
      file_name = '{}.{}.dna_sm.toplevel.fa.gz'.format(species['name'][0].upper() + species['name'][1:],
                                             species['assembly'])
      file_url = '{}/release-{}/fasta/{}/dna/{}'.format(
        self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_KEY_ENSEMBL_FTP][
          self.CONFIG_KEY_BASE_URL],
        species['release'], species['name'], file_name)
      if grch37:
        file_url = '{}/grch37/release-{}/fasta/{}/dna/{}'.format(
          self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_KEY_ENSEMBL_FTP][
            self.CONFIG_KEY_BASE_URL],
          species['release'], species['name'], file_name)
      files.append(
        download_file(file_url, self.get_local_path_root_ensembl_repo() + '/' + file_name, self.get_logger()))
    except KeyError:
      print("No valid info is available species: ", species)

    return files
Пример #2
0
  def get_pep_files(self, species: dict, grch37=False) -> list:
    """
    Get the peptide files for a specific species object.
    :return: List of files names.
    """
    files = []
    try:
      # TODO: Would be better to check by API the assembly version
      if grch37:
        species['assembly'] = 'GRCh37'
      file_name = '{}.{}.pep.all.fa.gz'.format(species['name'][0].upper() + species['name'][1:],
                                               species['assembly'])
      file_url = '{}/release-{}/fasta/{}/pep/{}'.format(
        self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_KEY_ENSEMBL_FTP][
          self.CONFIG_KEY_BASE_URL],
        species['release'], species['name'], file_name)
      if grch37:
        file_url = '{}/grch37/release-{}/fasta/{}/pep/{}'.format(
          self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_KEY_ENSEMBL_FTP][
            self.CONFIG_KEY_BASE_URL],
          species['release'], species['name'], file_name)
      files.append(
        download_file(file_url, self.get_local_path_root_ensembl_repo() + '/' + file_name, self.get_logger()))
    except KeyError:
      print("No valid info is available species: ", species)

    return files
Пример #3
0
    def get_gtf_files(self, species: dict) -> list:
        """
        This method retrieve the gtf files for a specific specie object
        :param species:
        :return:
        """
        """
          Generate GTF file name from the species info and download the GTF file
        """
        files = []
        try:
            file_name = '{}.{}.{}.gtf.gz'.format(
                species['name'][0].upper() + species['name'][1:],
                species['assembly'],
                species['release'],
            )
            file_url = '{}/release-{}/gtf/{}/{}'.format(
                self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][
                    self.CONFIG_KEY_ENSEMBL_FTP][self.CONFIG_KEY_BASE_URL],
                species['release'], species['name'], file_name)
            files.append(
                download_file(
                    file_url,
                    self.get_local_path_root_ensembl_repo() + '/' + file_name,
                    self.get_logger()))
        except KeyError:
            self.get_logger().debug("No valid info is available species: ",
                                    species)

        return files
Пример #4
0
 def download_one_study(self, download_study):
     file_name = '{}.tar.gz'.format(download_study)
     file_url = '{}/{}'.format(
         self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_KEY_CBIOPORTAL_DOWNLOAD_URL],
         file_name)
     file_name = download_file(file_url, self.get_local_path_root_cbioportal_repo() + '/' + file_name, self.get_logger())
     if file_name is not None:
         msg = "The following study '{}' has been downloaded. ".format(download_study)
     else:
         msg = "The following study '{}' hasn't been downloaded. ".format(download_study)
     self.get_logger().debug(msg)
     return file_name
Пример #5
0
  def get_vcf_files(self, species: dict) -> list:
    """
    This method retrieve the vcf file for a specific specie object
    :param species:
    :return:
    """
    files = []
    try:
      file_name = '{}_incl_consequences.vcf.gz'.format(species['name'])
      file_url = '{}/release-{}/variation/vcf/{}/'.format(
        self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_KEY_ENSEMBL_FTP][
          self.CONFIG_KEY_BASE_URL], species['release'], species['name'])

      downloaded_file = download_file(file_url + file_name, self.get_local_path_root_ensembl_repo() + '/' + file_name,
                                      self.get_logger())
      if downloaded_file is not None:
        files.append(downloaded_file)

      elif species['name'] == 'homo_sapiens':
        # for humans the variants are stored per chromosome, so we need to download them all and combine them into one file here"
        chrN = 1
        file_name = '{}_incl_consequences-chr{}.vcf.gz'.format(species['name'], chrN)
        downloaded_file = download_file(file_url + file_name,
                                        self.get_local_path_root_ensembl_repo() + '/' + file_name, self.get_logger())
        if downloaded_file is not None:
          # if chr1 is downloaded then try all others
          files.append(downloaded_file)
          for chrN in range(2, 23):  # chr2-22
            file_name = '{}_incl_consequences-chr{}.vcf.gz'.format(species['name'], chrN)
            files.append(
              download_file(file_url + file_name, self.get_local_path_root_ensembl_repo() + '/' + file_name,
                            self.get_logger()))
          file_name = '{}_incl_consequences-chr{}.vcf.gz'.format(species['name'], 'X')
          files.append(download_file(file_url + file_name, self.get_local_path_root_ensembl_repo() + '/' + file_name,
                                     self.get_logger()))
          file_name = '{}_incl_consequences-chr{}.vcf.gz'.format(species['name'], 'Y')
          files.append(download_file(file_url + file_name, self.get_local_path_root_ensembl_repo() + '/' + file_name,
                                     self.get_logger()))
          file_name = '{}_incl_consequences-chr{}.vcf.gz'.format(species['name'], 'MT')
          files.append(download_file(file_url + file_name, self.get_local_path_root_ensembl_repo() + '/' + file_name,
                                     self.get_logger()))

    except KeyError:
      self.get_logger().debug("No valid info is available species: ", species)

    return files
Пример #6
0
    def get_ncrna_files(self, species: dict) -> list:
        """
        Get the cds files for a specific species object.
        :return: List of files names.
        """
        files = []
        try:
            file_name = '{}.{}.ncrna.fa.gz'.format(
                species['name'][0].upper() + species['name'][1:],
                species['assembly'])
            file_url = '{}/release-{}/fasta/{}/ncrna/{}'.format(
                self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][
                    self.CONFIG_KEY_ENSEMBL_FTP][self.CONFIG_KEY_BASE_URL],
                species['release'], species['name'], file_name)
            files.append(
                download_file(
                    file_url,
                    self.get_local_path_root_ensembl_repo() + '/' + file_name,
                    self.get_logger()))
        except KeyError:
            print("No valid info is available species: ", species)

        return files