Python ConfigHandler примеры использования

Язык программирования: Python

Пространство имен/Пакет: handlers.refseq.confighandler

Класс/Тип: ConfigHandler

Примеров на hotexamples.com: 7

Python ConfigHandler - 7 примеров найдено. Это лучшие примеры Python кода для handlers.refseq.confighandler.ConfigHandler, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ConfigHandler(7)

get(1)

Основные методы

ConfigHandler (7)

get (1)

Пример #1

Показать файл

Файл: databasehandler.py Проект: Ensembl/tark-refseq-loader

    def load_release_set(self, assembly_id, session_id, data_release_set=None):
        if data_release_set is None:
            today = datetime.now().date()
            default_config = ConfigHandler().getInstance().get_section_config()
            data_release_set = collections.OrderedDict()
            data_release_set["shortname"] = default_config["shortname"]
            data_release_set["description"] = default_config["description"]
            data_release_set["assembly_id"] = str(assembly_id)
            data_release_set["release_date"] = str(today)
            data_release_set["session_id"] = str(session_id)
            data_release_set["source_id"] = default_config["source"]

        release_set_checksum = ChecksumHandler.checksum_list(
            list(data_release_set.values()))
        data_release_set["release_checksum"] = release_set_checksum

        insert_release_set = (
            "INSERT INTO release_set (shortname, description, assembly_id, release_date, session_id, \
                                release_checksum, source_id) VALUES \
                                (%(shortname)s,  %(description)s, %(assembly_id)s, %(release_date)s,  %(session_id)s, \
                                X%(release_checksum)s, %(source_id)s)\
                                ON DUPLICATE KEY UPDATE release_id=LAST_INSERT_ID(release_id)"
        )

        release_id = self.insert_data(insert_release_set, data_release_set)
        return release_id

Пример #2

Показать файл

Файл: databasehandler.py Проект: Ensembl/tark-refseq-loader

    def __init__(self, db_config=None, mypool_name="mypool"):

        if db_config is None:
            db_config = ConfigHandler().getInstance().get_section_config(
                section_name="DATABASE")

        logger.info("loading in to  " + db_config.get("database"))

        mydbconfig = {
            "user": db_config.get("user"),
            "password": db_config.get("pass"),
            "port": db_config.get("port"),
            "host": db_config.get("host"),
            "database": db_config.get("database")
        }

        connection_pool = mysql.connector.pooling.MySQLConnectionPool(
            pool_name=mypool_name,
            pool_size=32,
            pool_reset_session=True,
            **mydbconfig)
        print(connection_pool)
        connection_obj = connection_pool.get_connection()

        self.db_con = connection_obj

Пример #3

Показать файл

Файл: databasehandler.py Проект: Ensembl/tark-refseq-loader

    def populate_parent_tables(self, init_table_list=None):

        if init_table_list is None:
            init_table_list = [
                "session", "genome", "assembly", "assembly_alias",
                "release_source"
            ]

        session_id = None
        genome_id = None
        assembly_id = None

        parent_ids = {}
        if "session" in init_table_list:
            session_id = self.start_session("Refseq Client " +
                                            str(time.time()))
            parent_ids['session_id'] = session_id
            print(".........Popultating SESSION table.........\n")

        if "genome" in init_table_list:
            genome_data = {
                "name": "homo_sapiens",
                "tax_id": str(9606),
                "session_id": str(session_id)
            }
            genome_id = self.load_genome(genome_data)
            parent_ids['genome_id'] = genome_id
            print(".........Popultating GENOME table.........\n")

        if "assembly" in init_table_list:
            assembly_data = {
                "genome_id": str(genome_id),
                "assembly_name": "GRCh38",
                "session_id": str(session_id)
            }
            assembly_id = self.load_assembly(assembly_data)
            parent_ids['assembly_id'] = assembly_id
            logger.info(".........Popultating ASSEMBLY table.........\n")

        if "assembly_alias" in init_table_list:
            assembly_alias_data = {
                "alias": "GCA_000001405.25",
                "genome_id": str(genome_id),
                "assembly_id": str(assembly_id),
                "session_id": str(session_id)
            }
            assembly_alias_id = self.load_assembly_alias(assembly_alias_data)
            parent_ids['assembly_alias_id'] = assembly_alias_id
            logger.info(".........Popultating ASSEMBLY ALIAS table.........\n")

        if "release_source" in init_table_list:
            release_source = {
                "shortname": "Ensembl",
                "description": "Ensembl data imports from Human Core DBs"
            }
            release_source_ensembl = self.load_release_source(release_source)
            parent_ids['release_source_ensembl'] = release_source_ensembl
            logger.info(".........Popultating RELEASE SOURCE table.........\n")

            release_source = {
                "shortname": "RefSeq",
                "description": "RefSeq data imports from RefSeq GFF"
            }
            release_source_refseq = self.load_release_source(release_source)
            parent_ids['release_source_refseq'] = release_source_refseq
            logger.info(".........Popultating REFSEQ table.........\n")

        # load data_release_set
        today = datetime.now().date()
        default_config = ConfigHandler().getInstance().get_section_config()
        data_release_set = collections.OrderedDict()
        data_release_set["shortname"] = default_config["shortname"]
        data_release_set["description"] = default_config["description"]
        data_release_set["assembly_id"] = str(assembly_id)
        data_release_set["release_date"] = str(today)
        data_release_set["session_id"] = str(session_id)
        data_release_set["source_id"] = str(release_source_refseq)
        release_set_id = self.load_release_set(assembly_id, session_id,
                                               data_release_set)
        parent_ids['release_id'] = release_set_id

        return parent_ids

Пример #4

Показать файл

class AnnotationHandler(object):

    ASSEMBLY_ID = ConfigHandler().getInstance().get_section_config(
    )["assembly_id"]
    ASSEMBLY_NAME = ConfigHandler().getInstance().get_section_config(
    )["assembly_name"]

    @classmethod
    def get_annotated_gene(cls, chrom, gene_feature):
        gene = {}
        gene['loc_start'] = str(gene_feature.location.start)
        gene['loc_end'] = str(gene_feature.location.end)
        gene['loc_strand'] = str(gene_feature.location.strand)
        gene['loc_region'] = str(chrom)
        gene['stable_id'] = cls.parse_qualifiers(gene_feature.qualifiers,
                                                 "Dbxref", "GeneID")
        gene['stable_id_version'] = 1
        gene['assembly_id'] = cls.ASSEMBLY_ID
        gene['assembly_name'] = cls.ASSEMBLY_NAME
        # make it none for the moment, otherwise you will get integrity exception
        hgnc_id = cls.parse_qualifiers(gene_feature.qualifiers, "Dbxref",
                                       "HGNC:HGNC")
        if hgnc_id is not None:
            hgnc_id = "HGNC:" + hgnc_id
        gene['hgnc_id'] = hgnc_id

        gene['session_id'] = None
        gene['loc_checksum'] = ChecksumHandler.get_location_checksum(gene)
        gene['gene_checksum'] = ChecksumHandler.get_gene_checksum(gene)
        return gene

    @classmethod
    def get_annotated_transcript(cls, sequence_handler, chrom, mRNA_feature):
        transcript = {}
        # Note we have shifted one base here
        transcript['assembly_id'] = cls.ASSEMBLY_ID
        transcript['assembly_name'] = cls.ASSEMBLY_NAME
        transcript['loc_start'] = str(mRNA_feature.location.start + 1)
        transcript['loc_end'] = str(mRNA_feature.location.end)
        transcript['loc_strand'] = str(mRNA_feature.location.strand)
        transcript['loc_region'] = str(chrom)
        stable_id = mRNA_feature.qualifiers['transcript_id'][0]
        (transcript_stable_id,
         transcript_stable_id_version) = stable_id.split(".")
        transcript['stable_id'] = transcript_stable_id
        transcript['stable_id_version'] = transcript_stable_id_version
        transcript['session_id'] = None
        transcript['transcript_checksum'] = None
        transcript['exon_set_checksum'] = None
        transcript['loc_checksum'] = ChecksumHandler.get_location_checksum(
            transcript)
        transcript['sequence'] = sequence_handler.get_sequence_by_id(
            mRNA_feature.qualifiers['transcript_id'][0])
        transcript['seq_checksum'] = ChecksumHandler.get_seq_checksum(
            transcript, 'sequence')
        return transcript

    @classmethod
    def get_annotated_exons(cls, sequence_handler, seq_region,
                            transcript_identifier, refseq_exon_list):
        exon_sequences = []

        refseq_exon_list_relative_coordinates = ExonUtils.compute_exon_coordinates(
            refseq_exon_list.copy())
        #         '''
        #         Ref: BioSeqFeature
        #         Note that the start and end location numbering follow Python's scheme,
        #         thus a GenBank entry of 123..150 (one based counting) becomes a location
        #         of [122:150] (zero based counting).
        #         '''

        for exon in refseq_exon_list_relative_coordinates:
            sequence = sequence_handler.get_seq_record_by_id_location(
                transcript_identifier, exon['exon_start'], exon['exon_end'],
                int(exon['exon_strand']))
            exon_sequences.append(str(sequence))
        # exon_sequences = sequence_handler.get_exon_sequences_by_identifier(transcript_identifier)
        # print(exon_sequences)
        annotated_exons = []

        if exon_sequences is None:
            return None

        if len(refseq_exon_list) != len(exon_sequences):
            return None

        for exon_feature, exon_sequence in zip(refseq_exon_list,
                                               exon_sequences):
            annotated_exons.append(
                cls.get_annotated_exon(seq_region, exon_feature,
                                       exon_sequence))

        return annotated_exons

    @classmethod
    def get_annotated_exon(cls, seq_region, exon_feature, exon_sequence):
        exon = {}
        exon['assembly_id'] = cls.ASSEMBLY_ID
        exon['assembly_name'] = cls.ASSEMBLY_NAME
        exon['loc_start'] = exon_feature["exon_start"]
        exon['loc_end'] = exon_feature["exon_end"]
        exon['loc_strand'] = exon_feature["exon_strand"]
        exon['loc_region'] = str(seq_region)
        exon['loc_checksum'] = ChecksumHandler.get_location_checksum(exon)
        exon['exon_order'] = exon_feature["exon_order"]
        exon['stable_id'] = exon_feature["exon_stable_id"]
        exon['stable_id_version'] = exon_feature["exon_stable_id_version"]
        exon['session_id'] = None
        exon['exon_seq'] = exon_sequence
        exon['seq_checksum'] = ChecksumHandler.get_seq_checksum(
            exon, 'exon_seq')
        exon['exon_checksum'] = ChecksumHandler.get_exon_checksum(exon)

        return exon

    @classmethod
    def get_annotated_cds(cls, protein_sequence_handler, seq_region,
                          protein_id, cds_list):

        cds_strand = cds_list[0]['cds_strand']
        protein_id = cds_list[0]['protein_id']
        (stable_id, stable_id_version) = protein_id.split(".")

        (translation_start,
         translation_end) = cls.get_translation_loc(cds_list)
        translation = {}
        translation['assembly_id'] = cls.ASSEMBLY_ID
        translation['assembly_name'] = cls.ASSEMBLY_NAME
        translation['stable_id'] = stable_id
        translation['stable_id_version'] = stable_id_version
        translation['loc_start'] = translation_start
        translation['loc_end'] = translation_end
        translation['loc_strand'] = cds_strand
        translation['loc_region'] = seq_region
        translation[
            'translation_seq'] = protein_sequence_handler.get_fasta_seq_by_id(
                protein_id)
        translation['seq_checksum'] = ChecksumHandler.get_seq_checksum(
            translation, 'translation_seq')
        translation['session_id'] = None
        translation['loc_checksum'] = ChecksumHandler.get_location_checksum(
            translation)

        translation[
            'translation_checksum'] = ChecksumHandler.get_translation_checksum(
                translation)

        return translation

    @classmethod
    def get_translation_loc(cls, cds_list):
        cds = cds_list[0]
        if (cds['cds_strand'] == '1'):
            cds_start = [
                cds['cds_start'] for cds in cds_list if cds['cds_order'] == 1
            ]
            cds_end = [
                cds['cds_end'] for cds in cds_list
                if cds['cds_order'] == len(cds_list)
            ]
        elif (cds['cds_strand'] == '-1'):
            cds_start = [
                cds['cds_start'] for cds in cds_list
                if cds['cds_order'] == len(cds_list)
            ]
            cds_end = [
                cds['cds_end'] for cds in cds_list if cds['cds_order'] == 1
            ]

        if len(cds_start) > 0 and len(cds_end) > 0:
            return (cds_start[0], cds_end[0])
        else:
            return (0, 0)

    @classmethod
    def parse_qualifiers(cls, qualifiers, key_qualifier, attr=None):
        if key_qualifier in qualifiers:
            cur_qualifiers = qualifiers[key_qualifier]
            for cur_qualifier in cur_qualifiers:
                if attr is not None:
                    my_regex = attr + ":" + "(.*)"
                    matchObj = re.match(my_regex, cur_qualifier,
                                        re.M | re.I)  # @IgnorePep8
                    if matchObj and matchObj.group(1):
                        attr_value = matchObj.group(1)
                        return str(attr_value)
        return None

    @classmethod
    def get_seq_region_from_refseq_accession(cls, refseq_accession):
        matchObj = re.match(r'NC_(\d+)\.\d+', refseq_accession,
                            re.M | re.I)  # @IgnorePep8

        if matchObj and matchObj.group(1):
            chrom = int(matchObj.group(1))
            if chrom == 23:
                return "X"
            elif chrom == 24:
                return "Y"
            else:
                return chrom

    @classmethod
    def add_feature_sequence(cls,
                             fasta_handler,
                             feature_locations,
                             feature_id,
                             feature_type='exon'):
        features_with_seq = []
        for feature in feature_locations:
            feature_seq = fasta_handler.get_fasta_seq_by_id(
                feature_id, feature[feature_type + '_start'],
                feature[feature_type + '_end'])
            feature[feature_type + '_seq'] = feature_seq
            features_with_seq.append(feature)
        return features_with_seq

Пример #5

Показать файл

Файл: parse_gff_file.py Проект: bethflint/tark-refseq-loader

    def run(self):

        mydb_config = ConfigHandler().getInstance().get_section_config(section_name="DATABASE")
        dbh = DatabaseHandler(db_config=mydb_config,
                              mypool_name="mypool_" + str(self.seq_region))
        dbc = dbh.get_connection()

        sequence_handler = FastaHandler(self.downloaded_files['fasta'])

        print("Loading protein.....")
        print(self.downloaded_files['protein'])
        protein_sequence_handler = FastaHandler(self.downloaded_files['protein'])

        print("Working on Seq region limit " + str(self.seq_region))

        gff_handle = open(self.downloaded_files['gff'])

        # Chromosome seq level
        for rec in GFF.parse(gff_handle, limit_info=self.limits, target_lines=1000):

            for gene_feature in rec.features:

                # skip regions
                if gene_feature.type == "region":
                    continue

                annotated_gene = AnnotationHandler.get_annotated_gene(self.seq_region, gene_feature)

                # gene level
                annotated_transcripts = []
                for mRNA_feature in gene_feature.sub_features:

                    if 'transcript_id' in mRNA_feature.qualifiers:
                        transcript_id = mRNA_feature.qualifiers['transcript_id'][0]
                    else:
                        continue

                    refseq_exon_list = []
                    refseq_exon_order = 1

                    refseq_cds_list = []
                    refseq_cds_order = 1
                    for mRNA_sub_feature in mRNA_feature.sub_features:
                        refseq_exon_dict = {}
                        if 'exon' in mRNA_sub_feature.type:
                            # print("Transcript Has exons" + str(mRNA_sub_feature.id))
                            refseq_exon_dict['exon_stable_id'] = str(mRNA_sub_feature.id)
                            refseq_exon_dict['exon_stable_id_version'] = 1  # dummmy version
                            refseq_exon_dict['exon_order'] = refseq_exon_order
                            # note that we are shifting one base here
                            refseq_exon_dict['exon_start'] = str(mRNA_sub_feature.location.start + 1)
                            refseq_exon_dict['exon_end'] = str(mRNA_sub_feature.location.end)
                            refseq_exon_dict['exon_strand'] = str(mRNA_sub_feature.location.strand)
                            refseq_exon_list.append(refseq_exon_dict)
                            refseq_exon_order += 1

                        refseq_cds_dict = {}
                        if 'CDS' in mRNA_sub_feature.type:

                            refseq_cds_dict['cds_order'] = refseq_cds_order
                            # note that we are shifting one base here
                            refseq_cds_dict['cds_start'] = str(mRNA_sub_feature.location.start + 1)
                            refseq_cds_dict['cds_end'] = str(mRNA_sub_feature.location.end)
                            refseq_cds_dict['cds_strand'] = str(mRNA_sub_feature.location.strand)
                            refseq_cds_dict['cds_id'] = str(mRNA_sub_feature.id)
                            refseq_cds_dict['protein_id'] = str(mRNA_sub_feature.qualifiers['protein_id'][0])  # @IgnorePep8
                            refseq_cds_list.append(refseq_cds_dict)
                            refseq_cds_order += 1

                    annotated_transcript = AnnotationHandler.get_annotated_transcript(sequence_handler,
                                                                                      self.seq_region,
                                                                                      mRNA_feature)

                    # add sequence and other annotations
                    annotated_exons = []
                    if len(refseq_exon_list) > 0:
                        annotated_exons = AnnotationHandler.get_annotated_exons(sequence_handler, self.seq_region,
                                                                                transcript_id,
                                                                                refseq_exon_list)

                        if annotated_exons is not None and len(annotated_exons) > 0:

                            exon_set_checksum = ChecksumHandler.get_exon_set_checksum(annotated_exons)
                            annotated_transcript['exon_set_checksum'] = exon_set_checksum
                            annotated_transcript['exons'] = annotated_exons
                        else:
                            annotated_transcript['exons'] = []

                    annotated_translation = []
                    if len(refseq_cds_list) > 0:
                        protein_id = refseq_cds_list[0]['protein_id']
                        annotated_translation = AnnotationHandler.get_annotated_cds(protein_sequence_handler,
                                                                                    self.seq_region,
                                                                                    protein_id,
                                                                                    refseq_cds_list)
                        annotated_transcript['translation'] = annotated_translation
                    else:
                        annotated_transcript['translation'] = []

                    annotated_transcript['transcript_checksum'] = ChecksumHandler.get_transcript_checksum(annotated_transcript)  # @IgnorePep8
                    annotated_transcripts.append(annotated_transcript)

                annotated_gene['transcripts'] = annotated_transcripts
                feature_object_to_save = {}
                feature_object_to_save["gene"] = annotated_gene

                if not self.dryrun and annotated_gene is not None and annotated_gene['stable_id'] is not None:
                    print("About to load gene => " + str(annotated_gene['stable_id']))
                    feature_handler = FeatureHandler(parent_ids=self.parent_ids, dbc=dbc)
                    feature_handler.save_features_to_database(feature_object_to_save)

        dbc.close()
        gff_handle.close()

        print("About to write to the status file")
        status_dir = self.download_dir + '/' + 'status_logs'
        if not os.path.exists(status_dir):
            os.makedirs(status_dir)
        self.status_file = status_dir + '/' + 'status_file_chr' + str(self.seq_region)
        status_handle = open(self.status_file, "w")
        status_handle.write("Done")
        status_handle.close()

Пример #6

Показать файл

Файл: parse_gff_file.py Проект: bethflint/tark-refseq-loader

    def requires(self):
        mydefault_config = ConfigHandler().getInstance().get_section_config(section_name="DEFAULT")

        (gff_filename, gff_file_extension) = os.path.splitext(mydefault_config['gff_file'])  # @UnusedVariable
        (fasta_filename, fasta_file_extension) = os.path.splitext(mydefault_config['fasta_file'])  # @UnusedVariable
        (protein_filename, protein_file_extension) = os.path.splitext(mydefault_config['protein_file'])  # @UnusedVariable

        downloaded_files = {}
        downloaded_files['gff'] = self.download_dir + "/" + gff_filename
        downloaded_files['fasta'] = self.download_dir + "/" + fasta_filename
        downloaded_files['protein'] = self.download_dir + "/" + protein_filename

        # Examine for available regions
        # examiner = GFF.GFFExaminer()

        # load the parent tables
        parent_ids = None
        # use for debugging only

        if not self.dryrun:
            mydb_config = ConfigHandler().getInstance().get_section_config(section_name="DATABASE")
            dbh = DatabaseHandler(db_config=mydb_config,
                                  mypool_name="mypool_parentids")
            print(dbh)
            feature_handler = FeatureHandler(dbc=dbh.get_connection())
            parent_ids = feature_handler.populate_parent_tables()

        print(downloaded_files['gff'])

        # You could examine the file to get the possible chr, initialising it to save some time
        #         with open(downloaded_files['gff']) as gff_handle_examiner:
        #             possible_limits = examiner.available_limits(gff_handle_examiner)
        #             chromosomes = sorted(possible_limits["gff_id"].keys())
        chromosomes = [
            ('NC_000001.11',),
            ('NC_000002.12',),
            ('NC_000003.12',),
            ('NC_000004.12',),
            ('NC_000005.10',),
            ('NC_000006.12',),
            ('NC_000007.14',),
            ('NC_000008.11',),
            ('NC_000009.12',),
            ('NC_000010.11',),
            ('NC_000011.10',),
            ('NC_000012.12',),
            ('NC_000013.11',),
            ('NC_000014.9',),
            ('NC_000015.10',),
            ('NC_000016.10',),
            ('NC_000017.11',),
            ('NC_000018.10',),
            ('NC_000019.10',),
            ('NC_000020.11',),
            ('NC_000021.9',),
            ('NC_000022.11',),
            ('NC_000023.11',),
            ('NC_000024.10',),
            ('NC_012920.1',)
            ]
        limits = dict()
        # for testing
        filter_regions = None
        for chrom_tuple in chromosomes:
            chrom = chrom_tuple[0]
            if not chrom.startswith("NC_"):
                continue
            print(chrom_tuple)

            seq_region = self.get_seq_region_from_refseq_accession(chrom)

            # Restrict only for filter_region
            if self.limit_chr is not None:
                if ',' in self.limit_chr:
                    filter_regions = self.limit_chr.split(',')
                else:
                    filter_regions = [self.limit_chr]

                if str(seq_region) not in filter_regions:
                    print(" Skipping " + str(seq_region))
                    continue

            limits["gff_id"] = chrom_tuple

            yield ParseRecord(
                   download_dir=self.download_dir,
                   downloaded_files=downloaded_files,
                   seq_region=str(seq_region),
                   parent_ids=parent_ids,
                   limits=limits,
                   dryrun=self.dryrun
                )

Пример #7

Показать файл

Файл: download_refseq_files.py Проект: Ensembl/tark-refseq-loader

class DownloadRefSeqSourceFiles(luigi.WrapperTask):
    """
    Wrapper Task to download refseq gff files
    """

    download_dir = luigi.Parameter()
    task_namespace = 'DownloadRefSeqSourceFiles'

    assembly_id = ConfigHandler().getInstance().get_section_config()["assembly_id"]
    assembly_name = ConfigHandler().getInstance().get_section_config()["assembly_name"]
    source = ConfigHandler().getInstance().get_section_config()["source"]
    shortname = ConfigHandler().getInstance().get_section_config()["shortname"]
    description = ConfigHandler().getInstance().get_section_config()["description"]
    print("Assembly ID " + str(assembly_id))
    print("Assembly Name " + str(assembly_name))
    print("source name " + str(source))
    print("shortname " + str(shortname))
    print("description  " + str(description))

    ftp_root = ConfigHandler().getInstance().get_section_config()["ftp_root"]
    gff_file = ConfigHandler().getInstance().get_section_config()["gff_file"]
    fasta_file = ConfigHandler().getInstance().get_section_config()["fasta_file"]
    protein_file = ConfigHandler().getInstance().get_section_config()["protein_file"]

    print("ftp_root {}".format(ftp_root))
    print("gff_file {}".format(gff_file))
    print("fasta_file {}".format(fasta_file))
    print("protein_file {}".format(protein_file))

#     ftp_root = 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.38_GRCh38.p12/'
#     #ftp_root = 'http://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/GCF_000001405.38_GRCh38.p12'  # @IgnorePep8
#     gff_file = 'GCF_000001405.38_GRCh38.p12_genomic.gff.gz'
#     fasta_file = 'GCF_000001405.38_GRCh38.p12_rna.fna.gz'
#     protein_file = 'GCF_000001405.38_GRCh38.p12_protein.faa.gz'

    files_to_download = [gff_file, fasta_file, protein_file]
    # files_to_download = [gff_file]

    def complete(self):
        complete_list = []
        for file_ in self.files_to_download:
            base = os.path.basename(file_)
            downloaded_file_url_zipped = self.download_dir + '/' + file_
            downloaded_file_url_unzipped = self.download_dir + '/' + os.path.splitext(base)[0]

            if os.path.exists(downloaded_file_url_zipped) and os.path.exists(downloaded_file_url_unzipped):
                complete_list.append(True)

        if len(complete_list) == len(self.files_to_download):
            return True
        else:
            return False

    def requires(self):
        for file_ in self.files_to_download:
            yield DownloadRefSeqSourceFile(
                download_dir=self.download_dir,
                file_to_download=file_,
                ftp_root=self.ftp_root)

            yield UnzipRefSeqFile(
                download_dir=self.download_dir,
                file_to_download=file_,
                ftp_root=self.ftp_root
                     )