def retrieve(self): if os.path.exists(self.reference_dir): print("Warning! The reference path exists: '{}'".format( self.reference_dir)) os.makedirs(self.reference_dir, exist_ok=True) chromosomes_dir = os.path.join(self.reference_dir, "chromosomes") os.makedirs(chromosomes_dir, exist_ok=True) # UCSC returns HTTP 530 when attempting to download in multi-thread compressed_chromosomes = Utilities.single_core_queue( self._dl_wrapper, [{ "chromosome": i, "chromosomes_dir": chromosomes_dir } for i in self.CHROMOSOMES]) # Process sequence self.parsed_records = Utilities.flatten_2d_array( Utilities.single_core_queue(self._parse_gzip_fna, compressed_chromosomes)) self.nfasta_file = os.path.join(self.reference_dir, "hg19.fasta") SeqIO.write(self.parsed_records, self.nfasta_file, "fasta") # Process annotation self.index_dir = self.describer.get_index_guide(self.nfasta_file)
def generate_keywords_dict(keywords: list, split_words: bool = False): keywords = [j for j in [i.strip() for i in keywords if isinstance(i, str)] if len(j) > 0] if split_words: keywords = Utilities.flatten_2d_array([i.split(" ") for i in keywords]) return {j: () for j in sorted([i.strip() for i in set(keywords)]) if len(j) > 0}