def retrieve(self):
     if os.path.exists(self.reference_dir):
         print("Warning! The reference path exists: '{}'".format(
             self.reference_dir))
     os.makedirs(self.reference_dir, exist_ok=True)
     chromosomes_dir = os.path.join(self.reference_dir, "chromosomes")
     os.makedirs(chromosomes_dir, exist_ok=True)
     # UCSC returns HTTP 530 when attempting to download in multi-thread
     compressed_chromosomes = Utilities.single_core_queue(
         self._dl_wrapper, [{
             "chromosome": i,
             "chromosomes_dir": chromosomes_dir
         } for i in self.CHROMOSOMES])
     # Process sequence
     self.parsed_records = Utilities.flatten_2d_array(
         Utilities.single_core_queue(self._parse_gzip_fna,
                                     compressed_chromosomes))
     self.nfasta_file = os.path.join(self.reference_dir, "hg19.fasta")
     SeqIO.write(self.parsed_records, self.nfasta_file, "fasta")
     # Process annotation
     self.index_dir = self.describer.get_index_guide(self.nfasta_file)
Exemplo n.º 2
0
 def generate_keywords_dict(keywords: list, split_words: bool = False):
     keywords = [j for j in [i.strip() for i in keywords if isinstance(i, str)] if len(j) > 0]
     if split_words:
         keywords = Utilities.flatten_2d_array([i.split(" ") for i in keywords])
     return {j: () for j in sorted([i.strip() for i in set(keywords)]) if len(j) > 0}