def handle( self, file: str, organism: str, doi: str = None, cpu: int = 1, verbosity: int = 1, **options ): """Execute the main function.""" # retrieve only the file name filename = os.path.basename(file) if verbosity > 0: self.stdout.write("Processing file: {}".format(filename)) try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) try: index_file = "{}.tbi".format(file) FileValidator().validate(index_file) except ImportingError: try: index_file = "{}.csi".format(file) FileValidator().validate(index_file) except ImportingError: raise CommandError("No index found (.tbi/.csi)") try: feature_file = FeatureLoader( filename=filename, source="VCF_SOURCE", doi=doi ) except ImportingError as e: raise CommandError(e) pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() chunk_size = cpu * 2 # Load the GFF3 file with open(file) as tbx_file: tbx = pysam.TabixFile(filename=tbx_file.name, index=index_file) for row in tqdm(tbx.fetch(parser=pysam.asVCF()), total=get_num_lines(file)): tasks.append( pool.submit(feature_file.store_tabix_VCF_feature, row, organism) ) if len(tasks) >= chunk_size: for task in as_completed(tasks): try: task.result() except ImportingError as e: raise CommandError(e) tasks.clear() else: for task in as_completed(tasks): try: task.result() except ImportingError as e: raise CommandError(e) tasks.clear() pool.shutdown() if verbosity > 0: self.stdout.write(self.style.SUCCESS("Done with {}".format(filename)))
def handle( self, file: str, cpu: int = 1, soterm: str = "mRNA", verbosity: int = 0, **options ): """Execute the main function.""" filename = os.path.basename(file) if verbosity > 0: self.stdout.write("Processing file: {}".format(filename)) try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) try: pairs = open(file, "r") # retrieve only the file name except ImportingError as e: raise CommandError(e) cvterm_corel = Cvterm.objects.get( name="correlated with", cv__name="relationship" ).cvterm_id # feature source is not needed here source = "null" featureloader = FeatureLoader(source=source, filename=filename) size = get_num_lines(file) # every cpu should be able to handle 5 tasks chunk = cpu * 5 with ThreadPoolExecutor(max_workers=cpu) as pool: tasks = list() for line in tqdm(pairs, total=size): nfields = 3 fields = re.split(r"\s+", line.rstrip()) try: FieldsValidator().validate(nfields, fields) except ImportingError as e: raise CommandError(e) # get corrected PCC value (last item from fields list) value = float(fields.pop()) + 0.7 tasks.append( pool.submit( featureloader.store_feature_pairs, pair=fields, soterm=soterm, term=cvterm_corel, value=value, ) ) if len(tasks) >= chunk: for task in as_completed(tasks): if task.result(): raise (task.result()) tasks.clear() else: for task in as_completed(tasks): if task.result(): raise (task.result()) tasks.clear() pool.shutdown() if verbosity > 0: self.stdout.write(self.style.SUCCESS("Done with {}".format(filename)))
def handle(self, file: str, organism: str, doi: str = None, ignore: str = None, qtl: bool = False, cpu: int = 1, verbosity: int = 1, **options): """Execute the main function.""" # retrieve only the file name filename = os.path.basename(file) if verbosity > 0: self.stdout.write("Processing file: {}".format(filename)) try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) try: index_file = "{}.tbi".format(file) FileValidator().validate(index_file) except ImportingError: try: index_file = "{}.csi".format(file) FileValidator().validate(index_file) except ImportingError: raise CommandError("No index found (.tbi/.csi)") try: feature_file = FeatureLoader(filename=filename, source="GFF_SOURCE", doi=doi) except ImportingError as e: raise CommandError(e) pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() chunk_size = cpu * 2 # Load the GFF3 file with open(file) as tbx_file: tbx = pysam.TabixFile(filename=tbx_file.name, index=index_file) for row in tqdm(tbx.fetch(parser=pysam.asGTF()), total=get_num_lines(file)): if ignore is not None and row.feature in ignore: continue tasks.append( pool.submit(feature_file.store_tabix_GFF_feature, row, organism, qtl)) if len(tasks) >= chunk_size: for task in as_completed(tasks): try: task.result() except ImportingError as e: raise CommandError(e) tasks.clear() else: for task in as_completed(tasks): try: task.result() except ImportingError as e: raise CommandError(e) tasks.clear() pool.shutdown() if verbosity > 0: self.stdout.write("Loading relationships") pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() for item in feature_file.relationships: tasks.append( pool.submit( feature_file.store_relationship, organism, item["subject_id"], item["object_id"], )) for task in tqdm(as_completed(tasks), total=len(tasks)): try: task.result() except ImportingError as e: raise CommandError(e) pool.shutdown() if feature_file.ignored_attrs is not None: self.stdout.write( self.style.WARNING("Ignored attrs: {}".format( feature_file.ignored_attrs))) if verbosity > 0: self.stdout.write( self.style.SUCCESS("Done with {}".format(filename)))
def handle(self, file: str, organism: str, soterm: str = "mRNA", cpu: int = 1, verbosity: int = 0, **options): """Execute the main function.""" filename = os.path.basename(file) if verbosity > 0: self.stdout.write("Processing file: {}".format(filename)) try: organism = retrieve_organism(organism) except IntegrityError as e: raise ImportingError(e) try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) try: clusters = open(file, "r") # retrieve only the file name except ImportingError as e: raise CommandError(e) tasks = list() cv, created = Cv.objects.get_or_create(name="feature_property") coexp_db, created = Db.objects.get_or_create(name="LSTRAP_SOURCE") coexp_dbxref, created = Dbxref.objects.get_or_create( accession="LSTRAP_SOURCE", db=coexp_db) cvterm_cluster, created = Cvterm.objects.get_or_create( name="coexpression group", cv=cv, dbxref=coexp_dbxref, is_obsolete=0, is_relationshiptype=0, ) # feature source is not needed here source = "null" featureloader = FeatureLoader(source=source, filename=filename) pool = ThreadPoolExecutor(max_workers=cpu) # each line is an coexpression cluster group for line in tqdm(clusters, total=get_num_lines(file)): name = "" fields = re.split(r"\s+", line.strip()) nfields = len(fields) try: FieldsValidator().validate(nfields, fields) except ImportingError as e: raise CommandError(e) if re.search(r"^(\w+)\:", fields[0]): group_field = re.match(r"^(\w+)\:", fields[0]) name = group_field.group(1) else: raise CommandError("Cluster identification has problems.") # remove cluster name before loading fields.pop(0) # get cvterm for correlation tasks.append( pool.submit( featureloader.store_feature_groups, group=fields, soterm=soterm, term=cvterm_cluster.cvterm_id, value=name, )) if verbosity > 0: self.stdout.write("Loading") for task in tqdm(as_completed(tasks), total=len(tasks)): if task.result(): raise (task.result()) pool.shutdown() if verbosity > 0: self.stdout.write( self.style.SUCCESS("Done with {}".format(filename)))