def handle(self, file: str, verbosity: int = 1, **options): """Execute the main function.""" try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) # Load the ontology file with open(file) as obo_file: G = obonet.read_obo(obo_file) if verbosity > 0: self.stdout.write("Preprocessing") cv_name = "relationship" # Initializing ontology ontology = OntologyLoader(cv_name) # Load typedefs as Dbxrefs and Cvterm if verbosity > 0: self.stdout.write("Loading typedefs") for data in tqdm(G.graph["typedefs"], disable=False if verbosity > 0 else True): ontology.store_type_def(data) if verbosity > 0: self.stdout.write(self.style.SUCCESS("Done"))
def handle(self, file: str, verbosity: int = 1, cpu: int = 1, **options): """Execute the main function.""" if verbosity > 0: self.stdout.write("Preprocessing") try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() # Load the publication file with open(file) as tab_file: for line in tab_file: organism, doi = line.strip().split("\t") tasks.append( pool.submit(OrganismLoader().store_organism_publication, organism, doi)) if verbosity > 0: self.stdout.write("Loading organism publications") for task in tqdm(as_completed(tasks), total=len(tasks)): try: task.result() except ImportingError as e: raise CommandError(e) pool.shutdown() if verbosity > 0: self.stdout.write(self.style.SUCCESS("Done"))
def handle( self, file: str, soterm: str, verbosity: int = 1, cpu: int = 1, **options ): """Execute the main function.""" try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) # retrieve only the file name filename = os.path.basename(file) try: sequence_file = SequenceLoader(filename=filename) except ImportingError as e: raise CommandError(e) if verbosity > 0: self.stdout.write("Processing file: {}".format(filename)) fasta_sequences = SeqIO.parse(open(file), "fasta") pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() for fasta in fasta_sequences: tasks.append( pool.submit(sequence_file.add_sequence_to_feature, fasta, soterm) ) if verbosity > 0: self.stdout.write("Loading") for task in tqdm(as_completed(tasks), total=len(tasks)): if task.result(): raise (task.result()) pool.shutdown() if verbosity > 0: self.stdout.write(self.style.SUCCESS("Done with {}".format(filename)))
def handle(self, file: str, cvterm: str, soterm: str, doi: str = None, verbosity: int = 1, cpu: int = 1, **options): """Execute the main function.""" if verbosity > 0: self.stdout.write("Preprocessing") try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) # retrieve only the file name filename = os.path.basename(file) try: feature_file = FeatureLoader(filename=filename, source="GFF_source") except ImportingError as e: raise CommandError(e) pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() # Load the annotation file with open(file) as tab_file: for line in tab_file: if line.startswith("#"): continue feature, annotation = line.strip().split("\t") tasks.append( pool.submit( feature_file.store_feature_annotation, feature, soterm, cvterm, annotation, doi, )) if verbosity > 0: self.stdout.write("Loading feature annotations") for task in tqdm(as_completed(tasks), total=len(tasks)): try: task.result() except ImportingError as e: raise CommandError(e) pool.shutdown() if verbosity > 0: self.stdout.write(self.style.SUCCESS("Done"))
def handle(self, file: str, format: str, cpu: int = 1, verbosity: int = 1, **options): """Execute the main function.""" # retrieve only the file name try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) if format == "blast-xml": source = "BLAST_source" elif format == "interproscan-xml": source = "InterproScan_source" else: raise CommandError("Format allowed options are blast-xml or " "interproscan-xml only, not {}".format(format)) filename = os.path.basename(file) try: feature_file = FeatureLoader(filename=filename, source=source) except ImportingError as e: raise CommandError(e) if verbosity > 0: self.stdout.write("Processing file: {}".format(filename)) try: records = SearchIO.parse(file, format) except ValueError as e: return CommandError(e) pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() for record in records: for hit in record.hits: tasks.append( pool.submit(feature_file.store_bio_searchio_hit, hit, record.target)) if verbosity > 0: self.stdout.write("Loading") for task in tqdm(as_completed(tasks), total=len(tasks)): try: task.result() except ImportingError as e: raise CommandError(e) pool.shutdown() if len(feature_file.ignored_goterms) > 0: self.stdout.write( self.style.WARNING("Ignored GO terms: {}".format( feature_file.ignored_goterms))) if verbosity > 0: self.stdout.write( self.style.SUCCESS("Done with {}".format(filename)))
def handle(self, file: str, organism: str, soterm: str, nosequence: bool = False, cpu: int = 1, description: str = None, url: str = None, doi: str = None, verbosity: int = 1, **options) -> None: """Execute the main function.""" if verbosity > 0: self.stdout.write("Preprocessing") try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) # retrieve only the file name filename = os.path.basename(file) try: sequence_file = SequenceLoader(filename=filename, description=description, url=url, doi=doi) except ImportingError as e: raise CommandError(e) fasta_sequences = SeqIO.parse(open(file), "fasta") pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() for fasta in fasta_sequences: tasks.append( pool.submit( sequence_file.store_biopython_seq_record, fasta, soterm, organism, nosequence, )) if verbosity > 0: self.stdout.write("Loading") for task in tqdm(as_completed(tasks), total=len(tasks)): if task.result(): raise (task.result()) pool.shutdown() if verbosity > 0: self.stdout.write(self.style.SUCCESS("Done"))
def handle(self, file: str, verbosity: int = 1, **options): """Execute the main function.""" try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) # Load the ontology file with open(file) as obo_file: G = obonet.read_obo(obo_file) if verbosity > 0: self.stdout.write("Preprocessing") cv_name = G.graph["default-namespace"][0] cv_definition = G.graph["data-version"] # Initializing ontology ontology = OntologyLoader(cv_name, cv_definition) if verbosity > 0: self.stdout.write("Loading typedefs") # Load typedefs as Dbxrefs and Cvterm for typedef in tqdm( G.graph["typedefs"], disable=False if verbosity > 0 else True ): ontology.store_type_def(typedef) if verbosity > 0: self.stdout.write("Loading terms") for n, data in tqdm( G.nodes(data=True), disable=False if verbosity > 0 else True ): ontology.store_term(n, data) if verbosity > 0: self.stdout.write("Loading relationships") for u, v, type in tqdm( G.edges(keys=True), disable=False if verbosity > 0 else True ): ontology.store_relationship(u, v, type) if verbosity > 0: self.stdout.write(self.style.SUCCESS("Done"))
def handle(self, file=str, verbosity: int = 1, cpu: int = 1, **options): """Execute the main function.""" if verbosity > 0: self.stdout.write("Preprocessing") try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) # filename = os.path.basename(file) bib_database = None try: bib_database = bibtexparser.load(open(file)) except ValueError as e: return CommandError(e) bibtex = PublicationLoader() pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() for entry in bib_database.entries: # create model object for each entry if entry["ENTRYTYPE"]: tasks.append(pool.submit(bibtex.store_bibtex_entry, entry)) if verbosity > 0: self.stdout.write("Loading") for task in tqdm( as_completed(tasks), total=len(tasks), disable=False if verbosity > 0 else True, ): try: task.result() except ImportingError as e: raise CommandError(e) pool.shutdown() if verbosity > 0: self.stdout.write(self.style.SUCCESS("Done"))
def handle(self, file: str, organism: str, doi: str = None, ignore: str = None, qtl: bool = False, cpu: int = 1, verbosity: int = 1, **options): """Execute the main function.""" # retrieve only the file name filename = os.path.basename(file) if verbosity > 0: self.stdout.write("Processing file: {}".format(filename)) try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) try: index_file = "{}.tbi".format(file) FileValidator().validate(index_file) except ImportingError: try: index_file = "{}.csi".format(file) FileValidator().validate(index_file) except ImportingError: raise CommandError("No index found (.tbi/.csi)") try: feature_file = FeatureLoader(filename=filename, source="GFF_SOURCE", doi=doi) except ImportingError as e: raise CommandError(e) pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() chunk_size = cpu * 2 # Load the GFF3 file with open(file) as tbx_file: tbx = pysam.TabixFile(filename=tbx_file.name, index=index_file) for row in tqdm(tbx.fetch(parser=pysam.asGTF()), total=get_num_lines(file)): if ignore is not None and row.feature in ignore: continue tasks.append( pool.submit(feature_file.store_tabix_GFF_feature, row, organism, qtl)) if len(tasks) >= chunk_size: for task in as_completed(tasks): try: task.result() except ImportingError as e: raise CommandError(e) tasks.clear() else: for task in as_completed(tasks): try: task.result() except ImportingError as e: raise CommandError(e) tasks.clear() pool.shutdown() if verbosity > 0: self.stdout.write("Loading relationships") pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() for item in feature_file.relationships: tasks.append( pool.submit( feature_file.store_relationship, organism, item["subject_id"], item["object_id"], )) for task in tqdm(as_completed(tasks), total=len(tasks)): try: task.result() except ImportingError as e: raise CommandError(e) pool.shutdown() if feature_file.ignored_attrs is not None: self.stdout.write( self.style.WARNING("Ignored attrs: {}".format( feature_file.ignored_attrs))) if verbosity > 0: self.stdout.write( self.style.SUCCESS("Done with {}".format(filename)))
def handle(self, file: str, cpu: int = 1, verbosity: int = 1, **options): """Execute the main function.""" try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) # Load the ontology file with open(file) as obo_file: G = read_obo(obo_file) cv_definition = G.graph["data-version"] if verbosity > 0: self.stdout.write("Preprocessing") # Instantiating Ontology in order to have access to secondary cv, db, # cvterm, and dbxref, even though the main cv will not be used. # There will be a ontology for each namespace, plus one called # gene_ontology for storing type_defs try: ontology = OntologyLoader("biological_process", cv_definition) ontology = OntologyLoader("molecular_function", cv_definition) ontology = OntologyLoader("cellular_component", cv_definition) ontology = OntologyLoader("external", cv_definition) ontology = OntologyLoader("gene_ontology", cv_definition) except ImportingError as e: raise CommandError(e) # Load typedefs as Dbxrefs and Cvterm if verbosity > 0: self.stdout.write("Loading typedefs ({} threads)".format(cpu)) pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() for typedef in G.graph["typedefs"]: tasks.append(pool.submit(ontology.store_type_def, typedef)) for task in tqdm(as_completed(tasks), total=len(tasks)): if task.result(): raise (task.result()) # Load the cvterms if verbosity > 0: self.stdout.write("Loading terms ({} threads)".format(cpu)) lock = Lock() tasks = list() for n, data in G.nodes(data=True): tasks.append(pool.submit(ontology.store_term, n, data, lock)) for task in tqdm(as_completed(tasks), total=len(tasks)): if task.result(): raise (task.result()) # Load the relationship between cvterms if verbosity > 0: self.stdout.write("Loading relationships ({} threads)".format(cpu)) tasks = list() for u, v, type in G.edges(keys=True): tasks.append(pool.submit(ontology.store_relationship, u, v, type)) for task in tqdm(as_completed(tasks), total=len(tasks)): if task.result(): raise (task.result()) pool.shutdown() if verbosity > 0: self.stdout.write(self.style.SUCCESS("Done"))
def handle( self, file: str, organism: str, doi: str = None, cpu: int = 1, verbosity: int = 1, **options ): """Execute the main function.""" # retrieve only the file name filename = os.path.basename(file) if verbosity > 0: self.stdout.write("Processing file: {}".format(filename)) try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) try: index_file = "{}.tbi".format(file) FileValidator().validate(index_file) except ImportingError: try: index_file = "{}.csi".format(file) FileValidator().validate(index_file) except ImportingError: raise CommandError("No index found (.tbi/.csi)") try: feature_file = FeatureLoader( filename=filename, source="VCF_SOURCE", doi=doi ) except ImportingError as e: raise CommandError(e) pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() chunk_size = cpu * 2 # Load the GFF3 file with open(file) as tbx_file: tbx = pysam.TabixFile(filename=tbx_file.name, index=index_file) for row in tqdm(tbx.fetch(parser=pysam.asVCF()), total=get_num_lines(file)): tasks.append( pool.submit(feature_file.store_tabix_VCF_feature, row, organism) ) if len(tasks) >= chunk_size: for task in as_completed(tasks): try: task.result() except ImportingError as e: raise CommandError(e) tasks.clear() else: for task in as_completed(tasks): try: task.result() except ImportingError as e: raise CommandError(e) tasks.clear() pool.shutdown() if verbosity > 0: self.stdout.write(self.style.SUCCESS("Done with {}".format(filename)))
def handle(self, file: str, cpu: int = 1, verbosity: int = 0, **options): """Execute the main function.""" try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) filename = os.path.basename(file) if verbosity > 0: self.stdout.write("Processing file: {}".format(filename)) try: groups = open(file, "r") # retrieve only the file name except ImportingError as e: raise CommandError(e) pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() cv, created = Cv.objects.get_or_create(name="feature_property") ortho_db, created = Db.objects.get_or_create(name="ORTHOMCL_SOURCE") ortho_dbxref, created = Dbxref.objects.get_or_create( accession="ORTHOMCL_SOURCE", db=ortho_db) cvterm_cluster, created = Cvterm.objects.get_or_create( name="orthologous group", cv=cv, dbxref=ortho_dbxref, is_obsolete=0, is_relationshiptype=0, ) # hardcoded as orthomcl uses protein input soterm = "polypeptide" source = "null" featureloader = FeatureLoader(source=source, filename=filename) # each line is an orthologous group for line in groups: members = [] name = "" fields = re.split(r"\s+", line.strip()) if re.search(r"^(\w+)\:", fields[0]): group_field = re.match(r"^(\w+)\:", fields[0]) name = group_field.group(1) fields.pop(0) for field in fields: if re.search(r"^(\w+)\|(\S+)", field): member_field = re.match(r"^(\w+)\|(\S+)", field) ident = member_field.group(2) members.append(ident) else: raise CommandError("Cluster has no identification, check.") # only orthologous groups with 2 or more members allowed if len(members) > 1: tasks.append( pool.submit( featureloader.store_feature_groups, soterm=soterm, group=members, term=cvterm_cluster.cvterm_id, value=name, )) if verbosity > 0: self.stdout.write("Loading") for task in tqdm(as_completed(tasks), total=len(tasks)): if task.result(): raise (task.result()) pool.shutdown() if verbosity > 0: self.stdout.write( self.style.SUCCESS("Done with {}".format(filename)))
def test_validate_file(self): """Tests - validate file.""" # test file not exists file_path = "/tmp/machado.test.file" v = FileValidator() with self.assertRaisesMessage(ImportingError, "{} does not exist".format(file_path)): v.validate(file_path=file_path) # test wrong file type file_path = "/tmp/machado.test.dir" os.mkdir(file_path) v = FileValidator() with self.assertRaisesMessage(ImportingError, "{} is not a file".format(file_path)): v.validate(file_path=file_path) os.rmdir(file_path) # test file not readable file_path = "/tmp/machado.test.file" os.mknod(file_path, mode=0o200) v = FileValidator() with self.assertRaisesMessage(ImportingError, "{} is not readable".format(file_path)): v.validate(file_path=file_path) os.remove(file_path)
def handle(self, file: str, organism: str, program: str, programversion: str, name: str = None, description: str = None, algorithm: str = None, assaydb: str = "SRA", timeexecuted: str = None, norm: int = 1, cpu: int = 1, verbosity: int = 0, **options): """Execute the main function.""" filename = os.path.basename(file) if verbosity > 0: self.stdout.write("Processing file: {}".format(filename)) try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) # start reading file try: rnaseq_data = open(file, "r") # retrieve only the file name except ImportingError as e: raise CommandError(e) header = 1 # analysis_list = defaultdict(list) analysis_list = list() # instantiate Loader analysis_file = AnalysisLoader() pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() for line in rnaseq_data: fields = re.split("\t", line.rstrip()) nfields = len(fields) # validate fields within line try: FieldsValidator().validate(nfields, fields) except ImportingError as e: raise CommandError(e) # read header and instantiate analysis object for each assay # e.g. SRR12345. if header: # first element is the string "gene" - need to be removed fields.pop(0) for i in range(len(fields)): # parse field to get SRA ID. e.g.: SRR5167848.htseq # try to remove ".htseq" part of string string = re.match(r"(\w+)\.(\w+)", fields[i]) try: assay = string.group(1) except IntegrityError as e: raise CommandError(e) # store analysis try: analysis = analysis_file.store_analysis( program=program, sourcename=fields[i], programversion=programversion, timeexecuted=timeexecuted, algorithm=algorithm, name=assay, description=description, filename=filename, ) except ImportingError as e: raise CommandError(e) # store quantification try: analysis_file.store_quantification(analysis=analysis, assayacc=assay) except ImportingError as e: raise CommandError(e) # finally, store each analysis in a list. analysis_list.insert(i, analysis) header = 0 else: # first element is the feature acc. "e.g.: AT2G44195.1.TAIR10" feature_name = fields.pop(0) for i in range(len(fields)): if norm: normscore = fields[i] rawscore = None else: normscore = None rawscore = fields[i] # store analysis feature for each value tasks.append( pool.submit( analysis_file.store_analysisfeature, analysis_list[i], feature_name, organism, rawscore, normscore, )) if verbosity > 0: self.stdout.write("Loading") for task in tqdm(as_completed(tasks), total=len(tasks)): try: task.result() except ImportingError as e: raise CommandError(e) pool.shutdown() if verbosity > 0: self.stdout.write( self.style.SUCCESS("Done with {}".format(filename)))
def handle( self, file: str, format: str, so_query: str, so_subject: str, organism_query: str, organism_subject: str, program: str, programversion: str, name: str = None, description: str = None, algorithm: str = None, cpu: int = 1, verbosity: int = 1, **options ): """Execute the main function.""" filename = os.path.basename(file) if organism_query == "mutispecies multispecies": raise CommandError("Query's organism cannot be multispecies") if format not in VALID_FORMAT: raise CommandError( "The format is not valid. Please choose: " "{}".format(VALID_FORMAT) ) try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) try: similarity_file = SimilarityLoader( filename=filename, so_query=so_query, so_subject=so_subject, org_query=organism_query, org_subject=organism_subject, algorithm=algorithm, name=name, description=description, program=program, programversion=programversion, input_format=format, ) except ImportingError as e: raise CommandError(e) try: similarity_records = SearchIO.parse(file, format) except ValueError as e: return CommandError(e) pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() if verbosity > 0: self.stdout.write("Processing file: {}".format(filename)) for record in similarity_records: if len(record.hsps) > 0: tasks.append( pool.submit(similarity_file.store_bio_searchio_query_result, record) ) if verbosity > 0: self.stdout.write("Loading") for task in tqdm(as_completed(tasks), total=len(tasks)): try: task.result() except ImportingError as e: raise CommandError(e) pool.shutdown() if verbosity > 0: self.stdout.write(self.style.SUCCESS("Done with {}".format(filename)))
def handle(self, file: str, biomaterialdb: str, assaydb: str, cpu: int = 1, verbosity: int = 0, **options): """Execute the main function.""" filename = os.path.basename(file) nfields = 8 if verbosity > 0: self.stdout.write("Processing file: {}".format(filename)) # instantiate project, biomaterial and assay try: project_file = ProjectLoader() biomaterial_file = BiomaterialLoader() assay_file = AssayLoader() treatment_file = TreatmentLoader() except ImportingError as e: raise CommandError(e) try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) try: rnaseq_data = open(file, "r") # retrieve only the file name except ImportingError as e: raise CommandError(e) # each line is an RNA-seq experiment # e.g: # Oryza sativa,GSE112368,GSM3068810,SRR6902930,heat leaf,Heat stress,Leaf,Jul-20-2018 for line in rnaseq_data: fields = re.split(",", line.strip()) organism_name = fields[0] try: FieldsValidator().validate(nfields, fields) except ImportingError as e: raise CommandError(e) # get organism - mandatory try: organism = retrieve_organism(organism=organism_name) except ObjectDoesNotExist as e: raise ImportingError(e) # store project try: # e.g: "GSExxx" from GEO project_model = project_file.store_project(name=fields[1], filename=filename) except ObjectDoesNotExist as e: raise ImportingError(e) # store biomaterial (sample) try: # e.g: "GSMxxxx" from GEO biomaterial_model = biomaterial_file.store_biomaterial( db=biomaterialdb, acc=fields[2], organism=organism, name=fields[2], filename=filename, description=fields[6], ) except ImportingError as e: raise CommandError(e) # store treatment try: # e.g. "Heat" treatment_model = treatment_file.store_treatment( name=fields[5], biomaterial=biomaterial_model) except ImportingError as e: raise CommandError(e) try: biomaterial_file.store_biomaterial_treatment( biomaterial=biomaterial_model, treatment=treatment_model) except ImportingError as e: raise CommandError(e) # store assay (experiment) try: # e.g. "SRRxxxx" from GEO assay_model = assay_file.store_assay( db=assaydb, acc=fields[3], assaydate=fields[7], name=fields[3], filename=filename, description=fields[4], ) assay_file.store_assay_project(assay=assay_model, project=project_model) assay_file.store_assay_biomaterial( assay=assay_model, biomaterial=biomaterial_model) except ImportingError as e: raise CommandError(e) if verbosity > 0: self.stdout.write( self.style.SUCCESS("Done with {}".format(filename)))
def handle(self, file: str, name: str, verbosity: int = 1, cpu: int = 1, **options): """Execute the main function.""" if verbosity > 0: self.stdout.write("Preprocessing") try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) try: organism_db = OrganismLoader(organism_db=name) except ImportingError as e: raise CommandError(e) file_names = open(file) pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() current_id = None taxid, scname = "", "" synonyms, common_names = [], [] for line in file_names: columns = re.split(r"\s\|\s", line) if current_id is not None and current_id != columns[0]: # store if new record tasks.append( pool.submit( organism_db.store_organism_record, taxid, scname, synonyms, common_names, )) taxid, scname = "", "" synonyms, common_names = [], [] current_id = columns[0] # get data while current_id remains unchanged if columns[3] == "scientific name": taxid = columns[0] if columns[2] == "" or columns[1] == columns[2]: scname = columns[1] else: scname = "{} {}".format(columns[1], columns[2]) elif columns[3] == "synonym": synonyms.append(columns[1]) elif columns[3] == "common name": common_names.append(columns[1]) else: # insert the last record tasks.append( pool.submit( organism_db.store_organism_record, taxid, scname, synonyms, common_names, )) if verbosity > 0: self.stdout.write("Loading names file") for task in tqdm(as_completed(tasks), total=len(tasks)): if task.result(): raise (task.result()) pool.shutdown() if verbosity > 0: self.stdout.write(self.style.SUCCESS("Done"))
def handle( self, file: str, cpu: int = 1, soterm: str = "mRNA", verbosity: int = 0, **options ): """Execute the main function.""" filename = os.path.basename(file) if verbosity > 0: self.stdout.write("Processing file: {}".format(filename)) try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) try: pairs = open(file, "r") # retrieve only the file name except ImportingError as e: raise CommandError(e) cvterm_corel = Cvterm.objects.get( name="correlated with", cv__name="relationship" ).cvterm_id # feature source is not needed here source = "null" featureloader = FeatureLoader(source=source, filename=filename) size = get_num_lines(file) # every cpu should be able to handle 5 tasks chunk = cpu * 5 with ThreadPoolExecutor(max_workers=cpu) as pool: tasks = list() for line in tqdm(pairs, total=size): nfields = 3 fields = re.split(r"\s+", line.rstrip()) try: FieldsValidator().validate(nfields, fields) except ImportingError as e: raise CommandError(e) # get corrected PCC value (last item from fields list) value = float(fields.pop()) + 0.7 tasks.append( pool.submit( featureloader.store_feature_pairs, pair=fields, soterm=soterm, term=cvterm_corel, value=value, ) ) if len(tasks) >= chunk: for task in as_completed(tasks): if task.result(): raise (task.result()) tasks.clear() else: for task in as_completed(tasks): if task.result(): raise (task.result()) tasks.clear() pool.shutdown() if verbosity > 0: self.stdout.write(self.style.SUCCESS("Done with {}".format(filename)))
def handle(self, file: str, name: str, organismdb: str, verbosity: int = 1, cpu: int = 1, **options): """Execute the main function.""" if verbosity > 0: self.stdout.write("Preprocessing") try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) try: phylotree = PhylotreeLoader(phylotree_name=name, organism_db=organismdb) except ImportingError as e: raise CommandError(e) file_nodes = open(file) self.nodes: Dict[int, Dict[str, Any]] = dict() self.ctr = 0 for line in file_nodes: columns = re.split(r"\s\|\s", line) tax_id = int(columns[0]) parent_id = int(columns[1]) level = columns[2] if self.nodes.get(tax_id) is None: self.nodes[tax_id] = { "parent_id": parent_id, "level": level, "children": [], } else: self.nodes[tax_id]["parent_id"] = parent_id self.nodes[tax_id]["level"] = level if self.nodes.get(parent_id) is None: self.nodes[parent_id] = { "parent_id": None, "level": None, "children": [tax_id], } else: self.nodes[parent_id]["children"].append(tax_id) self.walktree(node_id=1) if verbosity > 0: self.stdout.write("Loading") pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() # By setting the parent_id to None it's possible to load the # nodes randomly and using threads. try: for key, data in self.nodes.items(): tasks.append( pool.submit( phylotree.store_phylonode_record, tax_id=key, parent_id=None, level=data["level"], left_idx=data["left_idx"], right_idx=data["right_idx"], )) for task in tqdm(as_completed(tasks), total=len(tasks)): if task.result(): tax_id, phylonode = task.result() self.nodes[tax_id]["phylonode_id"] = phylonode.phylonode_id except KeyError as e: raise CommandError("Could not calculate {}. Make it sure it is " "possible to walk the entire tree " "structure.".format(e)) if verbosity > 0: self.stdout.write("Loading nodes relationships") tasks = list() # Load the nodes relationship info for key, data in self.nodes.items(): if data.get("parent_id") is None: continue tasks.append( pool.submit( phylotree.update_parent_phylonode_id, data["phylonode_id"], data["parent_id"], )) for task in tqdm(as_completed(tasks), total=len(tasks)): if task.result(): raise (task.result()) pool.shutdown() if verbosity > 0: self.stdout.write(self.style.SUCCESS("Done"))
def handle(self, file: str, organism: str, soterm: str = "mRNA", cpu: int = 1, verbosity: int = 0, **options): """Execute the main function.""" filename = os.path.basename(file) if verbosity > 0: self.stdout.write("Processing file: {}".format(filename)) try: organism = retrieve_organism(organism) except IntegrityError as e: raise ImportingError(e) try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) try: clusters = open(file, "r") # retrieve only the file name except ImportingError as e: raise CommandError(e) tasks = list() cv, created = Cv.objects.get_or_create(name="feature_property") coexp_db, created = Db.objects.get_or_create(name="LSTRAP_SOURCE") coexp_dbxref, created = Dbxref.objects.get_or_create( accession="LSTRAP_SOURCE", db=coexp_db) cvterm_cluster, created = Cvterm.objects.get_or_create( name="coexpression group", cv=cv, dbxref=coexp_dbxref, is_obsolete=0, is_relationshiptype=0, ) # feature source is not needed here source = "null" featureloader = FeatureLoader(source=source, filename=filename) pool = ThreadPoolExecutor(max_workers=cpu) # each line is an coexpression cluster group for line in tqdm(clusters, total=get_num_lines(file)): name = "" fields = re.split(r"\s+", line.strip()) nfields = len(fields) try: FieldsValidator().validate(nfields, fields) except ImportingError as e: raise CommandError(e) if re.search(r"^(\w+)\:", fields[0]): group_field = re.match(r"^(\w+)\:", fields[0]) name = group_field.group(1) else: raise CommandError("Cluster identification has problems.") # remove cluster name before loading fields.pop(0) # get cvterm for correlation tasks.append( pool.submit( featureloader.store_feature_groups, group=fields, soterm=soterm, term=cvterm_cluster.cvterm_id, value=name, )) if verbosity > 0: self.stdout.write("Loading") for task in tqdm(as_completed(tasks), total=len(tasks)): if task.result(): raise (task.result()) pool.shutdown() if verbosity > 0: self.stdout.write( self.style.SUCCESS("Done with {}".format(filename)))