Python FileValidator 예제들, machado.loaders.common.FileValidator Python 예제들

예제 #1

0

파일 보기

파일: load_relations_ontology.py 프로젝트: lmb-embrapa/machado

    def handle(self, file: str, verbosity: int = 1, **options):
        """Execute the main function."""
        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)

        # Load the ontology file
        with open(file) as obo_file:
            G = obonet.read_obo(obo_file)

        if verbosity > 0:
            self.stdout.write("Preprocessing")

        cv_name = "relationship"

        # Initializing ontology
        ontology = OntologyLoader(cv_name)

        # Load typedefs as Dbxrefs and Cvterm
        if verbosity > 0:
            self.stdout.write("Loading typedefs")

        for data in tqdm(G.graph["typedefs"],
                         disable=False if verbosity > 0 else True):
            ontology.store_type_def(data)

        if verbosity > 0:
            self.stdout.write(self.style.SUCCESS("Done"))

예제 #2

0

파일 보기

파일: load_organism_publication.py 프로젝트: lmb-embrapa/machado

    def handle(self, file: str, verbosity: int = 1, cpu: int = 1, **options):
        """Execute the main function."""
        if verbosity > 0:
            self.stdout.write("Preprocessing")

        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)

        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()

        # Load the publication file
        with open(file) as tab_file:
            for line in tab_file:
                organism, doi = line.strip().split("\t")
                tasks.append(
                    pool.submit(OrganismLoader().store_organism_publication,
                                organism, doi))

        if verbosity > 0:
            self.stdout.write("Loading organism publications")
        for task in tqdm(as_completed(tasks), total=len(tasks)):
            try:
                task.result()
            except ImportingError as e:
                raise CommandError(e)
        pool.shutdown()

        if verbosity > 0:
            self.stdout.write(self.style.SUCCESS("Done"))

예제 #3

0

파일 보기

    def handle(
        self, file: str, soterm: str, verbosity: int = 1, cpu: int = 1, **options
    ):
        """Execute the main function."""
        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)

        # retrieve only the file name
        filename = os.path.basename(file)
        try:
            sequence_file = SequenceLoader(filename=filename)
        except ImportingError as e:
            raise CommandError(e)

        if verbosity > 0:
            self.stdout.write("Processing file: {}".format(filename))

        fasta_sequences = SeqIO.parse(open(file), "fasta")
        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()
        for fasta in fasta_sequences:
            tasks.append(
                pool.submit(sequence_file.add_sequence_to_feature, fasta, soterm)
            )
        if verbosity > 0:
            self.stdout.write("Loading")
        for task in tqdm(as_completed(tasks), total=len(tasks)):
            if task.result():
                raise (task.result())
        pool.shutdown()

        if verbosity > 0:
            self.stdout.write(self.style.SUCCESS("Done with {}".format(filename)))

예제 #4

0

파일 보기

    def handle(self,
               file: str,
               cvterm: str,
               soterm: str,
               doi: str = None,
               verbosity: int = 1,
               cpu: int = 1,
               **options):
        """Execute the main function."""
        if verbosity > 0:
            self.stdout.write("Preprocessing")

        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)

        # retrieve only the file name
        filename = os.path.basename(file)

        try:
            feature_file = FeatureLoader(filename=filename,
                                         source="GFF_source")
        except ImportingError as e:
            raise CommandError(e)

        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()

        # Load the annotation file
        with open(file) as tab_file:
            for line in tab_file:
                if line.startswith("#"):
                    continue
                feature, annotation = line.strip().split("\t")
                tasks.append(
                    pool.submit(
                        feature_file.store_feature_annotation,
                        feature,
                        soterm,
                        cvterm,
                        annotation,
                        doi,
                    ))

        if verbosity > 0:
            self.stdout.write("Loading feature annotations")
        for task in tqdm(as_completed(tasks), total=len(tasks)):
            try:
                task.result()
            except ImportingError as e:
                raise CommandError(e)
        pool.shutdown()

        if verbosity > 0:
            self.stdout.write(self.style.SUCCESS("Done"))

예제 #5

0

파일 보기

    def handle(self,
               file: str,
               format: str,
               cpu: int = 1,
               verbosity: int = 1,
               **options):
        """Execute the main function."""
        # retrieve only the file name
        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)
        if format == "blast-xml":
            source = "BLAST_source"
        elif format == "interproscan-xml":
            source = "InterproScan_source"
        else:
            raise CommandError("Format allowed options are blast-xml or "
                               "interproscan-xml only, not {}".format(format))

        filename = os.path.basename(file)
        try:
            feature_file = FeatureLoader(filename=filename, source=source)
        except ImportingError as e:
            raise CommandError(e)

        if verbosity > 0:
            self.stdout.write("Processing file: {}".format(filename))
        try:
            records = SearchIO.parse(file, format)
        except ValueError as e:
            return CommandError(e)

        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()
        for record in records:
            for hit in record.hits:
                tasks.append(
                    pool.submit(feature_file.store_bio_searchio_hit, hit,
                                record.target))
        if verbosity > 0:
            self.stdout.write("Loading")
        for task in tqdm(as_completed(tasks), total=len(tasks)):
            try:
                task.result()
            except ImportingError as e:
                raise CommandError(e)
        pool.shutdown()

        if len(feature_file.ignored_goterms) > 0:
            self.stdout.write(
                self.style.WARNING("Ignored GO terms: {}".format(
                    feature_file.ignored_goterms)))
        if verbosity > 0:
            self.stdout.write(
                self.style.SUCCESS("Done with {}".format(filename)))

예제 #6

0

파일 보기

    def handle(self,
               file: str,
               organism: str,
               soterm: str,
               nosequence: bool = False,
               cpu: int = 1,
               description: str = None,
               url: str = None,
               doi: str = None,
               verbosity: int = 1,
               **options) -> None:
        """Execute the main function."""
        if verbosity > 0:
            self.stdout.write("Preprocessing")

        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)

        # retrieve only the file name
        filename = os.path.basename(file)
        try:
            sequence_file = SequenceLoader(filename=filename,
                                           description=description,
                                           url=url,
                                           doi=doi)
        except ImportingError as e:
            raise CommandError(e)

        fasta_sequences = SeqIO.parse(open(file), "fasta")

        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()
        for fasta in fasta_sequences:
            tasks.append(
                pool.submit(
                    sequence_file.store_biopython_seq_record,
                    fasta,
                    soterm,
                    organism,
                    nosequence,
                ))
        if verbosity > 0:
            self.stdout.write("Loading")
        for task in tqdm(as_completed(tasks), total=len(tasks)):
            if task.result():
                raise (task.result())
        pool.shutdown()

        if verbosity > 0:
            self.stdout.write(self.style.SUCCESS("Done"))

예제 #7

0

파일 보기

    def handle(self, file: str, verbosity: int = 1, **options):
        """Execute the main function."""
        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)

        # Load the ontology file
        with open(file) as obo_file:
            G = obonet.read_obo(obo_file)

        if verbosity > 0:
            self.stdout.write("Preprocessing")

        cv_name = G.graph["default-namespace"][0]
        cv_definition = G.graph["data-version"]

        # Initializing ontology
        ontology = OntologyLoader(cv_name, cv_definition)

        if verbosity > 0:
            self.stdout.write("Loading typedefs")

        # Load typedefs as Dbxrefs and Cvterm
        for typedef in tqdm(
            G.graph["typedefs"], disable=False if verbosity > 0 else True
        ):
            ontology.store_type_def(typedef)

        if verbosity > 0:
            self.stdout.write("Loading terms")

        for n, data in tqdm(
            G.nodes(data=True), disable=False if verbosity > 0 else True
        ):
            ontology.store_term(n, data)

        if verbosity > 0:
            self.stdout.write("Loading relationships")

        for u, v, type in tqdm(
            G.edges(keys=True), disable=False if verbosity > 0 else True
        ):
            ontology.store_relationship(u, v, type)

        if verbosity > 0:
            self.stdout.write(self.style.SUCCESS("Done"))

예제 #8

0

파일 보기

파일: load_publication.py 프로젝트: gledisonteixeira/machado

    def handle(self, file=str, verbosity: int = 1, cpu: int = 1, **options):
        """Execute the main function."""
        if verbosity > 0:
            self.stdout.write("Preprocessing")

        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)

        # filename = os.path.basename(file)
        bib_database = None
        try:
            bib_database = bibtexparser.load(open(file))
        except ValueError as e:
            return CommandError(e)

        bibtex = PublicationLoader()

        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()
        for entry in bib_database.entries:
            # create model object for each entry
            if entry["ENTRYTYPE"]:
                tasks.append(pool.submit(bibtex.store_bibtex_entry, entry))
        if verbosity > 0:
            self.stdout.write("Loading")
        for task in tqdm(
            as_completed(tasks),
            total=len(tasks),
            disable=False if verbosity > 0 else True,
        ):
            try:
                task.result()
            except ImportingError as e:
                raise CommandError(e)
        pool.shutdown()

        if verbosity > 0:
            self.stdout.write(self.style.SUCCESS("Done"))

예제 #9

0

파일 보기

    def handle(self,
               file: str,
               organism: str,
               doi: str = None,
               ignore: str = None,
               qtl: bool = False,
               cpu: int = 1,
               verbosity: int = 1,
               **options):
        """Execute the main function."""
        # retrieve only the file name
        filename = os.path.basename(file)
        if verbosity > 0:
            self.stdout.write("Processing file: {}".format(filename))

        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)

        try:
            index_file = "{}.tbi".format(file)
            FileValidator().validate(index_file)
        except ImportingError:
            try:
                index_file = "{}.csi".format(file)
                FileValidator().validate(index_file)
            except ImportingError:
                raise CommandError("No index found (.tbi/.csi)")

        try:
            feature_file = FeatureLoader(filename=filename,
                                         source="GFF_SOURCE",
                                         doi=doi)
        except ImportingError as e:
            raise CommandError(e)

        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()

        chunk_size = cpu * 2

        # Load the GFF3 file
        with open(file) as tbx_file:
            tbx = pysam.TabixFile(filename=tbx_file.name, index=index_file)
            for row in tqdm(tbx.fetch(parser=pysam.asGTF()),
                            total=get_num_lines(file)):
                if ignore is not None and row.feature in ignore:
                    continue
                tasks.append(
                    pool.submit(feature_file.store_tabix_GFF_feature, row,
                                organism, qtl))

                if len(tasks) >= chunk_size:
                    for task in as_completed(tasks):
                        try:
                            task.result()
                        except ImportingError as e:
                            raise CommandError(e)
                    tasks.clear()
            else:
                for task in as_completed(tasks):
                    try:
                        task.result()
                    except ImportingError as e:
                        raise CommandError(e)
                tasks.clear()

        pool.shutdown()

        if verbosity > 0:
            self.stdout.write("Loading relationships")

        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()

        for item in feature_file.relationships:
            tasks.append(
                pool.submit(
                    feature_file.store_relationship,
                    organism,
                    item["subject_id"],
                    item["object_id"],
                ))

        for task in tqdm(as_completed(tasks), total=len(tasks)):
            try:
                task.result()
            except ImportingError as e:
                raise CommandError(e)
        pool.shutdown()

        if feature_file.ignored_attrs is not None:
            self.stdout.write(
                self.style.WARNING("Ignored attrs: {}".format(
                    feature_file.ignored_attrs)))

        if verbosity > 0:
            self.stdout.write(
                self.style.SUCCESS("Done with {}".format(filename)))

예제 #10

0

파일 보기

파일: load_gene_ontology.py 프로젝트: lmb-embrapa/machado

    def handle(self, file: str, cpu: int = 1, verbosity: int = 1, **options):
        """Execute the main function."""
        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)

        # Load the ontology file
        with open(file) as obo_file:
            G = read_obo(obo_file)

        cv_definition = G.graph["data-version"]

        if verbosity > 0:
            self.stdout.write("Preprocessing")

        # Instantiating Ontology in order to have access to secondary cv, db,
        # cvterm, and dbxref, even though the main cv will not be used.
        # There will be a ontology for each namespace, plus one called
        # gene_ontology for storing type_defs
        try:
            ontology = OntologyLoader("biological_process", cv_definition)
            ontology = OntologyLoader("molecular_function", cv_definition)
            ontology = OntologyLoader("cellular_component", cv_definition)
            ontology = OntologyLoader("external", cv_definition)
            ontology = OntologyLoader("gene_ontology", cv_definition)
        except ImportingError as e:
            raise CommandError(e)

        # Load typedefs as Dbxrefs and Cvterm
        if verbosity > 0:
            self.stdout.write("Loading typedefs ({} threads)".format(cpu))

        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()
        for typedef in G.graph["typedefs"]:
            tasks.append(pool.submit(ontology.store_type_def, typedef))
        for task in tqdm(as_completed(tasks), total=len(tasks)):
            if task.result():
                raise (task.result())

        # Load the cvterms
        if verbosity > 0:
            self.stdout.write("Loading terms ({} threads)".format(cpu))

        lock = Lock()
        tasks = list()
        for n, data in G.nodes(data=True):
            tasks.append(pool.submit(ontology.store_term, n, data, lock))
        for task in tqdm(as_completed(tasks), total=len(tasks)):
            if task.result():
                raise (task.result())

        # Load the relationship between cvterms
        if verbosity > 0:
            self.stdout.write("Loading relationships ({} threads)".format(cpu))

        tasks = list()
        for u, v, type in G.edges(keys=True):
            tasks.append(pool.submit(ontology.store_relationship, u, v, type))
        for task in tqdm(as_completed(tasks), total=len(tasks)):
            if task.result():
                raise (task.result())
        pool.shutdown()

        if verbosity > 0:
            self.stdout.write(self.style.SUCCESS("Done"))

예제 #11

0

파일 보기

    def handle(
        self,
        file: str,
        organism: str,
        doi: str = None,
        cpu: int = 1,
        verbosity: int = 1,
        **options
    ):
        """Execute the main function."""
        # retrieve only the file name
        filename = os.path.basename(file)
        if verbosity > 0:
            self.stdout.write("Processing file: {}".format(filename))

        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)

        try:
            index_file = "{}.tbi".format(file)
            FileValidator().validate(index_file)
        except ImportingError:
            try:
                index_file = "{}.csi".format(file)
                FileValidator().validate(index_file)
            except ImportingError:
                raise CommandError("No index found (.tbi/.csi)")

        try:
            feature_file = FeatureLoader(
                filename=filename, source="VCF_SOURCE", doi=doi
            )
        except ImportingError as e:
            raise CommandError(e)

        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()

        chunk_size = cpu * 2

        # Load the GFF3 file
        with open(file) as tbx_file:
            tbx = pysam.TabixFile(filename=tbx_file.name, index=index_file)
            for row in tqdm(tbx.fetch(parser=pysam.asVCF()), total=get_num_lines(file)):
                tasks.append(
                    pool.submit(feature_file.store_tabix_VCF_feature, row, organism)
                )

                if len(tasks) >= chunk_size:
                    for task in as_completed(tasks):
                        try:
                            task.result()
                        except ImportingError as e:
                            raise CommandError(e)
                    tasks.clear()
            else:
                for task in as_completed(tasks):
                    try:
                        task.result()
                    except ImportingError as e:
                        raise CommandError(e)
                tasks.clear()

        pool.shutdown()

        if verbosity > 0:
            self.stdout.write(self.style.SUCCESS("Done with {}".format(filename)))

예제 #12

0

파일 보기

파일: load_orthomcl.py 프로젝트: lmb-embrapa/machado

    def handle(self, file: str, cpu: int = 1, verbosity: int = 0, **options):
        """Execute the main function."""
        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)
        filename = os.path.basename(file)
        if verbosity > 0:
            self.stdout.write("Processing file: {}".format(filename))
        try:
            groups = open(file, "r")
            # retrieve only the file name
        except ImportingError as e:
            raise CommandError(e)
        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()
        cv, created = Cv.objects.get_or_create(name="feature_property")
        ortho_db, created = Db.objects.get_or_create(name="ORTHOMCL_SOURCE")
        ortho_dbxref, created = Dbxref.objects.get_or_create(
            accession="ORTHOMCL_SOURCE", db=ortho_db)
        cvterm_cluster, created = Cvterm.objects.get_or_create(
            name="orthologous group",
            cv=cv,
            dbxref=ortho_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )

        # hardcoded as orthomcl uses protein input
        soterm = "polypeptide"

        source = "null"
        featureloader = FeatureLoader(source=source, filename=filename)
        # each line is an orthologous group
        for line in groups:
            members = []
            name = ""
            fields = re.split(r"\s+", line.strip())
            if re.search(r"^(\w+)\:", fields[0]):
                group_field = re.match(r"^(\w+)\:", fields[0])
                name = group_field.group(1)
                fields.pop(0)
                for field in fields:
                    if re.search(r"^(\w+)\|(\S+)", field):
                        member_field = re.match(r"^(\w+)\|(\S+)", field)
                        ident = member_field.group(2)
                        members.append(ident)
            else:
                raise CommandError("Cluster has no identification, check.")
            # only orthologous groups with 2 or more members allowed
            if len(members) > 1:
                tasks.append(
                    pool.submit(
                        featureloader.store_feature_groups,
                        soterm=soterm,
                        group=members,
                        term=cvterm_cluster.cvterm_id,
                        value=name,
                    ))
        if verbosity > 0:
            self.stdout.write("Loading")
        for task in tqdm(as_completed(tasks), total=len(tasks)):
            if task.result():
                raise (task.result())
        pool.shutdown()
        if verbosity > 0:
            self.stdout.write(
                self.style.SUCCESS("Done with {}".format(filename)))

예제 #13

0

파일 보기

파일: test_loaders_common.py 프로젝트: lmb-embrapa/machado

    def test_validate_file(self):
        """Tests - validate file."""
        # test file not exists
        file_path = "/tmp/machado.test.file"
        v = FileValidator()
        with self.assertRaisesMessage(ImportingError,
                                      "{} does not exist".format(file_path)):
            v.validate(file_path=file_path)

        # test wrong file type
        file_path = "/tmp/machado.test.dir"
        os.mkdir(file_path)
        v = FileValidator()
        with self.assertRaisesMessage(ImportingError,
                                      "{} is not a file".format(file_path)):
            v.validate(file_path=file_path)
        os.rmdir(file_path)

        # test file not readable
        file_path = "/tmp/machado.test.file"
        os.mknod(file_path, mode=0o200)
        v = FileValidator()
        with self.assertRaisesMessage(ImportingError,
                                      "{} is not readable".format(file_path)):
            v.validate(file_path=file_path)
        os.remove(file_path)

예제 #14

0

파일 보기

파일: load_rnaseq_data.py 프로젝트: lmb-embrapa/machado

    def handle(self,
               file: str,
               organism: str,
               program: str,
               programversion: str,
               name: str = None,
               description: str = None,
               algorithm: str = None,
               assaydb: str = "SRA",
               timeexecuted: str = None,
               norm: int = 1,
               cpu: int = 1,
               verbosity: int = 0,
               **options):
        """Execute the main function."""
        filename = os.path.basename(file)
        if verbosity > 0:
            self.stdout.write("Processing file: {}".format(filename))
        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)

        # start reading file
        try:
            rnaseq_data = open(file, "r")
            # retrieve only the file name
        except ImportingError as e:
            raise CommandError(e)
        header = 1
        # analysis_list = defaultdict(list)
        analysis_list = list()
        # instantiate Loader
        analysis_file = AnalysisLoader()
        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()
        for line in rnaseq_data:
            fields = re.split("\t", line.rstrip())
            nfields = len(fields)
            # validate fields within line
            try:
                FieldsValidator().validate(nfields, fields)
            except ImportingError as e:
                raise CommandError(e)
                # read header and instantiate analysis object for each assay
                # e.g. SRR12345.
            if header:
                # first element is the string "gene" - need to be removed
                fields.pop(0)
                for i in range(len(fields)):
                    # parse field to get SRA ID. e.g.: SRR5167848.htseq
                    # try to remove ".htseq" part of string
                    string = re.match(r"(\w+)\.(\w+)", fields[i])
                    try:
                        assay = string.group(1)
                    except IntegrityError as e:
                        raise CommandError(e)
                    # store analysis
                    try:
                        analysis = analysis_file.store_analysis(
                            program=program,
                            sourcename=fields[i],
                            programversion=programversion,
                            timeexecuted=timeexecuted,
                            algorithm=algorithm,
                            name=assay,
                            description=description,
                            filename=filename,
                        )
                    except ImportingError as e:
                        raise CommandError(e)
                    # store quantification
                    try:
                        analysis_file.store_quantification(analysis=analysis,
                                                           assayacc=assay)
                    except ImportingError as e:
                        raise CommandError(e)
                    # finally, store each analysis in a list.
                    analysis_list.insert(i, analysis)
                header = 0
            else:
                # first element is the feature acc. "e.g.: AT2G44195.1.TAIR10"
                feature_name = fields.pop(0)
                for i in range(len(fields)):
                    if norm:
                        normscore = fields[i]
                        rawscore = None
                    else:
                        normscore = None
                        rawscore = fields[i]
                    # store analysis feature for each value
                    tasks.append(
                        pool.submit(
                            analysis_file.store_analysisfeature,
                            analysis_list[i],
                            feature_name,
                            organism,
                            rawscore,
                            normscore,
                        ))
        if verbosity > 0:
            self.stdout.write("Loading")
        for task in tqdm(as_completed(tasks), total=len(tasks)):
            try:
                task.result()
            except ImportingError as e:
                raise CommandError(e)
        pool.shutdown()
        if verbosity > 0:
            self.stdout.write(
                self.style.SUCCESS("Done with {}".format(filename)))

예제 #15

0

파일 보기

파일: load_similarity.py 프로젝트: gledisonteixeira/machado

    def handle(
        self,
        file: str,
        format: str,
        so_query: str,
        so_subject: str,
        organism_query: str,
        organism_subject: str,
        program: str,
        programversion: str,
        name: str = None,
        description: str = None,
        algorithm: str = None,
        cpu: int = 1,
        verbosity: int = 1,
        **options
    ):
        """Execute the main function."""
        filename = os.path.basename(file)
        if organism_query == "mutispecies multispecies":
            raise CommandError("Query's organism cannot be multispecies")

        if format not in VALID_FORMAT:
            raise CommandError(
                "The format is not valid. Please choose: " "{}".format(VALID_FORMAT)
            )
        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)

        try:
            similarity_file = SimilarityLoader(
                filename=filename,
                so_query=so_query,
                so_subject=so_subject,
                org_query=organism_query,
                org_subject=organism_subject,
                algorithm=algorithm,
                name=name,
                description=description,
                program=program,
                programversion=programversion,
                input_format=format,
            )
        except ImportingError as e:
            raise CommandError(e)

        try:
            similarity_records = SearchIO.parse(file, format)
        except ValueError as e:
            return CommandError(e)

        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()
        if verbosity > 0:
            self.stdout.write("Processing file: {}".format(filename))
        for record in similarity_records:
            if len(record.hsps) > 0:
                tasks.append(
                    pool.submit(similarity_file.store_bio_searchio_query_result, record)
                )
        if verbosity > 0:
            self.stdout.write("Loading")
        for task in tqdm(as_completed(tasks), total=len(tasks)):
            try:
                task.result()
            except ImportingError as e:
                raise CommandError(e)
        pool.shutdown()
        if verbosity > 0:
            self.stdout.write(self.style.SUCCESS("Done with {}".format(filename)))

예제 #16

0

파일 보기

    def handle(self,
               file: str,
               biomaterialdb: str,
               assaydb: str,
               cpu: int = 1,
               verbosity: int = 0,
               **options):
        """Execute the main function."""
        filename = os.path.basename(file)
        nfields = 8
        if verbosity > 0:
            self.stdout.write("Processing file: {}".format(filename))
        # instantiate project, biomaterial and assay
        try:
            project_file = ProjectLoader()
            biomaterial_file = BiomaterialLoader()
            assay_file = AssayLoader()
            treatment_file = TreatmentLoader()
        except ImportingError as e:
            raise CommandError(e)

        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)
        try:
            rnaseq_data = open(file, "r")
            # retrieve only the file name
        except ImportingError as e:
            raise CommandError(e)
        # each line is an RNA-seq experiment
        # e.g:
        # Oryza sativa,GSE112368,GSM3068810,SRR6902930,heat leaf,Heat stress,Leaf,Jul-20-2018
        for line in rnaseq_data:
            fields = re.split(",", line.strip())
            organism_name = fields[0]
            try:
                FieldsValidator().validate(nfields, fields)
            except ImportingError as e:
                raise CommandError(e)
            # get organism - mandatory
            try:
                organism = retrieve_organism(organism=organism_name)
            except ObjectDoesNotExist as e:
                raise ImportingError(e)
            # store project
            try:
                # e.g: "GSExxx" from GEO
                project_model = project_file.store_project(name=fields[1],
                                                           filename=filename)
            except ObjectDoesNotExist as e:
                raise ImportingError(e)

            # store biomaterial (sample)
            try:
                # e.g: "GSMxxxx" from GEO
                biomaterial_model = biomaterial_file.store_biomaterial(
                    db=biomaterialdb,
                    acc=fields[2],
                    organism=organism,
                    name=fields[2],
                    filename=filename,
                    description=fields[6],
                )
            except ImportingError as e:
                raise CommandError(e)
            # store treatment
            try:
                # e.g. "Heat"
                treatment_model = treatment_file.store_treatment(
                    name=fields[5], biomaterial=biomaterial_model)
            except ImportingError as e:
                raise CommandError(e)
            try:
                biomaterial_file.store_biomaterial_treatment(
                    biomaterial=biomaterial_model, treatment=treatment_model)
            except ImportingError as e:
                raise CommandError(e)

            # store assay (experiment)
            try:
                # e.g. "SRRxxxx" from GEO
                assay_model = assay_file.store_assay(
                    db=assaydb,
                    acc=fields[3],
                    assaydate=fields[7],
                    name=fields[3],
                    filename=filename,
                    description=fields[4],
                )
                assay_file.store_assay_project(assay=assay_model,
                                               project=project_model)
                assay_file.store_assay_biomaterial(
                    assay=assay_model, biomaterial=biomaterial_model)
            except ImportingError as e:
                raise CommandError(e)
        if verbosity > 0:
            self.stdout.write(
                self.style.SUCCESS("Done with {}".format(filename)))

예제 #17

0

파일 보기

파일: load_organism.py 프로젝트: lmb-embrapa/machado

    def handle(self,
               file: str,
               name: str,
               verbosity: int = 1,
               cpu: int = 1,
               **options):
        """Execute the main function."""
        if verbosity > 0:
            self.stdout.write("Preprocessing")

        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)

        try:
            organism_db = OrganismLoader(organism_db=name)
        except ImportingError as e:
            raise CommandError(e)

        file_names = open(file)

        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()
        current_id = None
        taxid, scname = "", ""
        synonyms, common_names = [], []
        for line in file_names:
            columns = re.split(r"\s\|\s", line)
            if current_id is not None and current_id != columns[0]:
                # store if new record
                tasks.append(
                    pool.submit(
                        organism_db.store_organism_record,
                        taxid,
                        scname,
                        synonyms,
                        common_names,
                    ))
                taxid, scname = "", ""
                synonyms, common_names = [], []

            current_id = columns[0]

            # get data while current_id remains unchanged
            if columns[3] == "scientific name":
                taxid = columns[0]
                if columns[2] == "" or columns[1] == columns[2]:
                    scname = columns[1]
                else:
                    scname = "{} {}".format(columns[1], columns[2])
            elif columns[3] == "synonym":
                synonyms.append(columns[1])
            elif columns[3] == "common name":
                common_names.append(columns[1])
        else:
            # insert the last record
            tasks.append(
                pool.submit(
                    organism_db.store_organism_record,
                    taxid,
                    scname,
                    synonyms,
                    common_names,
                ))

        if verbosity > 0:
            self.stdout.write("Loading names file")
        for task in tqdm(as_completed(tasks), total=len(tasks)):
            if task.result():
                raise (task.result())
        pool.shutdown()
        if verbosity > 0:
            self.stdout.write(self.style.SUCCESS("Done"))

예제 #18

0

파일 보기

파일: load_coexpression_pairs.py 프로젝트: lmb-embrapa/machado

    def handle(
        self,
        file: str,
        cpu: int = 1,
        soterm: str = "mRNA",
        verbosity: int = 0,
        **options
    ):
        """Execute the main function."""
        filename = os.path.basename(file)
        if verbosity > 0:
            self.stdout.write("Processing file: {}".format(filename))

        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)
        try:
            pairs = open(file, "r")
            # retrieve only the file name
        except ImportingError as e:
            raise CommandError(e)

        cvterm_corel = Cvterm.objects.get(
            name="correlated with", cv__name="relationship"
        ).cvterm_id
        # feature source is not needed here
        source = "null"
        featureloader = FeatureLoader(source=source, filename=filename)
        size = get_num_lines(file)
        # every cpu should be able to handle 5 tasks
        chunk = cpu * 5
        with ThreadPoolExecutor(max_workers=cpu) as pool:
            tasks = list()
            for line in tqdm(pairs, total=size):
                nfields = 3
                fields = re.split(r"\s+", line.rstrip())
                try:
                    FieldsValidator().validate(nfields, fields)
                except ImportingError as e:
                    raise CommandError(e)
                # get corrected PCC value (last item from fields list)
                value = float(fields.pop()) + 0.7
                tasks.append(
                    pool.submit(
                        featureloader.store_feature_pairs,
                        pair=fields,
                        soterm=soterm,
                        term=cvterm_corel,
                        value=value,
                    )
                )
                if len(tasks) >= chunk:
                    for task in as_completed(tasks):
                        if task.result():
                            raise (task.result())
                    tasks.clear()
            else:
                for task in as_completed(tasks):
                    if task.result():
                        raise (task.result())
                tasks.clear()
            pool.shutdown()
        if verbosity > 0:
            self.stdout.write(self.style.SUCCESS("Done with {}".format(filename)))

예제 #19

0

파일 보기

파일: load_phylotree.py 프로젝트: lmb-embrapa/machado

    def handle(self,
               file: str,
               name: str,
               organismdb: str,
               verbosity: int = 1,
               cpu: int = 1,
               **options):
        """Execute the main function."""
        if verbosity > 0:
            self.stdout.write("Preprocessing")

        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)

        try:
            phylotree = PhylotreeLoader(phylotree_name=name,
                                        organism_db=organismdb)
        except ImportingError as e:
            raise CommandError(e)

        file_nodes = open(file)

        self.nodes: Dict[int, Dict[str, Any]] = dict()
        self.ctr = 0
        for line in file_nodes:
            columns = re.split(r"\s\|\s", line)
            tax_id = int(columns[0])
            parent_id = int(columns[1])
            level = columns[2]

            if self.nodes.get(tax_id) is None:
                self.nodes[tax_id] = {
                    "parent_id": parent_id,
                    "level": level,
                    "children": [],
                }
            else:
                self.nodes[tax_id]["parent_id"] = parent_id
                self.nodes[tax_id]["level"] = level
            if self.nodes.get(parent_id) is None:
                self.nodes[parent_id] = {
                    "parent_id": None,
                    "level": None,
                    "children": [tax_id],
                }
            else:
                self.nodes[parent_id]["children"].append(tax_id)

        self.walktree(node_id=1)

        if verbosity > 0:
            self.stdout.write("Loading")

        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()
        # By setting the parent_id to None it's possible to load the
        # nodes randomly and using threads.
        try:
            for key, data in self.nodes.items():
                tasks.append(
                    pool.submit(
                        phylotree.store_phylonode_record,
                        tax_id=key,
                        parent_id=None,
                        level=data["level"],
                        left_idx=data["left_idx"],
                        right_idx=data["right_idx"],
                    ))
            for task in tqdm(as_completed(tasks), total=len(tasks)):
                if task.result():
                    tax_id, phylonode = task.result()
                    self.nodes[tax_id]["phylonode_id"] = phylonode.phylonode_id
        except KeyError as e:
            raise CommandError("Could not calculate {}. Make it sure it is "
                               "possible to walk the entire tree "
                               "structure.".format(e))

        if verbosity > 0:
            self.stdout.write("Loading nodes relationships")
        tasks = list()
        # Load the nodes relationship info
        for key, data in self.nodes.items():
            if data.get("parent_id") is None:
                continue
            tasks.append(
                pool.submit(
                    phylotree.update_parent_phylonode_id,
                    data["phylonode_id"],
                    data["parent_id"],
                ))
        for task in tqdm(as_completed(tasks), total=len(tasks)):
            if task.result():
                raise (task.result())
        pool.shutdown()

        if verbosity > 0:
            self.stdout.write(self.style.SUCCESS("Done"))

예제 #20

0

파일 보기

파일: load_coexpression_clusters.py 프로젝트: lmb-embrapa/machado

    def handle(self,
               file: str,
               organism: str,
               soterm: str = "mRNA",
               cpu: int = 1,
               verbosity: int = 0,
               **options):
        """Execute the main function."""
        filename = os.path.basename(file)
        if verbosity > 0:
            self.stdout.write("Processing file: {}".format(filename))

        try:
            organism = retrieve_organism(organism)
        except IntegrityError as e:
            raise ImportingError(e)
        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)
        try:
            clusters = open(file, "r")
            # retrieve only the file name
        except ImportingError as e:
            raise CommandError(e)

        tasks = list()
        cv, created = Cv.objects.get_or_create(name="feature_property")
        coexp_db, created = Db.objects.get_or_create(name="LSTRAP_SOURCE")
        coexp_dbxref, created = Dbxref.objects.get_or_create(
            accession="LSTRAP_SOURCE", db=coexp_db)
        cvterm_cluster, created = Cvterm.objects.get_or_create(
            name="coexpression group",
            cv=cv,
            dbxref=coexp_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        # feature source is not needed here
        source = "null"
        featureloader = FeatureLoader(source=source, filename=filename)

        pool = ThreadPoolExecutor(max_workers=cpu)
        # each line is an coexpression cluster group
        for line in tqdm(clusters, total=get_num_lines(file)):
            name = ""
            fields = re.split(r"\s+", line.strip())
            nfields = len(fields)
            try:
                FieldsValidator().validate(nfields, fields)
            except ImportingError as e:
                raise CommandError(e)

            if re.search(r"^(\w+)\:", fields[0]):
                group_field = re.match(r"^(\w+)\:", fields[0])
                name = group_field.group(1)
            else:
                raise CommandError("Cluster identification has problems.")
            # remove cluster name before loading
            fields.pop(0)
            # get cvterm for correlation
            tasks.append(
                pool.submit(
                    featureloader.store_feature_groups,
                    group=fields,
                    soterm=soterm,
                    term=cvterm_cluster.cvterm_id,
                    value=name,
                ))
        if verbosity > 0:
            self.stdout.write("Loading")
        for task in tqdm(as_completed(tasks), total=len(tasks)):
            if task.result():
                raise (task.result())
        pool.shutdown()
        if verbosity > 0:
            self.stdout.write(
                self.style.SUCCESS("Done with {}".format(filename)))