示例#1
0
    def handle(self,
               file: str,
               cvterm: str,
               soterm: str,
               doi: str = None,
               verbosity: int = 1,
               cpu: int = 1,
               **options):
        """Execute the main function."""
        if verbosity > 0:
            self.stdout.write("Preprocessing")

        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)

        # retrieve only the file name
        filename = os.path.basename(file)

        try:
            feature_file = FeatureLoader(filename=filename,
                                         source="GFF_source")
        except ImportingError as e:
            raise CommandError(e)

        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()

        # Load the annotation file
        with open(file) as tab_file:
            for line in tab_file:
                if line.startswith("#"):
                    continue
                feature, annotation = line.strip().split("\t")
                tasks.append(
                    pool.submit(
                        feature_file.store_feature_annotation,
                        feature,
                        soterm,
                        cvterm,
                        annotation,
                        doi,
                    ))

        if verbosity > 0:
            self.stdout.write("Loading feature annotations")
        for task in tqdm(as_completed(tasks), total=len(tasks)):
            try:
                task.result()
            except ImportingError as e:
                raise CommandError(e)
        pool.shutdown()

        if verbosity > 0:
            self.stdout.write(self.style.SUCCESS("Done"))
示例#2
0
    def handle(self,
               file: str,
               format: str,
               cpu: int = 1,
               verbosity: int = 1,
               **options):
        """Execute the main function."""
        # retrieve only the file name
        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)
        if format == "blast-xml":
            source = "BLAST_source"
        elif format == "interproscan-xml":
            source = "InterproScan_source"
        else:
            raise CommandError("Format allowed options are blast-xml or "
                               "interproscan-xml only, not {}".format(format))

        filename = os.path.basename(file)
        try:
            feature_file = FeatureLoader(filename=filename, source=source)
        except ImportingError as e:
            raise CommandError(e)

        if verbosity > 0:
            self.stdout.write("Processing file: {}".format(filename))
        try:
            records = SearchIO.parse(file, format)
        except ValueError as e:
            return CommandError(e)

        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()
        for record in records:
            for hit in record.hits:
                tasks.append(
                    pool.submit(feature_file.store_bio_searchio_hit, hit,
                                record.target))
        if verbosity > 0:
            self.stdout.write("Loading")
        for task in tqdm(as_completed(tasks), total=len(tasks)):
            try:
                task.result()
            except ImportingError as e:
                raise CommandError(e)
        pool.shutdown()

        if len(feature_file.ignored_goterms) > 0:
            self.stdout.write(
                self.style.WARNING("Ignored GO terms: {}".format(
                    feature_file.ignored_goterms)))
        if verbosity > 0:
            self.stdout.write(
                self.style.SUCCESS("Done with {}".format(filename)))
示例#3
0
    def test_get_attributes(self):
        """Tests - get attributes."""
        test_db = Db.objects.create(name="SO")
        test_dbxref = Dbxref.objects.create(accession="12345", db=test_db)
        test_cv = Cv.objects.create(name="sequence")
        Cvterm.objects.create(
            name="polypeptide",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_dbxref = Dbxref.objects.create(accession="123455", db=test_db)
        Cvterm.objects.create(
            name="protein_match",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_db = Db.objects.create(name="RO")
        test_dbxref = Dbxref.objects.create(accession="00002", db=test_db)
        test_cv = Cv.objects.create(name="relationship")
        Cvterm.objects.create(
            name="contained in",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )

        Organism.objects.create(genus="Mus", species="musculus")
        test_feature_file = FeatureLoader(filename="file.name",
                                          source="GFF_loader")
        test_attrs = test_feature_file.get_attributes("ID=1;name=feat1")
        self.assertEqual("1", test_attrs.get("id"))
        self.assertEqual("feat1", test_attrs.get("name"))
    def test_load_coexpression_pairs(self):
        """Run tests of load_coexpression_pairs."""
        """Load 'pcc.mcl.txt' output result file from LSTrAP.
    The 'pcc.mcl.txt' file is headless and have the format as follows:
    AT2G44195.1.TAIR10	AT1G30080.1.TAIR10	0.18189286870895194
    AT2G44195.1.TAIR10	AT5G24750.1.TAIR10	0.1715779378273995
    ...
    and so on.
    The value of the third column is a Pearson correlation coefficient
    subtracted from 0.7 (PCC - 0.7). To obtain the original PCC value,
    it must be added 0.7 to every value of the third column."""
        # register multispecies organism
        test_organism, created = Organism.objects.get_or_create(
            abbreviation="multispecies",
            genus="multispecies",
            species="multispecies",
            common_name="multispecies",
        )
        # creating test SO term
        test_db = Db.objects.create(name="SO")
        test_cv = Cv.objects.create(name="sequence")
        # creating test RO term
        test_db2 = Db.objects.create(name="RO")
        test_cv2 = Cv.objects.create(name="relationship")

        # test_dbxref = Dbxref.objects.create(accession='123456', db=test_db)
        test_dbxref2 = Dbxref.objects.create(accession="789", db=test_db2)
        test_dbxref3 = Dbxref.objects.create(accession="135", db=test_db)
        test_dbxref4 = Dbxref.objects.create(accession="246", db=test_db2)
        test_dbxref6 = Dbxref.objects.create(accession="357", db=test_db)
        test_dbxref7 = Dbxref.objects.create(accession="468", db=test_db)
        Cvterm.objects.create(
            name="mRNA",
            cv=test_cv,
            dbxref=test_dbxref3,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        # Cvterm.objects.create(
        #     name='polypeptide', cv=test_cv, dbxref=test_dbxref,
        #    is_obsolete=0, is_relationshiptype=0)
        # register features.
        cvterm_contained_in = Cvterm.objects.create(
            name="contained in",
            cv=test_cv2,
            dbxref=test_dbxref2,
            is_obsolete=0,
            is_relationshiptype=1,
        )
        term = Cvterm.objects.create(
            name="correlated with",
            cv=test_cv2,
            dbxref=test_dbxref4,
            is_obsolete=0,
            is_relationshiptype=1,
        )
        test_term = Cvterm.objects.create(
            name="polypeptide",
            cv=test_cv,
            dbxref=test_dbxref6,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        Cvterm.objects.create(
            name="protein_match",
            cv=test_cv,
            dbxref=test_dbxref7,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        db = Db.objects.create(name="FASTA_SOURCE")
        # creating test features
        test_featurename1 = "AT2G44195.1.TAIR10"
        dbxref1 = Dbxref.objects.create(db=db, accession=test_featurename1)
        test_feature1 = Feature.objects.create(
            organism=test_organism,
            dbxref=dbxref1,
            uniquename=test_featurename1,
            is_analysis=False,
            type_id=test_term.cvterm_id,
            is_obsolete=False,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )
        test_featurename2 = "AT1G30080.1.TAIR10"
        dbxref2 = Dbxref.objects.create(db=db, accession=test_featurename2)
        test_feature2 = Feature.objects.create(
            organism=test_organism,
            dbxref=dbxref2,
            uniquename=test_featurename2,
            is_analysis=False,
            type_id=test_term.cvterm_id,
            is_obsolete=False,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )
        test_featurename3 = "AT5G24750.1.TAIR10"
        dbxref3 = Dbxref.objects.create(db=db, accession=test_featurename3)
        test_feature3 = Feature.objects.create(
            dbxref=dbxref3,
            organism=test_organism,
            uniquename=test_featurename3,
            is_analysis=False,
            type_id=test_term.cvterm_id,
            is_obsolete=False,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )
        test_pair1 = [test_featurename1, test_featurename2]
        test_pair2 = [test_featurename1, test_featurename3]
        test_value1 = 0.1818928687089519
        test_value2 = 0.1715779378273995
        test_pcc_value1 = str(test_value1 + 0.7)
        test_pcc_value2 = str(test_value2 + 0.7)
        # dummy coexpression variables
        test_filename = "pcc.mcl.dummy.txt"
        source = "null"
        soterm = "polypeptide"
        test_coexpression_loader = FeatureLoader(source=source,
                                                 filename=test_filename)
        test_coexpression_loader.store_feature_pairs(pair=test_pair1,
                                                     soterm=soterm,
                                                     term=term,
                                                     value=test_pcc_value1)
        test_coexpression_loader.store_feature_pairs(pair=test_pair2,
                                                     soterm=soterm,
                                                     term=term,
                                                     value=test_pcc_value2)
        # start checking
        self.assertTrue(
            FeatureRelationship.objects.filter(
                subject_id=test_feature1.feature_id,
                object_id=test_feature2.feature_id,
                value=test_pcc_value1,
            ).exists())
        self.assertTrue(
            FeatureRelationship.objects.filter(
                subject_id=test_feature1.feature_id,
                object_id=test_feature3.feature_id,
                value=test_pcc_value2,
            ).exists())
        fr1 = FeatureRelationship.objects.get(
            subject_id=test_feature1.feature_id,
            object_id=test_feature2.feature_id,
            value=test_pcc_value1,
        )
        fr2 = FeatureRelationship.objects.get(
            subject_id=test_feature1.feature_id,
            object_id=test_feature3.feature_id,
            value=test_pcc_value2,
        )
        self.assertTrue(
            FeatureRelationshipprop.objects.filter(
                feature_relationship=fr1,
                type_id=cvterm_contained_in.cvterm_id,
                value=test_filename,
            ).exists())
        self.assertTrue(
            FeatureRelationshipprop.objects.filter(
                feature_relationship=fr2,
                type_id=cvterm_contained_in.cvterm_id,
                value=test_filename,
            ).exists())
    def test_load_coexpression_clusters(self):
        """Run tests of load_coexpression_pairs."""
        """Load 'mcl.clusters.txt' output result file from LSTrAP.
        The 'mcl.clusters.txt' is a tab separated, headless file and have the
        format as follows (each line is a cluster):
        ath_coexpr_mcl_1:   AT3G18715.1.TAIR10 AT3G08790.1.TAIR10
        AT5G42230.1.TAIR10
        ath_coexpr_mcl_1:   AT1G27040.1.TAIR10 AT1G71692.1.TAIR10
        ath_coexpr_mcl_1:   AT5G24750.1.TAIR10
        ...
        and so on.
        The features need to be loaded previously or won't be registered."""
        # register multispecies organism
        test_organism = Organism.objects.create(
            abbreviation="multispecies",
            genus="multispecies",
            species="multispecies",
            common_name="multispecies",
        )
        # creating test SO term
        test_db = Db.objects.create(name="SO")
        test_cv = Cv.objects.create(name="sequence")
        # creating test RO term
        test_db2 = Db.objects.create(name="RO")
        test_cv2 = Cv.objects.create(name="relationship")
        test_cv3 = Cv.objects.create(name="feature_property")

        # test_dbxref = Dbxref.objects.create(accession='123456', db=test_db)
        test_dbxref2 = Dbxref.objects.create(accession="028", db=test_db)
        test_dbxref3 = Dbxref.objects.create(accession="135", db=test_db)
        test_dbxref4 = Dbxref.objects.create(accession="246", db=test_db2)
        test_dbxref5 = Dbxref.objects.create(accession="579", db=test_db2)
        test_dbxref6 = Dbxref.objects.create(accession="357", db=test_db)
        test_dbxref7 = Dbxref.objects.create(accession="468", db=test_db)
        Cvterm.objects.create(
            name="mRNA",
            cv=test_cv,
            dbxref=test_dbxref3,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        # Cvterm.objects.create(
        #     name='polypeptide', cv=test_cv, dbxref=test_dbxref,
        #    is_obsolete=0, is_relationshiptype=0)
        # register features.
        Cvterm.objects.create(
            name="contained in",
            cv=test_cv2,
            dbxref=test_dbxref2,
            is_obsolete=0,
            is_relationshiptype=1,
        )
        Cvterm.objects.create(
            name="correlated with",
            cv=test_cv2,
            dbxref=test_dbxref4,
            is_obsolete=0,
            is_relationshiptype=1,
        )
        term = Cvterm.objects.create(
            name="coexpression group",
            cv=test_cv3,
            dbxref=test_dbxref5,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_term = Cvterm.objects.create(
            name="polypeptide",
            cv=test_cv,
            dbxref=test_dbxref6,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        Cvterm.objects.create(
            name="protein_match",
            cv=test_cv,
            dbxref=test_dbxref7,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        db = Db.objects.create(name="FASTA_SOURCE")

        test_featurename1 = "AT3G18715.1.TAIR10"
        dbxref1 = Dbxref.objects.create(db=db, accession=test_featurename1)
        test_feature1 = Feature.objects.create(
            dbxref=dbxref1,
            organism=test_organism,
            uniquename=test_featurename1,
            is_analysis=False,
            type_id=test_term.cvterm_id,
            is_obsolete=False,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )
        test_featurename2 = "AT3G08790.1.TAIR10"

        dbxref2 = Dbxref.objects.create(db=db, accession=test_featurename2)
        test_feature2 = Feature.objects.create(
            dbxref=dbxref2,
            organism=test_organism,
            uniquename=test_featurename2,
            is_analysis=False,
            type_id=test_term.cvterm_id,
            is_obsolete=False,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )
        test_featurename3 = "AT5G42230.1.TAIR10"

        dbxref3 = Dbxref.objects.create(db=db, accession=test_featurename3)
        test_feature3 = Feature.objects.create(
            dbxref=dbxref3,
            organism=test_organism,
            uniquename=test_featurename3,
            is_analysis=False,
            type_id=test_term.cvterm_id,
            is_obsolete=False,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )

        test_featurename4 = "AT1G27040.1.TAIR10"
        dbxref4 = Dbxref.objects.create(db=db, accession=test_featurename4)
        test_feature4 = Feature.objects.create(
            dbxref=dbxref4,
            organism=test_organism,
            uniquename=test_featurename4,
            is_analysis=False,
            type_id=test_term.cvterm_id,
            is_obsolete=False,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )

        test_featurename5 = "AT1G71692.1.TAIR10"
        dbxref5 = Dbxref.objects.create(db=db, accession=test_featurename5)
        test_feature5 = Feature.objects.create(
            dbxref=dbxref5,
            organism=test_organism,
            uniquename=test_featurename5,
            is_analysis=False,
            type_id=test_term.cvterm_id,
            is_obsolete=False,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )

        test_featurename6 = "AT5G24750.1.TAIR10"
        dbxref6 = Dbxref.objects.create(db=db, accession=test_featurename6)
        test_feature6 = Feature.objects.create(
            dbxref=dbxref6,
            organism=test_organism,
            uniquename=test_featurename6,
            is_analysis=False,
            type_id=test_term.cvterm_id,
            is_obsolete=False,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )

        # clusters setup
        test_cluster1_name = "ath_coexpr_mcl_1"
        test_cluster1 = [
            test_featurename1, test_featurename2, test_featurename3
        ]
        test_cluster2_name = "ath_coexpr_mcl_2"
        test_cluster2 = [test_featurename4, test_featurename5]
        test_cluster3_name = "ath_coexpr_mcl_3"
        test_cluster3 = [test_featurename6]
        test_filename = "mcl.clusters.dummy.txt"
        source = "null"
        test_coexpression_loader = FeatureLoader(source=source,
                                                 filename=test_filename)
        soterm = "polypeptide"
        test_coexpression_loader.store_feature_groups(group=test_cluster1,
                                                      soterm=soterm,
                                                      term=term,
                                                      value=test_cluster1_name)
        test_coexpression_loader.store_feature_groups(group=test_cluster2,
                                                      soterm=soterm,
                                                      term=term,
                                                      value=test_cluster2_name)
        test_coexpression_loader.store_feature_groups(group=test_cluster3,
                                                      soterm=soterm,
                                                      term=term,
                                                      value=test_cluster3_name)
        # check entire cluster1 relationships (not in reverse)
        self.assertTrue(
            Featureprop.objects.filter(feature_id=test_feature1.feature_id,
                                       type=term,
                                       value=test_cluster1_name).exists())
        self.assertTrue(
            Featureprop.objects.filter(feature_id=test_feature3.feature_id,
                                       type=term,
                                       value=test_cluster1_name).exists())
        self.assertTrue(
            Featureprop.objects.filter(feature_id=test_feature2.feature_id,
                                       type=term,
                                       value=test_cluster1_name).exists())
        # check cluster2 relationships
        self.assertTrue(
            Featureprop.objects.filter(feature_id=test_feature5.feature_id,
                                       type=term,
                                       value=test_cluster2_name).exists())
        self.assertTrue(
            Featureprop.objects.filter(feature_id=test_feature4.feature_id,
                                       type=term,
                                       value=test_cluster2_name).exists())
        self.assertFalse(
            Featureprop.objects.filter(feature_id=test_feature6.feature_id,
                                       type=term,
                                       value=test_cluster3_name).exists())
示例#6
0
    def handle(
        self,
        file: str,
        organism: str,
        doi: str = None,
        cpu: int = 1,
        verbosity: int = 1,
        **options
    ):
        """Execute the main function."""
        # retrieve only the file name
        filename = os.path.basename(file)
        if verbosity > 0:
            self.stdout.write("Processing file: {}".format(filename))

        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)

        try:
            index_file = "{}.tbi".format(file)
            FileValidator().validate(index_file)
        except ImportingError:
            try:
                index_file = "{}.csi".format(file)
                FileValidator().validate(index_file)
            except ImportingError:
                raise CommandError("No index found (.tbi/.csi)")

        try:
            feature_file = FeatureLoader(
                filename=filename, source="VCF_SOURCE", doi=doi
            )
        except ImportingError as e:
            raise CommandError(e)

        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()

        chunk_size = cpu * 2

        # Load the GFF3 file
        with open(file) as tbx_file:
            tbx = pysam.TabixFile(filename=tbx_file.name, index=index_file)
            for row in tqdm(tbx.fetch(parser=pysam.asVCF()), total=get_num_lines(file)):
                tasks.append(
                    pool.submit(feature_file.store_tabix_VCF_feature, row, organism)
                )

                if len(tasks) >= chunk_size:
                    for task in as_completed(tasks):
                        try:
                            task.result()
                        except ImportingError as e:
                            raise CommandError(e)
                    tasks.clear()
            else:
                for task in as_completed(tasks):
                    try:
                        task.result()
                    except ImportingError as e:
                        raise CommandError(e)
                tasks.clear()

        pool.shutdown()

        if verbosity > 0:
            self.stdout.write(self.style.SUCCESS("Done with {}".format(filename)))
示例#7
0
    def handle(self, file: str, cpu: int = 1, verbosity: int = 0, **options):
        """Execute the main function."""
        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)
        filename = os.path.basename(file)
        if verbosity > 0:
            self.stdout.write("Processing file: {}".format(filename))
        try:
            groups = open(file, "r")
            # retrieve only the file name
        except ImportingError as e:
            raise CommandError(e)
        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()
        cv, created = Cv.objects.get_or_create(name="feature_property")
        ortho_db, created = Db.objects.get_or_create(name="ORTHOMCL_SOURCE")
        ortho_dbxref, created = Dbxref.objects.get_or_create(
            accession="ORTHOMCL_SOURCE", db=ortho_db)
        cvterm_cluster, created = Cvterm.objects.get_or_create(
            name="orthologous group",
            cv=cv,
            dbxref=ortho_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )

        # hardcoded as orthomcl uses protein input
        soterm = "polypeptide"

        source = "null"
        featureloader = FeatureLoader(source=source, filename=filename)
        # each line is an orthologous group
        for line in groups:
            members = []
            name = ""
            fields = re.split(r"\s+", line.strip())
            if re.search(r"^(\w+)\:", fields[0]):
                group_field = re.match(r"^(\w+)\:", fields[0])
                name = group_field.group(1)
                fields.pop(0)
                for field in fields:
                    if re.search(r"^(\w+)\|(\S+)", field):
                        member_field = re.match(r"^(\w+)\|(\S+)", field)
                        ident = member_field.group(2)
                        members.append(ident)
            else:
                raise CommandError("Cluster has no identification, check.")
            # only orthologous groups with 2 or more members allowed
            if len(members) > 1:
                tasks.append(
                    pool.submit(
                        featureloader.store_feature_groups,
                        soterm=soterm,
                        group=members,
                        term=cvterm_cluster.cvterm_id,
                        value=name,
                    ))
        if verbosity > 0:
            self.stdout.write("Loading")
        for task in tqdm(as_completed(tasks), total=len(tasks)):
            if task.result():
                raise (task.result())
        pool.shutdown()
        if verbosity > 0:
            self.stdout.write(
                self.style.SUCCESS("Done with {}".format(filename)))
示例#8
0
    def test_store_feature_dbxref(self):
        """Tests - store feature dbxref."""
        # creating exact term
        test_db_global = Db.objects.create(name="_global")
        Dbxref.objects.create(accession="exact", db=test_db_global)
        test_db = Db.objects.create(name="RO")
        test_dbxref = Dbxref.objects.create(accession="00002", db=test_db)
        test_cv = Cv.objects.create(name="relationship")
        Cvterm.objects.create(
            name="contained in",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )

        test_db = Db.objects.create(name="SO")
        test_dbxref = Dbxref.objects.create(accession="12345", db=test_db)
        test_cv = Cv.objects.create(name="sequence")
        test_so_term = Cvterm.objects.create(
            name="polypeptide",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_dbxref = Dbxref.objects.create(accession="123455", db=test_db)
        Cvterm.objects.create(
            name="protein_match",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )

        test_organism = Organism.objects.create(genus="Mus",
                                                species="musculus")

        test_db = Db.objects.create(name="GFF_SOURCE")
        test_dbxref = Dbxref.objects.create(accession="feat2", db=test_db)
        test_feature = Feature.objects.create(
            organism=test_organism,
            uniquename="feat2",
            dbxref=test_dbxref,
            is_analysis=False,
            type_id=test_so_term.cvterm_id,
            is_obsolete=False,
            timeaccessioned=datetime.now(timezone.utc),
            timelastmodified=datetime.now(timezone.utc),
        )

        test_feature_file = FeatureLoader(filename="file.name",
                                          source="GFF_loader")

        # store the feature annotation
        test_feature_file.store_feature_dbxref(feature="feat2",
                                               soterm="polypeptide",
                                               dbxref="GI:12345")
        test_featuredbxref = FeatureDbxref.objects.get(feature=test_feature)
        self.assertEqual("GI", test_featuredbxref.dbxref.db.name)
        self.assertEqual("12345", test_featuredbxref.dbxref.accession)
示例#9
0
    def test_store_feature_annotation(self):
        """Tests - store feature annotation."""
        # creating exact term
        test_db_global = Db.objects.create(name="_global")
        test_dbxref = Dbxref.objects.create(accession="exact",
                                            db=test_db_global)
        test_cv = Cv.objects.create(name="synonym_type")
        Cvterm.objects.create(
            name="exact",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_db = Db.objects.create(name="RO")
        test_dbxref = Dbxref.objects.create(accession="00002", db=test_db)
        test_cv = Cv.objects.create(name="relationship")
        Cvterm.objects.create(
            name="contained in",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )

        test_db = Db.objects.create(name="SO")
        test_dbxref = Dbxref.objects.create(accession="12345", db=test_db)
        test_cv = Cv.objects.create(name="sequence")
        test_so_term = Cvterm.objects.create(
            name="polypeptide",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_dbxref = Dbxref.objects.create(accession="123455", db=test_db)
        Cvterm.objects.create(
            name="protein_match",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )

        test_db = Db.objects.create(name="GO")
        test_dbxref = Dbxref.objects.create(accession="12345", db=test_db)
        test_cv = Cv.objects.create(name="biological_process")
        Cvterm.objects.create(
            name="go test term",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )

        test_organism = Organism.objects.create(genus="Mus",
                                                species="musculus")

        test_db = Db.objects.create(name="GFF_SOURCE")
        test_dbxref = Dbxref.objects.create(accession="feat2", db=test_db)
        test_feature = Feature.objects.create(
            organism=test_organism,
            uniquename="feat2",
            dbxref=test_dbxref,
            is_analysis=False,
            type_id=test_so_term.cvterm_id,
            is_obsolete=False,
            timeaccessioned=datetime.now(timezone.utc),
            timelastmodified=datetime.now(timezone.utc),
        )

        test_feature_file = FeatureLoader(filename="file.name",
                                          source="GFF_loader")

        # store the feature annotation
        test_feature_file.store_feature_annotation(
            feature="feat2",
            soterm="polypeptide",
            cvterm="display",
            annotation="feature one",
            doi=None,
        )
        test_featureprop = Featureprop.objects.get(feature=test_feature)
        self.assertEqual("feature one", test_featureprop.value)

        # replace the feature annotation
        test_feature_file.store_feature_annotation(
            feature="feat2",
            soterm="polypeptide",
            cvterm="display",
            annotation="feature new",
            doi=None,
        )
        test_featureprop = Featureprop.objects.get(feature=test_feature)
        self.assertEqual("feature new", test_featureprop.value)

        # store the ontology_term
        test_feature_file.store_feature_annotation(
            feature="feat2",
            soterm="polypeptide",
            cvterm="ontology_term",
            annotation="GO:12345",
            doi=None,
        )
        test_cvterm = Cvterm.objects.get(name="go test term")
        test_feature_cvterm = FeatureCvterm.objects.get(feature=test_feature,
                                                        cvterm=test_cvterm)
        self.assertIsNotNone(test_feature_cvterm)

        # store the dbxref
        test_feature_file.store_feature_annotation(
            feature="feat2",
            soterm="polypeptide",
            cvterm="dbxref",
            annotation="GEO:123456",
            doi=None,
        )
        test_db = Db.objects.get(name="GEO")
        test_dbxref = Dbxref.objects.get(db=test_db, accession="123456")
        test_feature_dbxref = FeatureDbxref.objects.get(feature=test_feature,
                                                        dbxref=test_dbxref)
        self.assertIsNotNone(test_feature_dbxref)
示例#10
0
    def test_store_bio_searchio_hit(self):
        """Tests - store bio searchio hit."""
        # create RO term: contained in
        test_db = Db.objects.create(name="RO")
        test_dbxref = Dbxref.objects.create(accession="00002", db=test_db)
        test_cv = Cv.objects.create(name="relationship")
        Cvterm.objects.create(
            name="contained in",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )

        # create SO terms: protein_match
        test_cv = Cv.objects.create(name="sequence")
        test_db = Db.objects.create(name="SO")
        test_dbxref = Dbxref.objects.create(accession="00001", db=test_db)
        Cvterm.objects.create(
            name="protein_match",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_dbxref = Dbxref.objects.create(accession="00002", db=test_db)
        Cvterm.objects.create(
            name="polypeptide",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        # create GO term
        test_db = Db.objects.create(name="GO")
        test_dbxref = Dbxref.objects.create(accession="1234", db=test_db)
        test_cv = Cv.objects.create(name="biological_process")
        Cvterm.objects.create(
            name="GO:1234",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )

        # create a bio searchio hit
        test_searchio_hit = Hit()
        test_searchio_hit.id = "PF1234"
        test_searchio_hit.accession = "PFAM mock domain"
        test_searchio_hit.attributes["Target"] = "PFAM"
        test_searchio_hit.dbxrefs = [
            "GO:1234", "IPR:IPR012345", "Reactome:R-HSA-12345"
        ]

        Organism.objects.create(genus="test", species="organism")

        # instantiate the loader
        test_feature_file = FeatureLoader(filename="file.name",
                                          source="InterproScan_source")
        # store the bio searchio hit
        # From interproscan
        target = "InterPro"
        test_feature_file.store_bio_searchio_hit(test_searchio_hit, target)

        test_feature = Feature.objects.get(uniquename="PF1234")
        self.assertEqual("PFAM mock domain", test_feature.name)

        test_dbxref = Dbxref.objects.get(accession="IPR012345")
        test_feature_dbxref = FeatureDbxref.objects.get(feature=test_feature,
                                                        dbxref=test_dbxref)
        self.assertEqual(True, test_feature_dbxref.is_current)

        test_cvterm = Cvterm.objects.get(name="GO:1234")
        test_feature_cvterm = FeatureCvterm.objects.get(feature=test_feature,
                                                        cvterm=test_cvterm)
        self.assertEqual(0, test_feature_cvterm.rank)
示例#11
0
    def test_store_tabix_GFF_feature(self):
        """Tests - store tabix feature / store relationships."""
        # creating exact term
        test_db_global = Db.objects.create(name="_global")
        test_dbxref = Dbxref.objects.create(accession="exact",
                                            db=test_db_global)
        test_cv = Cv.objects.create(name="synonym_type")
        Cvterm.objects.create(
            name="exact",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        # creating part_of term
        test_dbxref = Dbxref.objects.create(accession="part_of",
                                            db=test_db_global)
        test_cv = Cv.objects.create(name="sequence")
        Cvterm.objects.create(
            name="part_of",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        # create SO terms: assembly, gene, and exon
        test_db = Db.objects.create(name="SO")
        test_dbxref = Dbxref.objects.create(accession="00001", db=test_db)
        test_cvterm_assembly = Cvterm.objects.create(
            name="assembly",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_dbxref = Dbxref.objects.create(accession="00002", db=test_db)
        Cvterm.objects.create(
            name="gene",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_dbxref = Dbxref.objects.create(accession="00003", db=test_db)
        Cvterm.objects.create(
            name="exon",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_dbxref = Dbxref.objects.create(accession="00004", db=test_db)
        Cvterm.objects.create(
            name="polypeptide",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_dbxref = Dbxref.objects.create(accession="00005", db=test_db)
        Cvterm.objects.create(
            name="protein_match",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        # create RO term: contained in
        test_db = Db.objects.create(name="RO")
        test_dbxref = Dbxref.objects.create(accession="00002", db=test_db)
        test_cv = Cv.objects.create(name="relationship")
        Cvterm.objects.create(
            name="contained in",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )

        # create an organism
        test_organism = Organism.objects.create(genus="Mus",
                                                species="musculus")
        # create a srcfeature
        test_db = Db.objects.create(name="FASTA_SOURCE")
        test_dbxref = Dbxref.objects.create(accession="contig1", db=test_db)
        feature = Feature.objects.create(
            dbxref=test_dbxref,
            organism=test_organism,
            name="contig1",
            type=test_cvterm_assembly,
            uniquename="contig1",
            is_analysis=False,
            is_obsolete=False,
            timeaccessioned=datetime.now(timezone.utc),
            timelastmodified=datetime.now(timezone.utc),
        )

        # DOI TESTING
        db2 = BibDatabase()
        db2.entries = [{
            "journal": "Nice Journal",
            "comments": "A comment",
            "pages": "12--23",
            "month": "jan",
            "abstract": "This is an abstract. This line should be "
            "long enough to test multilines...",
            "title": "An amazing title",
            "year": "2013",
            "doi": "10.1186/s12864-016-2535-300002",
            "volume": "12",
            "ID": "Teste2018",
            "author": "Foo, b. and Foo1, b. and Foo b.",
            "keyword": "keyword1, keyword2",
            "ENTRYTYPE": "article",
        }]
        for entry in db2.entries:
            bibtest3 = PublicationLoader()
            bibtest3.store_bibtex_entry(entry)
        test_bibtex3 = Pub.objects.get(uniquename="Teste2018")
        test_bibtex3_pubdbxref = PubDbxref.objects.get(pub=test_bibtex3)
        test_bibtex3_dbxref = Dbxref.objects.get(
            dbxref_id=test_bibtex3_pubdbxref.dbxref_id)
        self.assertEqual("10.1186/s12864-016-2535-300002",
                         test_bibtex3_dbxref.accession)
        # DOI: try to link feature to publication's DOI
        featurepub_test = None
        if feature and test_bibtex3_pubdbxref:
            featurepub_test = FeaturePub.objects.create(
                feature_id=feature.feature_id,
                pub_id=test_bibtex3_pubdbxref.pub_id)
        test_pub = Pub.objects.get(pub_id=featurepub_test.pub_id)
        self.assertEqual("An amazing title", test_pub.title)
        test_pubdbxref = PubDbxref.objects.get(pub=test_pub)
        test_dbxref = Dbxref.objects.get(dbxref_id=test_pubdbxref.dbxref_id)
        self.assertEqual("10.1186/s12864-016-2535-300002",
                         test_dbxref.accession)

        # create a tabix feature
        class TabixFeature(object):
            """mock tabix feature."""

        test_tabix_feature1 = TabixFeature()
        test_tabix_feature1.contig = "contig1"
        test_tabix_feature1.feature = "gene"
        test_tabix_feature1.start = "10"
        test_tabix_feature1.end = "100"
        test_tabix_feature1.strand = "+"
        test_tabix_feature1.frame = "1"
        test_tabix_feature1.attributes = "id=id1;name=name1"

        test_tabix_feature2 = TabixFeature()
        test_tabix_feature2.contig = "contig1"
        test_tabix_feature2.feature = "exon"
        test_tabix_feature2.start = "10"
        test_tabix_feature2.end = "100"
        test_tabix_feature2.strand = "-"
        test_tabix_feature2.frame = "2"
        test_tabix_feature2.attributes = "id=id2;name=name2;parent=id1"

        # instantiate the loader
        test_feature_file = FeatureLoader(filename="file.name",
                                          source="GFF_source")

        organism = "Mus musculus"
        # store the tabix feature
        qtl = False
        test_feature_file.store_tabix_GFF_feature(test_tabix_feature1,
                                                  organism, qtl)
        test_feature_file.store_tabix_GFF_feature(test_tabix_feature2,
                                                  organism, qtl)

        # store the relationships
        for item in test_feature_file.relationships:
            test_feature_file.store_relationship(organism, item["subject_id"],
                                                 item["object_id"])

        test_feature = Feature.objects.get(uniquename="id2")
        test_featureloc = Featureloc.objects.get(feature=test_feature)
        test_feature_relationship = FeatureRelationship.objects.get(
            object=test_feature.feature_id)
        test_src_feature = Feature.objects.get(
            feature_id=test_feature_relationship.subject.feature_id)
        self.assertEqual("name2", test_feature.name)
        self.assertEqual(10, test_featureloc.fmin)
        self.assertEqual("id1", test_src_feature.uniquename)
示例#12
0
    def test_store_tabix_VCF_feature(self):
        """Tests - store tabix VCF feature / store relationships."""
        # creating exact term
        test_db_global = Db.objects.create(name="_global")
        test_dbxref = Dbxref.objects.create(accession="exact",
                                            db=test_db_global)
        test_cv = Cv.objects.create(name="synonym_type")
        Cvterm.objects.create(
            name="exact",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        # creating part_of term
        test_dbxref = Dbxref.objects.create(accession="part_of",
                                            db=test_db_global)
        test_cv = Cv.objects.create(name="sequence")
        Cvterm.objects.create(
            name="part_of",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        # create SO terms: assembly, gene, and exon
        test_db = Db.objects.create(name="SO")
        test_dbxref = Dbxref.objects.create(accession="00001", db=test_db)
        test_cvterm_assembly = Cvterm.objects.create(
            name="assembly",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_dbxref = Dbxref.objects.create(accession="00002", db=test_db)
        Cvterm.objects.create(
            name="snv",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_dbxref = Dbxref.objects.create(accession="00003", db=test_db)
        Cvterm.objects.create(
            name="snp",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_dbxref = Dbxref.objects.create(accession="00004", db=test_db)
        Cvterm.objects.create(
            name="polypeptide",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_dbxref = Dbxref.objects.create(accession="00005", db=test_db)
        Cvterm.objects.create(
            name="protein_match",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_dbxref = Dbxref.objects.create(accession="00006", db=test_db)
        Cvterm.objects.create(
            name="quality_value",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )

        # create RO term: contained in
        test_db = Db.objects.create(name="RO")
        test_dbxref = Dbxref.objects.create(accession="00002", db=test_db)
        test_cv = Cv.objects.create(name="relationship")
        Cvterm.objects.create(
            name="contained in",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )

        # create an organism
        test_organism = Organism.objects.create(genus="Mus",
                                                species="musculus")
        # create a srcfeature
        test_db = Db.objects.create(name="FASTA_SOURCE")
        test_dbxref = Dbxref.objects.create(accession="contig1", db=test_db)
        feature = Feature.objects.create(
            dbxref=test_dbxref,
            organism=test_organism,
            name="contig1",
            type=test_cvterm_assembly,
            uniquename="contig1",
            is_analysis=False,
            is_obsolete=False,
            timeaccessioned=datetime.now(timezone.utc),
            timelastmodified=datetime.now(timezone.utc),
        )

        # DOI TESTING
        db2 = BibDatabase()
        db2.entries = [{
            "journal": "Nice Journal",
            "comments": "A comment",
            "pages": "12--23",
            "month": "jan",
            "abstract": "This is an abstract. This line should be "
            "long enough to test multilines...",
            "title": "An amazing title",
            "year": "2013",
            "doi": "10.1186/s12864-016-2535-300002",
            "volume": "12",
            "ID": "Teste2018",
            "author": "Foo, b. and Foo1, b. and Foo b.",
            "keyword": "keyword1, keyword2",
            "ENTRYTYPE": "article",
        }]
        for entry in db2.entries:
            bibtest3 = PublicationLoader()
            bibtest3.store_bibtex_entry(entry)
        test_bibtex3 = Pub.objects.get(uniquename="Teste2018")
        test_bibtex3_pubdbxref = PubDbxref.objects.get(pub=test_bibtex3)
        test_bibtex3_dbxref = Dbxref.objects.get(
            dbxref_id=test_bibtex3_pubdbxref.dbxref_id)
        self.assertEqual("10.1186/s12864-016-2535-300002",
                         test_bibtex3_dbxref.accession)
        # DOI: try to link feature to publication's DOI
        featurepub_test = None
        if feature and test_bibtex3_pubdbxref:
            featurepub_test = FeaturePub.objects.create(
                feature_id=feature.feature_id,
                pub_id=test_bibtex3_pubdbxref.pub_id)
        test_pub = Pub.objects.get(pub_id=featurepub_test.pub_id)
        self.assertEqual("An amazing title", test_pub.title)
        test_pubdbxref = PubDbxref.objects.get(pub=test_pub)
        test_dbxref = Dbxref.objects.get(dbxref_id=test_pubdbxref.dbxref_id)
        self.assertEqual("10.1186/s12864-016-2535-300002",
                         test_dbxref.accession)

        # create a tabix feature
        class TabixFeature(object):
            """mock tabix feature."""

        test_tabix_feature1 = TabixFeature()
        test_tabix_feature1.contig = "contig1"
        test_tabix_feature1.feature = "snp"
        test_tabix_feature1.pos = 10
        test_tabix_feature1.id = "id1"
        test_tabix_feature1.ref = "A"
        test_tabix_feature1.alt = "T,C"
        test_tabix_feature1.info = "TSA=snv"
        test_tabix_feature1.qual = 10

        test_tabix_feature2 = TabixFeature()
        test_tabix_feature2.contig = "contig1"
        test_tabix_feature2.feature = "snv"
        test_tabix_feature2.pos = 100
        test_tabix_feature2.id = "id2"
        test_tabix_feature2.ref = "G"
        test_tabix_feature2.alt = "C,A"
        test_tabix_feature2.info = "VC=snp;SAO=0"
        test_tabix_feature2.qual = 20

        # instantiate the loader
        test_feature_file = FeatureLoader(filename="file.name",
                                          source="VCF_SOURCE")

        organism = "Mus musculus"
        # store the tabix feature
        test_feature_file.store_tabix_VCF_feature(test_tabix_feature1,
                                                  organism)
        test_feature_file.store_tabix_VCF_feature(test_tabix_feature2,
                                                  organism)

        test_feature = Feature.objects.get(uniquename="id2")
        test_featurelocs = Featureloc.objects.filter(feature=test_feature)
        self.assertEqual(100, test_featurelocs[0].fmin)
        self.assertEqual("G", test_featurelocs[0].residue_info)
        self.assertEqual("C", test_featurelocs[1].residue_info)
        self.assertEqual("A", test_featurelocs[2].residue_info)
        self.assertEqual(0, test_featurelocs[0].rank)
        self.assertEqual(1, test_featurelocs[1].rank)
        self.assertEqual(2, test_featurelocs[2].rank)
        self.assertEqual("contig1", test_featurelocs[0].srcfeature.uniquename)
示例#13
0
    def handle(self,
               file: str,
               organism: str,
               doi: str = None,
               ignore: str = None,
               qtl: bool = False,
               cpu: int = 1,
               verbosity: int = 1,
               **options):
        """Execute the main function."""
        # retrieve only the file name
        filename = os.path.basename(file)
        if verbosity > 0:
            self.stdout.write("Processing file: {}".format(filename))

        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)

        try:
            index_file = "{}.tbi".format(file)
            FileValidator().validate(index_file)
        except ImportingError:
            try:
                index_file = "{}.csi".format(file)
                FileValidator().validate(index_file)
            except ImportingError:
                raise CommandError("No index found (.tbi/.csi)")

        try:
            feature_file = FeatureLoader(filename=filename,
                                         source="GFF_SOURCE",
                                         doi=doi)
        except ImportingError as e:
            raise CommandError(e)

        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()

        chunk_size = cpu * 2

        # Load the GFF3 file
        with open(file) as tbx_file:
            tbx = pysam.TabixFile(filename=tbx_file.name, index=index_file)
            for row in tqdm(tbx.fetch(parser=pysam.asGTF()),
                            total=get_num_lines(file)):
                if ignore is not None and row.feature in ignore:
                    continue
                tasks.append(
                    pool.submit(feature_file.store_tabix_GFF_feature, row,
                                organism, qtl))

                if len(tasks) >= chunk_size:
                    for task in as_completed(tasks):
                        try:
                            task.result()
                        except ImportingError as e:
                            raise CommandError(e)
                    tasks.clear()
            else:
                for task in as_completed(tasks):
                    try:
                        task.result()
                    except ImportingError as e:
                        raise CommandError(e)
                tasks.clear()

        pool.shutdown()

        if verbosity > 0:
            self.stdout.write("Loading relationships")

        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()

        for item in feature_file.relationships:
            tasks.append(
                pool.submit(
                    feature_file.store_relationship,
                    organism,
                    item["subject_id"],
                    item["object_id"],
                ))

        for task in tqdm(as_completed(tasks), total=len(tasks)):
            try:
                task.result()
            except ImportingError as e:
                raise CommandError(e)
        pool.shutdown()

        if feature_file.ignored_attrs is not None:
            self.stdout.write(
                self.style.WARNING("Ignored attrs: {}".format(
                    feature_file.ignored_attrs)))

        if verbosity > 0:
            self.stdout.write(
                self.style.SUCCESS("Done with {}".format(filename)))
    def handle(
        self,
        file: str,
        cpu: int = 1,
        soterm: str = "mRNA",
        verbosity: int = 0,
        **options
    ):
        """Execute the main function."""
        filename = os.path.basename(file)
        if verbosity > 0:
            self.stdout.write("Processing file: {}".format(filename))

        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)
        try:
            pairs = open(file, "r")
            # retrieve only the file name
        except ImportingError as e:
            raise CommandError(e)

        cvterm_corel = Cvterm.objects.get(
            name="correlated with", cv__name="relationship"
        ).cvterm_id
        # feature source is not needed here
        source = "null"
        featureloader = FeatureLoader(source=source, filename=filename)
        size = get_num_lines(file)
        # every cpu should be able to handle 5 tasks
        chunk = cpu * 5
        with ThreadPoolExecutor(max_workers=cpu) as pool:
            tasks = list()
            for line in tqdm(pairs, total=size):
                nfields = 3
                fields = re.split(r"\s+", line.rstrip())
                try:
                    FieldsValidator().validate(nfields, fields)
                except ImportingError as e:
                    raise CommandError(e)
                # get corrected PCC value (last item from fields list)
                value = float(fields.pop()) + 0.7
                tasks.append(
                    pool.submit(
                        featureloader.store_feature_pairs,
                        pair=fields,
                        soterm=soterm,
                        term=cvterm_corel,
                        value=value,
                    )
                )
                if len(tasks) >= chunk:
                    for task in as_completed(tasks):
                        if task.result():
                            raise (task.result())
                    tasks.clear()
            else:
                for task in as_completed(tasks):
                    if task.result():
                        raise (task.result())
                tasks.clear()
            pool.shutdown()
        if verbosity > 0:
            self.stdout.write(self.style.SUCCESS("Done with {}".format(filename)))
示例#15
0
    def test_orthology(self):
        """Tests - __init__."""
        # register multispecies organism
        so_db = Db.objects.create(name="SO")
        so_cv = Cv.objects.create(name="sequence")
        # creating test RO term
        ro_db = Db.objects.create(name="RO")
        ro_cv = Cv.objects.create(name="relationship")
        fo_db = Db.objects.create(name="ORTHOMCL_SOURCE")
        fo_cv = Cv.objects.create(name="feature_property")

        # test_dbxref = Dbxref.objects.create(accession='123456', db=test_db)
        so_dbxref = Dbxref.objects.create(accession="357", db=so_db)
        so_dbxref2 = Dbxref.objects.create(accession="358", db=so_db)
        ro_dbxref = Dbxref.objects.create(accession="658", db=ro_db)
        # creating test SO term
        Cvterm.objects.create(
            name="contained in",
            cv=ro_cv,
            dbxref=ro_dbxref,
            is_obsolete=0,
            is_relationshiptype=1,
        )

        ortho_dbxref = Dbxref.objects.create(accession="ORTHOMCL_SOURCE",
                                             db=fo_db)
        term = Cvterm.objects.create(
            name="orthologous group",
            cv=fo_cv,
            dbxref=ortho_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        poly_cvterm = Cvterm.objects.create(
            name="polypeptide",
            cv=so_cv,
            dbxref=so_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        Cvterm.objects.create(
            name="protein_match",
            cv=so_cv,
            dbxref=so_dbxref2,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        db_null = Db.objects.create(name="null")
        null_dbxref = Dbxref.objects.create(db=db_null, accession="null")
        null_cv = Cv.objects.create(name="null")
        Cvterm.objects.create(
            cv=null_cv,
            name="null",
            definition="",
            dbxref=null_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        # need to insert organisms first
        organism1 = Organism.objects.create(species="coerulea",
                                            genus="Aquilegia",
                                            abbreviation="Aco")
        organism2 = Organism.objects.create(species="distachyon",
                                            genus="Brachypodium",
                                            abbreviation="Brd")
        organism3 = Organism.objects.create(species="clementina",
                                            genus="Citrus",
                                            abbreviation="Ccl")
        organism4 = Organism.objects.create(species="carota",
                                            genus="Dacus",
                                            abbreviation="Dca")
        organism5 = Organism.objects.create(species="grandis",
                                            genus="Eucalyptus",
                                            abbreviation="Egr")
        organism6 = Organism.objects.create(species="vesca",
                                            genus="Fragaria",
                                            abbreviation="Fve")
        organism7 = Organism.objects.create(species="max",
                                            genus="Glycine",
                                            abbreviation="Gma")
        organism8 = Organism.objects.create(species="fedtschenkoi",
                                            genus="Kalanchoe",
                                            abbreviation="Kld")
        self.assertTrue(Organism.objects.filter(abbreviation="Aco").exists())
        self.assertTrue(Organism.objects.filter(abbreviation="Brd").exists())
        self.assertTrue(Organism.objects.filter(abbreviation="Ccl").exists())
        self.assertTrue(Organism.objects.filter(abbreviation="Dca").exists())
        self.assertTrue(Organism.objects.filter(abbreviation="Egr").exists())
        self.assertTrue(Organism.objects.filter(abbreviation="Fve").exists())
        self.assertTrue(Organism.objects.filter(abbreviation="Gma").exists())
        self.assertTrue(Organism.objects.filter(abbreviation="Kld").exists())

        # also need to insert Features from fasta file first.
        # inserting: Aqcoe0131s0001.1.v3.1
        db = Db.objects.create(name="FASTA_SOURCE")
        acc1 = "Aqcoe0131s0001.1.v3.1"
        dbxref1 = Dbxref.objects.create(db=db, accession=acc1)
        feature1 = Feature.objects.create(
            dbxref=dbxref1,
            organism=organism1,
            uniquename="Aqcoe0131s0001.1.v3.1",
            type=poly_cvterm,
            is_analysis=False,
            is_obsolete=False,
            timeaccessioned=datetime.now(timezone.utc),
            timelastmodified=datetime.now(timezone.utc),
        )
        self.assertTrue(
            Feature.objects.filter(
                type__cv__name="sequence",
                type__name="polypeptide",
                dbxref__accession=acc1,
                dbxref__db__name="FASTA_SOURCE",
            ).exists())
        # inserting: Bradi0180s00100.1.v3.1; Bradi2g20400.1.v3.1
        acc2 = "Bradi0180s00100.1.v3.1"
        dbxref2 = Dbxref.objects.create(db=db, accession=acc2)
        Feature.objects.create(
            dbxref=dbxref2,
            organism=organism2,
            uniquename="Bradi0180s00100.1.v3.1",
            type=poly_cvterm,
            is_analysis=False,
            is_obsolete=False,
            timeaccessioned=datetime.now(timezone.utc),
            timelastmodified=datetime.now(timezone.utc),
        )
        self.assertTrue(
            Feature.objects.filter(
                type__cv__name="sequence",
                type__name="polypeptide",
                dbxref__accession=acc2,
                dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"],
            ).exists())
        acc3 = "Bradi2g20400.1.v3.1"
        dbxref3 = Dbxref.objects.create(db=db, accession=acc3)
        Feature.objects.create(
            dbxref=dbxref3,
            organism=organism2,
            uniquename="Bradi2g20400.1.v3.1",
            type=poly_cvterm,
            is_analysis=False,
            is_obsolete=False,
            timeaccessioned=datetime.now(timezone.utc),
            timelastmodified=datetime.now(timezone.utc),
        )
        self.assertTrue(
            Feature.objects.filter(
                type__cv__name="sequence",
                type__name="polypeptide",
                dbxref__accession=acc3,
                dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"],
            ).exists())
        # inserting: Ciclev10013963m.v1.0; Ciclev10013962m.v1.0;
        # Ciclev10013970m.v1.0
        acc4 = "Ciclev10013963m.v1.0"
        dbxref4 = Dbxref.objects.create(db=db, accession=acc4)
        feature4 = Feature.objects.create(
            dbxref=dbxref4,
            organism=organism3,
            uniquename="Ciclev10013963m.v1.0",
            type=poly_cvterm,
            is_analysis=False,
            is_obsolete=False,
            timeaccessioned=datetime.now(timezone.utc),
            timelastmodified=datetime.now(timezone.utc),
        )
        self.assertTrue(
            Feature.objects.filter(
                type__cv__name="sequence",
                type__name="polypeptide",
                dbxref__accession=acc4,
                dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"],
            ).exists())
        acc5 = "Ciclev10013962m.v1.0"
        dbxref5 = Dbxref.objects.create(db=db, accession=acc5)
        Feature.objects.create(
            dbxref=dbxref5,
            organism=organism3,
            uniquename="Ciclev10013962m.v1.0",
            type=poly_cvterm,
            is_analysis=False,
            is_obsolete=False,
            timeaccessioned=datetime.now(timezone.utc),
            timelastmodified=datetime.now(timezone.utc),
        )
        self.assertTrue(
            Feature.objects.filter(
                type__cv__name="sequence",
                type__name="polypeptide",
                dbxref__accession=acc5,
                dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"],
            ).exists())
        acc6 = "Ciclev10013970m.v1.0"
        dbxref6 = Dbxref.objects.create(db=db, accession=acc6)
        Feature.objects.create(
            dbxref=dbxref6,
            organism=organism3,
            uniquename="Ciclev10013970m.v1.0",
            type=poly_cvterm,
            is_analysis=False,
            is_obsolete=False,
            timeaccessioned=datetime.now(timezone.utc),
            timelastmodified=datetime.now(timezone.utc),
        )
        self.assertTrue(
            Feature.objects.filter(
                type__cv__name="sequence",
                type__name="polypeptide",
                dbxref__accession=acc6,
                dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"],
            ).exists())
        # inserting: DCAR_032182.v1.0.388; DCAR_031986.v1.0.388;
        # DCAR_032223.v1.0.388; DCAR_000323.v1.0.388
        acc7 = "DCAR_032182.v1.0.388"
        dbxref7 = Dbxref.objects.create(db=db, accession=acc7)
        Feature.objects.create(
            dbxref=dbxref7,
            organism=organism4,
            uniquename="DCAR_032182.v1.0.388",
            type=poly_cvterm,
            is_analysis=False,
            is_obsolete=False,
            timeaccessioned=datetime.now(timezone.utc),
            timelastmodified=datetime.now(timezone.utc),
        )
        self.assertTrue(
            Feature.objects.filter(
                type__cv__name="sequence",
                type__name="polypeptide",
                dbxref__accession=acc7,
                dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"],
            ).exists())
        acc8 = "DCAR_031986.v1.0.388"
        dbxref8 = Dbxref.objects.create(db=db, accession=acc8)
        Feature.objects.create(
            dbxref=dbxref8,
            organism=organism4,
            uniquename="DCAR_031986.v1.0.388",
            type=poly_cvterm,
            is_analysis=False,
            is_obsolete=False,
            timeaccessioned=datetime.now(timezone.utc),
            timelastmodified=datetime.now(timezone.utc),
        )
        self.assertTrue(
            Feature.objects.filter(
                type__cv__name="sequence",
                type__name="polypeptide",
                dbxref__accession=acc8,
                dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"],
            ).exists())
        acc9 = "DCAR_032223.v1.0.388"
        dbxref9 = Dbxref.objects.create(db=db, accession=acc9)
        feature9 = Feature.objects.create(
            dbxref=dbxref9,
            organism=organism4,
            uniquename="DCAR_032223.v1.0.388",
            type=poly_cvterm,
            is_analysis=False,
            is_obsolete=False,
            timeaccessioned=datetime.now(timezone.utc),
            timelastmodified=datetime.now(timezone.utc),
        )
        self.assertTrue(
            Feature.objects.filter(
                type__cv__name="sequence",
                type__name="polypeptide",
                dbxref__accession=acc9,
                dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"],
            ).exists())
        acc10 = "DCAR_000323.v1.0.388"
        dbxref10 = Dbxref.objects.create(db=db, accession=acc10)
        feature10 = Feature.objects.create(
            dbxref=dbxref10,
            organism=organism4,
            uniquename="DCAR_000323.v1.0.388",
            type=poly_cvterm,
            is_analysis=False,
            is_obsolete=False,
            timeaccessioned=datetime.now(timezone.utc),
            timelastmodified=datetime.now(timezone.utc),
        )
        self.assertTrue(
            Feature.objects.filter(
                type__cv__name="sequence",
                type__name="polypeptide",
                dbxref__accession=acc10,
                dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"],
            ).exists())
        # inserting: Eucgr.L02820.1.v2.0
        acc11 = "Eucgr.L02820.1.v2.0"
        dbxref11 = Dbxref.objects.create(db=db, accession=acc11)
        Feature.objects.create(
            dbxref=dbxref11,
            organism=organism5,
            uniquename="Eucgr.L02820.1.v2.0",
            type=poly_cvterm,
            is_analysis=False,
            is_obsolete=False,
            timeaccessioned=datetime.now(timezone.utc),
            timelastmodified=datetime.now(timezone.utc),
        )
        self.assertTrue(
            Feature.objects.filter(
                type__cv__name="sequence",
                type__name="polypeptide",
                dbxref__accession=acc11,
                dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"],
            ).exists())
        # inserting: mrna13067.1-v1.0-hybrid.v1.1
        acc12 = "mrna13067.1-v1.0-hybrid.v1.1"
        dbxref12 = Dbxref.objects.create(db=db, accession=acc12)
        Feature.objects.create(
            dbxref=dbxref12,
            organism=organism6,
            uniquename="mrna13067.1-v1.0-hybrid.v1.1",
            type=poly_cvterm,
            is_analysis=False,
            is_obsolete=False,
            timeaccessioned=datetime.now(timezone.utc),
            timelastmodified=datetime.now(timezone.utc),
        )
        self.assertTrue(
            Feature.objects.filter(
                type__cv__name="sequence",
                type__name="polypeptide",
                dbxref__accession=acc12,
                dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"],
            ).exists())
        # inserting: Glyma.10G030500.1.Wm82.a2.v1; Glyma.10G053100.1.Wm82.a2.v1
        acc13 = "Glyma.10G030500.1.Wm82.a2.v1"
        dbxref13 = Dbxref.objects.create(db=db, accession=acc13)
        Feature.objects.create(
            dbxref=dbxref13,
            organism=organism7,
            uniquename="Glyma.10G030500.1.Wm82.a2.v1",
            type_id=poly_cvterm.cvterm_id,
            is_analysis=False,
            is_obsolete=False,
            timeaccessioned=datetime.now(timezone.utc),
            timelastmodified=datetime.now(timezone.utc),
        )
        self.assertTrue(
            Feature.objects.filter(
                type__cv__name="sequence",
                type__name="polypeptide",
                dbxref__accession=acc13,
                dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"],
            ).exists())
        acc14 = "Glyma.10G053100.1.Wm82.a2.v1"
        dbxref14 = Dbxref.objects.create(db=db, accession=acc14)
        feature14 = Feature.objects.create(
            dbxref=dbxref14,
            organism=organism7,
            uniquename="Glyma.10G053100.1.Wm82.a2.v1",
            type_id=poly_cvterm.cvterm_id,
            is_analysis=False,
            is_obsolete=False,
            timeaccessioned=datetime.now(timezone.utc),
            timelastmodified=datetime.now(timezone.utc),
        )
        self.assertTrue(
            Feature.objects.filter(
                type__cv__name="sequence",
                type__name="polypeptide",
                dbxref__accession=acc14,
                dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"],
            ).exists())
        acc15 = "Glyma.10G008400.1.Wm82.a2.v1"
        dbxref15 = Dbxref.objects.create(db=db, accession=acc15)
        Feature.objects.create(
            dbxref=dbxref15,
            organism=organism7,
            uniquename="Glyma.10G008400.1.Wm82.a2.v1",
            type_id=poly_cvterm.cvterm_id,
            is_analysis=False,
            is_obsolete=False,
            timeaccessioned=datetime.now(timezone.utc),
            timelastmodified=datetime.now(timezone.utc),
        )
        self.assertTrue(
            Feature.objects.filter(
                type__cv__name="sequence",
                type__name="polypeptide",
                dbxref__accession=acc15,
                dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"],
            ).exists())
        # inserting: Kaladp0598s0001.1.v1.1
        acc16 = "Kaladp0598s0001.1.v1.1"
        dbxref16 = Dbxref.objects.create(db=db, accession=acc16)
        feature16 = Feature.objects.create(
            dbxref=dbxref16,
            organism=organism8,
            uniquename="Kaladp0598s0001.1.v1.1",
            type_id=poly_cvterm.cvterm_id,
            is_analysis=False,
            is_obsolete=False,
            timeaccessioned=datetime.now(timezone.utc),
            timelastmodified=datetime.now(timezone.utc),
        )
        self.assertTrue(
            Feature.objects.filter(
                type__cv__name="sequence",
                type__name="polypeptide",
                dbxref__accession=acc16,
                dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"],
            ).exists())
        acc17 = "Kaladp0598s0002.1.v1.1"
        dbxref17 = Dbxref.objects.create(db=db, accession=acc17)
        feature17 = Feature.objects.create(
            dbxref=dbxref17,
            organism=organism8,
            uniquename="Kaladp0598s0002.1.v1.1",
            type_id=poly_cvterm.cvterm_id,
            is_analysis=False,
            is_obsolete=False,
            timeaccessioned=datetime.now(timezone.utc),
            timelastmodified=datetime.now(timezone.utc),
        )
        self.assertTrue(
            Feature.objects.filter(
                type__cv__name="sequence",
                type__name="polypeptide",
                dbxref__accession=acc17,
                dbxref__db__name__in=["GFF_SOURCE", "FASTA_SOURCE"],
            ).exists())
        # ########################
        # store feature groups:
        filename = "groups.txt"
        organism, created = Organism.objects.get_or_create(
            abbreviation="multispecies",
            genus="multispecies",
            species="multispecies",
            common_name="multispecies",
        )
        source = "null"
        soterm = "polypeptide"
        test_orthology_loader = FeatureLoader(source=source, filename=filename)
        # ####################
        # test store groups
        group1_name = "machado0001"
        members1 = [
            "Aqcoe0131s0001.1.v3.1",
            "Bradi0180s00100.1.v3.1",
            "Bradi2g20400.1.v3.1",
            "Ciclev10013963m.v1.0",
            "DCAR_032223.v1.0.388",
            "UnknownProtein.v1.1",
        ]
        test_orthology_loader.store_feature_groups(group=members1,
                                                   soterm=soterm,
                                                   term=term,
                                                   value=group1_name)
        group2_name = "machado0002"
        members2 = [
            "Eucgr.L02820.1.v2.0",
            "mrna13067.1-v1.0-hybrid.v1.1",
            "Ciclev10013970m.v1.0",
            "DCAR_031986.v1.0.388",
        ]
        test_orthology_loader.store_feature_groups(group=members2,
                                                   soterm=soterm,
                                                   term=term,
                                                   value=group2_name)
        group3_name = "machado0003"
        members3 = [
            "Glyma.10G030500.1.Wm82.a2.v1",
            "Glyma.10G053100.1.Wm82.a2.v1",
            "DCAR_032182.v1.0.388",
        ]
        test_orthology_loader.store_feature_groups(group=members3,
                                                   soterm=soterm,
                                                   term=term,
                                                   value=group3_name)
        group4_name = "machado0004"
        members4 = ["Glyma.10G008400.1.Wm82.a2.v1", "", "UnknownProtein.v1.2"]
        test_orthology_loader.store_feature_groups(group=members4,
                                                   soterm=soterm,
                                                   term=term,
                                                   value=group4_name)
        group5_name = "machado0005"
        members5 = ["DCAR_000323.v1.0.388", "Kaladp0598s0002.1.v1.1"]
        test_orthology_loader.store_feature_groups(group=members5,
                                                   soterm=soterm,
                                                   term=term,
                                                   value=group5_name)
        group6_name = "machado0006"
        members6 = ["Kaladp0598s0001.1.v1.1", "UnknownProtein.v1.3"]
        test_orthology_loader.store_feature_groups(group=members6,
                                                   soterm=soterm,
                                                   term=term,
                                                   value=group6_name)
        group7_name = "machado0007"
        members7 = ["UnknownProtein.v1.4"]
        test_orthology_loader.store_feature_groups(group=members7,
                                                   soterm=soterm,
                                                   term=term,
                                                   value=group7_name)

        # ###check if relationships exist###
        # in a group (machado0001 and machado0005)
        self.assertTrue(
            Featureprop.objects.filter(feature_id=feature1.feature_id,
                                       type_id=term,
                                       value=group1_name).exists())
        self.assertTrue(
            Featureprop.objects.filter(feature_id=feature9.feature_id,
                                       type_id=term,
                                       value=group1_name).exists())
        self.assertTrue(
            Featureprop.objects.filter(feature_id=feature4.feature_id,
                                       type_id=term,
                                       value=group1_name).exists())
        # another example group5
        self.assertTrue(
            Featureprop.objects.filter(feature_id=feature10.feature_id,
                                       type_id=term,
                                       value=group5_name).exists())
        self.assertTrue(
            Featureprop.objects.filter(feature_id=feature17.feature_id,
                                       type_id=term,
                                       value=group5_name).exists())
        # another example:
        # ###check if a relationship does not exist###
        # between features from different groups (machado0004 and machado0003)
        self.assertFalse(
            Featureprop.objects.filter(feature_id=feature16.feature_id,
                                       type_id=term,
                                       value=group6_name).exists())
        self.assertFalse(
            Featureprop.objects.filter(feature_id=feature4.feature_id,
                                       type_id=term,
                                       value=group2_name).exists())
        self.assertFalse(
            Featureprop.objects.filter(feature_id=feature14.feature_id,
                                       type_id=term,
                                       value=group1_name).exists())
示例#16
0
    def test_store_feature_publication(self):
        """Tests - store feature publication."""
        test_db = Db.objects.create(name="RO")
        test_dbxref = Dbxref.objects.create(accession="00002", db=test_db)
        test_cv = Cv.objects.create(name="relationship")
        Cvterm.objects.create(
            name="contained in",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )

        test_db = Db.objects.create(name="SO")
        test_dbxref = Dbxref.objects.create(accession="12345", db=test_db)
        test_cv = Cv.objects.create(name="sequence")
        test_so_term = Cvterm.objects.create(
            name="gene",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_dbxref = Dbxref.objects.create(accession="123456", db=test_db)
        Cvterm.objects.create(
            name="polypeptide",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_dbxref = Dbxref.objects.create(accession="123455", db=test_db)
        Cvterm.objects.create(
            name="protein_match",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )

        test_organism = Organism.objects.create(genus="Mus",
                                                species="musculus")

        test_db = Db.objects.create(name="GFF_SOURCE")
        test_dbxref = Dbxref.objects.create(accession="feat_gene", db=test_db)
        test_feature = Feature.objects.create(
            organism=test_organism,
            uniquename="feat_gene",
            dbxref=test_dbxref,
            is_analysis=False,
            type_id=test_so_term.cvterm_id,
            is_obsolete=False,
            timeaccessioned=datetime.now(timezone.utc),
            timelastmodified=datetime.now(timezone.utc),
        )

        db2 = BibDatabase()
        db2.entries = [{
            "journal": "Nice Journal",
            "comments": "A comment",
            "pages": "12--23",
            "month": "jan",
            "abstract": "This is an abstract. This line should be "
            "long enough to test multilines...",
            "title": "An amazing title",
            "year": "2013",
            "doi": "10.1186/s12864-016-2535-300002",
            "volume": "12",
            "ID": "Teste2018",
            "author": "Foo, b. and Foo1, b. and Foo b.",
            "keyword": "keyword1, keyword2",
            "ENTRYTYPE": "article",
        }]
        for entry in db2.entries:
            bibtest = PublicationLoader()
            bibtest.store_bibtex_entry(entry)

        test_feature_file = FeatureLoader(filename="file.name",
                                          source="GFF_loader")

        test_feature_file.store_feature_publication(
            feature="feat_gene",
            soterm="gene",
            doi="10.1186/s12864-016-2535-300002")
        test_featurepub = FeaturePub.objects.get(feature=test_feature)
        self.assertEqual("An amazing title", test_featurepub.pub.title)
    def test_process_attributes(self):
        """Tests - get attributes."""
        test_organism = Organism.objects.create(genus="Mus",
                                                species="musculus")
        # creating test GO term
        test_db = Db.objects.create(name="GO")
        test_dbxref = Dbxref.objects.create(accession="12345", db=test_db)
        test_cv = Cv.objects.create(name="biological_process")
        Cvterm.objects.create(
            name="go test term",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        # creating test SO term
        test_db = Db.objects.create(name="SO")
        test_dbxref = Dbxref.objects.create(accession="12345", db=test_db)
        test_cv = Cv.objects.create(name="sequence")
        Cvterm.objects.create(
            name="gene",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_dbxref = Dbxref.objects.create(accession="123455", db=test_db)
        test_so_term = Cvterm.objects.create(
            name="polypeptide",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_dbxref = Dbxref.objects.create(accession="1234555", db=test_db)
        Cvterm.objects.create(
            name="protein_match",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )

        # creating test feature
        test_feature = Feature.objects.create(
            organism=test_organism,
            uniquename="feat1",
            is_analysis=False,
            type_id=test_so_term.cvterm_id,
            is_obsolete=False,
            timeaccessioned=datetime.now(timezone.utc),
            timelastmodified=datetime.now(timezone.utc),
        )
        # creating exact term
        test_db_global = Db.objects.create(name="_global")
        test_dbxref = Dbxref.objects.create(accession="exact",
                                            db=test_db_global)
        test_cv = Cv.objects.create(name="synonym_type")
        Cvterm.objects.create(
            name="exact",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_db = Db.objects.create(name="RO")
        test_dbxref = Dbxref.objects.create(accession="00002", db=test_db)
        test_cv = Cv.objects.create(name="relationship")
        Cvterm.objects.create(
            name="contained in",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )

        # new FeatureLoader
        FeatureLoader(filename="file.name", source="GFF_source")
        # running get_attributes
        test_attrs_file = FeatureAttributesLoader(filecontent="genome")
        test_attrs = test_attrs_file.get_attributes(
            "ID=1;name=feat1;note=Test feature;display=feat1;gene=gene1;"
            "orf_classification=1;ontology_term=GO:12345,GO:54321;parent=2;"
            "alias=Feature1;dbxref=GI:12345,NC:12345;noecziste=True")
        # running process_attributes
        test_attrs_file.process_attributes(feature_id=test_feature.feature_id,
                                           attrs=test_attrs)
        # creating feature_property cvterm
        cv_feature_property = Cv.objects.get(name="feature_property")
        # asserting note
        test_prop_cvterm = Cvterm.objects.get(name="note",
                                              cv=cv_feature_property)
        test_prop = Featureprop.objects.get(feature=test_feature,
                                            type_id=test_prop_cvterm.cvterm_id,
                                            rank=0)
        self.assertEqual("Test feature", test_prop.value)
        # asserting ontology_term
        test_feat_cvterm = FeatureCvterm.objects.get(feature=test_feature)
        test_cvterm = Cvterm.objects.get(cvterm_id=test_feat_cvterm.cvterm_id)
        self.assertEqual("go test term", test_cvterm.name)
        # asserting dbxref
        test_dbxref_ids = FeatureDbxref.objects.filter(
            feature=test_feature).values_list("dbxref_id", flat=True)
        test_db = Db.objects.get(name="GI")
        test_dbxref = Dbxref.objects.get(dbxref_id__in=test_dbxref_ids,
                                         db=test_db)
        self.assertEqual("12345", test_dbxref.accession)
        # asserting alias
        test_synonym = FeatureSynonym.objects.select_related("synonym").get(
            feature=test_feature)
        self.assertEqual("Feature1", test_synonym.synonym.name)
        # asserting ignored goterms
        self.assertEqual("GO:54321", test_attrs_file.ignored_goterms.pop())
    def handle(self,
               file: str,
               organism: str,
               soterm: str = "mRNA",
               cpu: int = 1,
               verbosity: int = 0,
               **options):
        """Execute the main function."""
        filename = os.path.basename(file)
        if verbosity > 0:
            self.stdout.write("Processing file: {}".format(filename))

        try:
            organism = retrieve_organism(organism)
        except IntegrityError as e:
            raise ImportingError(e)
        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)
        try:
            clusters = open(file, "r")
            # retrieve only the file name
        except ImportingError as e:
            raise CommandError(e)

        tasks = list()
        cv, created = Cv.objects.get_or_create(name="feature_property")
        coexp_db, created = Db.objects.get_or_create(name="LSTRAP_SOURCE")
        coexp_dbxref, created = Dbxref.objects.get_or_create(
            accession="LSTRAP_SOURCE", db=coexp_db)
        cvterm_cluster, created = Cvterm.objects.get_or_create(
            name="coexpression group",
            cv=cv,
            dbxref=coexp_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        # feature source is not needed here
        source = "null"
        featureloader = FeatureLoader(source=source, filename=filename)

        pool = ThreadPoolExecutor(max_workers=cpu)
        # each line is an coexpression cluster group
        for line in tqdm(clusters, total=get_num_lines(file)):
            name = ""
            fields = re.split(r"\s+", line.strip())
            nfields = len(fields)
            try:
                FieldsValidator().validate(nfields, fields)
            except ImportingError as e:
                raise CommandError(e)

            if re.search(r"^(\w+)\:", fields[0]):
                group_field = re.match(r"^(\w+)\:", fields[0])
                name = group_field.group(1)
            else:
                raise CommandError("Cluster identification has problems.")
            # remove cluster name before loading
            fields.pop(0)
            # get cvterm for correlation
            tasks.append(
                pool.submit(
                    featureloader.store_feature_groups,
                    group=fields,
                    soterm=soterm,
                    term=cvterm_cluster.cvterm_id,
                    value=name,
                ))
        if verbosity > 0:
            self.stdout.write("Loading")
        for task in tqdm(as_completed(tasks), total=len(tasks)):
            if task.result():
                raise (task.result())
        pool.shutdown()
        if verbosity > 0:
            self.stdout.write(
                self.style.SUCCESS("Done with {}".format(filename)))