예제 #1
0
    def handle(
        self,
        file: str,
        organism: str,
        doi: str = None,
        cpu: int = 1,
        verbosity: int = 1,
        **options
    ):
        """Execute the main function."""
        # retrieve only the file name
        filename = os.path.basename(file)
        if verbosity > 0:
            self.stdout.write("Processing file: {}".format(filename))

        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)

        try:
            index_file = "{}.tbi".format(file)
            FileValidator().validate(index_file)
        except ImportingError:
            try:
                index_file = "{}.csi".format(file)
                FileValidator().validate(index_file)
            except ImportingError:
                raise CommandError("No index found (.tbi/.csi)")

        try:
            feature_file = FeatureLoader(
                filename=filename, source="VCF_SOURCE", doi=doi
            )
        except ImportingError as e:
            raise CommandError(e)

        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()

        chunk_size = cpu * 2

        # Load the GFF3 file
        with open(file) as tbx_file:
            tbx = pysam.TabixFile(filename=tbx_file.name, index=index_file)
            for row in tqdm(tbx.fetch(parser=pysam.asVCF()), total=get_num_lines(file)):
                tasks.append(
                    pool.submit(feature_file.store_tabix_VCF_feature, row, organism)
                )

                if len(tasks) >= chunk_size:
                    for task in as_completed(tasks):
                        try:
                            task.result()
                        except ImportingError as e:
                            raise CommandError(e)
                    tasks.clear()
            else:
                for task in as_completed(tasks):
                    try:
                        task.result()
                    except ImportingError as e:
                        raise CommandError(e)
                tasks.clear()

        pool.shutdown()

        if verbosity > 0:
            self.stdout.write(self.style.SUCCESS("Done with {}".format(filename)))
    def handle(
        self,
        file: str,
        cpu: int = 1,
        soterm: str = "mRNA",
        verbosity: int = 0,
        **options
    ):
        """Execute the main function."""
        filename = os.path.basename(file)
        if verbosity > 0:
            self.stdout.write("Processing file: {}".format(filename))

        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)
        try:
            pairs = open(file, "r")
            # retrieve only the file name
        except ImportingError as e:
            raise CommandError(e)

        cvterm_corel = Cvterm.objects.get(
            name="correlated with", cv__name="relationship"
        ).cvterm_id
        # feature source is not needed here
        source = "null"
        featureloader = FeatureLoader(source=source, filename=filename)
        size = get_num_lines(file)
        # every cpu should be able to handle 5 tasks
        chunk = cpu * 5
        with ThreadPoolExecutor(max_workers=cpu) as pool:
            tasks = list()
            for line in tqdm(pairs, total=size):
                nfields = 3
                fields = re.split(r"\s+", line.rstrip())
                try:
                    FieldsValidator().validate(nfields, fields)
                except ImportingError as e:
                    raise CommandError(e)
                # get corrected PCC value (last item from fields list)
                value = float(fields.pop()) + 0.7
                tasks.append(
                    pool.submit(
                        featureloader.store_feature_pairs,
                        pair=fields,
                        soterm=soterm,
                        term=cvterm_corel,
                        value=value,
                    )
                )
                if len(tasks) >= chunk:
                    for task in as_completed(tasks):
                        if task.result():
                            raise (task.result())
                    tasks.clear()
            else:
                for task in as_completed(tasks):
                    if task.result():
                        raise (task.result())
                tasks.clear()
            pool.shutdown()
        if verbosity > 0:
            self.stdout.write(self.style.SUCCESS("Done with {}".format(filename)))
예제 #3
0
    def handle(self,
               file: str,
               organism: str,
               doi: str = None,
               ignore: str = None,
               qtl: bool = False,
               cpu: int = 1,
               verbosity: int = 1,
               **options):
        """Execute the main function."""
        # retrieve only the file name
        filename = os.path.basename(file)
        if verbosity > 0:
            self.stdout.write("Processing file: {}".format(filename))

        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)

        try:
            index_file = "{}.tbi".format(file)
            FileValidator().validate(index_file)
        except ImportingError:
            try:
                index_file = "{}.csi".format(file)
                FileValidator().validate(index_file)
            except ImportingError:
                raise CommandError("No index found (.tbi/.csi)")

        try:
            feature_file = FeatureLoader(filename=filename,
                                         source="GFF_SOURCE",
                                         doi=doi)
        except ImportingError as e:
            raise CommandError(e)

        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()

        chunk_size = cpu * 2

        # Load the GFF3 file
        with open(file) as tbx_file:
            tbx = pysam.TabixFile(filename=tbx_file.name, index=index_file)
            for row in tqdm(tbx.fetch(parser=pysam.asGTF()),
                            total=get_num_lines(file)):
                if ignore is not None and row.feature in ignore:
                    continue
                tasks.append(
                    pool.submit(feature_file.store_tabix_GFF_feature, row,
                                organism, qtl))

                if len(tasks) >= chunk_size:
                    for task in as_completed(tasks):
                        try:
                            task.result()
                        except ImportingError as e:
                            raise CommandError(e)
                    tasks.clear()
            else:
                for task in as_completed(tasks):
                    try:
                        task.result()
                    except ImportingError as e:
                        raise CommandError(e)
                tasks.clear()

        pool.shutdown()

        if verbosity > 0:
            self.stdout.write("Loading relationships")

        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()

        for item in feature_file.relationships:
            tasks.append(
                pool.submit(
                    feature_file.store_relationship,
                    organism,
                    item["subject_id"],
                    item["object_id"],
                ))

        for task in tqdm(as_completed(tasks), total=len(tasks)):
            try:
                task.result()
            except ImportingError as e:
                raise CommandError(e)
        pool.shutdown()

        if feature_file.ignored_attrs is not None:
            self.stdout.write(
                self.style.WARNING("Ignored attrs: {}".format(
                    feature_file.ignored_attrs)))

        if verbosity > 0:
            self.stdout.write(
                self.style.SUCCESS("Done with {}".format(filename)))
    def handle(self,
               file: str,
               organism: str,
               soterm: str = "mRNA",
               cpu: int = 1,
               verbosity: int = 0,
               **options):
        """Execute the main function."""
        filename = os.path.basename(file)
        if verbosity > 0:
            self.stdout.write("Processing file: {}".format(filename))

        try:
            organism = retrieve_organism(organism)
        except IntegrityError as e:
            raise ImportingError(e)
        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)
        try:
            clusters = open(file, "r")
            # retrieve only the file name
        except ImportingError as e:
            raise CommandError(e)

        tasks = list()
        cv, created = Cv.objects.get_or_create(name="feature_property")
        coexp_db, created = Db.objects.get_or_create(name="LSTRAP_SOURCE")
        coexp_dbxref, created = Dbxref.objects.get_or_create(
            accession="LSTRAP_SOURCE", db=coexp_db)
        cvterm_cluster, created = Cvterm.objects.get_or_create(
            name="coexpression group",
            cv=cv,
            dbxref=coexp_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        # feature source is not needed here
        source = "null"
        featureloader = FeatureLoader(source=source, filename=filename)

        pool = ThreadPoolExecutor(max_workers=cpu)
        # each line is an coexpression cluster group
        for line in tqdm(clusters, total=get_num_lines(file)):
            name = ""
            fields = re.split(r"\s+", line.strip())
            nfields = len(fields)
            try:
                FieldsValidator().validate(nfields, fields)
            except ImportingError as e:
                raise CommandError(e)

            if re.search(r"^(\w+)\:", fields[0]):
                group_field = re.match(r"^(\w+)\:", fields[0])
                name = group_field.group(1)
            else:
                raise CommandError("Cluster identification has problems.")
            # remove cluster name before loading
            fields.pop(0)
            # get cvterm for correlation
            tasks.append(
                pool.submit(
                    featureloader.store_feature_groups,
                    group=fields,
                    soterm=soterm,
                    term=cvterm_cluster.cvterm_id,
                    value=name,
                ))
        if verbosity > 0:
            self.stdout.write("Loading")
        for task in tqdm(as_completed(tasks), total=len(tasks)):
            if task.result():
                raise (task.result())
        pool.shutdown()
        if verbosity > 0:
            self.stdout.write(
                self.style.SUCCESS("Done with {}".format(filename)))