示例#1
0
    def build(
        self,
        bloomfilters: hug.types.multiple,
        samples: hug.types.multiple = [],
        config: hug.types.text = None,
    ):
        config = get_config_from_file(config)

        if samples:
            assert len(samples) == len(bloomfilters)
        else:
            samples = bloomfilters

        if config.get("max_build_mem_bytes"):
            max_memory_bytes = humanfriendly.parse_size(
                config["max_build_mem_bytes"])
        else:
            max_memory_bytes = None

        return build(
            config=config,
            bloomfilter_filepaths=bloomfilters,
            samples=samples,
            max_memory=max_memory_bytes,
        )
示例#2
0
    def bulk_search(
        self,
        fasta: hug.types.text,
        threshold: hug.types.float_number = 1.0,
        config: hug.types.text = None,
        score: hug.types.smart_boolean = False,
        format: hug.types.one_of(["json", "csv"]) = "json",
        stream: hug.types.smart_boolean = False,
    ):
        config = get_config_from_file(config)

        fasta = Fasta(fasta)
        if not stream:
            _config = copy.copy(config)
            _config["nproc"] = 1
            csv_combined = ""
            nproc = config.get("nproc", 1)
            with multiprocessing.Pool(processes=nproc) as pool:
                args = [(_config, str(seq), threshold, score)
                        for seq in fasta.values()]
                dd = pool.map_async(search_bigsi_parallel,
                                    chunks(args, math.ceil(len(args) /
                                                           nproc))).get()
                dd = [item for sublist in dd for item in sublist]
            if format == "csv":
                return "\n".join([d_to_csv(d, False, False) for d in dd])
            else:
                return json.dumps(dd, indent=4)
        else:
            bigsi = BIGSI(config)
            csv_combined = ""
            for i, seq in enumerate(fasta.values()):
                seq = str(seq)
                d = {
                    "query": seq,
                    "threshold": threshold,
                    "results": bigsi.search(seq, threshold, score),
                    "citation": "http://dx.doi.org/10.1038/s41587-018-0010-1",
                }
                if format == "csv":
                    if i == 0:
                        with_header = True
                        carriage_return = False
                    elif i == len(fasta) - 1:
                        carriage_return = True
                    else:
                        with_header = False
                        carriage_return = False
                    csv_result = d_to_csv(d, with_header, carriage_return)
                    csv_combined += csv_result
                    if stream:
                        print(csv_result)
                else:
                    if stream:
                        print(json.dumps(d))
示例#3
0
文件: __main__.py 项目: leoisl/BIGSI
 def bulk_search(
     self,
     fasta: hug.types.text,
     threshold: hug.types.float_number = 1.0,
     config: hug.types.text = None,
     score: hug.types.smart_boolean = False,
     format: hug.types.one_of(["json", "csv"]) = "json",
     stream: hug.types.smart_boolean = False,
 ):
     config = get_config_from_file(config)
     bigsi = BIGSI(config)
     fasta = Fasta(fasta)
     if not stream:
         csv_combined = ""
         nproc = config.get("nproc", 1)
         with ThreadPool(processes=nproc) as pool:
             args = [(bigsi, str(seq), threshold, score)
                     for seq in fasta.values()]
             dd = pool.starmap(search_bigsi, args)
         if format == "csv":
             return "\n".join([d_to_csv(d, False, False) for d in dd])
         else:
             return json.dumps(dd, indent=4)
     else:
         dd = []
         csv_combined = ""
         for i, seq in enumerate(fasta.values()):
             seq = str(seq)
             d = {
                 "query": seq,
                 "threshold": threshold,
                 "results": bigsi.search(seq, threshold, score),
                 "citation": "http://dx.doi.org/10.1038/s41587-018-0010-1",
             }
             dd.append(d)
             if format == "csv":
                 if i == 0:
                     with_header = True
                     carriage_return = False
                 elif i == len(fasta) - 1:
                     carriage_return = True
                 else:
                     with_header = False
                     carriage_return = False
                 csv_result = d_to_csv(d, with_header, carriage_return)
                 csv_combined += csv_result
                 if stream:
                     print(csv_result)
             else:
                 if stream:
                     print(json.dumps(d))
示例#4
0
    def build(
        self,
        bloomfilters: hug.types.multiple = [],
        samples: hug.types.multiple = [],
        from_file: hug.types.text = None,
        config: hug.types.text = None,
    ):
        config = get_config_from_file(config)

        if from_file and bloomfilters:
            raise ValueError(
                "You can only specify blooms via from_file or bloomfilters, but not both"
            )
        elif from_file:
            samples = []
            bloomfilters = []
            with open(from_file, "r") as tsvfile:
                reader = csv.reader(tsvfile, delimiter="\t")
                for row in reader:
                    bloomfilters.append(row[0])
                    samples.append(row[1])
        if samples:
            assert len(samples) == len(bloomfilters)
        else:
            samples = bloomfilters

        if config.get("max_build_mem_bytes"):
            max_memory_bytes = humanfriendly.parse_size(
                config["max_build_mem_bytes"])
        else:
            max_memory_bytes = None

        return build(
            config=config,
            bloomfilter_filepaths=bloomfilters,
            samples=samples,
            max_memory=max_memory_bytes,
        )