예제 #1
0
def kyotocabinet_fetch(limit):
    path = "/tmp/test_py_benchmark_%s.kch" % limit

    db = kyotocabinet.DB()
    flags = kyotocabinet.DB.OWRITER
    if not db.open(path, flags):
        return False

    for i in range(0, limit):
        k = str(random.randrange(0, limit - 1))

        v = db.get(k)
        if len(v) < 1:
            return False

    db.close()
    return True
예제 #2
0
def kyotocabinet_store(limit):
    path = "/tmp/test_py_benchmark_%s.kch" % limit

    db = kyotocabinet.DB()
    flags = kyotocabinet.DB.OWRITER
    flags = flags | kyotocabinet.DB.OCREATE
    flags = flags | kyotocabinet.DB.OTRUNCATE
    flags = flags | kyotocabinet.DB.OTRYLOCK
    if not db.open(path, flags):
        return False

    for i in range(0, limit):
        k = str(i)
        v = str(random.randrange(0, 65535))

        rv = db.set(k, v)
        if not rv:
            return False

    db.close()
    return True
예제 #3
0
 def _create_index(self, surface_map_file, entity_list_file,
                   surface_index_name, mid_offset_index_name):
     logging.info("Generating entities and surface index.")
     num_lines = 0
     logger.info("Reading entity offsets.")
     mid_offsets = dict()
     # Remember the offset for each entity.
     with open(entity_list_file, 'r') as f:
         mm = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
         offset = mm.tell()
         line = mm.readline()
         while line:
             num_lines += 1
             if num_lines % 1000000 == 0:
                 logger.info('Read %s lines' % num_lines)
             line = line.decode('utf-8')
             cols = line.split('\t')
             mid = cols[0]
             mid_offsets[mid] = offset
             offset = mm.tell()
             line = mm.readline()
     s_index_db = kyotocabinet.DB()
     s_index_db.open(
         surface_index_name + '#msiz=20000000000#bnum=200000000#opts=l')
     logging.info("Creating surface map on disk.")
     num_lines = 0
     # We now write a list of (offset, score)... floats for
     # each surface form.
     num_not_found = 0
     with open(surface_map_file, 'r') as f:
         last_surface_form = None
         surface_form_entries = array.array('d')
         for line in f:
             num_lines += 1
             try:
                 cols = line.decode('utf-8').split('\t')
                 surface_form = cols[0]
                 score = float(cols[1])
                 mid = cols[2].strip()
                 offset = float(mid_offsets[mid])
                 if surface_form != last_surface_form:
                     if surface_form_entries:
                         s_index_db.set(last_surface_form,
                                        surface_form_entries.tostring())
                     last_surface_form = surface_form
                     surface_form_entries = array.array('d')
                 surface_form_entries.append(offset)
                 surface_form_entries.append(score)
             except KeyError:
                 num_not_found += 1
                 if num_not_found < 100:
                     logger.warn(
                         "Mid %s appears in surface map but not "
                         "in entity list." % mid)
                 elif num_not_found == 100:
                     logger.warn(
                         "Suppressing further warnings about unfound mids.")
             if num_lines % 1000000 == 0:
                 logger.info(
                     'Stored %s surface-form->entity pairs.' % num_lines)
         if surface_form_entries:
             s_index_db.set(last_surface_form,
                            surface_form_entries.tostring())
     if num_not_found > 0:
         logger.warn(
             "%s entries of an mid in surface map but mid not "
             "in entity list." % num_not_found)
     # store an additional index from mid -> offset
     s_index_db.close()
     mid_offset_db = kyotocabinet.DB()
     mid_offset_db.open(
         mid_offset_index_name + '#msiz=20000000000#bnum=200000000#opts=l')
     logging.info("Creating entity offset index on disk.")
     for mid, offset in mid_offsets.iteritems():
         mid_offset_db.set(mid, offset)
     logging.info("Done.")
     mid_offset_db.close()
예제 #4
0
def read_db(DB):
    """Load the kyotocabinet type database DB"""
    db = kyotocabinet.DB()
    if not db.open(DB, kyotocabinet.DB.OWRITER | kyotocabinet.DB.OCREATE):
        sys.stderr.write('ERROR: failed to open: %s\n' % db.error())
    return db
예제 #5
0
 def __init__(self, path, parse=lambda v: v, unparse=lambda v: v):
     self.fk = kyoto.DB()
     if not self.fk.open(path, _OPEN_MODE):
         raise self.fk.error()
     self.parse = parse
     self.unparse = unparse
예제 #6
0
파일: poll.py 프로젝트: enchuu/ahopoll
 def __init__(self):
     self.db = kyotocabinet.DB()
     if not self.db.open(Settings.database_name):
         raise InvalidInputException(PollContainer.db_error)
예제 #7
0
    def __init__(self, dbdir, mode=Mode.READONLY):
        """dbdir -> opens a descriptor storage
         
        >>> store = DescriptaStore(db)
        >>> len(store)

        # access the options used to create this store
        #  (this is optional and may not exist)
        >>> store.options
        ...
        
        Iterate through molecule data ([moldata, <optional name>], descriptors)
        >>> for moldata, descriptors in store:
        >>>     pass

        Iterate through only the descriptors
        >>> for i,prop in enumerate(store.descriptors()):
        >>>    pass

        If name indexed:
        >>> row = store.lookupName("ZWIMER-03065")
        
        If inchi key index:
        Since inchi keys may collide, this can return multiple indices
        >>>  rows = store.lookupInchiKey("BCWYEXBNOWJQJV-UHFFFAOYSA-N")
        """
        self.desctiporDB = dbdir
        self.db = raw.RawStore(dbdir, mode=mode)
        self.index = MolFileIndex.MolFileIndex(
            os.path.join(dbdir, "__molindex__"))

        inchi = os.path.join(dbdir, "inchikey.kch")
        if os.path.exists(inchi):
            if not kyotocabinet:
                print(
                    "Inchi lookup exists, but kyotocabinet is not installed.",
                    file=sys.stderr)
            else:
                self.inchikey = kyotocabinet.DB()
                if mode == Mode.READONLY:
                    self.inchikey.open(inchi, kyotocabinet.DB.OREADER)
                else:
                    self.inchikey.open(inchi, kyotocabinet.DB.OWRITER)

        else:
            self.inchikey = None

        name = os.path.join(dbdir, "name.kch")
        if os.path.exists(name):
            if not kyotocabinet:
                logging.warning(
                    "Name lookup exists, but kyotocabinet is not installed.")
                self.name = None
            else:
                self.name = kyotocabinet.DB()
                if mode == Mode.READONLY:
                    self.name.open(name, kyotocabinet.DB.OREADER)
                else:
                    self.name.open(name, kyotocabinet.DB.OWRITER)
        else:
            print("Couldn't open name db", name, file=sys.stderr)
            self.name = None

        self.options = None
        optionsfile = os.path.join(dbdir, "__options__")
        if os.path.exists(optionsfile):
            with open(optionsfile, 'rb') as f:
                self.options = pickle.load(f)

        # index the calculated flags
        datacols = [(i, name) for i, name in enumerate(self.db.colnames)
                    if "_calculated" not in name]
        self.datanames = [name for i, name in datacols]
        self.dataindices = [i for i, name in datacols]
예제 #8
0
 def __init__(self):
     self.__counter = 0
     self.__db = kyotocabinet.DB()
     self.__fname = None
예제 #9
0
 def __init__(self, kyoto_db):
     self.db = kyotocabinet.DB()
     if not self.db.open(kyoto_db,
                         kyotocabinet.DB.OWRITER | kyotocabinet.DB.OCREATE):
         print "cannot open db"
         sys.exit(2)
예제 #10
0
def make_store(options):
    while props:
        props.pop()

    props.append(MakeGenerator(options.descriptors.split(",")))
    properties = props[0]
    # to test molecule

    inchiKey = options.index_inchikey
    if inchiKey and not kyotocabinet:
        logging.warning(
            "Indexing inchikeys requires kyotocabinet, please install kyotocabinet"
        )
        return False

    # make the storage directory
    if os.path.exists(options.storage):
        raise IOError("Directory for descriptastorus already exists: %s" %
                      options.storage)

    # prepare the Pool
    if options.numprocs == -1:
        num_cpus = multiprocessing.cpu_count()
    else:
        # never use more than the maximum number
        num_cpus = min(int(options.numprocs), multiprocessing.cpu_count())

    pool = multiprocessing.Pool(num_cpus)

    os.mkdir(options.storage)
    with open(os.path.join(options.storage, "__options__"), 'wb') as f:
        pickle.dump(vars(options), f)

    # index the molfile
    indexdir = os.path.join(options.storage, "__molindex__")

    sm = MolFileIndex.MakeSmilesIndex(options.smilesfile,
                                      indexdir,
                                      sep=options.seperator,
                                      hasHeader=options.hasHeader,
                                      smilesColumn=options.smilesColumn,
                                      nameColumn=options.nameColumn)
    logging.info("Creating descriptors for %s molecules...", sm.N)

    numstructs = sm.N
    s = raw.MakeStore(properties.GetColumns(),
                      sm.N,
                      options.storage,
                      checkDirectoryExists=False)
    try:
        if options.index_inchikey:
            logging.info("Creating inchi store")
            cabinet = kyotocabinet.DB()
            inchi = os.path.join(options.storage, "inchikey.kch")
            cabinet.open(inchi,
                         kyotocabinet.DB.OWRITER | kyotocabinet.DB.OCREATE)
        else:
            logging.warning("Not logging inchi (see --index-inchkey)")

        if options.nameColumn is not None:
            logging.info("Creating name store")
            name_cabinet = kyotocabinet.DB()
            name = os.path.join(options.storage, "name.kch")
            name_cabinet.open(
                name, kyotocabinet.DB.OWRITER | kyotocabinet.DB.OCREATE)
        else:
            logging.warning("Not storing name lookup (see --nameColumn)")

        logging.info("Number of molecules to process: %s", numstructs)

        done = False
        count = 0
        numOutput = 0
        batchsize = options.batchsize
        badColumnWarning = False
        inchies = {}
        names = {}
        while 1:
            lastcount = count

            if options.nameColumn is not None:
                joblist, count = getJobsAndNames(sm, options, count,
                                                 numstructs, batchsize,
                                                 num_cpus, names)
            else:
                joblist, count = getJobs(sm, options, count, numstructs,
                                         batchsize, num_cpus)

            if not joblist:
                break

            t1 = time.time()
            if options.index_inchikey:
                results = pool.map(processInchi, joblist)
            else:
                results = pool.map(process, joblist)

            procTime = time.time() - t1

            for result in results:
                numOutput += len(result)
                if numOutput == 0 and not badColumnWarning and len(
                        result) == 0:
                    badColumnWarning = True
                    logging.warning(
                        "no molecules processed in batch, check the smilesColumn"
                    )
                    logging.warning("First 10 smiles:\n")
                    logging.warning("\n".join([
                        "%i: %s" % (i, sm.get(i))
                        for i in range(0, min(sm.N, 10))
                    ]))

            flattened = [val for sublist in results for val in sublist]
            flattened.sort()

            t1 = time.time()
            delta = 0.0
            # flatten the results so that we store them in index order
            for result in flattened:
                if options.index_inchikey:
                    i, v, inchi, key = result
                    if v:
                        try:
                            s.putRow(i, v)
                        except ValueError:
                            logging.exception("Columns: %s\nData: %r",
                                              properties.GetColumns(), v)
                            raise
                    if inchi in inchies:
                        inchies[key].append(i)
                    else:
                        inchies[key] = [i]

                elif options.nameColumn is not None:
                    i, v = result
                    if v:
                        s.putRow(i, v)

            storeTime = time.time() - t1
            logging.info(
                "Done with %s out of %s.  Processing time %0.2f store time %0.2f",
                count, sm.N, procTime, storeTime)

        if options.index_inchikey:
            logging.info("Indexing inchies")
            t1 = time.time()
            for k in sorted(inchies):
                cabinet[k] = repr(inchies[k])
            logging.info("... indexed in %2.2f seconds", (time.time() - t1))

        if names:
            t1 = time.time()
            logging.info("Indexing names")
            for name in sorted(names):
                name_cabinet[name] = names[name]
            logging.info("... indexed in %2.2f seconds", (time.time() - t1))
    finally:
        sm.close()
        s.close()
        pool.close()
예제 #11
0
    print 'Specify a test: redis-normal, redis-hashes, kyoto, tokyo'
    sys.exit(2)
    
if sys.argv[1] == 'redis-normal':
    REDIS_SETGET = True
elif sys.argv[1] == 'redis-hashes':
    REDIS_HSET = True
elif sys.argv[1] == 'kyoto':
    KYOTO = True
elif sys.argv[1] == 'tokyo':
    TOKYO = True

if REDIS_SETGET or REDIS_HSET:
    p = r.pipeline()
elif KYOTO:
    k = kyotocabinet.DB()
    if not k.open("/tmp/casket.kch", kyotocabinet.DB.OWRITER | kyotocabinet.DB.OCREATE):
        print "cannot open db"
        sys.exit(2)
    
for i in range(0, NUM_ENTRIES):
    value = random.randint(0, MAX_VAL)
    if REDIS_SETGET:
        r.set(str(i), value)
    elif REDIS_HSET:
        bucket = int(i / 513)
        p.hset(bucket, i, value)
    elif KYOTO:
        k.set(str(i), value)

    if i % (NUM_ENTRIES/10) == 0:
예제 #12
0
 def open(self):
     self.__db = kyotocabinet.DB()
     if not self.__db.open(
             self.__dbpath,
             kyotocabinet.DB.OWRITER | kyotocabinet.DB.OCREATE):
         raise PAWSError('open error: ' + str(self.__db.error()))