示例#1
0
    def _pipe_from_textfile(self, finp):
        while True:
            s = finp.readline()
            if s == '':
                return
            s = s.rstrip('\n').lstrip()
            if '##' in s:
                s = s[:s.index('##')]
            if '=' in s:
                s = s.split('=', 1)
                if s[0].lower().startswith('formula'):
                    s = s[1].split(',')
                    self.formulas.append(
                        encode_formula_to_array(parse_formula(s[0])))
                    self.scores.append(float(s[1]))
                elif s[0].lower().startswith('unknown_score'):
                    self.unknown_score = float(s[1])

            elif s.lower().startswith('dict_form'):
                self.vector_form = False

            elif s.lower().startswith('end'):
                if not self.vector_form:
                    self.required_fields = ['Formula']
                    for i in range(len(self.formulas)):
                        self.formulas[i] = decode_formula_from_array(
                            self.formulas[i])
                    self.process_molecular_candidate_record = self.__process_molecular_candidate_record_formula

                return
示例#2
0
    def _pipe_from_textfile(self, finp):
        while True:
            s = finp.readline()
            if s == '':
                return
            s = s.rstrip('\n').lstrip()
            if '##' in s:
                s = s[:s.index('##')]
            if '=' in s:
                s = s.split('=', 1)
                if s[0].lower().startswith('formula'):
                    self.filter.append(
                        encode_formula_to_array(parse_formula(s[1])))
            elif s.lower().startswith('dict_form'):
                self.vector_form = False

            elif s.lower().startswith('end'):
                if not self.vector_form:
                    self.required_fields = ['Formula']
                    for i in range(len(self.filter)):
                        self.filter[i] = decode_formula_from_array(
                            self.filter[i])
                    self.rejected = self.__rejected_formula

                return
示例#3
0
    def __init__(self, formulas=None, use_vector_form=True):

        self.filter = []
        self.vector_form = use_vector_form
        #self.supported_adducts=set();
        if self.vector_form:
            self.required_fields = ['FormulaVector']
        else:
            self.required_fields = ['Formula']

        if formulas is None:
            return
        if isinstance(formulas, str):
            formulas = formulas.split(',')
            for formula in formulas:
                self.filter.append(
                    encode_formula_to_array(parse_formula(formula)))
        elif isinstance(formulas, list):
            for formula in formulas:
                if isinstance(formula, dict):
                    self.filter.append(encode_formula_to_array(formula))
                elif isinstance(formula, np.ndarray):
                    self.filter.append(formula)
                elif isinstance(formula, str):
                    fs = formula.split(',')
                    for f in fs:
                        self.filter.append(
                            encode_formula_to_array(parse_formula(f)))
                else:
                    raise TypeError(
                        'Wrong type argument for FormulasFilter initialization! str, dict, list of (dict or str) supported only!'
                    )

        elif isinstance(formulas, dict):
            self.filter.append(encode_formula_to_array(formulas))
        elif isinstance(formulas, np.ndarray):
            self.filter.append(formulas)
        else:
            raise TypeError(
                'Wrong type argument for FormulasFilter initialization! str, dict, list of (dict or str) supported only!'
            )

        if not self.vector_form:
            for i in range(len(self.filter)):
                self.filter[i] = decode_formula_from_array(self.filter[i])
示例#4
0
def process_file(fname, correct_key, best_results, worst_results,
                 correct_formula):
    min_correct = -1
    max_correct = -1

    correct_elements = ElementCompositionFilter(correct_formula,
                                                correct_formula)
    correct_formula = FormulasFilter(correct_formula)

    with open(fname, 'rb') as finp:
        results = list(csv.reader(finp))
        del results[0]

        for i in reversed(range(1, len(results))):
            if results[i][9] == results[i - 1][9]:
                del results[i]

            elif correct_elements.rejected({
                    'ElementVector':
                    formula_to_element_vector(parse_formula(results[i][6]))
            }):
                #print('removed wrong elements %s %s'%(results[i][6], correct_formula))
                del results[i]

            elif correct_formula.rejected({
                    'FormulaVector':
                    encode_formula_to_array(parse_formula(results[i][6]))
            }):
                #print('removed wrong formulas %s %s'%(results[i][6], correct_formula))
                del results[i]

        for i in range(len(results)):
            if correct_key == results[i][9]:
                if min_correct == -1:
                    min_correct = i
                max_correct = i
        if min_correct > -1:
            for i in range(len(best_results)):
                if min_correct <= i:
                    best_results[i] += 1
                if max_correct <= i:
                    worst_results[i] += 1
示例#5
0
    def _get_next_raw_record(self):
        if self.currentfile=='':
            self.currentfile=os.path.join(self.database_path, self.db_name, self.subf,
                                          str(self.mzcurrent//1000),
                                          str(self.mzcurrent%1000//100),
                                          str(self.mzcurrent%100//10),
                                          '%s.st2'%str(self.mzcurrent%10));
            while (not os.path.isfile(self.currentfile)) and (self.mzcurrent<=self.mzmax_int):
                self.mzcurrent+=1;
                self.currentfile=os.path.join(self.database_path, self.db_name, self.subf,
                                          str(self.mzcurrent//1000),
                                          str(self.mzcurrent%1000//100),
                                          str(self.mzcurrent%100//10),
                                          '%s.st2'%str(self.mzcurrent%10));
            if self.mzcurrent<=self.mzmax_int:
                self.datafile=open(self.currentfile,'r');
                self.record_index=-1;
            else:
                raise StopIteration();                
        
        s=self.datafile.readline();
        self.record_index+=1;
        while s=='':
            self.datafile.close();
            self.mzcurrent+=1;
            self.currentfile=os.path.join(self.database_path, self.db_name, self.subf,
                                          str(self.mzcurrent//1000),
                                          str(self.mzcurrent%1000//100),
                                          str(self.mzcurrent%100//10),
                                          '%s.st2'%str(self.mzcurrent%10));
            while (not os.path.isfile(self.currentfile)) and (self.mzcurrent<=self.mzmax_int):
                self.mzcurrent+=1;
                self.currentfile=os.path.join(self.database_path, self.db_name, self.subf,
                                          str(self.mzcurrent//1000),
                                          str(self.mzcurrent%1000//100),
                                          str(self.mzcurrent%100//10),
                                          '%s.st2'%str(self.mzcurrent%10));
            if self.mzcurrent<=self.mzmax_int:
                self.datafile=open(self.currentfile,'r');
                self.record_index=-1;
            else:
                raise StopIteration();                
            s=self.datafile.readline();
            self.record_index+=1;
            
        s=s.rstrip('\n').split('\t');
        record=MolecularRecord();
        record['MZ']=float(s[0]);
        if self.charged:
            record['Mass']=float(s[1]);
            record['Charge']=float(s[2]);
        else:
            record['Mass']=record['MZ'];
            record['Charge']=0;
        if 'ShortInChI' in self.required_fields:            
            record['ShortInChI']=parse_inchi(s[2+self.offs])[0];
            
        if 'InChI' in self.required_fields:
            record['InChI']=s[2+self.offs];

        if 'SMILES' in self.required_fields:    
            record['SMILES']=s[3+self.offs];

        if 'IDs' in self.required_fields:            
            record['IDs']=s[4+self.offs];

        if 'FPT' in self.required_fields:
            record['FPT']=decode_from_base64(s[5+self.offs]);
            # Mask FPT here !

        if 'Frag' in self.required_fields:
            record['Frag']=s[6+self.offs];
            if self.charged:
                record['FragCharge']=s[9];

        if 'InChIKeyValues' in self.required_fields:        
            record['InChIKeyValues']=inchikeyvalues_from_inchi(s[2+self.offs]);
            
        if 'InChIKey' in self.required_fields:        
            record['InChIKey']=inchikey_from_inchi(s[2+self.offs]);
        
        if ('Formula' in self.required_fields) or ('ElementVector' in self.required_fields) or ('FormulaVector' in self.required_fields):       
            fla=parse_formula(s[1+self.offs].split('/')[0]);
            
            if 'Formula' in self.required_fields:
                record['Formula']=fla;
        
            if 'ElementVector' in self.required_fields:
                record['ElementVector']=formula_to_element_vector(fla);
            
            if 'FormulaVector' in self.required_fields:
                record['FormulaVector']=encode_formula_to_array(fla);
        

            
        return record;
示例#6
0
    def setup_scorer(self, formulas, scores, unknown_score=0.0):
        self.formulas = []
        self.scores = []
        self.unknown_score = unknown_score
        if isinstance(formulas, str):
            formulas = formulas.split(',')
            for formula in formulas:
                self.formulas.append(
                    encode_formula_to_array(parse_formula(formula)))
        elif isinstance(formulas, list):
            for formula in formulas:
                if isinstance(formula, dict):
                    self.formulas.append(encode_formula_to_array(formula))
                elif isinstance(formula, np.ndarray):
                    self.formulas.append(formula)
                elif isinstance(formula, str):
                    fs = formula.split(',')
                    for f in fs:
                        self.formulas.append(
                            encode_formula_to_array(parse_formula(f)))
                else:
                    raise TypeError(
                        'Wrong type argument (formulas) for FormulaVectors initialization! str, formula, list of (formula, formulavector or str) supported only!'
                    )
        elif isinstance(formulas, dict):
            self.formulas.append(encode_formula_to_array(formulas))
        elif isinstance(formulas, np.ndarray):
            self.formulas.append(formulas)
        else:
            raise TypeError(
                'Wrong type argument (formulas) for FormulaVectors initialization! str, formula, list of (formula, formulavector or str) supported only!'
            )

        if isinstance(scores, str):
            scores = scores.split(',')
            for score in scores:
                self.scores.append(float(score))
        elif isinstance(scores, float):
            self.scores.append(scores)
        elif isinstance(scores, int):
            self.scores.append(float(scores))
        elif isinstance(scores, list):
            for score in scores:
                if isinstance(score, float):
                    self.scores.append(score)
                elif isinstance(score, dict):
                    self.scores.append(score)
                elif isinstance(score, int):
                    self.scores.append(float(score))
                elif isinstance(score, str):
                    score = score.split(',')
                    for s in score:
                        self.scores.append(float(s))
                else:
                    raise TypeError(
                        'Wrong type argument (scores) for FormulaScorer initialization! str, float, int, list of (float, int or str) or dictionary supported only!'
                    )
        else:
            raise TypeError(
                'Wrong type argument (scores) for FormulaScorer initialization! str, float, int, list of (float, int or str) supported only!'
            )
        if len(self.scores) != len(self.formulas):
            raise TypeError(
                'Number of formulas and number of scores supplied do not match!'
            )

        if not self.vector_form:
            for i in range(len(self.formulas)):
                self.formulas[i] = decode_formula_from_array(self.formulas[i])
示例#7
0
    }
    test.process_molecular_candidate_record(None, record)
    print(record)

    record = {
        'Formula': parse_formula('O4'),
        'Scores': {}
    }
    test.process_molecular_candidate_record(None, record)
    print(record)
    print('FormulaVector')
    test = FormulaScorer()
    test.setup_scorer('C2H5OH,CH4,PO4', '0.3,0.1,0.5', 1.0)

    record = {
        'FormulaVector': encode_formula_to_array(parse_formula('CH4')),
        'Scores': {}
    }
    test.process_molecular_candidate_record(None, record)
    print(record)

    record = {
        'FormulaVector': encode_formula_to_array(parse_formula('PO4')),
        'Scores': {}
    }
    test.process_molecular_candidate_record(None, record)
    print(record)

    record = {
        'FormulaVector': encode_formula_to_array(parse_formula('C2H5OH')),
        'Scores': {}
示例#8
0
    print(test.rejected({'Formula': parse_formula('CH4N')}))
    #True
    print(test.rejected({'Formula': parse_formula('CH3')}))
    #True
    print(test.rejected({'Formula': parse_formula('C2H4')}))
    #True
    print(test.rejected({'Formula': parse_formula('C')}))
    #True
    print(test.rejected({'Formula': parse_formula('CO4')}))
    #True
    print('FormulasVector')
    test = FormulasFilter('C2H5OH')
    print(test.filter)
    test = FormulasFilter('C2H5OH,CH4,PO4')
    print(test.filter)
    test = FormulasFilter(encode_formula_to_array(parse_formula('C2H5OH')))
    print(test.filter)
    test = FormulasFilter(['C2H5OH,PO4', parse_formula('CH4')])

    print(test.filter)
    print(
        test.rejected(
            {'FormulaVector': encode_formula_to_array(parse_formula('CH4'))}))
    #False
    print(
        test.rejected(
            {'FormulaVector': encode_formula_to_array(parse_formula('PO4'))}))
    #False
    print(
        test.rejected(
            {'FormulaVector': encode_formula_to_array(parse_formula('H4C'))}))
示例#9
0
    def hdf5_import_from_st2raw(self,
                                inpath,
                                fptmask=np.ones((11416, ), np.uint8)):
        if not os.path.isfile(os.path.join(inpath, 'dbinfo.dat')):
            raise IOError('Database info file not found: %s' %
                          os.path.join(inpath, 'dbinfo.dat'))

        self.HDF5container.attrs['HDF5ContainerType'] = np.string_(
            'DistilledChemicalDatabase')
        self.HDF5container.attrs['HDF5ContainerVersion'] = np.string_('1.0')
        finp = open(os.path.join(inpath, 'dbinfo.dat'), 'r')
        #fout=open(os.path.join(self.folderpath,'dbinfo.dat'),'w');
        for s in finp:
            s = s.rstrip('\n').lstrip().split('=', 1)
            if s[0].upper() == 'DBFORMAT':
                s[1] = '3'
            if s[0] != '':
                #fout.write('%s=%s\n'%(s[0],s[1]));
                self.HDF5container.attrs[s[0]] = np.string_(s[1])

        #fout.close();
        finp.close()

        fptlist = []

        for i in range(11416):
            if fptmask[i] == 1:
                fptlist.append(i)

        fptlen = len(fptlist)
        fptsubmask = np.packbits(np.ones((fptlen, ), np.uint8))
        fptmasklen = len(fptsubmask)
        fptindexes = np.array(fptlist, dtype=np.uint32)
        packedmask = np.packbits(fptmask)
        packedmasklen = len(packedmask)

        #hdf5_ascii_string = h5py.special_dtype(vlen=bytes);

        fptgroup = self.HDF5container.create_group('FingerPrints')
        #Original mask, packed
        fptoriginalmask = fptgroup.create_dataset("FPTOriginalMask",
                                                  (packedmasklen, ),
                                                  maxshape=(packedmasklen, ),
                                                  dtype=np.uint8)
        fptoriginalmask[:] = packedmask[:]

        #List of indeces of original FPT bits (11416)
        fptmask = fptgroup.create_dataset("FPTMask", (fptlen, ),
                                          maxshape=(fptlen, ),
                                          compression="gzip",
                                          compression_opts=4,
                                          dtype=np.uint32)
        fptmask[:] = fptindexes[:]

        #Mask for working bits (packed)
        fptsubmask = fptgroup.create_dataset("FPTsubmask", (1, fptmasklen),
                                             chunks=(100, fptmasklen),
                                             maxshape=(None, fptmasklen),
                                             compression="gzip",
                                             compression_opts=4,
                                             dtype=np.uint8)

        #FPT info: 0 - original bit count=11416, 1 - length of new fpt after masking, 2 - length of the packed fpt, 3 - No of padding bits
        fptinfo = fptgroup.create_dataset("FPTInfo", (4, ),
                                          maxshape=(4, ),
                                          dtype=np.uint32)
        fptinfo[0] = 11416
        fptinfo[1] = fptlen
        fptinfo[2] = fptmasklen
        fptinfo[3] = fptmasklen * 8 - fptlen

        print('Listing input files')

        subpaths = ['/Negative', '/Positive', '/Neutral']
        #subpaths=['/Positive'];
        for subpath in subpaths:
            print(subpath)

            fptgroup = self.HDF5container.create_group(subpath +
                                                       '/FingerPrints')
            fraggroup = self.HDF5container.create_group(subpath +
                                                        '/FragPrints')
            chemgroup = self.HDF5container.create_group(subpath + '/ChemInfo')
            chargegroup = self.HDF5container[subpath]

            #New FPT Array, packed and trimmed to fptmask
            fptdataset = fptgroup.create_dataset("FPTArray", (1, fptmasklen),
                                                 chunks=(100, fptmasklen),
                                                 maxshape=(None, fptmasklen),
                                                 compression="gzip",
                                                 compression_opts=4,
                                                 dtype=np.uint8)

            if subpath != '/Neutral':
                masschargedataset = chargegroup.create_dataset(
                    "MZMassCharge", (1, 3),
                    chunks=(10000, 3),
                    maxshape=(None, 3),
                    compression="gzip",
                    compression_opts=4,
                    dtype=np.float32)
            else:
                mzdataset = chargegroup.create_dataset("MZ", (1, ),
                                                       chunks=(10000, ),
                                                       maxshape=(None, ),
                                                       compression="gzip",
                                                       compression_opts=4,
                                                       dtype=np.float32)

            inchikey_dataset = chemgroup.create_dataset("InChiKeyValues",
                                                        (1, 15),
                                                        chunks=(10000, 15),
                                                        maxshape=(None, 15),
                                                        compression="gzip",
                                                        compression_opts=4,
                                                        dtype=np.uint8)

            elementsvector_dataset = chemgroup.create_dataset(
                "ElementsVector", (1, 12),
                chunks=(10000, 12),
                maxshape=(None, 12),
                compression="gzip",
                compression_opts=4,
                dtype=np.uint8)

            formulavector_dataset = chemgroup.create_dataset(
                "FormulaVector", (1, 96),
                chunks=(10000, 96),
                maxshape=(None, 96),
                compression="gzip",
                compression_opts=4,
                dtype=np.uint16)

            fragprintindex_dataset = fraggroup.create_dataset(
                "FragPrintIndex", (1, 2),
                chunks=(10000, 2),
                maxshape=(None, 2),
                compression="gzip",
                compression_opts=4,
                dtype=np.int64)
            fragprintvalues_dataset = fraggroup.create_dataset(
                "FragPrintValues", (1, ),
                chunks=(10000, ),
                maxshape=(None, ),
                compression="gzip",
                compression_opts=4,
                dtype=np.float32)

            smiles_dataset = chemgroup.create_dataset("SMILES", (1, 2),
                                                      chunks=(10000, 2),
                                                      maxshape=(None, 2),
                                                      compression="gzip",
                                                      compression_opts=4,
                                                      dtype=np.int64)

            inchi_dataset = chemgroup.create_dataset("InChi", (1, 4, 2),
                                                     chunks=(10000, 4, 2),
                                                     maxshape=(None, 4, 2),
                                                     compression="gzip",
                                                     compression_opts=4,
                                                     dtype=np.int64)

            ids_dataset = chemgroup.create_dataset("IDs", (1, 2),
                                                   chunks=(10000, 2),
                                                   maxshape=(None, 2),
                                                   compression="gzip",
                                                   compression_opts=4,
                                                   dtype=np.int64)

            ascii_dataset = chemgroup.create_dataset("ASCII", (1, ),
                                                     chunks=(10000, ),
                                                     maxshape=(None, ),
                                                     compression="gzip",
                                                     compression_opts=4,
                                                     dtype=np.uint8)

            recordindex = -1

            fileslist = []

            for i in range(0, 2000):
                if os.path.exists(inpath + subpath + '/%s' % i):
                    print(inpath + subpath + '/%s' % i)
                    for j in range(0, 10):
                        if os.path.exists(inpath + subpath + '/%s/%s' %
                                          (i, j)):
                            for k in range(0, 10):
                                if os.path.exists(inpath + subpath +
                                                  '/%s/%s/%s' % (i, j, k)):
                                    for l in range(0, 10):
                                        if os.path.isfile(inpath + subpath +
                                                          '/%s/%s/%s/%s.st2' %
                                                          (i, j, k, l)):
                                            fileslist.append(
                                                inpath + subpath +
                                                '/%s/%s/%s/%s.st2' %
                                                (i, j, k, l))

            print('Total number of input files: %s' % len(fileslist))

            for filename in fileslist:
                fpath, fname = os.path.split(filename)
                subpath = fpath.replace(inpath, '')
                if 'Neutral' in subpath:
                    charged = False
                    offs = 0
                else:
                    charged = True
                    offs = 2
                print('Importing: .../%s/%s' % (subpath, fname))
                dblist = []
                with open(filename, 'r') as finp:
                    for s in finp:
                        try:
                            s = s.replace('\n', '').replace('\r',
                                                            '').split('\t')
                            mz = float(s[0])
                            if charged:
                                mass = float(s[1])
                                charge = float(s[2])
                            else:
                                mass = mz
                                charge = 0.0
                            if mass >= 12.0:
                                #shortinchi=s[1+offs];
                                inchi = s[2 + offs]
                                smiles = s[3 + offs]
                                ids = s[4 + offs]
                                fpt = s[5 + offs]
                                fpt = decode_from_base64(fpt)
                                fpt = np.unpackbits(fpt)
                                frag = s[6 + offs]
                                if charged:
                                    fragcharge = s[9]
                                else:
                                    fragcharge = ''
                                recordindex += 1
                                if recordindex % 1000 == 0:
                                    print('Total: %s' % (recordindex + 1))

                                dblist.append([
                                    recordindex, mz, charged, mass, charge,
                                    inchi, fpt, frag, fragcharge, smiles, ids
                                ])
                        except:
                            print('Error! Skipping!')

                if len(dblist) > 0:
                    #expand datasets here
                    fptdataset.resize((recordindex + 1, fptmasklen))

                    if charged:
                        masschargedataset.resize((recordindex + 1, 3))
                    else:
                        mzdataset.resize((recordindex + 1, ))

                    inchikey_dataset.resize((recordindex + 1, 15))

                    elementsvector_dataset.resize((recordindex + 1, 12))

                    formulavector_dataset.resize((recordindex + 1, 96))

                    fragprintindex_dataset.resize((recordindex + 1, 2))

                    smiles_dataset.resize((recordindex + 1, 2))

                    ids_dataset.resize((recordindex + 1, 2))

                    inchi_dataset.resize((recordindex + 1, 4, 2))

                    for db in dblist:
                        currentindex = db[0]
                        fptdataset[currentindex, :] = np.packbits(
                            db[6][fptindexes])[:]
                        #print(inchi)
                        inchi = parse_inchi(db[5])
                        #print(inchi)
                        inchikeyvalues = inchikeyvalues_from_inchi(db[5])

                        sformula = inchi[0].split('/', 1)[0]

                        #print(sformula);
                        formula = parse_formula(sformula)

                        elementsvector = formula_to_element_vector(formula)
                        encodedformula = encode_formula_to_array(formula)

                        charge = db[4]
                        charged = db[2]

                        if charged:
                            #print(db[7],db[8])
                            frags = parse_string_fragment_charges(
                                charge, db[7], db[8])
                            #print(frags)
                        else:
                            frags = parse_string_fragments(db[7])

                        if charged:
                            masschargedataset[currentindex, 0] = db[1]
                            masschargedataset[currentindex, 1] = db[3]
                            masschargedataset[currentindex, 2] = charge
                        else:
                            mzdataset[currentindex] = db[1]

                        inchikey_dataset[currentindex, :] = inchikeyvalues[:]

                        elementsvector_dataset[
                            currentindex, :] = elementsvector[:]

                        formulavector_dataset[
                            currentindex, :] = encodedformula[:]

                        fragcount = len(frags)
                        frags = np.array(frags, dtype=np.float32)
                        fragprintindex_dataset[currentindex,
                                               0] = self.fragprintpos
                        fragprintindex_dataset[
                            currentindex, 1] = self.fragprintpos + fragcount

                        fragprintvalues_dataset.resize(
                            (self.fragprintpos + fragcount, ))

                        fragprintvalues_dataset[self.fragprintpos:self.
                                                fragprintpos +
                                                fragcount] = frags[:]

                        self.fragprintpos += fragcount

                        smiles = bytearray(db[9].encode('ascii'))

                        smileslen = len(smiles)

                        smiles = np.array(smiles, dtype=np.uint8)

                        ids = bytearray(db[10].encode('ascii'))

                        idslen = len(ids)

                        ids = np.array(ids, dtype=np.uint8)

                        sinchi = inchi[0].split('/', 1)
                        if len(sinchi) > 1:
                            sinchi = sinchi[1]
                        else:
                            sinchi = ''

                        inchi0 = bytearray(sformula.encode('ascii'))
                        inchi1 = bytearray(sinchi.encode('ascii'))
                        inchi2 = bytearray(inchi[2].encode('ascii'))
                        inchi3 = bytearray(inchi[1].encode('ascii'))

                        inchi0len = len(inchi0)
                        inchi1len = len(inchi1)
                        inchi2len = len(inchi2)
                        inchi3len = len(inchi3)

                        inchi0 = np.array(inchi0, dtype=np.uint8)
                        inchi1 = np.array(inchi1, dtype=np.uint8)
                        inchi2 = np.array(inchi2, dtype=np.uint8)
                        inchi3 = np.array(inchi3, dtype=np.uint8)

                        ascii_dataset.resize(
                            (self.asciipos + smileslen + idslen + inchi0len +
                             inchi1len + inchi2len + inchi3len, ))

                        smiles_dataset[currentindex, 0] = self.asciipos
                        smiles_dataset[currentindex,
                                       1] = self.asciipos + smileslen
                        ascii_dataset[self.asciipos:self.asciipos +
                                      smileslen] = smiles[:]
                        self.asciipos += smileslen

                        ids_dataset[currentindex, 0] = self.asciipos
                        ids_dataset[currentindex, 1] = self.asciipos + idslen
                        ascii_dataset[self.asciipos:self.asciipos +
                                      idslen] = ids[:]
                        self.asciipos += idslen

                        inchi_dataset[currentindex, 0, 0] = self.asciipos
                        inchi_dataset[currentindex, 0,
                                      1] = self.asciipos + inchi0len
                        ascii_dataset[self.asciipos:self.asciipos +
                                      inchi0len] = inchi0[:]
                        self.asciipos += inchi0len

                        inchi_dataset[currentindex, 1, 0] = self.asciipos
                        inchi_dataset[currentindex, 1,
                                      1] = self.asciipos + inchi1len
                        ascii_dataset[self.asciipos:self.asciipos +
                                      inchi1len] = inchi1[:]
                        self.asciipos += inchi1len

                        inchi_dataset[currentindex, 2, 0] = self.asciipos
                        inchi_dataset[currentindex, 2,
                                      1] = self.asciipos + inchi2len
                        ascii_dataset[self.asciipos:self.asciipos +
                                      inchi2len] = inchi2[:]
                        self.asciipos += inchi2len

                        inchi_dataset[currentindex, 3, 0] = self.asciipos
                        inchi_dataset[currentindex, 3,
                                      1] = self.asciipos + inchi3len
                        ascii_dataset[self.asciipos:self.asciipos +
                                      inchi3len] = inchi3[:]
                        self.asciipos += inchi3len

        print('Import Finished!')