示例#1
0
    def __init__(self, compulsory_elements='', elements=''):
        
        if isinstance(elements, str):
            self.element_filter=formula_to_element_vector(parse_formula(elements));
        elif isinstance(elements, dict):
            self.element_filter=formula_to_element_vector(elements);
        elif isinstance(elements, list):
            self.element_filter=list_to_element_vector(elements);
        elif isinstance(elements, set):
            self.element_filter=set_to_element_vector(elements);
        else:
            raise TypeError('Wrong type argument for ElementCompositionFilter initialization! str, dict, set and list supported only!');

        if isinstance(compulsory_elements, str):
            self.compulsory_element_filter=formula_to_element_vector(parse_formula(compulsory_elements));
        elif isinstance(compulsory_elements, dict):
            self.compulsory_element_filter=formula_to_element_vector(compulsory_elements);
        elif isinstance(compulsory_elements, list):
            self.compulsory_element_filter=list_to_element_vector(compulsory_elements);
        elif isinstance(compulsory_elements, set):
            self.compulsory_element_filter=set_to_element_vector(compulsory_elements);
        else:
            raise TypeError('Wrong type argument for ElementCompositionFilter initialization! str, dict, set and list supported only!');
        self.element_filter=np.bitwise_or(self.element_filter,self.compulsory_element_filter);
            
        self.required_fields=['ElementVector'];
示例#2
0
    def _pipe_from_textfile(self, finp):
        while True:
            s = finp.readline()
            if s == '':
                return
            s = s.rstrip('\n').lstrip()
            if '##' in s:
                s = s[:s.index('##')]
            if '=' in s:
                s = s.split('=', 1)
                if s[0].lower().startswith('formula'):
                    s = s[1].split(',')
                    self.formulas.append(
                        encode_formula_to_array(parse_formula(s[0])))
                    self.scores.append(float(s[1]))
                elif s[0].lower().startswith('unknown_score'):
                    self.unknown_score = float(s[1])

            elif s.lower().startswith('dict_form'):
                self.vector_form = False

            elif s.lower().startswith('end'):
                if not self.vector_form:
                    self.required_fields = ['Formula']
                    for i in range(len(self.formulas)):
                        self.formulas[i] = decode_formula_from_array(
                            self.formulas[i])
                    self.process_molecular_candidate_record = self.__process_molecular_candidate_record_formula

                return
示例#3
0
    def _pipe_from_textfile(self, finp):
        while True:
            s = finp.readline()
            if s == '':
                return
            s = s.rstrip('\n').lstrip()
            if '##' in s:
                s = s[:s.index('##')]
            if '=' in s:
                s = s.split('=', 1)
                if s[0].lower().startswith('formula'):
                    self.filter.append(
                        encode_formula_to_array(parse_formula(s[1])))
            elif s.lower().startswith('dict_form'):
                self.vector_form = False

            elif s.lower().startswith('end'):
                if not self.vector_form:
                    self.required_fields = ['Formula']
                    for i in range(len(self.filter)):
                        self.filter[i] = decode_formula_from_array(
                            self.filter[i])
                    self.rejected = self.__rejected_formula

                return
示例#4
0
    def __init__(self, formulas=None, use_vector_form=True):

        self.filter = []
        self.vector_form = use_vector_form
        #self.supported_adducts=set();
        if self.vector_form:
            self.required_fields = ['FormulaVector']
        else:
            self.required_fields = ['Formula']

        if formulas is None:
            return
        if isinstance(formulas, str):
            formulas = formulas.split(',')
            for formula in formulas:
                self.filter.append(
                    encode_formula_to_array(parse_formula(formula)))
        elif isinstance(formulas, list):
            for formula in formulas:
                if isinstance(formula, dict):
                    self.filter.append(encode_formula_to_array(formula))
                elif isinstance(formula, np.ndarray):
                    self.filter.append(formula)
                elif isinstance(formula, str):
                    fs = formula.split(',')
                    for f in fs:
                        self.filter.append(
                            encode_formula_to_array(parse_formula(f)))
                else:
                    raise TypeError(
                        'Wrong type argument for FormulasFilter initialization! str, dict, list of (dict or str) supported only!'
                    )

        elif isinstance(formulas, dict):
            self.filter.append(encode_formula_to_array(formulas))
        elif isinstance(formulas, np.ndarray):
            self.filter.append(formulas)
        else:
            raise TypeError(
                'Wrong type argument for FormulasFilter initialization! str, dict, list of (dict or str) supported only!'
            )

        if not self.vector_form:
            for i in range(len(self.filter)):
                self.filter[i] = decode_formula_from_array(self.filter[i])
示例#5
0
def process_file(fname, correct_key, best_results, worst_results,
                 correct_formula):
    min_correct = -1
    max_correct = -1

    correct_elements = ElementCompositionFilter(correct_formula,
                                                correct_formula)
    correct_formula = FormulasFilter(correct_formula)

    with open(fname, 'rb') as finp:
        results = list(csv.reader(finp))
        del results[0]

        for i in reversed(range(1, len(results))):
            if results[i][9] == results[i - 1][9]:
                del results[i]

            elif correct_elements.rejected({
                    'ElementVector':
                    formula_to_element_vector(parse_formula(results[i][6]))
            }):
                #print('removed wrong elements %s %s'%(results[i][6], correct_formula))
                del results[i]

            elif correct_formula.rejected({
                    'FormulaVector':
                    encode_formula_to_array(parse_formula(results[i][6]))
            }):
                #print('removed wrong formulas %s %s'%(results[i][6], correct_formula))
                del results[i]

        for i in range(len(results)):
            if correct_key == results[i][9]:
                if min_correct == -1:
                    min_correct = i
                max_correct = i
        if min_correct > -1:
            for i in range(len(best_results)):
                if min_correct <= i:
                    best_results[i] += 1
                if max_correct <= i:
                    worst_results[i] += 1
示例#6
0
    def _get_next_raw_record(self):
        if self.currentfile=='':
            self.currentfile=os.path.join(self.database_path, self.db_name, self.subf,
                                          str(self.mzcurrent//1000),
                                          str(self.mzcurrent%1000//100),
                                          str(self.mzcurrent%100//10),
                                          '%s.st2'%str(self.mzcurrent%10));
            while (not os.path.isfile(self.currentfile)) and (self.mzcurrent<=self.mzmax_int):
                self.mzcurrent+=1;
                self.currentfile=os.path.join(self.database_path, self.db_name, self.subf,
                                          str(self.mzcurrent//1000),
                                          str(self.mzcurrent%1000//100),
                                          str(self.mzcurrent%100//10),
                                          '%s.st2'%str(self.mzcurrent%10));
            if self.mzcurrent<=self.mzmax_int:
                self.datafile=open(self.currentfile,'r');
                self.record_index=-1;
            else:
                raise StopIteration();                
        
        s=self.datafile.readline();
        self.record_index+=1;
        while s=='':
            self.datafile.close();
            self.mzcurrent+=1;
            self.currentfile=os.path.join(self.database_path, self.db_name, self.subf,
                                          str(self.mzcurrent//1000),
                                          str(self.mzcurrent%1000//100),
                                          str(self.mzcurrent%100//10),
                                          '%s.st2'%str(self.mzcurrent%10));
            while (not os.path.isfile(self.currentfile)) and (self.mzcurrent<=self.mzmax_int):
                self.mzcurrent+=1;
                self.currentfile=os.path.join(self.database_path, self.db_name, self.subf,
                                          str(self.mzcurrent//1000),
                                          str(self.mzcurrent%1000//100),
                                          str(self.mzcurrent%100//10),
                                          '%s.st2'%str(self.mzcurrent%10));
            if self.mzcurrent<=self.mzmax_int:
                self.datafile=open(self.currentfile,'r');
                self.record_index=-1;
            else:
                raise StopIteration();                
            s=self.datafile.readline();
            self.record_index+=1;
            
        s=s.rstrip('\n').split('\t');
        record=MolecularRecord();
        record['MZ']=float(s[0]);
        if self.charged:
            record['Mass']=float(s[1]);
            record['Charge']=float(s[2]);
        else:
            record['Mass']=record['MZ'];
            record['Charge']=0;
        if 'ShortInChI' in self.required_fields:            
            record['ShortInChI']=parse_inchi(s[2+self.offs])[0];
            
        if 'InChI' in self.required_fields:
            record['InChI']=s[2+self.offs];

        if 'SMILES' in self.required_fields:    
            record['SMILES']=s[3+self.offs];

        if 'IDs' in self.required_fields:            
            record['IDs']=s[4+self.offs];

        if 'FPT' in self.required_fields:
            record['FPT']=decode_from_base64(s[5+self.offs]);
            # Mask FPT here !

        if 'Frag' in self.required_fields:
            record['Frag']=s[6+self.offs];
            if self.charged:
                record['FragCharge']=s[9];

        if 'InChIKeyValues' in self.required_fields:        
            record['InChIKeyValues']=inchikeyvalues_from_inchi(s[2+self.offs]);
            
        if 'InChIKey' in self.required_fields:        
            record['InChIKey']=inchikey_from_inchi(s[2+self.offs]);
        
        if ('Formula' in self.required_fields) or ('ElementVector' in self.required_fields) or ('FormulaVector' in self.required_fields):       
            fla=parse_formula(s[1+self.offs].split('/')[0]);
            
            if 'Formula' in self.required_fields:
                record['Formula']=fla;
        
            if 'ElementVector' in self.required_fields:
                record['ElementVector']=formula_to_element_vector(fla);
            
            if 'FormulaVector' in self.required_fields:
                record['FormulaVector']=encode_formula_to_array(fla);
        

            
        return record;
示例#7
0
        elif isinstance(obj, np.float32):
            return float(obj)

        elif isinstance(obj, np.uint16):
            return int(obj)

        elif isinstance(obj, np.uint8):
            return int(obj)

        return json.JSONEncoder.default(self, obj)


def spectra_to_json(fname, spectral_list):
    dirname, filename = os.path.split(fname)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    with open(fname, 'w') as fout:
        json.dump(spectral_list,
                  fout,
                  cls=MS_Data_Encoder,
                  sort_keys=True,
                  indent=4,
                  separators=(',', ': '))


if __name__ == '__main__':

    spectra_to_json(os.path.join(chemdistiller_path, 'testlogs/t1.jsn'),
                    [parse_formula('C2H5OH'), 25])
示例#8
0
        s1=''.join(element_vector_to_list(self.compulsory_element_filter));
        s2=''.join(element_vector_to_list(self.element_filter));
        return 'ElementCompositionFilter(compulsory="%s", allowed="%s")'%(s1,s2);


registered_filters[ElementCompositionFilter.__name__]=ElementCompositionFilter;
#full_periodic_table_filter=np.full((12,),255,dtype=np.uint8);

CHNOPS_filter=ElementCompositionFilter('','CHNOPS');

CHNOPS_halogens_filter=ElementCompositionFilter('','CHNOPSFClBrIAt');

if __name__=='__main__':
    
    test=ElementCompositionFilter('Cl','CHON');
    
    print(test.rejected({'ElementVector':formula_to_element_vector(parse_formula('C2H5'))}));
    print(test.rejected({'ElementVector':formula_to_element_vector(parse_formula('C2H5Cl'))}));
    print(test.rejected({'ElementVector':formula_to_element_vector(parse_formula('C2H5S'))}));
    print(test.rejected({'ElementVector':formula_to_element_vector(parse_formula('C2H5SCl'))}));

    test=ElementCompositionFilter('Cl','CHONCl');
    
    print(test.rejected({'ElementVector':formula_to_element_vector(parse_formula('C2H5'))}));
    print(test.rejected({'ElementVector':formula_to_element_vector(parse_formula('C2H5Cl'))}));
    print(test.rejected({'ElementVector':formula_to_element_vector(parse_formula('C2H5S'))}));
    print(test.rejected({'ElementVector':formula_to_element_vector(parse_formula('C2H5SCl'))}));
    
    test=ElementCompositionFilter('','CHONCl');
    print(test)
    
示例#9
0
    def setup_scorer(self, formulas, scores, unknown_score=0.0):
        self.formulas = []
        self.scores = []
        self.unknown_score = unknown_score
        if isinstance(formulas, str):
            formulas = formulas.split(',')
            for formula in formulas:
                self.formulas.append(
                    encode_formula_to_array(parse_formula(formula)))
        elif isinstance(formulas, list):
            for formula in formulas:
                if isinstance(formula, dict):
                    self.formulas.append(encode_formula_to_array(formula))
                elif isinstance(formula, np.ndarray):
                    self.formulas.append(formula)
                elif isinstance(formula, str):
                    fs = formula.split(',')
                    for f in fs:
                        self.formulas.append(
                            encode_formula_to_array(parse_formula(f)))
                else:
                    raise TypeError(
                        'Wrong type argument (formulas) for FormulaVectors initialization! str, formula, list of (formula, formulavector or str) supported only!'
                    )
        elif isinstance(formulas, dict):
            self.formulas.append(encode_formula_to_array(formulas))
        elif isinstance(formulas, np.ndarray):
            self.formulas.append(formulas)
        else:
            raise TypeError(
                'Wrong type argument (formulas) for FormulaVectors initialization! str, formula, list of (formula, formulavector or str) supported only!'
            )

        if isinstance(scores, str):
            scores = scores.split(',')
            for score in scores:
                self.scores.append(float(score))
        elif isinstance(scores, float):
            self.scores.append(scores)
        elif isinstance(scores, int):
            self.scores.append(float(scores))
        elif isinstance(scores, list):
            for score in scores:
                if isinstance(score, float):
                    self.scores.append(score)
                elif isinstance(score, dict):
                    self.scores.append(score)
                elif isinstance(score, int):
                    self.scores.append(float(score))
                elif isinstance(score, str):
                    score = score.split(',')
                    for s in score:
                        self.scores.append(float(s))
                else:
                    raise TypeError(
                        'Wrong type argument (scores) for FormulaScorer initialization! str, float, int, list of (float, int or str) or dictionary supported only!'
                    )
        else:
            raise TypeError(
                'Wrong type argument (scores) for FormulaScorer initialization! str, float, int, list of (float, int or str) supported only!'
            )
        if len(self.scores) != len(self.formulas):
            raise TypeError(
                'Number of formulas and number of scores supplied do not match!'
            )

        if not self.vector_form:
            for i in range(len(self.formulas)):
                self.formulas[i] = decode_formula_from_array(self.formulas[i])
示例#10
0
                        for key in self.scores[index].keys():
                            record['Scores'][key] = self.scores[index][key]
                    break
        if not found:
            record['Scores']['Formula'] = self.unknown_score


registered_scorers[FormulaScorer.__name__] = FormulaScorer

if __name__ == '__main__':

    test = FormulaScorer(use_vector_form=False)
    test.setup_scorer('C2H5OH,CH4,PO4', '0.3,0.1,0.5', 1.0)

    record = {
        'Formula': parse_formula('CH4'),
        'Scores': {}
    }
    test.process_molecular_candidate_record(None, record)
    print(record)

    record = {
        'Formula': parse_formula('PO4'),
        'Scores': {}
    }
    test.process_molecular_candidate_record(None, record)
    print(record)

    record = {
        'Formula': parse_formula('C2H5OH'),
        'Scores': {}
示例#11
0
                        accepted = False
                        break
                if accepted:
                    result = False
                    break
        return result


registered_filters[FormulasFilter.__name__] = FormulasFilter

if __name__ == '__main__':
    test = FormulasFilter('C2H5OH', use_vector_form=False)
    print(test.filter)
    test = FormulasFilter('C2H5OH,CH4,PO4', use_vector_form=False)
    print(test.filter)
    test = FormulasFilter(parse_formula('C2H5OH'), use_vector_form=False)
    print(test.filter)
    test = FormulasFilter(['C2H5OH,PO4', parse_formula('CH4')],
                          use_vector_form=False)
    print(test.filter)
    print(test.rejected({'Formula': parse_formula('CH4')}))
    #False
    print(test.rejected({'Formula': parse_formula('PO4')}))
    #False
    print(test.rejected({'Formula': parse_formula('H4C')}))
    #False
    print(test.rejected({'Formula': parse_formula('CH4N')}))
    #True
    print(test.rejected({'Formula': parse_formula('CH3')}))
    #True
    print(test.rejected({'Formula': parse_formula('C2H4')}))
示例#12
0
        record['Scores']['Elements'] = score


registered_scorers[ElementScorer.__name__] = ElementScorer

if __name__ == '__main__':

    test = ElementScorer()
    test.setup_scorer({
        'C': 0.8,
        'O': 0.9,
        'Si': 0.1
    })

    record = {
        'ElementVector': formula_to_element_vector(parse_formula('CH4')),
        'Scores': {}
    }
    test.process_molecular_candidate_record(None, record)
    print(record)

    record = {
        'ElementVector': formula_to_element_vector(parse_formula('CH4Si')),
        'Scores': {}
    }
    test.process_molecular_candidate_record(None, record)
    print(record)

    record = {
        'ElementVector': formula_to_element_vector(parse_formula('CH4CO')),
        'Scores': {}
示例#13
0
    def hdf5_import_from_st2raw(self,
                                inpath,
                                fptmask=np.ones((11416, ), np.uint8)):
        if not os.path.isfile(os.path.join(inpath, 'dbinfo.dat')):
            raise IOError('Database info file not found: %s' %
                          os.path.join(inpath, 'dbinfo.dat'))

        self.HDF5container.attrs['HDF5ContainerType'] = np.string_(
            'DistilledChemicalDatabase')
        self.HDF5container.attrs['HDF5ContainerVersion'] = np.string_('1.0')
        finp = open(os.path.join(inpath, 'dbinfo.dat'), 'r')
        #fout=open(os.path.join(self.folderpath,'dbinfo.dat'),'w');
        for s in finp:
            s = s.rstrip('\n').lstrip().split('=', 1)
            if s[0].upper() == 'DBFORMAT':
                s[1] = '3'
            if s[0] != '':
                #fout.write('%s=%s\n'%(s[0],s[1]));
                self.HDF5container.attrs[s[0]] = np.string_(s[1])

        #fout.close();
        finp.close()

        fptlist = []

        for i in range(11416):
            if fptmask[i] == 1:
                fptlist.append(i)

        fptlen = len(fptlist)
        fptsubmask = np.packbits(np.ones((fptlen, ), np.uint8))
        fptmasklen = len(fptsubmask)
        fptindexes = np.array(fptlist, dtype=np.uint32)
        packedmask = np.packbits(fptmask)
        packedmasklen = len(packedmask)

        #hdf5_ascii_string = h5py.special_dtype(vlen=bytes);

        fptgroup = self.HDF5container.create_group('FingerPrints')
        #Original mask, packed
        fptoriginalmask = fptgroup.create_dataset("FPTOriginalMask",
                                                  (packedmasklen, ),
                                                  maxshape=(packedmasklen, ),
                                                  dtype=np.uint8)
        fptoriginalmask[:] = packedmask[:]

        #List of indeces of original FPT bits (11416)
        fptmask = fptgroup.create_dataset("FPTMask", (fptlen, ),
                                          maxshape=(fptlen, ),
                                          compression="gzip",
                                          compression_opts=4,
                                          dtype=np.uint32)
        fptmask[:] = fptindexes[:]

        #Mask for working bits (packed)
        fptsubmask = fptgroup.create_dataset("FPTsubmask", (1, fptmasklen),
                                             chunks=(100, fptmasklen),
                                             maxshape=(None, fptmasklen),
                                             compression="gzip",
                                             compression_opts=4,
                                             dtype=np.uint8)

        #FPT info: 0 - original bit count=11416, 1 - length of new fpt after masking, 2 - length of the packed fpt, 3 - No of padding bits
        fptinfo = fptgroup.create_dataset("FPTInfo", (4, ),
                                          maxshape=(4, ),
                                          dtype=np.uint32)
        fptinfo[0] = 11416
        fptinfo[1] = fptlen
        fptinfo[2] = fptmasklen
        fptinfo[3] = fptmasklen * 8 - fptlen

        print('Listing input files')

        subpaths = ['/Negative', '/Positive', '/Neutral']
        #subpaths=['/Positive'];
        for subpath in subpaths:
            print(subpath)

            fptgroup = self.HDF5container.create_group(subpath +
                                                       '/FingerPrints')
            fraggroup = self.HDF5container.create_group(subpath +
                                                        '/FragPrints')
            chemgroup = self.HDF5container.create_group(subpath + '/ChemInfo')
            chargegroup = self.HDF5container[subpath]

            #New FPT Array, packed and trimmed to fptmask
            fptdataset = fptgroup.create_dataset("FPTArray", (1, fptmasklen),
                                                 chunks=(100, fptmasklen),
                                                 maxshape=(None, fptmasklen),
                                                 compression="gzip",
                                                 compression_opts=4,
                                                 dtype=np.uint8)

            if subpath != '/Neutral':
                masschargedataset = chargegroup.create_dataset(
                    "MZMassCharge", (1, 3),
                    chunks=(10000, 3),
                    maxshape=(None, 3),
                    compression="gzip",
                    compression_opts=4,
                    dtype=np.float32)
            else:
                mzdataset = chargegroup.create_dataset("MZ", (1, ),
                                                       chunks=(10000, ),
                                                       maxshape=(None, ),
                                                       compression="gzip",
                                                       compression_opts=4,
                                                       dtype=np.float32)

            inchikey_dataset = chemgroup.create_dataset("InChiKeyValues",
                                                        (1, 15),
                                                        chunks=(10000, 15),
                                                        maxshape=(None, 15),
                                                        compression="gzip",
                                                        compression_opts=4,
                                                        dtype=np.uint8)

            elementsvector_dataset = chemgroup.create_dataset(
                "ElementsVector", (1, 12),
                chunks=(10000, 12),
                maxshape=(None, 12),
                compression="gzip",
                compression_opts=4,
                dtype=np.uint8)

            formulavector_dataset = chemgroup.create_dataset(
                "FormulaVector", (1, 96),
                chunks=(10000, 96),
                maxshape=(None, 96),
                compression="gzip",
                compression_opts=4,
                dtype=np.uint16)

            fragprintindex_dataset = fraggroup.create_dataset(
                "FragPrintIndex", (1, 2),
                chunks=(10000, 2),
                maxshape=(None, 2),
                compression="gzip",
                compression_opts=4,
                dtype=np.int64)
            fragprintvalues_dataset = fraggroup.create_dataset(
                "FragPrintValues", (1, ),
                chunks=(10000, ),
                maxshape=(None, ),
                compression="gzip",
                compression_opts=4,
                dtype=np.float32)

            smiles_dataset = chemgroup.create_dataset("SMILES", (1, 2),
                                                      chunks=(10000, 2),
                                                      maxshape=(None, 2),
                                                      compression="gzip",
                                                      compression_opts=4,
                                                      dtype=np.int64)

            inchi_dataset = chemgroup.create_dataset("InChi", (1, 4, 2),
                                                     chunks=(10000, 4, 2),
                                                     maxshape=(None, 4, 2),
                                                     compression="gzip",
                                                     compression_opts=4,
                                                     dtype=np.int64)

            ids_dataset = chemgroup.create_dataset("IDs", (1, 2),
                                                   chunks=(10000, 2),
                                                   maxshape=(None, 2),
                                                   compression="gzip",
                                                   compression_opts=4,
                                                   dtype=np.int64)

            ascii_dataset = chemgroup.create_dataset("ASCII", (1, ),
                                                     chunks=(10000, ),
                                                     maxshape=(None, ),
                                                     compression="gzip",
                                                     compression_opts=4,
                                                     dtype=np.uint8)

            recordindex = -1

            fileslist = []

            for i in range(0, 2000):
                if os.path.exists(inpath + subpath + '/%s' % i):
                    print(inpath + subpath + '/%s' % i)
                    for j in range(0, 10):
                        if os.path.exists(inpath + subpath + '/%s/%s' %
                                          (i, j)):
                            for k in range(0, 10):
                                if os.path.exists(inpath + subpath +
                                                  '/%s/%s/%s' % (i, j, k)):
                                    for l in range(0, 10):
                                        if os.path.isfile(inpath + subpath +
                                                          '/%s/%s/%s/%s.st2' %
                                                          (i, j, k, l)):
                                            fileslist.append(
                                                inpath + subpath +
                                                '/%s/%s/%s/%s.st2' %
                                                (i, j, k, l))

            print('Total number of input files: %s' % len(fileslist))

            for filename in fileslist:
                fpath, fname = os.path.split(filename)
                subpath = fpath.replace(inpath, '')
                if 'Neutral' in subpath:
                    charged = False
                    offs = 0
                else:
                    charged = True
                    offs = 2
                print('Importing: .../%s/%s' % (subpath, fname))
                dblist = []
                with open(filename, 'r') as finp:
                    for s in finp:
                        try:
                            s = s.replace('\n', '').replace('\r',
                                                            '').split('\t')
                            mz = float(s[0])
                            if charged:
                                mass = float(s[1])
                                charge = float(s[2])
                            else:
                                mass = mz
                                charge = 0.0
                            if mass >= 12.0:
                                #shortinchi=s[1+offs];
                                inchi = s[2 + offs]
                                smiles = s[3 + offs]
                                ids = s[4 + offs]
                                fpt = s[5 + offs]
                                fpt = decode_from_base64(fpt)
                                fpt = np.unpackbits(fpt)
                                frag = s[6 + offs]
                                if charged:
                                    fragcharge = s[9]
                                else:
                                    fragcharge = ''
                                recordindex += 1
                                if recordindex % 1000 == 0:
                                    print('Total: %s' % (recordindex + 1))

                                dblist.append([
                                    recordindex, mz, charged, mass, charge,
                                    inchi, fpt, frag, fragcharge, smiles, ids
                                ])
                        except:
                            print('Error! Skipping!')

                if len(dblist) > 0:
                    #expand datasets here
                    fptdataset.resize((recordindex + 1, fptmasklen))

                    if charged:
                        masschargedataset.resize((recordindex + 1, 3))
                    else:
                        mzdataset.resize((recordindex + 1, ))

                    inchikey_dataset.resize((recordindex + 1, 15))

                    elementsvector_dataset.resize((recordindex + 1, 12))

                    formulavector_dataset.resize((recordindex + 1, 96))

                    fragprintindex_dataset.resize((recordindex + 1, 2))

                    smiles_dataset.resize((recordindex + 1, 2))

                    ids_dataset.resize((recordindex + 1, 2))

                    inchi_dataset.resize((recordindex + 1, 4, 2))

                    for db in dblist:
                        currentindex = db[0]
                        fptdataset[currentindex, :] = np.packbits(
                            db[6][fptindexes])[:]
                        #print(inchi)
                        inchi = parse_inchi(db[5])
                        #print(inchi)
                        inchikeyvalues = inchikeyvalues_from_inchi(db[5])

                        sformula = inchi[0].split('/', 1)[0]

                        #print(sformula);
                        formula = parse_formula(sformula)

                        elementsvector = formula_to_element_vector(formula)
                        encodedformula = encode_formula_to_array(formula)

                        charge = db[4]
                        charged = db[2]

                        if charged:
                            #print(db[7],db[8])
                            frags = parse_string_fragment_charges(
                                charge, db[7], db[8])
                            #print(frags)
                        else:
                            frags = parse_string_fragments(db[7])

                        if charged:
                            masschargedataset[currentindex, 0] = db[1]
                            masschargedataset[currentindex, 1] = db[3]
                            masschargedataset[currentindex, 2] = charge
                        else:
                            mzdataset[currentindex] = db[1]

                        inchikey_dataset[currentindex, :] = inchikeyvalues[:]

                        elementsvector_dataset[
                            currentindex, :] = elementsvector[:]

                        formulavector_dataset[
                            currentindex, :] = encodedformula[:]

                        fragcount = len(frags)
                        frags = np.array(frags, dtype=np.float32)
                        fragprintindex_dataset[currentindex,
                                               0] = self.fragprintpos
                        fragprintindex_dataset[
                            currentindex, 1] = self.fragprintpos + fragcount

                        fragprintvalues_dataset.resize(
                            (self.fragprintpos + fragcount, ))

                        fragprintvalues_dataset[self.fragprintpos:self.
                                                fragprintpos +
                                                fragcount] = frags[:]

                        self.fragprintpos += fragcount

                        smiles = bytearray(db[9].encode('ascii'))

                        smileslen = len(smiles)

                        smiles = np.array(smiles, dtype=np.uint8)

                        ids = bytearray(db[10].encode('ascii'))

                        idslen = len(ids)

                        ids = np.array(ids, dtype=np.uint8)

                        sinchi = inchi[0].split('/', 1)
                        if len(sinchi) > 1:
                            sinchi = sinchi[1]
                        else:
                            sinchi = ''

                        inchi0 = bytearray(sformula.encode('ascii'))
                        inchi1 = bytearray(sinchi.encode('ascii'))
                        inchi2 = bytearray(inchi[2].encode('ascii'))
                        inchi3 = bytearray(inchi[1].encode('ascii'))

                        inchi0len = len(inchi0)
                        inchi1len = len(inchi1)
                        inchi2len = len(inchi2)
                        inchi3len = len(inchi3)

                        inchi0 = np.array(inchi0, dtype=np.uint8)
                        inchi1 = np.array(inchi1, dtype=np.uint8)
                        inchi2 = np.array(inchi2, dtype=np.uint8)
                        inchi3 = np.array(inchi3, dtype=np.uint8)

                        ascii_dataset.resize(
                            (self.asciipos + smileslen + idslen + inchi0len +
                             inchi1len + inchi2len + inchi3len, ))

                        smiles_dataset[currentindex, 0] = self.asciipos
                        smiles_dataset[currentindex,
                                       1] = self.asciipos + smileslen
                        ascii_dataset[self.asciipos:self.asciipos +
                                      smileslen] = smiles[:]
                        self.asciipos += smileslen

                        ids_dataset[currentindex, 0] = self.asciipos
                        ids_dataset[currentindex, 1] = self.asciipos + idslen
                        ascii_dataset[self.asciipos:self.asciipos +
                                      idslen] = ids[:]
                        self.asciipos += idslen

                        inchi_dataset[currentindex, 0, 0] = self.asciipos
                        inchi_dataset[currentindex, 0,
                                      1] = self.asciipos + inchi0len
                        ascii_dataset[self.asciipos:self.asciipos +
                                      inchi0len] = inchi0[:]
                        self.asciipos += inchi0len

                        inchi_dataset[currentindex, 1, 0] = self.asciipos
                        inchi_dataset[currentindex, 1,
                                      1] = self.asciipos + inchi1len
                        ascii_dataset[self.asciipos:self.asciipos +
                                      inchi1len] = inchi1[:]
                        self.asciipos += inchi1len

                        inchi_dataset[currentindex, 2, 0] = self.asciipos
                        inchi_dataset[currentindex, 2,
                                      1] = self.asciipos + inchi2len
                        ascii_dataset[self.asciipos:self.asciipos +
                                      inchi2len] = inchi2[:]
                        self.asciipos += inchi2len

                        inchi_dataset[currentindex, 3, 0] = self.asciipos
                        inchi_dataset[currentindex, 3,
                                      1] = self.asciipos + inchi3len
                        ascii_dataset[self.asciipos:self.asciipos +
                                      inchi3len] = inchi3[:]
                        self.asciipos += inchi3len

        print('Import Finished!')
示例#14
0
    def _pipe_from_textfile(self, finp):
        while True:
            s = finp.readline()
            if s == '':
                return
            s = s.rstrip('\n').lstrip()
            if '##' in s:
                s = s[:s.index('##')]

            if '=' in s:
                s = s.split('=', 1)
                if s[0].lower().startswith('totalscore'):
                    self['TotalScore'] = float(s[1])

                elif s[0].lower().startswith('adduct'):
                    self['Adduct'] = s[1]
                elif s[0].lower().startswith('isotopeextramass'):
                    self['IsotopeExtraMass'] = float(s[1])
                elif s[0].lower().startswith('isotope'):
                    self['Isotope'] = int(s[1])

                elif s[0].lower().startswith('mz'):
                    self['MZ'] = float(s[1])
                elif s[0].lower().startswith('mass'):
                    self['Mass'] = float(s[1])
                elif s[0].lower().startswith('charge'):
                    self['Charge'] = int(s[1])
                elif s[0].lower().startswith('dbformat'):
                    self['DBFormat'] = int(s[1])
                elif s[0].lower().startswith('dbindex'):
                    self['DBIndex'] = int(s[1])
                elif s[0].lower().startswith('rindex'):
                    self['RIndex'] = int(s[1])
                elif s[0].lower().startswith('dbname'):
                    self['DBName'] = s[1]
                elif s[0].lower().startswith('rfile'):
                    self['RFile'] = s[1]
                elif s[0].lower().startswith('smiles'):
                    self['SMILES'] = s[1]
                elif s[0].lower().startswith('ids'):
                    self['IDs'] = s[1]
                elif s[0].lower().startswith('annotation'):
                    self['Annotation'] = s[1]
                elif s[0].lower().startswith('shortinchi'):
                    self['ShortInChI'] = s[1]
                elif s[0].lower().startswith('inchikeyvalues'):
                    self['InChIKeyValues'] = string_to_numpy_byte_array(s[1])
                elif s[0].lower().startswith('inchikey'):
                    self['InChiKey'] = s[1]
                elif s[0].lower().startswith('inchi'):
                    self['InChI'] = s[1]
                elif s[0].lower().startswith('formulavector'):
                    self['FormulaVector'] = string_to_numpy_uint16_array(s[1])
                elif s[0].lower().startswith('elementvector'):
                    self['ElementVector'] = string_to_numpy_byte_array(s[1])
                elif s[0].lower().startswith('frag'):
                    self['Frag'] = string_to_float_list(s[1])
                elif s[0].lower().startswith('formula'):
                    self['Formula'] = parse_formula(s[1])
                elif s[0].lower().startswith('fpt'):
                    self['FPT'] = decode_from_base64(s[1])
                elif s[0].lower().startswith('scores'):
                    if not ('Scores' in self):
                        self['Scores'] = {}
                    score = s[1].split(':', 1)
                    self['Scores'][score[0]] = float(score[1])

            elif s.lower().startswith('end'):
                return