Пример #1
0
def load():
    p = utils.getParams()

    p['inputDir'] = 'input/dream4'

    p['metaDataFile'] = 'meta_data.tsv'
    p['priorsFile'] = 'gold_standard.tsv'
    p['goldStandardFile'] = 'gold_standard.tsv'

    # NOTE: Currently does not work using 1 bootstrap. Please use 2 or more
    p['numBoots'] = 2
    p['cores'] = 1

    p['delTMax'] = 110
    p['delTMin'] = 0
    p['tau'] = 45

    p['percTp'] = [50] * 4
    p['permTp'] = [1] * 4
    p['percFp'] = [0, 100, 250, 500]
    p['permFp'] = [1, 5, 5, 5]

    p['evalOnSubset'] = False

    p['method'] = 'BBSR'
    p['priorWeight'] = 1.26

    p['saveToDir'] = 'output/dream4_BBSR_1'

    p['verbose'] = True
    p['demo'] = True
    p['exportCLRMatrix'] = True
    p['exportBSDR'] = True

    utils.setParams(p)
def paramsFlopsCounter(models,num_classes=10,input_shape=(3,32,32)):
    logger=get_logger("./")
    for modelname in models:
        model=get_models(modelname,num_classes=10)
        model = model.eval()
        pa1=getParams(model)
        fl1=getFlops(model,input_shape)
        fl2,pa2=get_model_complexity_info(model,input_shape,True)
        #logger.info("{}  v1: {}--{} ".format(model,pa1,fl1))
        logger.info("{}  v1: {}--{}  v2: {}--{}".format(modelname,pa1,fl1,pa2,fl2))
Пример #3
0
 def getSData(self, index):
     #Data handling inside row
     #If clicked: copy CP
     #If Ctrl + Click: copy Params
     data = self.sModel.data(self.sModel.index(index.row(), 2),
                             role=self.DR).strip()
     _modifiers = QApplication.keyboardModifiers()
     if data.strip():
         if _modifiers == QtCore.Qt.ControlModifier:
             params = utils.getParams(data)
             if params: logging.info(' Copied Params: ' + params)
             #self.cmdLabel.setText('Copied Params: ' + params)
             self.cb.setText(params, mode=self.cb.Clipboard)
         else:
             cb_cp = utils.getCP(data)
             if cb_cp: logging.info(' Copied CP: ' + cb_cp)
             #self.cmdLabel.setText('Copied CP: ' + cb_cp)
             self.cb.setText(cb_cp, mode=self.cb.Clipboard)
Пример #4
0
    def rec(target):
        processed.add(target)
        url = getUrl(target, True)

        params = getParams(target, '', True)  #得到参数
        if '=' in target:  # if there's a = in the url, there should be GET parameters
            inps = []
            for name, value in params.items():
                inps.append({'name': name, 'value': value})
            forms.append({0: {'action': url, 'method': 'get', 'inputs': inps}})

        response = requester(url, params, headers, True, delay, timeout).text

        #retireJs(url, response)##检测<script>中是否存在漏洞

        # if not skipDOM:
        #     highlighted = dom(response)
        #     clean_highlighted = ''.join([re.sub(r'^\d+\s+', '', line) for line in highlighted])
        #     if highlighted and clean_highlighted not in checkedDOMs:
        #         checkedDOMs.append(clean_highlighted)
        #         logger.good('Potentially vulnerable objects found at %s' % url)
        #         logger.red_line(level='good')
        #         for line in highlighted:
        #             logger.no_format(line, level='good')
        #         logger.red_line(level='good')
        forms.append(get_form(response))  #取出response中的所有form表单
        matches = re.findall(r'<[aA].*href=["\']{0,1}(.*?)["\']', response)
        for link in matches:  # iterate over the matches
            # remove everything after a "#" to deal with in-page anchors
            link = link.split('#')[0]
            if link.endswith(('.pdf', '.png', '.jpg', '.jpeg', '.xls', '.xml',
                              '.docx', '.doc')):
                pass
            else:
                if link[:4] == 'http':
                    if link.startswith(main_url):
                        storage.add(link)
                elif link[:2] == '//':
                    if link.split('/')[2].startswith(host):
                        storage.add(schema + link)
                elif link[:1] == '/':
                    storage.add(main_url + link)
                else:
                    storage.add(main_url + '/' + link)
Пример #5
0
 def tssig(self):
     """
     Get the type info
     """
     if self._client.server_handle is not None:
         self.reload()
         file = self.vim.current.buffer.name
         line = self.vim.current.window.cursor[0]
         offset = self.vim.current.window.cursor[1] + 1
         info = self._client.getSignature(file, line, offset)
         if info:
             signatureHelpItems = list(
                 map(
                     lambda item: {
                         'variableArguments':
                         item['isVariadic'],
                         'prefix':
                         utils.convertToDisplayString(item[
                             'prefixDisplayParts']),
                         'suffix':
                         utils.convertToDisplayString(item[
                             'suffixDisplayParts']),
                         'separator':
                         utils.convertToDisplayString(item[
                             'separatorDisplayParts']),
                         'parameters':
                         list(
                             map(
                                 lambda p: {
                                     'text':
                                     utils.convertToDisplayString(p[
                                         'displayParts']),
                                     'documentation':
                                     utils.convertToDisplayString(p[
                                         'documentation']),
                                 }, item['parameters']))
                     }, info['items']))
             params = utils.getParams(signatureHelpItems[0]['parameters'],
                                      signatureHelpItems[0]['separator'])
             self.printHighlight(params)
     else:
         self.printError('Server is not running')
Пример #6
0
    def btnGetData(self):
        # type = self.getRadioCodeType()
        freq = self.getRadioFreqType()
        year = self.getYearMonth()
        index_hs = self.getIndexHS()
        reporter = self.tab02_select_reporter.currentText()
        partner = self.tab03_select_partner.currentText()
        trade_flow = self.tab04_select_trade_flow.currentText()
        start_index = self.tab09_input_start_index.text()
        end_index = self.tab10_input_end_index.text()
        start_hs = self.tab11_input_start_hs.text()
        end_hs = self.tab12_input_end_hs.text()
        # token = self.tab01_input_token.text()
        check_input, message_input = checkInput(PeriodYear=year,
                                                StartIndex=start_index,
                                                EndIndex=end_index,
                                                StartHS=start_hs,
                                                EndHS=end_hs)
        check_select, message_select = checkSelect(Reporter=reporter,
                                                   Partner=partner,
                                                   TradeFlow=trade_flow)
        check_index, message_index = checkIndex(start_index, end_index)
        if self.tab06_radio_freq_month.isChecked(
        ) and not self.check_all.isChecked():
            check_month, message_month = self.checkMonth()
        else:
            check_month = True
            message_month = ''

        if check_input and check_select and check_index and check_month:
            repaintText(self.text_message, 'Start get data ...')
            params = getParams(year, freq, reporter, partner, trade_flow)
            data = getData(params, start_index, end_index, start_hs, index_hs,
                           self.text_message)
            message = dataToExcel(data)
            repaintText(self.text_message, message)
        else:
            repaintText(
                self.text_message,
                '{}{}{}{}'.format(message_input, message_select, message_index,
                                  message_month))
Пример #7
0
    def tBoxChanged(self):
        #Handles the PACR type combox box signal change
        #If type is Remove, disallow any CP steps to be added or deleted
        if self.typeBox.currentText() == 'Remove':
            self.addStep.setEnabled(False)
            self.delStep.setEnabled(False)
        else:
            self.addStep.setEnabled(True)
            self.delStep.setEnabled(True)

            #Fills data if changing a step
            if self.typeBox.currentText() == 'Change':
                _row = int(self.stepEdit.text()) - 1
                if _row < self.sModel.rowCount() + 1:
                    record = self.sModel.record(_row)
                    self.rationale.setPlainText(record.field(3).value())
                    step = record.field(2).value()
                    cp = utils.getCP(step)
                    if cp:
                        cp = cp.replace('.prc',
                                        '') + '(' + utils.getParams(step) + ')'
                        self.pSteps.populate(cp)
    def tssig(self):
        """
        Get type signature for symbol at cursor
        """
        self.reload()
        file = self.vim.current.buffer.name
        line = self.vim.current.window.cursor[0]
        offset = self.vim.current.window.cursor[1] + 1
        info = self._client.getSignature(file, line, offset)

        if info:
            signatureHelpItems = list(map(lambda item: {
                'variableArguments': item['isVariadic'],
                'prefix': utils.convertToDisplayString(item['prefixDisplayParts']),
                'suffix': utils.convertToDisplayString(item['suffixDisplayParts']),
                'separator': utils.convertToDisplayString(item['separatorDisplayParts']),
                'parameters': list(map(lambda p: {
                    'text': utils.convertToDisplayString(p['displayParts']),
                    'documentation': utils.convertToDisplayString(p['documentation']),
                }, item['parameters']))
            }, info['items']))
            params = utils.getParams(signatureHelpItems[0][
                                     'parameters'], signatureHelpItems[0]['separator'])
            self.printHighlight(params)
Пример #9
0
def searchDatabase(features, paramFile, queue="standard"):
    # features: fully-aligned features (pandas DataFrame)
    # paramFile: paramter file

    #################################
    # Preparation of job submission #
    #################################
    m = features.shape[0]
    n = 10  # Default number of entries in each job
    if int(
            m / n
    ) > 200:  # When there are too many features, limit the number of jobs to 200
        n = int(m / 200) + 1
    nJobs = math.ceil(m / n)

    # Create a temporary directory for jobs (to be removed later) and change the working directory for jobs
    cwd = os.getcwd()
    tmpDir = os.path.join(cwd, ".tmp")
    if os.path.exists(tmpDir):
        os.system("rm -rf " + tmpDir)
    os.mkdir(tmpDir)
    os.system("cp " + paramFile + " " +
              tmpDir)  # Copy the parameter file to "tmpDir"
    os.chdir(tmpDir)  # Change the working directory to a temporary one

    ##################
    # Job submission #
    ##################
    jobNumbers = []
    mem = 1000  # Default memory reserved = 1000MB
    for i in range(nJobs):
        # Split features into "nJobs" chunks and use each chunk in each job
        start = n * i
        end = min(m, n * (i + 1))
        featureFile = "features_" + str(i) + ".pickle"
        pickle.dump(features.iloc[start:end], open(featureFile, "wb"))

        # Submission of jobs to LSF
        jobNumber = submitJobs(i, featureFile, paramFile, mem, queue)
        jobNumbers.append(jobNumber)

        text = "\r  {} job(s) is/are submitted".format(i + 1)
        sys.stdout.write(text)
        sys.stdout.flush()

    # Check the status of submitted jobs
    print()
    logging.info("  {} job(s) is/are submitted".format(nJobs))
    checkJobStatus(jobNumbers)

    ########################################################
    # Check unfinished jobs (due to not enough memory) and #
    # re-submission with the memory increase               #
    ########################################################
    print()
    print("  Checking unfinished jobs")
    logging.info("")
    logging.info("  Checking unfinished jobs")
    isFinished = False
    while not isFinished:
        jobNumbers = []
        ii = 0
        for i in range(nJobs):
            csvFile = "features_" + str(i) + ".csv"
            if not os.path.exists(
                    csvFile
            ):  # When a job is not finished properly, there's no corresponding .csv file
                # Extracation of the required memory by parsing .o file
                f = open("job_" + str(i) + ".o")
                lines = f.read()
                mem = int(
                    re.search("(?<=Max Memory :)\s+(\d+)",
                              lines).group(1)) * 2  # Times 2 for safety
                f.close()

                # Re-submission of jobs
                featureFile = "features_" + str(i) + ".pickle"
                jobNumber = submitJobs(i, featureFile, paramFile, mem, queue)
                jobNumbers.append(jobNumber)

                ii += 1
                text = "\r  {} job(s) is/are submitted".format(ii + 1)
                sys.stdout.write(text)
                sys.stdout.flush()
        logging.info("  {} job(s) is/are submitted".format(ii + 1))
        # Check the status of submitted jobs
        if len(jobNumbers) > 0:
            print()
            checkJobStatus(jobNumbers)
        else:
            isFinished = True
    print()
    print("  All job(s) is/are finished")
    logging.info("")
    logging.info("  All job(s) is/are finished")

    ##########################
    # Postprocessing of jobs #
    ##########################
    res = pd.DataFrame()
    for i in range(nJobs):
        eachOutput = "features_" + str(i) + ".csv"
        try:
            df = pd.read_csv(eachOutput, sep="\t")
        except pd.errors.EmptyDataError:
            continue
        res = res.append(df, ignore_index=True)

    ################################
    # Generation of an output file #
    ################################
    os.chdir(cwd)  # Move back to the "current working directory"
    params = utils.getParams(paramFile)
    filePath = os.path.join(os.getcwd(), "align_" + params["output_name"])
    if not os.path.exists(filePath):
        os.mkdir(filePath)
    outputFile = os.path.join(
        filePath, "align_" + params["output_name"] + ".database_matches")
    res.to_csv(outputFile, sep="\t", index=False, na_rep="NA")

    # os.system("rm " + os.path.join(tmpDir, "features_*"))
    # os.system("rm " + os.path.join(tmpDir, "job_*"))

    return res
def alignFeatures(fArray, xmlFiles, paramFile):
    nFiles = len(xmlFiles)

    # Pandas dataframe to numpy structured array for internal computation
    for i in range(nFiles):
        fArray[i] = fArray[i].to_records(index=False)

    ###################
    # Load parameters #
    ###################
    params = utils.getParams(paramFile)
    # Features derived from feature files are stored in fArray. For example,
    # xmlFiles = [file1, file2, file3]
    # fArray[0] = features from file1 (which has column names like 'index', 'mz', etc.)
    # fArray[1] = features from file2
    # ...
    # The array of m/z values from the first feature file can be accessed by fArray[0]['mz']

    if nFiles > 1:  # Multiple feature files -> alignment is required
        print("  Feature calibration")
        print("  ===================")
        logging.info("  Feature calibration")
        logging.info("  ===================")

        ###################################
        # Selection of a reference sample #
        ###################################
        if params["reference_feature"] == "0":
            # A run with the largest median of top 100 intensities is set to a reference run
            refNo = 0
            refIntensity = 0
            for i in range(nFiles):
                tmpIntensity = np.median(sorted(fArray[i]["intensity"], reverse=True)[0: 100])
                if tmpIntensity >= refIntensity:
                    refNo = i
                    refIntensity = tmpIntensity
        elif params["reference_feature"] == "1":
            # A run with the most number of features is set to a reference run
            refNo = 0
            refN = 0
            for i in range(nFiles):
                tmpN = len(fArray[i])
                if tmpN >= refN:
                    refNo = i
                    refN = tmpN
        else:
            try:
                refNo = xmlFiles.index(params["reference_feature"])
            except:
                sys.exit("  'reference_feature' parameter should be correctly specified")
        print("  %s is chosen as the reference run" % os.path.basename(xmlFiles[refNo]))
        logging.info("  %s is chosen as the reference run" % os.path.basename(xmlFiles[refNo]))

        ############################################################
        # Calibration of features against those in a reference run #
        ############################################################
        rtSdArray, mzSdArray = [], []
        featureNames = []
        for i in range(nFiles):
            featureName = os.path.basename(xmlFiles[i])
            featureNames.append(featureName)
            if i != refNo:
                print("  " + featureName + " is being aligned against the reference run (it may take a while)")
                logging.info("  " + featureName + " is being aligned against the reference run (it may take a while)")
                fArray[i], rtSd, mzSd = calibrateFeatures(fArray[refNo], fArray[i], params)
                rtSdArray.append(rtSd)
                mzSdArray.append(mzSd)
            else:
                rtSdArray.append("NA")
                mzSdArray.append("NA")

        print("  Calibration summary")
        print("  ===================")
        print("  After calibration, RT- and m/z-shifts of each run (against the reference run) are centered to zero")
        print("  Variations (i.e. standard deviation) of RT- and m/z-shifts are as follows,")
        print("  Filename\t\t\t#features\tSD of RT-shifts [second]\tSD of m/z-shifts [ppm]")
        logging.info("  Calibration summary")
        logging.info("  ===================")
        logging.info("  After calibration, RT- and m/z-shifts of each run (against the reference run) are centered to zero")
        logging.info("  Variations (i.e. standard deviation) of RT- and m/z-shifts are as follows,")
        logging.info("  Filename\t\t\t#features\tSD of RT-shifts [second]\tSD of m/z-shifts [ppm]")
        for i in range(nFiles):
            nFeatures = str(fArray[i].shape[0])
            if i != refNo:
                meanRtSd = "%.6f" % np.mean(rtSdArray[i])
                meanMzSd = "%.6f" % np.mean(mzSdArray[i])
            else:
                meanRtSd = "NA"
                meanMzSd = "NA"
            print("  " + featureNames[i] + "\t\t\t" + nFeatures + "\t" + meanRtSd + "\t" + meanMzSd)
            logging.info("  " + featureNames[i] + "\t\t\t" + nFeatures + "\t" + meanRtSd + "\t" + meanMzSd)
        print()
        logging.info("")

        #################################################################
        # Identification of fully-aligned features for further analysis #
        #################################################################
        print("  Feature alignment")
        print("  =================")
        logging.info("  Feature alignment")
        logging.info("  =================")
        fullFeatures, partialFeatures, unalignedFeatures = findMatchedFeatures(refNo, fArray, rtSdArray, mzSdArray,
                                                                               featureNames, params)
    else:
        print("  Since a single feature is used, the feature alignment is skipped")
        logging.info("  Since a single feature is used, the feature alignment is skipped")
        fullFeatures = np.copy(fArray[0])  # Masked array to 2D numpy array
        colNames = list(fullFeatures.dtype.names)
        featureName = os.path.splitext(os.path.basename(xmlFiles[0]))[0]
        fullFeatures.dtype.names = [featureName + "_" + c for c in colNames]
        partialFeatures, unalignedFeatures = None, None

    ################################################################
    # Write fully-, partially- and/or un-aligned features to files #
    ################################################################
    # At this step, fully-, partially- and unaligned features are written to files and saved
    # Also, those features are converted to pandas DataFrame format and returned
    dfFull, dfPartial, dfArrayUnaligned = utils.generateFeatureFile(fullFeatures, partialFeatures, unalignedFeatures,
                                                                    params)

    return dfFull, dfPartial, dfArrayUnaligned
Пример #11
0
import sys, os, re, logging, pandas as pd
import utils
from featureDetection import detectFeatures
from datetime import datetime

##################
# Initialization #
##################
# For desktop debugging,
paramFile = r"jumpm.params"
inputFiles = [
    r"/home/jcho/dev/spectralLibrary/FTLD_Batch2_F50.mzXML",
    r"/home/jcho/dev/spectralLibrary/FTLD_Batch2_F51.mzXML",
    r"/home/jcho/dev/spectralLibrary/FTLD_Batch2_F52.mzXML"
]
params = utils.getParams(paramFile)

skipScans = [1, 3, 5, 7, 10]
for skipScan in skipScans:
    params["skipping_scans"] = skipScan
    logFile = "jump_m.log"
    if os.path.exists(logFile):
        os.system("rm " + logFile)
    logging.basicConfig(format='%(message)s',
                        filename=logFile,
                        level=logging.INFO)

    print()
    print("  Jump -m started")
    logging.info("  Jump -m started")
    now = datetime.now()
Пример #12
0
def detectFeatures(inputFile, paramFile):
    ##############
    # Parameters #
    ##############
    params = utils.getParams(paramFile)
    firstScan = int(params["first_scan_extraction"])
    lastScan = int(params["last_scan_extraction"])
    gap = int(params["skipping_scans"])
    scanWindow = gap + 1
    matchPpm = float(params["mass_tolerance_peak_matching"])

    ##################
    # Initialization #
    ##################
    reader = mzxml.read(inputFile)
    f = []  # Feature array
    nFeatures = -1
    cache = []
    noise = {}  # Empty dictionary for noise level information
    oldMinInd = -1
    oldMaxInd = -1

    ############################
    # Get MS1 scan information #
    ############################
    ms = []
    with reader:
        msCount = 0
        # filename = os.path.basename(inputFile)
        # print("  Extraction of MS1 spectra from %s" % filename)
        for spec in reader:
            msLevel = int(spec["msLevel"])
            scanNum = int(spec["num"])
            if msLevel == 1 and firstScan <= scanNum <= lastScan:
                ms.append(spec)
                msCount += 1
            elif scanNum > lastScan:
                break
        # print("  Done")

    ################################
    # Feature (3D-peak) generation #
    ################################
    filename = os.path.basename(inputFile)
    print("  Feature detection from %s" % filename)
    logging.info("  Feature detection from " + filename)
    progress = utils.progressBar(msCount)
    for i in range(msCount):
        progress.increment()
        minInd = max(0, i - gap - 1)
        maxInd = min(msCount - 1, i + gap + 1)
        if i == 0:
            for j in range(maxInd + 1):
                spec = detectPeaks(ms[j], params)
                spec["index"] = j
                cache.append(spec)
        else:
            for j in range(oldMinInd, minInd):
                cache.pop(0)  # Remove the first element in cache
            for j in range(oldMaxInd + 1, maxInd + 1):
                spec = detectPeaks(ms[j], params)
                spec["index"] = j
                cache.append(spec)

        ##################
        # Reduction step #
        ##################
        p = cache[i - minInd]
        pCount = len(p["m/z array"])
        valids = np.array([])
        count = 0
        for j in range(pCount):
            cm = p["m/z array"][j]
            match = 0
            nTry = 0
            # Backward search
            for k in range(i - 1, minInd - 1, -1):
                q = cache[k - minInd]
                if q["m/z array"].size == 0:
                    continue
                else:
                    match, ind = getClosest(q, cm, matchPpm)
                if match == 1:
                    break
                nTry += 1
                if nTry > scanWindow:
                    break
            if match == 0:  # Forward search
                nTry = 0
                for k in range(i + 1, maxInd + 1):
                    q = cache[k - minInd]
                    if q["m/z array"].size == 0:
                        continue
                    else:
                        match, ind = getClosest(q, cm, matchPpm)
                    if match == 1:
                        break
                    nTry += 1
                    if nTry > scanWindow:
                        break
            if match == 1:
                valids = np.append(valids, j)

        # Peak reduction and noise-level estimation
        p, noise = reduceMS1(p, noise, valids)

        #####################
        # Peak merging step #
        #####################
        cache[i - minInd] = p
        pCount = len(p["m/z array"])
        for j in range(pCount):
            cm = p["m/z array"][j]
            match = 0
            nTry = 0
            matchedPeakInd = []
            # Backward search
            for k in range(i - 1, minInd - 1, -1):
                q = cache[k - minInd]
                if q["m/z array"].size == 0:
                    continue
                else:
                    matchIndicator, ind = getClosest(q, cm, matchPpm)
                    # $matchIndicator = 1 means that the j-th (reduced) peak in the i-th scan
                    # can form a 3D-peak with $ind-th (reduced) peak in the previous scan (%q)
                    if matchIndicator == 1:
                        matchedPeakInd.append(q["featureIndex"][ind])
                        match = 1
            if match == 1:
                matchedPeakInd = list(set(matchedPeakInd))  # Make the list unique
                fInd = None
                if len(matchedPeakInd) > 1:  # There are multiple matches to the peaks in previous scans
                    fInd = min(matchedPeakInd)
                    for m in matchedPeakInd:
                        # Merge to the lowest indexed feature and remove the "merged" features
                        if m != fInd:
                            f[fInd]["mz"].extend(f[m]["mz"])
                            f[fInd]["intensity"].extend(f[m]["intensity"])
                            f[fInd]["num"].extend(f[m]["num"])
                            f[fInd]["rt"].extend(f[m]["rt"])
                            f[fInd]["index"].extend(f[m]["index"])

                            # Revise cache array
                            for s in f[m]["index"]:
                                for t in range(len(cache)):
                                    if cache[t]["index"] == s:
                                        for u in range(len(cache[t]["featureIndex"])):
                                            if cache[t]["featureIndex"][u] == m:
                                                cache[t]["featureIndex"][u] = fInd
                            f[m] = None  # Keep the size of feature array
                else:
                    fInd = matchedPeakInd[0]
                if "featureIndex" in cache[i - minInd]:
                    cache[i - minInd]["featureIndex"].append(fInd)
                else:
                    cache[i - minInd]["featureIndex"] = [fInd]
                f[fInd]["mz"].append(p["m/z array"][j])
                f[fInd]["intensity"].append(p["intensity array"][j])
                f[fInd]["num"].append(p["num"])
                f[fInd]["rt"].append(p["retentionTime"])
                f[fInd]["index"].append(p["index"])

            if match != 1:
                if i < msCount:
                    nFeatures += 1
                    if "featureIndex" in cache[i - minInd]:
                        cache[i - minInd]["featureIndex"].append(nFeatures)
                    else:
                        cache[i - minInd]["featureIndex"] = [nFeatures]
                    f.append({"mz": [p["m/z array"][j]],
                              "intensity": [p["intensity array"][j]],
                              "num": [p["num"]],
                              "rt": [p["retentionTime"]],
                              "index": [i]})

        oldMinInd = minInd
        oldMaxInd = maxInd

    # Remove empty features
    f = [i for i in f if i is not None]

    #################################
    # Filtering features (3D-peaks) #
    #################################
    # A feature may contain multiple peaks from one scan
    # In this case, one with the largest intensity is chosen
    gMinRt, gMaxRt = 0, 0  # Global minimum and maximum RT over all features
    for i in range(len(f)):
        if len(f[i]["num"]) != len(list(set(f[i]["num"]))):
            temp = {}
            for j in range(len(f[i]["num"])):
                if f[i]["num"][j] in temp:
                    currIntensity = f[i]["intensity"][j]
                    if currIntensity > temp[f[i]["num"][j]]["intensity"]:
                        temp[f[i]["num"][j]]["intensity"] = currIntensity
                        temp[f[i]["num"][j]]["index"] = j
                else:
                    temp[f[i]["num"][j]] = {}
                    temp[f[i]["num"][j]]["intensity"] = f[i]["intensity"][j]
                    temp[f[i]["num"][j]]["index"] = j
            uInd = []
            for key in sorted(temp.keys()):
                uInd.append(temp[key]["index"])
            f[i]["mz"] = [f[i]["mz"][u] for u in uInd]
            f[i]["intensity"] = [f[i]["intensity"][u] for u in uInd]
            f[i]["num"] = [f[i]["num"][u] for u in uInd]
            f[i]["rt"] = [f[i]["rt"][u] for u in uInd]
            f[i]["index"] = [f[i]["index"][u] for u in uInd]

        if i == 0:
            gMinRt = min(f[i]["rt"])
            gMaxRt = max(f[i]["rt"])
        else:
            if min(f[i]["rt"]) < gMinRt:
                gMinRt = min(f[i]["rt"])
            if max(f[i]["rt"]) > gMaxRt:
                gMaxRt = max(f[i]["rt"])

    if gMaxRt.unit_info == "minute":
        gMaxRt = gMaxRt * 60
        gMinRt = gMinRt * 60

    ###################################
    # Organization of output features #
    ###################################
    n = 0
    ms1ToFeatures = {}
    for i in range(len(f)):
        # 1. mz: mean m/z of a feauture = weighted average of m/z and intensity
        mz = np.sum(np.multiply(f[i]["mz"], f[i]["intensity"])) / np.sum(f[i]["intensity"])

        # 2. intensity: intensity of a feature (maximum intensity among the peaks consist of the feature)
        intensity = max(f[i]["intensity"])

        # 3. z: charge of the feature, set to 1 now, but modified later
        z = 1
        isotope = 0  # Will be used later

        # 4. RT: RT of the representative peak (i.e. strongest peak) of a feature
        ind = np.argmax(f[i]["intensity"])
        rt = f[i]["rt"][ind]

        # 5. minRT and maxRT
        minRt = min(f[i]["rt"])
        maxRt = max(f[i]["rt"])

        # Conversion of RT to the unit of second
        if rt.unit_info == "minute":
            rt = rt * 60  # Convert to the unit of second
            minRt = minRt * 60
            maxRt = maxRt * 60

        # 6. MS1 scan number of the representative peak of a feature
        ms1 = f[i]["num"][ind]

        # 7. minMS1 and maxMS1
        minMs1 = min(list(map(int, f[i]["num"])))
        maxMs1 = max(list(map(int, f[i]["num"])))

        # 8. SNratio (signal-to-noise ratio of the feature)
        if ms1 in noise:
            noiseLevel = noise[ms1]
        else:
            noiseLevel = 500
        snRatio = intensity / noiseLevel
        featureIntensityThreshold = noiseLevel * float(params["signal_noise_ratio"])

        if intensity >= featureIntensityThreshold:
            # 9. Percentage of true feature
            pctTF = (maxRt - minRt) / (gMaxRt - gMinRt) * 100
            # Organize features in a structured numpy array form
            if n == 0:
                features = np.array([(mz, intensity, z, rt, minRt, maxRt, ms1, minMs1, maxMs1, snRatio, pctTF, isotope)],
                                    dtype="f8, f8, f8, f8, f8, f8, f8, f8, f8, f8, f8, f8")
                n += 1
            else:
                features = np.append(features,
                                     np.array([(mz, intensity, z, rt, minRt, maxRt, ms1, minMs1, maxMs1, snRatio, pctTF, isotope)],
                                              dtype=features.dtype))
            for j in range(len(f[i]["num"])):
                num = f[i]["num"][j]
                if num not in ms1ToFeatures:
                    ms1ToFeatures[num] = {"mz": [f[i]["mz"][j]],
                                          "intensity": [f[i]["intensity"][j]]}
                else:
                    ms1ToFeatures[num]["mz"].append(f[i]["mz"][j])
                    ms1ToFeatures[num]["intensity"].append(f[i]["intensity"][j])
        else:
            continue

    features.dtype.names = ("mz", "intensity", "z", "RT", "minRT", "maxRT", "MS1", "minMS1", "maxMS1", "SNratio", "PercentageTF", "isotope")

    ##########################
    # Decharging of features #
    ##########################
    features = dechargeFeatures(features)
    # print()

    ############################################
    # Convert the features to pandas dataframe #
    # Write features to a file                 #
    ############################################
    df = pd.DataFrame(features)
    df = df.drop(columns = ["isotope"])    # "isotope" column was internally used, and no need to be transferred

    # Create a subdirectory and save features to a file
    baseFilename = os.path.splitext(os.path.basename(filename))[0]  # i.e. filename without extension
    featureDirectory = os.path.join(os.getcwd(), baseFilename)
    if not os.path.exists(featureDirectory):
        os.mkdir(featureDirectory)

    # # Increment the number of a feature file
    # if len(glob.glob(os.path.join(featureDirectory, baseFilename + ".*.feature"))) == 0:
    #     featureFilename = os.path.splitext(os.path.basename(filename))[0] + ".1.feature"
    # else:
    #     oldNo = 0
    #     for f in glob.glob(os.path.join(featureDirectory, baseFilename + ".*.feature")):
    #         oldNo = max(oldNo, int(os.path.basename(f).split(".")[-2]))
    #     featureFilename = baseFilename + "." + str(int(oldNo) + 1) + ".feature"
    # featureFilename = os.path.join(featureDirectory, featureFilename)

    # Simply overwrite any existing feature file
    # Individual feature file still needs to be located in an input file-specific location
    # since the feature file can be directly used later
    featureFilename = baseFilename + ".feature"
    featureFilename = os.path.join(featureDirectory, featureFilename)
    df.to_csv(featureFilename, index = False, sep = "\t")

    return df  # Pandas DataFrame
Пример #13
0
import sys
sys.path.append('scripts/')
sys.path.append('jobs/')

from imports import *
import utils
from desAndRes import designAndResponse
from priors import getPriors
from groupPredictors import groupPredictors
from miAndClr import mi, mixedCLR
from bayesianRegression import BBSR

utils.loadJob('default')
utils.loadJob(sys.argv[1])
pars = utils.getParams()
random.seed(pars['jobSeed'])

# Read in data
data = utils.readInput(
    pars['inputDir'], pars['expMatFile'], 
    pars['tfNamesFile'], pars['metaDataFile'], 
    pars['priorsFile'], pars['goldStandardFile'])

# Generate design and response matricies
desResp = designAndResponse(
    data['metaData'], data['expMat'], 
    pars['delTMin'], pars['delTMax'], pars['tau'])

# Generate priors
priors = getPriors(
    data['expMat'], data['tfNames'], 
Пример #14
0
def ms2ForFeatures(full, mzxmlFiles, paramFile):
    print("  Identification of MS2 spectra for the features")
    print("  ==============================================")
    logging.info("  Identification of MS2 spectra for the features")
    logging.info("  ==============================================")
    full = full.to_records(
        index=False
    )  # Change pd.DataFrame to np.RecArray for internal computation (speed issue)

    ######################################
    # Load parameters and initialization #
    ######################################
    params = utils.getParams(paramFile)
    # ppiThreshold = "max"  # Hard-coded
    ppiThreshold = params["ppi_threshold_of_features"]
    pctTfThreshold = float(params["max_percentage_RT_range"])
    tolIsolation = float(params["isolation_window"])
    tolPrecursor = float(params["tol_precursor"])
    tolIntraMS2Consolidation = float(params["tol_intra_ms2_consolidation"])
    tolInterMS2Consolidation = float(params["tol_inter_ms2_consolidation"])
    nFeatures = len(full)
    nFiles = len(mzxmlFiles)
    featureToScan = np.empty((nFeatures, nFiles), dtype=object)
    featureToSpec = np.empty((nFeatures, nFiles), dtype=object)

    #################################################
    # Assignment of MS2 spectra to features         #
    # Consolidation of MS2 spectra for each feature #
    #################################################
    m = -1  # Index for input files
    for file in mzxmlFiles:
        m += 1
        reader = mzxml.MzXML(file)
        fileBasename, _ = os.path.splitext(os.path.basename(file))
        colNames = [
            item for item in full.dtype.names
            if item.startswith(fileBasename + "_")
        ]
        subset = full[colNames]
        subset.dtype.names = [s.split("_")[-1] for s in subset.dtype.names]
        ms2Dict = {}
        minScan, maxScan = int(np.nanmin(subset["minMS1"])), int(
            np.nanmax(subset["maxMS1"]))
        progress = utils.progressBar(maxScan - minScan + 1)
        print("  %s is being processed" % os.path.basename(file))
        print("  Looking for MS2 scan(s) responsible for each feature")
        logging.info("  %s is being processed" % os.path.basename(file))
        logging.info("  Looking for MS2 scan(s) responsible for each feature")
        for i in range(minScan, maxScan + 1):
            progress.increment()
            spec = reader[str(i)]
            msLevel = spec["msLevel"]
            if msLevel == 1:
                surveyNum = i
            elif msLevel == 2:
                # Find MS2 scans which satisfy the following conditions

                # From the discussion around June 2020,
                # 1. In ReAdW-derived mzXML files, precursor m/z values are in two tags: "precursorMz" and "filterLine"
                # 2. Through Haiyan's manual inspection, the real precursor m/z value is closer to one in "filterLine" tag
                # 3. So, in this script, precursor m/z of MS2 scan is obtained from "filterLine" tag
                # 4. Note that it may be specific to ReAdW-derived mzXML files since MSConvert-derived mzXML files do not have "filterLine" tag
                # 4.1. In this case, maybe the use of mzML (instead of mzXML) would be a solution (to-do later)

                # precMz = spec["precursorMz"][0]["precursorMz"]  # Precursor m/z from "precursorMz" tag
                p = re.search("([0-9.]+)\\@", spec["filterLine"])
                precMz = float(p.group(1))
                survey = reader[str(surveyNum)]
                fInd = np.where((surveyNum >= subset["minMS1"])
                                & (surveyNum <= subset["maxMS1"])
                                & (subset["mz"] >= (precMz - tolIsolation))
                                & (subset["mz"] <= (precMz + tolIsolation)) &
                                (subset["PercentageTF"] <= pctTfThreshold))[0]
                if len(fInd) > 0:
                    ppi = []
                    for i in range(len(fInd)):
                        mz = subset["mz"][fInd[i]]
                        lL = mz - mz * tolPrecursor / 1e6
                        uL = mz + mz * tolPrecursor / 1e6
                        ind = np.where((survey["m/z array"] >= lL)
                                       & (survey["m/z array"] <= uL))[0]
                        if len(ind) > 0:
                            ppi.append(np.max(survey["intensity array"][ind]))
                        else:
                            ppi.append(0)

                    if sum(ppi) == 0:
                        continue
                    ppi = ppi / np.sum(
                        ppi) * 100  # Convert intensities to percentage values
                    if ppiThreshold == "max":
                        fInd = np.array([fInd[np.argmax(ppi)]])
                    else:
                        # ppiThreshold should be a numeric value
                        ppiThreshold = float(ppiThreshold)
                        fInd = fInd[np.where(ppi > ppiThreshold)]
                    if len(fInd
                           ) == 0:  # Last check of candidate feature indexes
                        continue
                    else:
                        # Add this MS2 scan information to ms2Dict
                        ms2Dict[spec["num"]] = {}
                        ms2Dict[spec["num"]]["mz"] = spec["m/z array"]
                        ms2Dict[
                            spec["num"]]["intensity"] = spec["intensity array"]

                        # Mapping between features and MS2 scan numbers
                        for i in range(len(fInd)):
                            if featureToScan[fInd[i], m] is None:
                                featureToScan[fInd[i], m] = spec["num"]
                            else:
                                featureToScan[fInd[i], m] += ";" + spec["num"]

        print(
            "  Merging MS2 spectra for each feature within a run (it may take a while)"
        )
        logging.info(
            "  Merging MS2 spectra for each feature within a run (it may take a while)"
        )
        progress = utils.progressBar(nFeatures)
        for i in range(nFeatures):
            progress.increment()
            if featureToScan[i, m] is not None:
                spec = intraConsolidation(ms2Dict, featureToScan[i, m],
                                          tolIntraMS2Consolidation)
                featureToSpec[i, m] = spec

    print(
        "  Merging MS2 spectra for each feature between runs when there are multiple runs"
    )
    print(
        "  Simplification of MS2 spectrum for each feature by retaining the most strongest 100 peaks"
    )
    logging.info(
        "  Merging MS2 spectra for each feature between runs when there are multiple runs"
    )
    logging.info(
        "  Simplification of MS2 spectrum for each feature by retaining the most strongest 100 peaks"
    )
    specArray = np.array([])
    progress = utils.progressBar(nFeatures)
    for i in range(nFeatures):
        progress.increment()
        if np.sum(featureToSpec[i] == None) == nFiles:
            specArray = np.append(specArray, None)
        else:
            spec = interConsolidation(featureToSpec[i, :],
                                      tolInterMS2Consolidation)
            specArray = np.append(specArray, spec)

    ###############################
    # MS2 processing for features #
    ###############################
    # "specArray" is the list of (consolidated) MS2 spectra
    # specArray[i] is the MS2 spectrum corresponding to the i-th feature
    # If there's no MS2 spectrum, then specArray[i] is None
    df = utils.summarizeFeatures(full, params)
    # Add the mean m/z of feature and its charge state to the beginning of MS2 spectrum (similar to .dta file)
    for i in range(nFeatures):
        if specArray[i] is not None:
            specArray[i]["mz"] = np.insert(specArray[i]["mz"], 0,
                                           df["feature_m/z"].iloc[i])
            specArray[i]["intensity"] = np.insert(specArray[i]["intensity"], 0,
                                                  df["feature_z"].iloc[i])
    df["MS2"] = specArray
    df = df.sort_values(
        by="feature_m/z",
        ignore_index=True)  # Features are sorted by "feature_m/z"
    df.insert(loc=0, column="feature_num", value=df.index + 1)
    # df["feature_num"] = df.index + 1  # Update "feature_num" according to the ascending order of "feature_m/z" (as sorted)

    # Write MS2 spectra to files
    filePath = os.path.join(os.getcwd(), "align_" + params["output_name"])
    ms2Path = os.path.join(filePath, "MS2")
    if not os.path.exists(ms2Path):
        os.mkdir(ms2Path)
    for i in range(df.shape[0]):
        if df["MS2"].iloc[i] is not None:
            fileName = os.path.join(ms2Path, "f" + str(i + 1) + ".MS2")
            dfMS2 = pd.DataFrame.from_dict(df["MS2"].iloc[i])
            dfMS2.to_csv(fileName, index=False, header=False, sep="\t")

    # Save fully-aligned features with their MS2 spectra (i.e. res) for debugging purpose
    # When the pipeline gets mature, this part needs to be removed
    pickle.dump(df,
                open(os.path.join(filePath, ".fully_aligned_feature.pickle"),
                     "wb"))  # Make the file be hidden

    ##########################
    # Handling mzXML file(s) #
    ##########################
    # Move mzXML files to the directory(ies) where individual .feature files are located
    if params["skip_feature_detection"] == "0":
        for file in mzxmlFiles:
            baseFilename = os.path.basename(file)
            featureDirectory = os.path.join(os.getcwd(),
                                            os.path.splitext(baseFilename)[0])
            os.rename(file, os.path.join(featureDirectory, baseFilename))

    return df, featureToScan
Пример #15
0
def searchLibrary(full, paramFile):
    ##################################
    # Load parameters and initialize #
    ##################################
    try:
        params = utils.getParams(paramFile)
    except:
        sys.exit("Parameter file cannot be found or cannot be loaded")
    condition = params["LC_column"].lower()
    if params["mode"] == "1":
        condition = condition + "p"
    elif params["mode"] == "-1":
        condition = condition + "n"
    else:
        sys.exit("'mode' parameter should be either 1 or -1")
    proton = 1.007276466812
    matchMzTol = float(params["library_mass_tolerance"])  # Unit of ppm
    adducts = adductDictionary(params)
    nFeatures = full.shape[0]
    # While full["feature_RT"] has the unit of minute, the library compounds have RTs in the unit of second
    # So, within this function, full["feature_RT"] needs to be converted to the unit of second
    full["feature_RT"] = full["feature_RT"] * 60

    ##########################
    # Perform library search #
    ##########################
    allRes = pd.DataFrame()
    nLibs = 1
    for libFile in params["library"]:
        doAlignment = int(params["library_rt_alignment"])
        print("  Library {} is being loaded".format(os.path.basename(libFile)))
        logging.info("  Library {} is being loaded".format(
            os.path.basename(libFile)))
        try:
            conn = sqlite3.connect(libFile)
        except:
            sys.exit("Library file cannot be found or cannot be loaded.")

        #####################################################
        # RT-alignment between features and library entries #
        #####################################################
        # Check whether 'rt' column of the library is numeric value or not
        hasNumericRt = 0
        cursor = conn.execute("PRAGMA table_info(library)")
        pragma = cursor.fetchall()
        for row in pragma:
            if row[1].lower() == "rt":
                if row[2].lower() == "real":
                    hasNumericRt = 1
                break

        # RT-alignment
        if doAlignment == 1:
            if hasNumericRt == 1:
                print(
                    "  RT-alignment is being performed between features and library compounds"
                )
                logging.info(
                    "  RT-alignment is being performed between features and library compounds"
                )
                x, y = prepRtAlignment(full, conn, params)
                mod = rtAlignment(x, y)
                if mod == -1:
                    print(
                        "  Since there are TOO FEW feature RTs comparable to library RTs, RT-alignment is skipped"
                    )
                    logging.info(
                        "  Since there are TOO FEW feature RTs comparable to library RTs, RT-alignment is skipped"
                    )
                    doAlignment = 0
                else:
                    # Calibration of features' RT
                    rPredict = ro.r("predict")
                    full["feature_calibrated_RT"] = None
                    full["feature_calibrated_RT"] = full[
                        "feature_RT"] - rPredict(
                            mod, FloatVector(full["feature_RT"]))
                    # Empirical CDF of alignment (absolute) residuals (will be used to calculate RT shift-based scores)
                    ecdfRt = ECDF(abs(np.array(mod.rx2("residuals"))))
            else:
                print(
                    "  Although the parameter is set to perform RT-alignment against the library, there are no valid RT values in the library"
                )
                print("  Therefore, RT-alignment is not performed")
                logging.info(
                    "  Although the parameter is set to perform RT-alignment against the library, there are no valid RT values in the library"
                )
                logging.info("  Therefore, RT-alignment is not performed")
                doAlignment = 0
        else:
            print(
                "  According to the parameter, RT-alignment is not performed between features and library compounds"
            )
            logging.info(
                "  According to the parameter, RT-alignment is not performed between features and library compounds"
            )

        ########################################
        # Match features and library compounds #
        ########################################
        # Match features and library compounds
        print("  Features are being compared with library compounds")
        logging.info("  Features are being compared with library compounds")
        res = {
            "no": [],
            "feature_index": [],
            "feature_m/z": [],
            "feature_original_RT": [],
            "feature_aligned_RT": [],
            "id": [],
            "other_id": [],
            "formula": [],
            "name": [],
            "ion": [],
            "RT": [],
            "SMILES": [],
            "InchiKey": [],
            "collision_energy": [],
            "RT_shift": [],
            "RT_score": [],
            "MS2_score": [],
            "combined_score": []
        }
        intensityCols = [
            col for col in full.columns if col.lower().endswith("_intensity")
        ]
        for c in intensityCols:
            res[c] = []
        n = 0
        progress = utils.progressBar(nFeatures)
        for i in range(nFeatures):
            progress.increment()
            # Feature information
            fZ = full["feature_z"].iloc[i]
            fSpec = full["MS2"].iloc[i]
            if np.isnan(
                    fZ
            ) or fSpec is None:  # When MS2 spectrum of the feature is not defined, skip it
                continue
            fMz = full["feature_m/z"].iloc[i]
            fRt = full["feature_RT"].iloc[i]
            fIntensity = full[intensityCols].iloc[i]
            if params["mode"] == "1":  # Positive mode
                fMass = fZ * (fMz - proton)
            elif params["mode"] == "-1":  # Negative mode
                fMass = fZ * (fMz + proton)

            # Retrieve library compounds of which neutral masses are similar to feature mass
            df = queryLibrary(fMz, fMass, fZ, conn, adducts, matchMzTol)
            if not df.empty:
                colNameOtherId = df.filter(regex="other_ids").columns[0]
                for j in range(df.shape[0]):
                    # When there is/are library compound(s) matched to the feature,
                    # MS2 of the library compound(s) should be retrieved
                    uid = df["id"].iloc[j]
                    uid = uid.replace("##Decoy_", "")
                    sqlQuery = r"SELECT * FROM {}".format(uid)
                    try:
                        libSpec = pd.read_sql_query(sqlQuery, conn)
                    except:
                        continue
                    if not libSpec.empty:
                        n += 1
                        # Calculate the score based on MS2 spectrum
                        libSpec = libSpec.to_dict(orient="list")
                        simMs2 = calcMS2Similarity(fSpec, libSpec, params)
                        pMs2 = 1 - simMs2  # p-value-like score (the smaller, the better)
                        pMs2 = max(np.finfo(float).eps,
                                   pMs2)  # Prevent the underflow caused by 0

                        # Calculate the (similarity?) score based on RT-shift
                        if doAlignment == 1:
                            fAlignedRt = full["feature_calibrated_RT"].iloc[i]
                            rtShift = fAlignedRt - df["rt"].iloc[j]
                            pRt = ecdfRt(
                                abs(rtShift)
                            )  # Also, p-value-like score (the smaller, the better)
                            pRt = max(np.finfo(float).eps, pRt)
                            simRt = 1 - pRt
                            # p = 1 / (0.5 / pMS2 + 0.5 / pRt)  # Combined p-value using harmonic mean with equal weights
                            p = 1 - stats.chi2.cdf(
                                -2 * (np.log(pMs2) + np.log(pRt)),
                                4)  # Fisher's method
                            # p = -2 * (np.log(pMs2) + np.log(pRt))   # Fisher's method used in Perl pipeline (the smaller, the better)
                        else:
                            fAlignedRt = "NA"
                            if hasNumericRt == 1 and df["rt"].iloc[
                                    j] is not None:
                                rtShift = fRt - df["rt"].iloc[j]
                            else:
                                rtShift = "NA"
                            # pRt = 1
                            simRt = "NA"
                            p = pMs2

                        # Output
                        libId = df["id"].iloc[j]
                        libOtherId = df[colNameOtherId].iloc[j]
                        libFormula = df["formula"].iloc[j]
                        libName = df["name"].iloc[j]
                        if hasNumericRt == 1:
                            libRt = df["rt"].iloc[j]
                        else:
                            libRt = "NA"
                        libIon = df["ion_type"].iloc[j]
                        libSmiles = df["smiles"].iloc[j]
                        libInchiKey = df["inchikey"].iloc[j]
                        libEnergy = df["collision_energy"].iloc[j]

                        res["no"].append(n)
                        res["feature_index"].append(i + 1)
                        res["feature_m/z"].append(fMz)
                        res["feature_original_RT"].append(
                            fRt / 60)  # For output, the unit of RT is minute
                        if doAlignment == 1:
                            res["feature_aligned_RT"].append(fAlignedRt / 60)
                        else:
                            res["feature_aligned_RT"].append(fAlignedRt)
                        for c in intensityCols:
                            res[c].append(fIntensity[c])
                        res["id"].append(libId)
                        res["other_id"].append(libOtherId)
                        res["formula"].append(libFormula)
                        res["name"].append(libName)
                        res["ion"].append(libIon)
                        if hasNumericRt == 1:
                            res["RT"].append(libRt / 60)
                        else:
                            res["RT"].append(libRt)
                        res["SMILES"].append(libSmiles)
                        res["InchiKey"].append(libInchiKey)
                        res["collision_energy"].append(libEnergy)
                        if rtShift != "NA":
                            rtShift = abs(rtShift) / 60  # Convert to "minute"
                        res["RT_shift"].append(rtShift)
                        # Haiyan's preference
                        # RT_score and MS2_score: 0 ~ 1 (bad to good)
                        res["RT_score"].append(simRt)
                        res["MS2_score"].append(simMs2)
                        res["combined_score"].append(abs(-np.log10(p)))

        conn.close()
        res = pd.DataFrame.from_dict(res)
        resCols = ["no", "feature_index", "feature_m/z", "feature_original_RT", "feature_aligned_RT"] + intensityCols + \
                  ["id", "other_id", "formula", "name", "ion", "RT", "SMILES", "InchiKey", "collision_energy", "RT_shift",
                   "RT_score", "MS2_score", "combined_score"]
        res = res[resCols]
        res = res.rename(columns={"other_id": colNameOtherId})

        filePath = os.path.join(os.getcwd(), "align_" + params["output_name"])
        outputFile = os.path.join(
            filePath, "align_" + params["output_name"] + "." + str(nLibs) +
            ".library_matches")
        res.to_csv(outputFile, sep="\t", index=False)
        allRes = allRes.append(res, ignore_index=True)
        nLibs += 1

    # RT unit of "full" needs to be converted back to minute for subsequent procedures (i.e. database search)
    full["feature_RT"] = full["feature_RT"] / 60

    return allRes