def eval_ada(VERBOSE=True):

    acr = np.zeros(10)
    for _ in range(100):
        _, _, correct = pipeline.pipe(feature='lbph',
                                      clf='adaboost',
                                      _mode='test',
                                      _verbose=VERBOSE)
        acr += (correct)

    est_list = np.linspace(2, 3.7, num=10)
    plt.bar(est_list, acr)
    plt.show()
    plt.savefig('graphs/ada_n_estimators.png')
def eval_pca(VERBOSE=True):
    trials = 0
    total_correct = 0
    n_comp = 2
    for _ in range(2000):
        correct = pipeline.pipe(feature='cslbcop',
                                clf='svm',
                                _mode='test',
                                _verbose=VERBOSE,
                                pca_scatter=True,
                                n_components=n_comp)
        total_correct += correct
        trials += 1
        print(
            f'Trial: {trials}\tCorrects: {total_correct}\tOverall: {total_correct/trials}'
        )
def final_eval(_mode='test', _verbose=True):
    itr = 0
    failed = 0
    set_id = 0
    data_path = 'data_tune/'
    datas = os.listdir(data_path)
    for set_id in range(159):
        auth = datas[set_id]
        form_count = len(os.listdir(data_path + auth))
        forms = os.listdir(data_path + auth)
        for form_id in range(form_count):
            itr += 1
            res = pipeline.pipe(feature='cslbcop',
                                clf='svm',
                                _verbose=_verbose,
                                _mode=_mode,
                                set_id=set_id,
                                form_id=form_id)
            if not res: failed += 1
            print(
                f'Trial: {itr}\tFailed: {failed}\tauthor: {set_id} \tform: {form_id}\n\n'
            )
def eval_perfomance_lbph_svm(MODE, VERBOSE=False):
    #fetch data
    tr = 0
    pre = 0
    conf = 0
    trueacc = np.zeros(101)
    falseacc = np.zeros(101)
    allacc = np.zeros(101)
    avgrtconv = 0
    predicted = -1
    cnt = 0
    for i in range(2000):
        print("Iteration: ", i + 1)
        predicted, conf, pre = pipeline.pipe(feature='lbph',
                                             clf='svm',
                                             _mode='test',
                                             _verbose=VERBOSE)
        conf = int(100 * conf[predicted] + 0.5)
        allacc[conf] += 1
        if pre == 1:
            trueacc[conf] += 1
        else:
            falseacc[conf] += 1
        if pre == 1 and conf > 50:
            tr += 1
            avgrtconv += conf
        if pre == 0 and conf > 50:
            cnt += 1
        print("correct: ", tr)
    print(tr / 20)
    print(avgrtconv / 2000)
    print(cnt)
    plt.plot(allacc)
    plt.show()
    plt.plot(trueacc)
    plt.show()
    plt.plot(falseacc)
    plt.show()
import evaluations


if __name__ == "__main__":
    # prepare_data.print_data_stat()
    # evaluations.eval_perfomance_lbph_svm(MODE, VERBOSE)
    # evaluations.eval_ada()
    # evaluations.eval_pca(VERBOSE)
    # evaluations.final_eval()

    VERBOSE = False
    MODE = 'deliver'
    data_path = 'data/'
    test_folders = os.listdir(data_path)
    test_folders = sorted(test_folders,key=int)
        
    results_file = open('results.txt',"w")
    time_file = open('time.txt',"w")

    for test_folder in test_folders:
        res,time = pipeline.pipe(feature='cslbcop', clf='svm',_verbose=VERBOSE, _mode=MODE, test_folder=test_folder)
        time = "{:.2f}".format(time)
        results_file.write(str(res))
        results_file.write('\n')
        results_file.flush()
        time_file.write(str(time))
        time_file.write('\n')
        time_file.flush()

    results_file.close()
    time_file.close()
示例#6
0
def annotate_panel(vcf,
                   vep,
                   reference=None,
                   threads=None,
                   output="",
                   panel="",
                   buffer_size=None):
    if threads is None:
        threads = run(["getconf", "_NPROCESSORS_ONLN"]).stdout.strip()

    if not output:
        output = "."
    if os.path.isdir(output):
        output = os.path.join(
            output, "{}.annotation.tsv".format(
                vcf[:-4] if vcf.endswith(".vcf") else vcf))

    vepjson = "{}.vep.json".format(output[:-4])
    vep_options = [
        "--no_stats", "--dir", vep, "--format", "vcf", "--json", "--offline",
        "--everything", "--warning_file", "STDERR", "--force_overwrite"
    ]
    if reference is not None:
        reference = (glob.glob(f"{reference}/*.fna") +
                     glob.glob(f"{reference}/*.fa") +
                     glob.glob(f"{reference}/*.fasta") + [reference])[0]
        vep_options += ["--fasta", reference]
    if int(threads) > 1:
        vep_options += ["--fork", threads]
    if "refseq" in vep:
        vep_options += ["--refseq"]
    if buffer_size is not None:
        vep_options += ["--buffer_size", buffer_size]

    pipe(["vep", "-i", vcf, "-o", vepjson] + vep_options)

    get_read_data = None
    with open(vcf, "rt") as f:
        for row in f:
            if not row.startswith("#"):
                break
            if row.startswith("##source="):
                source = row[9:].strip()
                #if source == "strelka":
                if source.startswith("VarDict"):
                    get_read_data = vardict_read_data
                elif source == "VarScan2":
                    get_read_data = varscan2_read_data
                elif source == "Mutect2":
                    get_read_data = mutect2_read_data
            headings = row

    if get_read_data is None:
        sys.exit(f"Unsupported variant caller {source}")
    if len(headings.split("\t")) > 10:
        sys.exit("Multi-sample vcfs not suppored")

    targets = None
    principal = {}
    needed_genes = set()
    needed_transcripts = set()
    if panel:
        panel = Panel(panel)
        if "targets" in panel:
            targets = panel.targets

        if "names" in panel:
            for name in panel.names:
                name = name.split()
                needed_genes.add(name[0])
                if len(name) > 1:
                    needed_transcripts.add(name[1])
        if "principal" in panel.paths:
            principal = appris(panel.paths["principal"])

    if "refseq" in vep:

        def consequence_sort(cons):
            transcript, minor = cons["transcript_id"].split(".")
            prefix = transcript[:2]
            major = transcript[3:]
            return [
                transcript in needed_transcripts, cons["gene_symbol"]
                in needed_genes, BIOTYPE[cons["biotype"]], REFSEQ[prefix],
                -int(cons["gene_id"]),
                principal.get(transcript, 0), "canonical" in cons, -int(major),
                int(minor)
            ]

    else:  # ensembl transcripts

        def consequence_sort(cons):
            # Version numbers not in vep as of version 101, but who knows the future ...
            transcript = cons["transcript_id"]
            return [
                transcript in needed_transcripts, cons["gene_symbol"]
                in needed_genes, BIOTYPE[cons["biotype"]],
                -int(cons["gene_id"].translate(DELETE_NON_DIGIT)),
                principal.get(transcript, 0), "canonical" in cons,
                -int(transcript.translate(DELETE_NON_DIGIT))
            ]

    annotations = []
    with open(vepjson) as f:
        for line in f:
            vep_output = json.loads(line)

            consequences = vep_output.get("transcript_consequences")
            if consequences:
                cons = sorted(consequences, key=consequence_sort)[-1]
                other_genes = set(c["gene_symbol"]
                                  for c in consequences) - set(
                                      [cons["gene_symbol"]])

            else:
                most_severe_consequence = vep_output["most_severe_consequence"]
                for cons in sorted(chain(*[
                        v for k, v in vep_output.items()
                        if k.endswith("_consequences")
                ]),
                                   key=lambda x: x.get("biotype", ""),
                                   reverse=True):
                    # We are only going to use biotype and impact so probably does not matter which one we choose so long as we are consistent
                    if most_severe_consequence in cons["consequence_terms"]:
                        break
                other_genes = ()

            row = vep_output["input"].rstrip().split("\t")
            read_data = get_read_data(row)

            if read_data["alt_depth"] == "0":
                continue

            # https://gatk.broadinstitute.org/hc/en-us/articles/360035532152-Fisher-s-Exact-Test
            fisher_strand = -10 * math.log10(
                fisher_exact([read_data["ref_fr"], read_data["alt_fr"]])[1])

            demographics = parse_colocated(vep_output)

            annotations.append([
                cons.get("gene_symbol", ""),
                cons.get("transcript_id", ""), row[CHROM], vep_output["start"],
                vep_output["allele_string"], row[QUAL], row[FILTERS],
                read_data["vaf"], read_data["depth"], read_data["alt_depth"],
                "{}:{}".format(*read_data["alt_fr"]),
                "{}:{}".format(*read_data["ref_fr"]),
                "{:.1f}".format(fisher_strand),
                cons.get("hgvsc", ""),
                cons.get("hgvsp", ""),
                cons.get("biotype", ""),
                cons.get("impact",
                         ""), ", ".join(demographics.get("clin_sig", ())),
                ", ".join(cons.get("consequence_terms", ())),
                cons.get("sift_prediction", ""),
                cons.get("polyphen_prediction", ""), "{:.10f}".format(
                    demographics["maf"]) if "maf" in demographics else "",
                ", ".join(sorted(other_genes)),
                ", ".join(demographics.get("dbsnp", ())),
                ", ".join(demographics.get("hgmd", ())), ", ".join(
                    demographics.get("cosmic", ())), ", ".join(
                        demographics.get("pubmed", ()))
            ])

    os.unlink(vepjson)
    annotations.sort(key=lambda r: (chrom2int(r[2]), r[2], int(r[3]), r[4]))
    with open(output, "wt") as f:
        writer = csv.writer(f, delimiter="\t")
        writer.writerow([
            "Gene", "Transcript", "Chrom", "Pos", "Change", "Quality",
            "Filters", "VAF", "Depth", "Alt Depth", "Alt Depth F:R",
            "Ref Depth F:R", "FisherStrand", "HGVSc", "HGVSp", "Biotype",
            "Impact", "Clinical Significance (Pubmed)", "Consequences", "Sift",
            "Polyphen", "MAF", "Other Genes", "dbSNP", "HGMD", "COSMIC",
            "Pubmed"
        ])
        writer.writerows(annotations)
示例#7
0
from flask import Flask, render_template, request, jsonify
import os, sys
reload(sys)  # Reload does the trick!
sys.setdefaultencoding('UTF8')
sys.path.append('..')
import pipeline as pp
import nltk
from utils.entity import EntityType

# Initialize the Flask application
app = Flask(__name__)
app.config['JSONIFY_PRETTYPRINT_REGULAR'] = False

pipe = pp.pipe()


@app.route('/')
def index():
    return render_template('index.html')


@app.route('/_tag_sentence')
def tag_sentence():
    a = request.args.get('input', 0)

    if len(a.split(" ")) < 2:
        return {}

    a = " ".join(nltk.word_tokenize(a))
    bilstm_red = pipe.tag(a)
示例#8
0
    def analyze(self):
        # Update application copy of input/output dir (in case user didn't use buttons)
        self.inputDir = self.lineEdit_inputDir.text()
        self.outputDir = self.lineEdit_outputDir.text()
        # Get all user options
        opts = {
            'input_dir':
            self.inputDir,
            'output_dir':
            self.outputDir,
            'bool_analyze_full':
            self.checkBox_setTime.checkState(),
            'start_time':
            utils.toInt(self.lineEdit_startTime.text()),
            'end_time':
            utils.toInt(self.lineEdit_endTime.text()),
            'bool_removeClipped':
            self.checkBox_removeClipped.checkState(),
            'bool_deleteClips':
            self.checkBox_deleteClips.checkState(),
            'bool_makeSpectrograms':
            self.checkBox_spectrograms.checkState(),
            'bool_parseFilename':
            self.checkBox_parseFilename.checkState(),
            'filename_delimiter':
            self.lineEdit_filenameDelimiter.text(),
            'filename_categories':
            [s for s in self.lineEdit_filenameCategories.text().split(',')],
            'bool_useAva':
            self.checkBox_useAva.checkState(),
            'avaFoldername':
            self.lineEdit_avaFolder.text(),
            'bool_inferSilence':
            self.checkBox_useSilenceFile.checkState(),
            'silenceFilename':
            self.lineEdit_silenceFile.text(),
            'silenceThreshold':
            self.spinBox_silenceThresh.value(),
            'silenceBuffer':
            self.spinBox_bufferLen.value(),
            'silenceMinLen':
            self.spinBox_minSilenceLen.value()
        }

        # TODO: Generalize error catching
        try:
            progress = QtWidgets.QProgressDialog(
                "Processing files...", "Abort", 0,
                len(os.listdir(opts['input_dir'])))
            progress.setWindowModality(QtCore.Qt.WindowModal)
            self.vocBatch = pipeline.pipe(opts)
        except exceptions.TimeBoundError as error:
            error_dialog = QtWidgets.QErrorMessage()
            error_dialog.setWindowModality(QtCore.Qt.WindowModal)
            error_dialog.showMessage(error.error_string())
            error_dialog.exec_()
            return
        except exceptions.FilenameParseError as error:
            error_dialog = QtWidgets.QErrorMessage()
            error_dialog.setWindowModality(QtCore.Qt.WindowModal)
            error_dialog.showMessage(error.error_string())
            error_dialog.exec_()
            return
        except exceptions.NoInputError as error:
            error_dialog = QtWidgets.QErrorMessage()
            error_dialog.setWindowModality(QtCore.Qt.WindowModal)
            error_dialog.showMessage(error.error_string())
            error_dialog.exec_()
            return

        self.loadOutputTables(self.lineEdit_outputDir.text())
        if opts['bool_makeSpectrograms']:
            self.loadSpectrograms(opts['output_dir'])