示例#1
0
def test_model_arrays(savefile, filename, P, T, **kwargs):
    with open(savefile, 'r') as FILE:
        master_com = pickle.load(FILE)

    print("Committee size: {0}".format(len(master_com)))

    output_file = 'test_{0}_{1}.cvs'.format(os.path.splitext(os.path.basename(savefile))[0], \
                                                              os.path.splitext(os.path.basename(filename))[0])
    #Need double brackets for dimensions to be right for numpy
    outputs = numpy.array([[master_com.risk_eval(inputs)] for inputs in P])
    if T is None or len(T) == 0:
        with open(output_file, 'w') as F:
            #print('Targets\tOutputs\tEvents:')
            F.write("Outputs\n")
            for o in outputs:
                #print("{0}\t{1}\t{2}".format(t[0], o[0], t[1]))
                F.write("{0}\n".format(o[0]))
        return outputs

    c_index = get_C_index(T, outputs)

    print("C-Index: {0}".format(c_index))

    #if len(sys.argv) > 2:
    #    thresholds = [float(t) for t in sys.argv[2:]]
    #else:
    thresholds = None

    #Calculate suitable size for the figure for use in LaTEX
    fig_width_pt = 396.0  # Get this from LaTeX using \showthe\columnwidth
    inches_per_pt = 1.0 / 72.27  # Convert pt to inch
    golden_mean = (sqrt(5) - 1.0) / 2.0  # Aesthetic ratio
    fig_width = fig_width_pt * inches_per_pt  # width in inches
    fig_height = fig_width * golden_mean  # height in inches
    fig_size = [fig_width, fig_height]
    #Update settings
    plt.rcParams['figure.figsize'] = fig_size

    th = kaplanmeier(time_array=T[:, 0],
                     event_array=T[:, 1],
                     output_array=outputs,
                     threshold=thresholds,
                     show_plot=False,
                     bestcut=False,
                     **kwargs)
    #print("Threshold dividing the set in two equal pieces: " + str(th))
    if plt:
        plt.savefig('kaplanmeier_{0}_{1}.eps'.format(os.path.splitext(os.path.basename(savefile))[0], \
                                             os.path.splitext(os.path.basename(filename))[0]))

    with open(output_file, 'w') as F:
        #print('Targets\tOutputs\tEvents:')
        F.write("Targets,Outputs,Events\n")
        for t, o in zip(T, outputs):
            #print("{0}\t{1}\t{2}".format(t[0], o[0], t[1]))
            F.write("{0},{1},{2}\n".format(t[0], o[0], t[1]))

    return output_file
def experiment(net, P, T, vP, vT, filename, epochs, learning_rate):
    logger.info("Running experiment for: " + filename + ' ' + str(epochs) + ", rate: " + str(learning_rate))
    print("Number of patients with events: " + str(T[:, 1].sum()))
    print("Number of censored patients: " + str((1 - T[:, 1]).sum()))

    timeslots = generate_timeslots(T)

    try:
        net = traingd(net, (P, T), (vP, vT), epochs, learning_rate, block_size = 100, error_module = cox_error)
    except FloatingPointError:
        print('Aaawww....')
    outputs = net.sim(P)
    c_index = get_C_index(T, outputs)
    logger.info("C index = " + str(c_index))

    #plot_network_weights(net)

    kaplanmeier(time_array = T[:, 0], event_array = T[:, 1], output_array = outputs[:, 0])
    if vP is not None and len(vP) > 0:
        outputs = net.sim(vP)
        kaplanmeier(time_array = vT[:, 0], event_array = vT[:, 1], output_array = outputs[:, 0])

    return net
示例#3
0
def survival_stat(filename, thresholds = None):
    data = np.array(read_data_file(filename, ","))
    D, t = parse_data(data, inputcols = (2, 3, 4, 5, 6, 7, 8, 9, 10), ignorerows = [0], normalize = False)

    T = D[:, (2, 3)]
    outputs = D[:, (-1, 3)]
    C = get_C_index(T, outputs)

    print("C-index: " + str(C))
    print("Genetic error: " + str(1 / C))

    th = kaplanmeier(D, 2, 3, -1, threshold = thresholds)
    print("Threshold dividing the set in two equal pieces: " + str(th))
    if plt:
        plt.show()
def committee_test():

    try:
        netsize = input('Number of hidden nodes? [1]: ')
    except SyntaxError as e:
        netsize = 1

    try:
        comsize = input('Committee size? [1]: ')
    except SyntaxError as e:
        comsize = 1

    try:
        pop_size = input('Population size? [100]: ')
    except SyntaxError as e:
        pop_size = 100

    try:
        mutation_rate = input('Please input a mutation rate (0.05): ')
    except SyntaxError as e:
        mutation_rate = 0.05

    filename = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_SA_1889_dataset.txt"

    try:
        columns = input("Which columns to include? (Do NOT forget trailing comma if only one column is used, e.g. '3,'\nAvailable columns are: 2, -4, -3, -2, -1. Just press ENTER for all columns.\n")
    except SyntaxError:
        columns = (2, -4, -3, -2, -1)

    P, T = parse_file(filename, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True)

    #remove tail censored
    try:
        cutoff = input('Cutoff for censored data? [9999 years]: ')
    except SyntaxError as e:
        cutoff = 9999
    P, T = copy_without_censored(P, T, cutoff)

    #Divide into validation sets
    try:
        test_size = float(input('Size of test set (not used in training)? Input in fractions. Default is [0.0]: '))
    except:
        test_size = 0.0
    ((TP, TT), (VP, VT)) = get_validation_set(P, T, validation_size = test_size, binary_column = 1)
    print("Length of training set: " + str(len(TP)))
    print("Length of test set: " + str(len(VP)))

    try:
        epochs = input("\nNumber of generations (1): ")
    except SyntaxError as e:
        epochs = 1

    com = build_feedforward_committee(comsize, len(P[0]), netsize, 1, output_function = 'linear')

    #1 is the column in the target array which holds the binary censoring information
    test_errors, vald_errors, data_sets = train_committee(com, train_evolutionary, P, T, 1, epochs, error_function = c_index_error, population_size = pop_size, mutation_chance = mutation_rate)

    com.set_training_sets([set[0][0] for set in data_sets]) #first 0 gives training sets, second 0 gives inputs.

    print('\nTest C_indices, Validation C_indices:')
    for terr, verr in zip(test_errors.values(), vald_errors.values()):
        print(str(1 / terr) + ", " + str(1 / verr))

    if plt:
        outputs = numpy.array([[com.risk_eval(inputs)] for inputs in TP]) #Need double brackets for dimensions to be right for numpy
        kaplanmeier(time_array = TT[:, 0], event_array = TT[:, 1], output_array = outputs[:, 0], threshold = 0.5)
        train_c_index = get_C_index(TT, outputs)
        print("\nC-index on the training set: " + str(train_c_index))
        if len(VP) > 0:
            outputs = numpy.array([[com.risk_eval(inputs)] for inputs in VP]) #Need double brackets for dimensions to be right for numpy
            test_c_index = get_C_index(VT, outputs)
            kaplanmeier(time_array = VT[:, 0], event_array = VT[:, 1], output_array = outputs[:, 0], threshold = 0.5)
            print("C-index on the test set: " + str(test_c_index))

        #raw_input("\nPress enter to show plots...")
        plt.show()

    try:
        answer = input("\nDo you wish to print committee risk output? ['n']: ")
    except (SyntaxError, NameError):
        answer = 'n'

    if answer != 'n' and answer != 'no':
        inputs = read_data_file(filename)
        P, T = parse_file(filename, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True)
        outputs = [[com.risk_eval(patient)] for patient in P]
        while len(inputs) > len(outputs):
            outputs.insert(0, ["net_output"])

        print("\n")
        for rawline in zip(inputs, outputs):
            line = ''
            for col in rawline[0]:
                line += str(col)
                line += ','
            for col in rawline[1]:
                line += str(col)

            print(line)
def plotKM(targets, outputs, cut):
    kaplanmeier(time_array=targets[:,0], event_array=targets[:, 1],
                output_array=outputs, threshold=cut, show_plot=False)
def test_model_arrays(savefile, filename, P, T, **kwargs):
    with open(savefile, "r") as FILE:
        master_com = pickle.load(FILE)

    print("Committee size: {0}".format(len(master_com)))

    output_file = "test_{0}_{1}.cvs".format(
        os.path.splitext(os.path.basename(savefile))[0], os.path.splitext(os.path.basename(filename))[0]
    )
    # Need double brackets for dimensions to be right for numpy
    outputs = numpy.array([[master_com.risk_eval(inputs)] for inputs in P])
    if T is None or len(T) == 0:
        with open(output_file, "w") as F:
            # print('Targets\tOutputs\tEvents:')
            F.write("Outputs\n")
            for o in outputs:
                # print("{0}\t{1}\t{2}".format(t[0], o[0], t[1]))
                F.write("{0}\n".format(o[0]))
        return outputs

    c_index = get_C_index(T, outputs)

    print("C-Index: {0}".format(c_index))

    # if len(sys.argv) > 2:
    #    thresholds = [float(t) for t in sys.argv[2:]]
    # else:
    thresholds = None

    # Calculate suitable size for the figure for use in LaTEX
    fig_width_pt = 396.0  # Get this from LaTeX using \showthe\columnwidth
    inches_per_pt = 1.0 / 72.27  # Convert pt to inch
    golden_mean = (sqrt(5) - 1.0) / 2.0  # Aesthetic ratio
    fig_width = fig_width_pt * inches_per_pt  # width in inches
    fig_height = fig_width * golden_mean  # height in inches
    fig_size = [fig_width, fig_height]
    # Update settings
    plt.rcParams["figure.figsize"] = fig_size

    th = kaplanmeier(
        time_array=T[:, 0],
        event_array=T[:, 1],
        output_array=outputs,
        threshold=thresholds,
        show_plot=False,
        bestcut=False,
        **kwargs
    )
    # print("Threshold dividing the set in two equal pieces: " + str(th))
    if plt:
        plt.savefig(
            "kaplanmeier_{0}_{1}.eps".format(
                os.path.splitext(os.path.basename(savefile))[0], os.path.splitext(os.path.basename(filename))[0]
            )
        )

    with open(output_file, "w") as F:
        # print('Targets\tOutputs\tEvents:')
        F.write("Targets,Outputs,Events\n")
        for t, o in zip(T, outputs):
            # print("{0}\t{1}\t{2}".format(t[0], o[0], t[1]))
            F.write("{0},{1},{2}\n".format(t[0], o[0], t[1]))

    return output_file
示例#7
0
def plotKM(targets, outputs, cut):
    kaplanmeier(time_array=targets[:, 0],
                event_array=targets[:, 1],
                output_array=outputs,
                threshold=cut,
                show_plot=False)
示例#8
0
def train_single():
    try:
        netsize = input('Number of hidden nodes? [1]: ')
    except SyntaxError as e:
        netsize = 1

    try:
        pop_size = input('Population size? [100]: ')
    except SyntaxError as e:
        pop_size = 100

    try:
        mutation_rate = input('Please input a mutation rate (0.05): ')
    except SyntaxError as e:
        mutation_rate = 0.05

    SB22 = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_SA_1889_dataset_SB22.txt"
    Benmargskohorten = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_SA_1889_dataset_Benmargskohorten.txt"
    SB91b = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_SA_1889_dataset_SB91b.txt"
    all_studies = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_SA_1889_dataset.txt"

    #Real data
    print("Studies to choose from:")
    print("1: SB22")
    print("2: Benmargskohorten")
    print("3: SB91b")
    print("0: All combined (default)")

    try:
        study = input("Which study to train on? [0]: ")
    except SyntaxError as e:
        study = 0

    if study == 1:
        filename = SB22
    elif study == 2:
        filename = Benmargskohorten
    elif study == 3:
        filename = SB91b
    else:
        filename = all_studies

    try:
        columns = input("Which columns to include? (Do NOT forget trailing comma if only one column is used, e.g. '3,'\nAvailable columns are: 2, -4, -3, -2, -1. Just press ENTER for all columns.\n")
    except SyntaxError:
        columns = (2, -4, -3, -2, -1)
    #P, T = parse_file(filename, targetcols = [4, 5], inputcols = [2, -4, -3, -2, -1], ignorerows = [0], normalize = True)
    P, T = parse_file(filename, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True)

    #Used for output comparison
    studies = {}
    studies[SB22] = parse_file(SB22, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True)
    studies[Benmargskohorten] = parse_file(Benmargskohorten, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True)
    studies[SB91b] = parse_file(SB91b, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True)
    studies[all_studies] = parse_file(all_studies, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True)

    #remove tail censored
    try:
        cutoff = input('Cutoff for censored data? [9999 years]: ')
    except SyntaxError as e:
        cutoff = 9999
    P, T = copy_without_censored(P, T, cutoff)

    #Divide into validation sets
    try:
        pieces = input('Size of validation set? Input denominator (1 for no validation set). Default is 1/[1] parts: ')
    except:
        pieces = 1
    TandV = get_cross_validation_sets(P, T, pieces , binary_column = 1)

    #Network part

    p = len(P[0]) #number of input covariates

    net = build_feedforward(p, netsize, 1, output_function = 'linear')
    #net = build_feedforward_multilayered(p, [7, 10], 1, output_function = 'linear')

    #Initial state
    #outputs = net.sim(tP)
    #orderscatter(outputs, tT, filename, 's')

    try:
        epochs = input("Number of generations (1): ")
    except SyntaxError as e:
        epochs = 1

    for ((tP, tT), (vP, vT)) in TandV:
        #train
        net = test(net, tP, tT, vP, vT, filename, epochs, population_size = pop_size, mutation_rate = mutation_rate)

        if plt:
            outputs = net.sim(tP)
            threshold = kaplanmeier(time_array = tT[:, 0], event_array = tT[:, 1], output_array = outputs[:, 0])
            if len(vP) > 0:
                outputs = net.sim(vP)
                kaplanmeier(time_array = vT[:, 0], event_array = vT[:, 1], output_array = outputs[:, 0], threshold = threshold)
            print("\nThreshold dividing the training set in two equal pieces: " + str(threshold))

            raw_input("\nPress enter to show plots...")
            plt.show()
        try:
            answer = input("Do you wish to print network output? Enter filename, or 'no' / 'n'. ['n']: ")
        except (SyntaxError, NameError):
            answer = 'n'
        if os.path.exists(answer):
            print("File exists. Will add random number to front")
            answer = str(random.randint(0, 123456)) + answer
        if answer != 'n' and answer != 'no':
            print_output(answer, net, filename, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True)