示例#1
0
def Experiment2():
    training_data_story, training_data_ask_hn, training_data_show_hn, training_data_poll, test_set = \
        func.read_data('Data/hns_2018_2019.csv')
    story_count, story_vocabulary_list, story_remove_list, story_total_word = \
        func.e2_get_count(training_data_story)
    ask_hn_count, ask_hn_vocabulary_list, ask_hn_remove_list, ask_hn_total_word = \
        func.e2_get_count(training_data_ask_hn)
    show_hn_count, show_hn_vocabulary_list, show_hn_remove_list, show_hn_total_word = \
        func.e2_get_count(training_data_show_hn)
    poll_count, poll_vocabulary_list, poll_remove_list, poll_total_word = \
        func.e2_get_count(training_data_poll)

    total_word = story_total_word + ask_hn_total_word + show_hn_total_word + poll_total_word
    store_p, ask_hn_p, show_hn_p, poll_p = Decimal(story_total_word) / Decimal(total_word), \
                                           Decimal(ask_hn_total_word) / Decimal(total_word), \
                                           Decimal(show_hn_total_word) / Decimal(total_word), \
                                           Decimal(poll_total_word) / Decimal(total_word)
    vocabulary_list = func.get_vocabulary_list(story_vocabulary_list, ask_hn_vocabulary_list,
                                               show_hn_vocabulary_list, poll_vocabulary_list)

    remove_list = func.get_remove_list(story_remove_list, ask_hn_remove_list,
                                       show_hn_remove_list, poll_remove_list)

    func.save_file('wordlength/wordlength_vocabulary.txt', vocabulary_list)
    func.save_file('wordlength/wordlength_remove.txt', remove_list)

    p_story, p_ask_hn, p_show_hn, p_poll = func.create_model(vocabulary_list, story_count, story_total_word,
                                                             story_vocabulary_list, ask_hn_count, ask_hn_total_word,
                                                             ask_hn_vocabulary_list, show_hn_count, show_hn_total_word,
                                                             show_hn_vocabulary_list, poll_count, poll_total_word,
                                                             poll_vocabulary_list, 'wordlength/wordlength-model.txt')
    func.test_model(vocabulary_list, test_set, p_story, p_ask_hn, p_show_hn, p_poll, store_p, ask_hn_p, show_hn_p,
                    poll_p, "wordlength/wordlength-result.txt")
示例#2
0
def main():
    training_data_story, training_data_ask_hn, training_data_show_hn, training_data_poll, test_set = \
        func.read_data(data)
    story_count, story_vocabulary_list, story_remove_list, story_total_word = \
        func.get_count(training_data_story)
    ask_hn_count, ask_hn_vocabulary_list, ask_hn_remove_list, ask_hn_total_word = \
        func.get_count(training_data_ask_hn)
    show_hn_count, show_hn_vocabulary_list, show_hn_remove_list, show_hn_total_word = \
        func.get_count(training_data_show_hn)
    poll_count, poll_vocabulary_list, poll_remove_list, poll_total_word = \
        func.get_count(training_data_poll)

    total_word = story_total_word + ask_hn_total_word + show_hn_total_word + poll_total_word
    store_p, ask_hn_p, show_hn_p, poll_p = Decimal(story_total_word) / Decimal(total_word), \
                                           Decimal(ask_hn_total_word) / Decimal(total_word), \
                                           Decimal(show_hn_total_word) / Decimal(total_word), \
                                           Decimal(poll_total_word) / Decimal(total_word)
    vocabulary_list = func.get_vocabulary_list(story_vocabulary_list, ask_hn_vocabulary_list,
                                               show_hn_vocabulary_list, poll_vocabulary_list)

    remove_list = func.get_remove_list(story_remove_list, ask_hn_remove_list,
                                       show_hn_remove_list, poll_remove_list)

    func.save_file('baseline/vocabulary.txt', vocabulary_list)
    func.save_file('baseline/remove.txt', remove_list)

    p_story, p_ask_hn, p_show_hn, p_poll = func.create_model(vocabulary_list, story_count, story_total_word,
                                                             story_vocabulary_list, ask_hn_count, ask_hn_total_word,
                                                             ask_hn_vocabulary_list, show_hn_count, show_hn_total_word,
                                                             show_hn_vocabulary_list, poll_count, poll_total_word,
                                                             poll_vocabulary_list, 'baseline/model-2018.txt')
    func.test_model(vocabulary_list, test_set, p_story, p_ask_hn, p_show_hn, p_poll, store_p, ask_hn_p, show_hn_p,
                    poll_p, "baseline/baselineresult.txt")
示例#3
0
def Experiment1():
    training_data_story, training_data_ask_hn, training_data_show_hn, training_data_poll, test_set = \
        func.read_data(data)
    stop_words_list = func.read_stop_words('Data/stopwords.txt')
    story_count, story_vocabulary_list, story_remove_list, story_total_word = \
        func.e1_get_count(training_data_story, stop_words_list)
    ask_hn_count, ask_hn_vocabulary_list, ask_hn_remove_list, ask_hn_total_word = \
        func.e1_get_count(training_data_ask_hn, stop_words_list)
    show_hn_count, show_hn_vocabulary_list, show_hn_remove_list, show_hn_total_word = \
        func.e1_get_count(training_data_show_hn, stop_words_list)
    poll_count, poll_vocabulary_list, poll_remove_list, poll_total_word = \
        func.e1_get_count(training_data_poll, stop_words_list)

    total_word = story_total_word + ask_hn_total_word + show_hn_total_word + poll_total_word
    store_p, ask_hn_p, show_hn_p, poll_p = Decimal(story_total_word) / Decimal(total_word), \
                                           Decimal(ask_hn_total_word) / Decimal(total_word), \
                                           Decimal(show_hn_total_word) / Decimal(total_word), \
                                           Decimal(poll_total_word) / Decimal(total_word)
    vocabulary_list = func.get_vocabulary_list(story_vocabulary_list, ask_hn_vocabulary_list,
                                               show_hn_vocabulary_list, poll_vocabulary_list)

    remove_list = func.get_remove_list(story_remove_list, ask_hn_remove_list,
                                       show_hn_remove_list, poll_remove_list)

    func.save_file('stopword/stopword_vocabulary.txt', vocabulary_list)
    func.save_file('stopword/stopword_remove.txt', remove_list)

    p_story, p_ask_hn, p_show_hn, p_poll = func.create_model(vocabulary_list, story_count, story_total_word,
                                                             story_vocabulary_list, ask_hn_count, ask_hn_total_word,
                                                             ask_hn_vocabulary_list, show_hn_count, show_hn_total_word,
                                                             show_hn_vocabulary_list, poll_count, poll_total_word,
                                                             poll_vocabulary_list, 'stopword/stopword-model.txt')
    func.test_model(vocabulary_list, test_set, p_story, p_ask_hn, p_show_hn, p_poll, store_p, ask_hn_p, show_hn_p,
                    poll_p, "stopword/stopword-result.txt")
示例#4
0
FIBERID = FIBERID.astype(int)

# Create shorter arrays for other variables, corresponding to specific THID
#T = input("Please enter THING_ID for {}: ".format(DR))
T = int(sys.argv[2])
w = (THID == T)
thid = THID[w]
plate = PLATE[w]
mjd = MJD[w]
fiberid = FIBERID[w]
ra = RA[w]
dec = DEC[w]
z = Z[w]

# Read data from relevant spec files
data = read_data(DR, thid, ra, dec, z, plate, mjd, fiberid, order, mode)


# Fit the continua
newdata = []
for d in data:
    d.cont_fit()
    if d.bad_cont is not None:
        print 'bad cont: ' + str(T)
        continue
    else:
        newdata.append(d)
data = N.asarray(newdata)

# Define g in order to retrieve thid, etc for stats, continuum fitting and plot
g = data[0]
示例#5
0
def Experiment3_2():
    # part 2
    threshold5 = 0.05
    training_data_story, training_data_ask_hn, training_data_show_hn, training_data_poll, test_set = \
        func.read_data(data)
    story_count, story_vocabulary_list, story_remove_list, story_total_word = \
        func.e3_2_get_count(training_data_story, threshold5)
    ask_hn_count, ask_hn_vocabulary_list, ask_hn_remove_list, ask_hn_total_word = \
        func.e3_2_get_count(training_data_ask_hn, threshold5)
    show_hn_count, show_hn_vocabulary_list, show_hn_remove_list, show_hn_total_word = \
        func.e3_2_get_count(training_data_show_hn, threshold5)
    poll_count, poll_vocabulary_list, poll_remove_list, poll_total_word = \
        func.e3_2_get_count(training_data_poll, threshold5)
    len_vocabulary_list_5 = len(func.get_vocabulary_list(story_vocabulary_list, ask_hn_vocabulary_list,
                                                         show_hn_vocabulary_list, poll_vocabulary_list))

    threshold10 = 0.1
    story_count, story_vocabulary_list, story_remove_list, story_total_word = \
        func.e3_2_get_count(training_data_story, threshold10)
    ask_hn_count, ask_hn_vocabulary_list, ask_hn_remove_list, ask_hn_total_word = \
        func.e3_2_get_count(training_data_ask_hn, threshold10)
    show_hn_count, show_hn_vocabulary_list, show_hn_remove_list, show_hn_total_word = \
        func.e3_2_get_count(training_data_show_hn, threshold10)
    poll_count, poll_vocabulary_list, poll_remove_list, poll_total_word = \
        func.e3_2_get_count(training_data_poll, threshold10)
    len_vocabulary_list_10 = len(func.get_vocabulary_list(story_vocabulary_list, ask_hn_vocabulary_list,
                                                          show_hn_vocabulary_list, poll_vocabulary_list))

    threshold15 = 0.15
    story_count, story_vocabulary_list, story_remove_list, story_total_word = \
        func.e3_2_get_count(training_data_story, threshold15)
    ask_hn_count, ask_hn_vocabulary_list, ask_hn_remove_list, ask_hn_total_word = \
        func.e3_2_get_count(training_data_ask_hn, threshold15)
    show_hn_count, show_hn_vocabulary_list, show_hn_remove_list, show_hn_total_word = \
        func.e3_2_get_count(training_data_show_hn, threshold15)
    poll_count, poll_vocabulary_list, poll_remove_list, poll_total_word = \
        func.e3_2_get_count(training_data_poll, threshold15)
    len_vocabulary_list_15 = len(func.get_vocabulary_list(story_vocabulary_list, ask_hn_vocabulary_list,
                                                          show_hn_vocabulary_list, poll_vocabulary_list))

    threshold20 = 0.2
    story_count, story_vocabulary_list, story_remove_list, story_total_word = \
        func.e3_2_get_count(training_data_story, threshold20)
    ask_hn_count, ask_hn_vocabulary_list, ask_hn_remove_list, ask_hn_total_word = \
        func.e3_2_get_count(training_data_ask_hn, threshold20)
    show_hn_count, show_hn_vocabulary_list, show_hn_remove_list, show_hn_total_word = \
        func.e3_2_get_count(training_data_show_hn, threshold20)
    poll_count, poll_vocabulary_list, poll_remove_list, poll_total_word = \
        func.e3_2_get_count(training_data_poll, threshold20)
    len_vocabulary_list_20 = len(func.get_vocabulary_list(story_vocabulary_list, ask_hn_vocabulary_list,
                                                          show_hn_vocabulary_list, poll_vocabulary_list))

    threshold25 = 0.25
    story_count, story_vocabulary_list, story_remove_list, story_total_word = \
        func.e3_2_get_count(training_data_story, threshold25)
    ask_hn_count, ask_hn_vocabulary_list, ask_hn_remove_list, ask_hn_total_word = \
        func.e3_2_get_count(training_data_ask_hn, threshold25)
    show_hn_count, show_hn_vocabulary_list, show_hn_remove_list, show_hn_total_word = \
        func.e3_2_get_count(training_data_show_hn, threshold25)
    poll_count, poll_vocabulary_list, poll_remove_list, poll_total_word = \
        func.e3_2_get_count(training_data_poll, threshold25)
    len_vocabulary_list_25 = len(func.get_vocabulary_list(story_vocabulary_list, ask_hn_vocabulary_list,
                                                          show_hn_vocabulary_list, poll_vocabulary_list))

    plt.plot([threshold5, threshold10, threshold15, threshold20, threshold25],
             [len_vocabulary_list_5, len_vocabulary_list_10, len_vocabulary_list_15, len_vocabulary_list_20,
              len_vocabulary_list_25], 'ro')
    plt.axis([0, 1, 0, max(len_vocabulary_list_5, len_vocabulary_list_10, len_vocabulary_list_15) + 50])
    plt.ylabel('number of vocabulary left')
    plt.xlabel('threshold')
    plt.title("2")
    plt.show()
示例#6
0
def Experiment3_1():
    # part 1
    threshold1 = 1
    training_data_story, training_data_ask_hn, training_data_show_hn, training_data_poll, test_set = \
        func.read_data(data)
    story_count, story_vocabulary_list, story_remove_list, story_total_word = \
        func.e3_1_get_count(training_data_story, threshold1)
    ask_hn_count, ask_hn_vocabulary_list, ask_hn_remove_list, ask_hn_total_word = \
        func.e3_1_get_count(training_data_ask_hn, threshold1)
    show_hn_count, show_hn_vocabulary_list, show_hn_remove_list, show_hn_total_word = \
        func.e3_1_get_count(training_data_show_hn, threshold1)
    poll_count, poll_vocabulary_list, poll_remove_list, poll_total_word = \
        func.e3_1_get_count(training_data_poll, threshold1)
    len_vocabulary_list_1 = len(func.get_vocabulary_list(story_vocabulary_list, ask_hn_vocabulary_list,
                                                         show_hn_vocabulary_list, poll_vocabulary_list))

    threshold5 = 5
    story_count, story_vocabulary_list, story_remove_list, story_total_word = \
        func.e3_1_get_count(training_data_story, threshold5)
    ask_hn_count, ask_hn_vocabulary_list, ask_hn_remove_list, ask_hn_total_word = \
        func.e3_1_get_count(training_data_ask_hn, threshold5)
    show_hn_count, show_hn_vocabulary_list, show_hn_remove_list, show_hn_total_word = \
        func.e3_1_get_count(training_data_show_hn, threshold5)
    poll_count, poll_vocabulary_list, poll_remove_list, poll_total_word = \
        func.e3_1_get_count(training_data_poll, threshold5)
    len_vocabulary_list_5 = len(func.get_vocabulary_list(story_vocabulary_list, ask_hn_vocabulary_list,
                                                         show_hn_vocabulary_list, poll_vocabulary_list))

    threshold10 = 10
    story_count, story_vocabulary_list, story_remove_list, story_total_word = \
        func.e3_1_get_count(training_data_story, threshold10)
    ask_hn_count, ask_hn_vocabulary_list, ask_hn_remove_list, ask_hn_total_word = \
        func.e3_1_get_count(training_data_ask_hn, threshold10)
    show_hn_count, show_hn_vocabulary_list, show_hn_remove_list, show_hn_total_word = \
        func.e3_1_get_count(training_data_show_hn, threshold10)
    poll_count, poll_vocabulary_list, poll_remove_list, poll_total_word = \
        func.e3_1_get_count(training_data_poll, threshold10)
    len_vocabulary_list_10 = len(func.get_vocabulary_list(story_vocabulary_list, ask_hn_vocabulary_list,
                                                          show_hn_vocabulary_list, poll_vocabulary_list))

    threshold15 = 15
    story_count, story_vocabulary_list, story_remove_list, story_total_word = \
        func.e3_1_get_count(training_data_story, threshold15)
    ask_hn_count, ask_hn_vocabulary_list, ask_hn_remove_list, ask_hn_total_word = \
        func.e3_1_get_count(training_data_ask_hn, threshold15)
    show_hn_count, show_hn_vocabulary_list, show_hn_remove_list, show_hn_total_word = \
        func.e3_1_get_count(training_data_show_hn, threshold15)
    poll_count, poll_vocabulary_list, poll_remove_list, poll_total_word = \
        func.e3_1_get_count(training_data_poll, threshold15)
    len_vocabulary_list_15 = len(func.get_vocabulary_list(story_vocabulary_list, ask_hn_vocabulary_list,
                                                          show_hn_vocabulary_list, poll_vocabulary_list))

    # plot for
    plt.plot([threshold1, threshold5, threshold10, threshold15],
             [len_vocabulary_list_1, len_vocabulary_list_5, len_vocabulary_list_10, len_vocabulary_list_15], 'ro')
    plt.axis([0, 20, 0,
              max(len_vocabulary_list_1, len_vocabulary_list_5, len_vocabulary_list_10, len_vocabulary_list_15) + 50])
    plt.ylabel('number of vocabulary left')
    plt.xlabel('threshold')
    plt.title("1")
    plt.show()